From 3044646148cdfa83a311bf1c146a70e550280159 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 Nov 2008 10:07:58 -0800
Subject: x86: move iomap.h to the new include location

a new file was accidentally added to include/asm-x86;
move it to the new arch/x86/include/asm location

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/asm-x86/iomap.h | 30 ------------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 include/asm-x86/iomap.h

(limited to 'include')

diff --git a/include/asm-x86/iomap.h b/include/asm-x86/iomap.h
deleted file mode 100644
index c1f06289b14b..000000000000
--- a/include/asm-x86/iomap.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright © 2008 Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- */
-
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-void *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
-
-void
-iounmap_atomic(void *kvaddr, enum km_type type);
-- 
cgit v1.2.3-71-gd317


From f6d87f4bd259cf33e092cd1a8fde05f291c47af1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 7 Nov 2008 13:18:30 +0100
Subject: genirq: keep affinities set from userspace across free/request_irq()

Impact: preserve user-modified affinities on interrupts

Kumar Galak noticed that commit
18404756765c713a0be4eb1082920c04822ce588 (genirq: Expose default irq
affinity mask (take 3))

overrides an already set affinity setting across a free /
request_irq(). Happens e.g. with ifdown/ifup of a network device.

Change the logic to mark the affinities as set and keep them
intact. This also fixes the unlocked access to irq_desc in
irq_select_affinity() when called from irq_affinity_proc_write()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h    |  8 ++-----
 kernel/irq/internals.h |  2 ++
 kernel/irq/manage.c    | 58 +++++++++++++++++++++++++++++++++++++++++---------
 kernel/irq/migration.c | 11 ----------
 kernel/irq/proc.c      |  2 +-
 5 files changed, 53 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d058c57be02d..36b186eb318b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -63,7 +63,8 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_MOVE_PENDING	0x00200000	/* need to re-target IRQ destination */
 #define IRQ_NO_BALANCING	0x00400000	/* IRQ is excluded from balancing */
 #define IRQ_SPURIOUS_DISABLED	0x00800000	/* IRQ was disabled by the spurious trap */
-#define IRQ_MOVE_PCNTXT	0x01000000	/* IRQ migration from process context */
+#define IRQ_MOVE_PCNTXT		0x01000000	/* IRQ migration from process context */
+#define IRQ_AFFINITY_SET	0x02000000	/* IRQ affinity was set from userspace*/
 
 #ifdef CONFIG_IRQ_PER_CPU
 # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
@@ -210,7 +211,6 @@ extern int setup_irq(unsigned int irq, struct irqaction *new);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 
-void set_pending_irq(unsigned int irq, cpumask_t mask);
 void move_native_irq(int irq);
 void move_masked_irq(int irq);
 
@@ -228,10 +228,6 @@ static inline void move_masked_irq(int irq)
 {
 }
 
-static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
-{
-}
-
 #endif /* CONFIG_GENERIC_PENDING_IRQ */
 
 #else /* CONFIG_SMP */
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c9767e641980..64c1c7253dae 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -25,6 +25,8 @@ static inline void unregister_handler_proc(unsigned int irq,
 					   struct irqaction *action) { }
 #endif
 
+extern int irq_select_affinity_usr(unsigned int irq);
+
 /*
  * Debugging printout:
  */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c498a1b8c621..634a2a955104 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -82,24 +82,27 @@ int irq_can_set_affinity(unsigned int irq)
 int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
 
 	if (!desc->chip->set_affinity)
 		return -EINVAL;
 
+	spin_lock_irqsave(&desc->lock, flags);
+
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&desc->lock, flags);
 		desc->affinity = cpumask;
 		desc->chip->set_affinity(irq, cpumask);
-		spin_unlock_irqrestore(&desc->lock, flags);
-	} else
-		set_pending_irq(irq, cpumask);
+	} else {
+		desc->status |= IRQ_MOVE_PENDING;
+		desc->pending_mask = cpumask;
+	}
 #else
 	desc->affinity = cpumask;
 	desc->chip->set_affinity(irq, cpumask);
 #endif
+	desc->status |= IRQ_AFFINITY_SET;
+	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
 
@@ -107,24 +110,59 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 /*
  * Generic version of the affinity autoselector.
  */
-int irq_select_affinity(unsigned int irq)
+int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
 {
 	cpumask_t mask;
-	struct irq_desc *desc;
 
 	if (!irq_can_set_affinity(irq))
 		return 0;
 
 	cpus_and(mask, cpu_online_map, irq_default_affinity);
 
-	desc = irq_to_desc(irq);
+	/*
+	 * Preserve an userspace affinity setup, but make sure that
+	 * one of the targets is online.
+	 */
+	if (desc->status & IRQ_AFFINITY_SET) {
+		if (cpus_intersects(desc->affinity, cpu_online_map))
+			mask = desc->affinity;
+		else
+			desc->status &= ~IRQ_AFFINITY_SET;
+	}
+
 	desc->affinity = mask;
 	desc->chip->set_affinity(irq, mask);
 
 	return 0;
 }
+#else
+static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
+{
+	return irq_select_affinity(irq);
+}
 #endif
 
+/*
+ * Called when affinity is set via /proc/irq
+ */
+int irq_select_affinity_usr(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	ret = do_irq_select_affinity(irq, desc);
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return ret;
+}
+
+#else
+static inline int do_select_irq_affinity(int irq, struct irq_desc *desc)
+{
+	return 0;
+}
 #endif
 
 /**
@@ -446,7 +484,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
 			desc->depth = 1;
 
 		/* Set default affinity mask once everything is setup */
-		irq_select_affinity(irq);
+		do_irq_select_affinity(irq, desc);
 
 	} else if ((new->flags & IRQF_TRIGGER_MASK)
 			&& (new->flags & IRQF_TRIGGER_MASK)
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 90b920d3f52b..9db681d95814 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,17 +1,6 @@
 
 #include <linux/irq.h>
 
-void set_pending_irq(unsigned int irq, cpumask_t mask)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	unsigned long flags;
-
-	spin_lock_irqsave(&desc->lock, flags);
-	desc->status |= IRQ_MOVE_PENDING;
-	desc->pending_mask = mask;
-	spin_unlock_irqrestore(&desc->lock, flags);
-}
-
 void move_masked_irq(int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4d161c70ba55..d257e7d6a8a4 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -62,7 +62,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 	if (!cpus_intersects(new_value, cpu_online_map))
 		/* Special case for empty set - allow the architecture
 		   code to set default SMP affinity. */
-		return irq_select_affinity(irq) ? -EINVAL : count;
+		return irq_select_affinity_usr(irq) ? -EINVAL : count;
 
 	irq_set_affinity(irq, new_value);
 
-- 
cgit v1.2.3-71-gd317


From a358324466b171e145df20bdb74fe81759906de6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 11 Nov 2008 15:01:42 -0500
Subject: ring-buffer: buffer record on/off switch

Impact: enable/disable ring buffer recording API added

Several kernel developers have requested that there be a way to stop
recording into the ring buffers with a simple switch that can also
be enabled from userspace. This patch addes a new kernel API to the
ring buffers called:

 tracing_on()
 tracing_off()

When tracing_off() is called, all ring buffers will not be able to record
into their buffers.

tracing_on() will enable the ring buffers again.

These two act like an on/off switch. That is, there is no counting of the
number of times tracing_off or tracing_on has been called.

A new file is added to the debugfs/tracing directory called

  tracing_on

This allows for userspace applications to also flip the switch.

  echo 0 > debugfs/tracing/tracing_on

disables the tracing.

  echo 1 > /debugfs/tracing/tracing_on

enables it.

Note, this does not disable or enable any tracers. It only sets or clears
a flag that needs to be set in order for the ring buffers to write to
their buffers. It is a global flag, and affects all ring buffers.

The buffers start out with tracing_on enabled.

There are now three flags that control recording into the buffers:

 tracing_on: which affects all ring buffer tracers.

 buffer->record_disabled: which affects an allocated buffer, which may be set
     if an anomaly is detected, and tracing is disabled.

 cpu_buffer->record_disabled: which is set by tracing_stop() or if an
     anomaly is detected. tracing_start can not reenable this if
     an anomaly occurred.

The userspace debugfs/tracing/tracing_enabled is implemented with
tracing_stop() but the user space code can not enable it if the kernel
called tracing_stop().

Userspace can enable the tracing_on even if the kernel disabled it.
It is just a switch used to stop tracing if a condition was hit.
tracing_on is not for protecting critical areas in the kernel nor is
it for stopping tracing if an anomaly occurred. This is because userspace
can reenable it at any time.

Side effect: With this patch, I discovered a dead variable in ftrace.c
  called tracing_on. This patch removes it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h |   3 ++
 kernel/trace/ftrace.c       |   8 +---
 kernel/trace/ring_buffer.c  | 101 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 536b0ca46a03..e097c2e6b6dc 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -120,6 +120,9 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 u64 ring_buffer_time_stamp(int cpu);
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
+void tracing_on(void);
+void tracing_off(void);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a39d24568c8..14fa52297b28 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -185,7 +185,6 @@ enum {
 };
 
 static int ftrace_filtered;
-static int tracing_on;
 
 static LIST_HEAD(ftrace_new_addrs);
 
@@ -506,13 +505,10 @@ static int __ftrace_modify_code(void *data)
 {
 	int *command = data;
 
-	if (*command & FTRACE_ENABLE_CALLS) {
+	if (*command & FTRACE_ENABLE_CALLS)
 		ftrace_replace_code(1);
-		tracing_on = 1;
-	} else if (*command & FTRACE_DISABLE_CALLS) {
+	else if (*command & FTRACE_DISABLE_CALLS)
 		ftrace_replace_code(0);
-		tracing_on = 0;
-	}
 
 	if (*command & FTRACE_UPDATE_TRACE_FUNC)
 		ftrace_update_ftrace_func(ftrace_trace_function);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2f76193c3489..b08ee9f00c8d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,6 +16,35 @@
 #include <linux/list.h>
 #include <linux/fs.h>
 
+#include "trace.h"
+
+/* Global flag to disable all recording to ring buffers */
+static int ring_buffers_off __read_mostly;
+
+/**
+ * tracing_on - enable all tracing buffers
+ *
+ * This function enables all tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
+{
+	ring_buffers_off = 0;
+}
+
+/**
+ * tracing_off - turn off all tracing buffers
+ *
+ * This function stops all tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+	ring_buffers_off = 1;
+}
+
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
@@ -1133,6 +1162,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	int cpu, resched;
 
+	if (ring_buffers_off)
+		return NULL;
+
 	if (atomic_read(&buffer->record_disabled))
 		return NULL;
 
@@ -1249,6 +1281,9 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	int ret = -EBUSY;
 	int cpu, resched;
 
+	if (ring_buffers_off)
+		return -EBUSY;
+
 	if (atomic_read(&buffer->record_disabled))
 		return -EBUSY;
 
@@ -2070,3 +2105,69 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	return 0;
 }
 
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+	       size_t cnt, loff_t *ppos)
+{
+	int *p = filp->private_data;
+	char buf[64];
+	int r;
+
+	/* !ring_buffers_off == tracing_on */
+	r = sprintf(buf, "%d\n", !*p);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+		size_t cnt, loff_t *ppos)
+{
+	int *p = filp->private_data;
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	/* !ring_buffers_off == tracing_on */
+	*p = !val;
+
+	(*ppos)++;
+
+	return cnt;
+}
+
+static struct file_operations rb_simple_fops = {
+	.open		= tracing_open_generic,
+	.read		= rb_simple_read,
+	.write		= rb_simple_write,
+};
+
+
+static __init int rb_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("tracing_on", 0644, d_tracer,
+				    &ring_buffers_off, &rb_simple_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'tracing_on' entry\n");
+
+	return 0;
+}
+
+fs_initcall(rb_init_debugfs);
-- 
cgit v1.2.3-71-gd317


From ba32929a91fe2c0628f5be62d1597b379c8d3062 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 10 Nov 2008 15:29:58 +0900
Subject: block: make add_partition() return pointer to hd_struct

Make add_partition() return pointer to the new hd_struct on success
and ERR_PTR() value on failure.  This change will be used to fix md
autodetection bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ioctl.c         |  7 +++----
 fs/partitions/check.c | 25 +++++++++++++------------
 include/linux/genhd.h |  4 +++-
 3 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/block/ioctl.c b/block/ioctl.c
index c832d639b6e2..d03985b04d67 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -18,7 +18,6 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	struct disk_part_iter piter;
 	long long start, length;
 	int partno;
-	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -61,10 +60,10 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			disk_part_iter_exit(&piter);
 
 			/* all seems OK */
-			err = add_partition(disk, partno, start, length,
-					    ADDPART_FLAG_NONE);
+			part = add_partition(disk, partno, start, length,
+					     ADDPART_FLAG_NONE);
 			mutex_unlock(&bdev->bd_mutex);
-			return err;
+			return IS_ERR(part) ? PTR_ERR(part) : 0;
 		case BLKPG_DEL_PARTITION:
 			part = disk_get_part(disk, partno);
 			if (!part)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 90bcf136a9de..633025340239 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -348,8 +348,8 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
-int add_partition(struct gendisk *disk, int partno,
-		  sector_t start, sector_t len, int flags)
+struct hd_struct *add_partition(struct gendisk *disk, int partno,
+				sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
@@ -361,15 +361,15 @@ int add_partition(struct gendisk *disk, int partno,
 
 	err = disk_expand_part_tbl(disk, partno);
 	if (err)
-		return err;
+		return ERR_PTR(err);
 	ptbl = disk->part_tbl;
 
 	if (ptbl->part[partno])
-		return -EBUSY;
+		return ERR_PTR(-EBUSY);
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
-		return -ENOMEM;
+		return ERR_PTR(-EBUSY);
 
 	if (!init_part_stats(p)) {
 		err = -ENOMEM;
@@ -424,20 +424,20 @@ int add_partition(struct gendisk *disk, int partno,
 	if (!ddev->uevent_suppress)
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
-	return 0;
+	return p;
 
 out_free_stats:
 	free_part_stats(p);
 out_free:
 	kfree(p);
-	return err;
+	return ERR_PTR(err);
 out_del:
 	kobject_put(p->holder_dir);
 	device_del(pdev);
 out_put:
 	put_device(pdev);
 	blk_free_devt(devt);
-	return err;
+	return ERR_PTR(err);
 }
 
 /* Not exported, helper to add_disk(). */
@@ -568,10 +568,11 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 			       disk->disk_name, p, (unsigned long long) size);
 			size = get_capacity(disk) - from;
 		}
-		res = add_partition(disk, p, from, size, state->parts[p].flags);
-		if (res) {
-			printk(KERN_ERR " %s: p%d could not be added: %d\n",
-				disk->disk_name, p, -res);
+		part = add_partition(disk, p, from, size,
+				     state->parts[p].flags);
+		if (IS_ERR(part)) {
+			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
+			       disk->disk_name, p, -PTR_ERR(part));
 			continue;
 		}
 #ifdef CONFIG_BLK_DEV_MD
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e439e6aed832..3df7742ce246 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -522,7 +522,9 @@ extern char *disk_name (struct gendisk *hd, int partno, char *buf);
 
 extern int disk_expand_part_tbl(struct gendisk *disk, int target);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
-extern int __must_check add_partition(struct gendisk *, int, sector_t, sector_t, int);
+extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
+						     int partno, sector_t start,
+						     sector_t len, int flags);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
-- 
cgit v1.2.3-71-gd317


From 8e3bad65a59915f2ddc40f62a180ad81695d8440 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 17 Nov 2008 10:59:59 +0100
Subject: mac80211: remove ieee80211_notify_mac

Before ieee80211_notify_mac() was added, it was presented with the
use case of using it to tell mac80211 that the association may
have been lost because the firmware crashed/reset.

Since then, it has also been used by iwlwifi to (slightly) speed
up re-association after resume, a workaround around the fact that
mac80211 has no suspend/resume handling yet. It is also not used
by any other drivers, so clearly it cannot be necessary for "good
enough" suspend/resume.

Unfortunately, the callback suffers from a severe problem: It only
works for station mode. If suspend/resume happens while in IBSS or
any other mode (but station), then the callback is pointless.

Recently, it has created a number of locking issues, first because
it required rtnl locking rather than RCU due to calling sleeping
functions within the critical section, and now because it's called
by iwlwifi from the mac80211 workqueue that may not use the rtnl
because it is flushed under rtnl.
(cf. http://bugzilla.kernel.org/show_bug.cgi?id=12046)

I think, therefore, that we should take a step back, remove it
entirely for now and add the small feature it provided properly.
For suspend and resume we will need to introduce new hooks, and for
the case where the firmware was reset the driver will probably
simply just pretend it has done a suspend/resume cycle to get
mac80211 to reprogram the hardware completely, not just try to
connect to the current AP again in station mode. When doing so, we
will need to take into account locking issues and possibly defer
to schedule_work from within mac80211 for the resume operation,
while the suspend operation must be done directly.

Proper suspend/resume should also not necessarily try to reconnect
to the current AP, the time spent in suspend may have been short
enough to not be disconnected from the AP, mac80211 will detect
that the AP went out of range quickly if it did, and if the
association is lost then the AP will disassoc as soon as a data
frame is sent. We might also take into account WWOL then, and
have mac80211 program the hardware into such a mode where it is
available and requested.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/iwlwifi/iwl-agn.c      |  1 -
 drivers/net/wireless/iwlwifi/iwl3945-base.c |  1 -
 include/net/mac80211.h                      | 20 --------------------
 net/mac80211/mlme.c                         | 22 ----------------------
 4 files changed, 44 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index 8d690a0eb1a9..6751bb2b8ae2 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -2341,7 +2341,6 @@ static void iwl_bg_alive_start(struct work_struct *data)
 	mutex_lock(&priv->mutex);
 	iwl_alive_start(priv);
 	mutex_unlock(&priv->mutex);
-	ieee80211_notify_mac(priv->hw, IEEE80211_NOTIFY_RE_ASSOC);
 }
 
 static void iwl4965_bg_rf_kill(struct work_struct *work)
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index 285b53e7e261..45a6b0c35695 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -6012,7 +6012,6 @@ static void iwl3945_bg_alive_start(struct work_struct *data)
 	mutex_lock(&priv->mutex);
 	iwl3945_alive_start(priv);
 	mutex_unlock(&priv->mutex);
-	ieee80211_notify_mac(priv->hw, IEEE80211_NOTIFY_RE_ASSOC);
 }
 
 static void iwl3945_bg_rf_kill(struct work_struct *work)
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8856e2d60e9f..73d81bc6aa75 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -73,14 +73,6 @@
  * not do so then mac80211 may add this under certain circumstances.
  */
 
-/**
- * enum ieee80211_notification_type - Low level driver notification
- * @IEEE80211_NOTIFY_RE_ASSOC: start the re-association sequence
- */
-enum ieee80211_notification_types {
-	IEEE80211_NOTIFY_RE_ASSOC,
-};
-
 /**
  * struct ieee80211_ht_bss_info - describing BSS's HT characteristics
  *
@@ -1797,18 +1789,6 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid);
 void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_hw *hw, const u8 *ra,
 				     u16 tid);
 
-/**
- * ieee80211_notify_mac - low level driver notification
- * @hw: pointer as obtained from ieee80211_alloc_hw().
- * @notif_type: enum ieee80211_notification_types
- *
- * This function must be called by low level driver to inform mac80211 of
- * low level driver status change or force mac80211 to re-assoc for low
- * level driver internal error that require re-assoc.
- */
-void ieee80211_notify_mac(struct ieee80211_hw *hw,
-			  enum ieee80211_notification_types  notif_type);
-
 /**
  * ieee80211_find_sta - find a station
  *
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 14d165f0df75..409bb7716236 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2560,25 +2560,3 @@ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
 		ieee80211_restart_sta_timer(sdata);
 	rcu_read_unlock();
 }
-
-/* driver notification call */
-void ieee80211_notify_mac(struct ieee80211_hw *hw,
-			  enum ieee80211_notification_types  notif_type)
-{
-	struct ieee80211_local *local = hw_to_local(hw);
-	struct ieee80211_sub_if_data *sdata;
-
-	switch (notif_type) {
-	case IEEE80211_NOTIFY_RE_ASSOC:
-		rtnl_lock();
-		list_for_each_entry(sdata, &local->interfaces, list) {
-			if (sdata->vif.type != NL80211_IFTYPE_STATION)
-				continue;
-
-			ieee80211_sta_req_auth(sdata, &sdata->u.sta);
-		}
-		rtnl_unlock();
-		break;
-	}
-}
-EXPORT_SYMBOL(ieee80211_notify_mac);
-- 
cgit v1.2.3-71-gd317


From de11defebf00007677fb7ee91d9b089b78786fbb Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 19 Nov 2008 15:36:14 -0800
Subject: reintroduce accept4

Introduce a new accept4() system call.  The addition of this system call
matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(),
inotify_init1(), epoll_create1(), pipe2()) which added new system calls
that differed from analogous traditional system calls in adding a flags
argument that can be used to access additional functionality.

The accept4() system call is exactly the same as accept(), except that
it adds a flags bit-mask argument.  Two flags are initially implemented.
(Most of the new system calls in 2.6.27 also had both of these flags.)

SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled
for the new file descriptor returned by accept4().  This is a useful
security feature to avoid leaking information in a multithreaded
program where one thread is doing an accept() at the same time as
another thread is doing a fork() plus exec().  More details here:
http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling",
Ulrich Drepper).

The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag
to be enabled on the new open file description created by accept4().
(This flag is merely a convenience, saving the use of additional calls
fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result.

Here's a test program.  Works on x86-32.  Should work on x86-64, but
I (mtk) don't have a system to hand to test with.

It tests accept4() with each of the four possible combinations of
SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies
that the appropriate flags are set on the file descriptor/open file
description returned by accept4().

I tested Ulrich's patch in this thread by applying against 2.6.28-rc2,
and it passes according to my test program.

/* test_accept4.c

  Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk
       <mtk.manpages@gmail.com>

  Licensed under the GNU GPLv2 or later.
*/
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>

#define PORT_NUM 33333

#define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0)

/**********************************************************************/

/* The following is what we need until glibc gets a wrapper for
  accept4() */

/* Flags for socket(), socketpair(), accept4() */
#ifndef SOCK_CLOEXEC
#define SOCK_CLOEXEC    O_CLOEXEC
#endif
#ifndef SOCK_NONBLOCK
#define SOCK_NONBLOCK   O_NONBLOCK
#endif

#ifdef __x86_64__
#define SYS_accept4 288
#elif __i386__
#define USE_SOCKETCALL 1
#define SYS_ACCEPT4 18
#else
#error "Sorry -- don't know the syscall # on this architecture"
#endif

static int
accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags)
{
   printf("Calling accept4(): flags = %x", flags);
   if (flags != 0) {
       printf(" (");
       if (flags & SOCK_CLOEXEC)
           printf("SOCK_CLOEXEC");
       if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK))
           printf(" ");
       if (flags & SOCK_NONBLOCK)
           printf("SOCK_NONBLOCK");
       printf(")");
   }
   printf("\n");

#if USE_SOCKETCALL
   long args[6];

   args[0] = fd;
   args[1] = (long) sockaddr;
   args[2] = (long) addrlen;
   args[3] = flags;

   return syscall(SYS_socketcall, SYS_ACCEPT4, args);
#else
   return syscall(SYS_accept4, fd, sockaddr, addrlen, flags);
#endif
}

/**********************************************************************/

static int
do_test(int lfd, struct sockaddr_in *conn_addr,
       int closeonexec_flag, int nonblock_flag)
{
   int connfd, acceptfd;
   int fdf, flf, fdf_pass, flf_pass;
   struct sockaddr_in claddr;
   socklen_t addrlen;

   printf("=======================================\n");

   connfd = socket(AF_INET, SOCK_STREAM, 0);
   if (connfd == -1)
       die("socket");
   if (connect(connfd, (struct sockaddr *) conn_addr,
               sizeof(struct sockaddr_in)) == -1)
       die("connect");

   addrlen = sizeof(struct sockaddr_in);
   acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen,
                      closeonexec_flag | nonblock_flag);
   if (acceptfd == -1) {
       perror("accept4()");
       close(connfd);
       return 0;
   }

   fdf = fcntl(acceptfd, F_GETFD);
   if (fdf == -1)
       die("fcntl:F_GETFD");
   fdf_pass = ((fdf & FD_CLOEXEC) != 0) ==
              ((closeonexec_flag & SOCK_CLOEXEC) != 0);
   printf("Close-on-exec flag is %sset (%s); ",
           (fdf & FD_CLOEXEC) ? "" : "not ",
           fdf_pass ? "OK" : "failed");

   flf = fcntl(acceptfd, F_GETFL);
   if (flf == -1)
       die("fcntl:F_GETFD");
   flf_pass = ((flf & O_NONBLOCK) != 0) ==
              ((nonblock_flag & SOCK_NONBLOCK) !=0);
   printf("nonblock flag is %sset (%s)\n",
           (flf & O_NONBLOCK) ? "" : "not ",
           flf_pass ? "OK" : "failed");

   close(acceptfd);
   close(connfd);

   printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL");
   return fdf_pass && flf_pass;
}

static int
create_listening_socket(int port_num)
{
   struct sockaddr_in svaddr;
   int lfd;
   int optval;

   memset(&svaddr, 0, sizeof(struct sockaddr_in));
   svaddr.sin_family = AF_INET;
   svaddr.sin_addr.s_addr = htonl(INADDR_ANY);
   svaddr.sin_port = htons(port_num);

   lfd = socket(AF_INET, SOCK_STREAM, 0);
   if (lfd == -1)
       die("socket");

   optval = 1;
   if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval,
                  sizeof(optval)) == -1)
       die("setsockopt");

   if (bind(lfd, (struct sockaddr *) &svaddr,
            sizeof(struct sockaddr_in)) == -1)
       die("bind");

   if (listen(lfd, 5) == -1)
       die("listen");

   return lfd;
}

int
main(int argc, char *argv[])
{
   struct sockaddr_in conn_addr;
   int lfd;
   int port_num;
   int passed;

   passed = 1;

   port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM;

   memset(&conn_addr, 0, sizeof(struct sockaddr_in));
   conn_addr.sin_family = AF_INET;
   conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
   conn_addr.sin_port = htons(port_num);

   lfd = create_listening_socket(port_num);

   if (!do_test(lfd, &conn_addr, 0, 0))
       passed = 0;
   if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0))
       passed = 0;
   if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK))
       passed = 0;
   if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK))
       passed = 0;

   close(lfd);

   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
}

[mtk.manpages@gmail.com: rewrote changelog, updated test program]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/unistd_64.h |  4 +-
 include/linux/net.h              |  6 +--
 include/linux/syscalls.h         |  3 +-
 kernel/sys_ni.c                  |  2 +-
 net/compat.c                     | 50 +++----------------------
 net/socket.c                     | 80 +++++-----------------------------------
 6 files changed, 21 insertions(+), 124 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 834b2c1d89fb..d2e415e6666f 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -639,8 +639,8 @@ __SYSCALL(__NR_fallocate, sys_fallocate)
 __SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
 #define __NR_timerfd_gettime			287
 __SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
-#define __NR_paccept				288
-__SYSCALL(__NR_paccept, sys_paccept)
+#define __NR_accept4				288
+__SYSCALL(__NR_accept4, sys_accept4)
 #define __NR_signalfd4				289
 __SYSCALL(__NR_signalfd4, sys_signalfd4)
 #define __NR_eventfd2				290
diff --git a/include/linux/net.h b/include/linux/net.h
index 6dc14a240042..4515efae4c39 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -40,7 +40,7 @@
 #define SYS_GETSOCKOPT	15		/* sys_getsockopt(2)		*/
 #define SYS_SENDMSG	16		/* sys_sendmsg(2)		*/
 #define SYS_RECVMSG	17		/* sys_recvmsg(2)		*/
-#define SYS_PACCEPT	18		/* sys_paccept(2)		*/
+#define SYS_ACCEPT4	18		/* sys_accept4(2)		*/
 
 typedef enum {
 	SS_FREE = 0,			/* not allocated		*/
@@ -100,7 +100,7 @@ enum sock_type {
  * remaining bits are used as flags. */
 #define SOCK_TYPE_MASK 0xf
 
-/* Flags for socket, socketpair, paccept */
+/* Flags for socket, socketpair, accept4 */
 #define SOCK_CLOEXEC	O_CLOEXEC
 #ifndef SOCK_NONBLOCK
 #define SOCK_NONBLOCK	O_NONBLOCK
@@ -223,8 +223,6 @@ extern int 	     sock_map_fd(struct socket *sock, int flags);
 extern struct socket *sockfd_lookup(int fd, int *err);
 #define		     sockfd_put(sock) fput(sock->file)
 extern int	     net_ratelimit(void);
-extern long	     do_accept(int fd, struct sockaddr __user *upeer_sockaddr,
-			       int __user *upeer_addrlen, int flags);
 
 #define net_random()		random32()
 #define net_srandom(seed)	srandom32((__force u32)seed)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d6ff145919ca..04fb47bfb920 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -410,8 +410,7 @@ asmlinkage long sys_getsockopt(int fd, int level, int optname,
 asmlinkage long sys_bind(int, struct sockaddr __user *, int);
 asmlinkage long sys_connect(int, struct sockaddr __user *, int);
 asmlinkage long sys_accept(int, struct sockaddr __user *, int __user *);
-asmlinkage long sys_paccept(int, struct sockaddr __user *, int __user *,
-			    const __user sigset_t *, size_t, int);
+asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
 asmlinkage long sys_getsockname(int, struct sockaddr __user *, int __user *);
 asmlinkage long sys_getpeername(int, struct sockaddr __user *, int __user *);
 asmlinkage long sys_send(int, void __user *, size_t, unsigned);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a77b27b11b04..e14a23281707 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,7 +31,7 @@ cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
 cond_syscall(sys_listen);
 cond_syscall(sys_accept);
-cond_syscall(sys_paccept);
+cond_syscall(sys_accept4);
 cond_syscall(sys_connect);
 cond_syscall(sys_getsockname);
 cond_syscall(sys_getpeername);
diff --git a/net/compat.c b/net/compat.c
index 6ce1a1cadcc0..a3a2ba0fac08 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -725,7 +725,7 @@ EXPORT_SYMBOL(compat_mc_getsockopt);
 static unsigned char nas[19]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 				AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
 				AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
-				AL(6)};
+				AL(4)};
 #undef AL
 
 asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
@@ -738,52 +738,13 @@ asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, uns
 	return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
 }
 
-asmlinkage long compat_sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr,
-				   int __user *upeer_addrlen,
-				   const compat_sigset_t __user *sigmask,
-				   compat_size_t sigsetsize, int flags)
-{
-	compat_sigset_t ss32;
-	sigset_t ksigmask, sigsaved;
-	int ret;
-
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
-			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
-
-	ret = do_accept(fd, upeer_sockaddr, upeer_addrlen, flags);
-
-	if (ret == -ERESTARTNOHAND) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	return ret;
-}
-
 asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 {
 	int ret;
 	u32 a[6];
 	u32 a0, a1;
 
-	if (call < SYS_SOCKET || call > SYS_PACCEPT)
+	if (call < SYS_SOCKET || call > SYS_ACCEPT4)
 		return -EINVAL;
 	if (copy_from_user(a, args, nas[call]))
 		return -EFAULT;
@@ -804,7 +765,7 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 		ret = sys_listen(a0, a1);
 		break;
 	case SYS_ACCEPT:
-		ret = do_accept(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
+		ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
 		break;
 	case SYS_GETSOCKNAME:
 		ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
@@ -844,9 +805,8 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 	case SYS_RECVMSG:
 		ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
 		break;
-	case SYS_PACCEPT:
-		ret = compat_sys_paccept(a0, compat_ptr(a1), compat_ptr(a[2]),
-					 compat_ptr(a[3]), a[4], a[5]);
+	case SYS_ACCEPT4:
+		ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/net/socket.c b/net/socket.c
index 57550c3bcabe..92764d836891 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1426,8 +1426,8 @@ asmlinkage long sys_listen(int fd, int backlog)
  *	clean when we restucture accept also.
  */
 
-long do_accept(int fd, struct sockaddr __user *upeer_sockaddr,
-	       int __user *upeer_addrlen, int flags)
+asmlinkage long sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
+			    int __user *upeer_addrlen, int flags)
 {
 	struct socket *sock, *newsock;
 	struct file *newfile;
@@ -1510,66 +1510,10 @@ out_fd:
 	goto out_put;
 }
 
-#if 0
-#ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr,
-			    int __user *upeer_addrlen,
-			    const sigset_t __user *sigmask,
-			    size_t sigsetsize, int flags)
-{
-	sigset_t ksigmask, sigsaved;
-	int ret;
-
-	if (sigmask) {
-		/* XXX: Don't preclude handling different sized sigset_t's.  */
-		if (sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-        }
-
-	ret = do_accept(fd, upeer_sockaddr, upeer_addrlen, flags);
-
-	if (ret < 0 && signal_pending(current)) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	return ret;
-}
-#else
-asmlinkage long sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr,
-			    int __user *upeer_addrlen,
-			    const sigset_t __user *sigmask,
-			    size_t sigsetsize, int flags)
-{
-	/* The platform does not support restoring the signal mask in the
-	 * return path.  So we do not allow using paccept() with a signal
-	 * mask.  */
-	if (sigmask)
-		return -EINVAL;
-
-	return do_accept(fd, upeer_sockaddr, upeer_addrlen, flags);
-}
-#endif
-#endif
-
 asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
 			   int __user *upeer_addrlen)
 {
-	return do_accept(fd, upeer_sockaddr, upeer_addrlen, 0);
+	return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
 }
 
 /*
@@ -2096,7 +2040,7 @@ static const unsigned char nargs[19]={
 	AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 	AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
 	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
-	AL(6)
+	AL(4)
 };
 
 #undef AL
@@ -2115,7 +2059,7 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 	unsigned long a0, a1;
 	int err;
 
-	if (call < 1 || call > SYS_PACCEPT)
+	if (call < 1 || call > SYS_ACCEPT4)
 		return -EINVAL;
 
 	/* copy_from_user should be SMP safe. */
@@ -2143,9 +2087,8 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 		err = sys_listen(a0, a1);
 		break;
 	case SYS_ACCEPT:
-		err =
-		    do_accept(a0, (struct sockaddr __user *)a1,
-			      (int __user *)a[2], 0);
+		err = sys_accept4(a0, (struct sockaddr __user *)a1,
+				  (int __user *)a[2], 0);
 		break;
 	case SYS_GETSOCKNAME:
 		err =
@@ -2192,12 +2135,9 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 	case SYS_RECVMSG:
 		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
 		break;
-	case SYS_PACCEPT:
-		err =
-		    sys_paccept(a0, (struct sockaddr __user *)a1,
-			        (int __user *)a[2],
-				(const sigset_t __user *) a[3],
-				a[4], a[5]);
+	case SYS_ACCEPT4:
+		err = sys_accept4(a0, (struct sockaddr __user *)a1,
+				  (int __user *)a[2], a[3]);
 		break;
 	default:
 		err = -EINVAL;
-- 
cgit v1.2.3-71-gd317


From f481891fdc49d3d1b8a9674a1825d183069a805f Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 19 Nov 2008 15:36:30 -0800
Subject: cpuset: update top cpuset's mems after adding a node

After adding a node into the machine, top cpuset's mems isn't updated.

By reviewing the code, we found that the update function

  cpuset_track_online_nodes()

was invoked after node_states[N_ONLINE] changes.  It is wrong because
N_ONLINE just means node has pgdat, and if node has/added memory, we use
N_HIGH_MEMORY.  So, We should invoke the update function after
node_states[N_HIGH_MEMORY] changes, just like its commit says.

This patch fixes it.  And we use notifier of memory hotplug instead of
direct calling of cpuset_track_online_nodes().

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Menage <menage@google.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h |  4 ----
 kernel/cpuset.c        | 19 ++++++++++++++++---
 mm/memory_hotplug.c    |  3 ---
 3 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2691926fb506..8e540d32c9fe 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -74,8 +74,6 @@ static inline int cpuset_do_slab_mem_spread(void)
 	return current->flags & PF_SPREAD_SLAB;
 }
 
-extern void cpuset_track_online_nodes(void);
-
 extern int current_cpuset_is_being_rebound(void);
 
 extern void rebuild_sched_domains(void);
@@ -151,8 +149,6 @@ static inline int cpuset_do_slab_mem_spread(void)
 	return 0;
 }
 
-static inline void cpuset_track_online_nodes(void) {}
-
 static inline int current_cpuset_is_being_rebound(void)
 {
 	return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 81fc6791a296..da7ff6137f37 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -36,6 +36,7 @@
 #include <linux/list.h>
 #include <linux/mempolicy.h>
 #include <linux/mm.h>
+#include <linux/memory.h>
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
@@ -2015,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
  * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
  * See also the previous routine cpuset_track_online_cpus().
  */
-void cpuset_track_online_nodes(void)
+static int cpuset_track_online_nodes(struct notifier_block *self,
+				unsigned long action, void *arg)
 {
 	cgroup_lock();
-	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-	scan_for_empty_cpusets(&top_cpuset);
+	switch (action) {
+	case MEM_ONLINE:
+		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+		break;
+	case MEM_OFFLINE:
+		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+		scan_for_empty_cpusets(&top_cpuset);
+		break;
+	default:
+		break;
+	}
 	cgroup_unlock();
+	return NOTIFY_OK;
 }
 #endif
 
@@ -2036,6 +2048,7 @@ void __init cpuset_init_smp(void)
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 
 	hotcpu_notifier(cpuset_track_online_cpus, 0);
+	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
 }
 
 /**
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6837a1014372..b5b2b15085a8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -22,7 +22,6 @@
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
-#include <linux/cpuset.h>
 #include <linux/delay.h>
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
@@ -498,8 +497,6 @@ int add_memory(int nid, u64 start, u64 size)
 	/* we online node here. we can't roll back from here. */
 	node_set_online(nid);
 
-	cpuset_track_online_nodes();
-
 	if (new_pgdat) {
 		ret = register_one_node(nid);
 		/*
-- 
cgit v1.2.3-71-gd317


From 21098c68df7115554fe041170899bdff709efd08 Mon Sep 17 00:00:00 2001
From: James Smart <James.Smart@Emulex.Com>
Date: Thu, 20 Nov 2008 10:58:01 -0500
Subject: [SCSI] fc_transport: fix old bug on bitflag definitions

When the fastfail flag was added, it did not account for the flags
being bit fields. Correct the definition so there is no longer a
conflict.

Signed-off-by: James Smart <james.smart@emulex.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 include/scsi/scsi_transport_fc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index 49d8913c4f86..6e04e6fe79c7 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -357,7 +357,7 @@ struct fc_rport {	/* aka fc_starget_attrs */
 /* bit field values for struct fc_rport "flags" field: */
 #define FC_RPORT_DEVLOSS_PENDING	0x01
 #define FC_RPORT_SCAN_PENDING		0x02
-#define FC_RPORT_FAST_FAIL_TIMEDOUT	0x03
+#define FC_RPORT_FAST_FAIL_TIMEDOUT	0x04
 
 #define	dev_to_rport(d)				\
 	container_of(d, struct fc_rport, dev)
-- 
cgit v1.2.3-71-gd317


From 7e56b5d698707a9934833c47b24d78fb0bcaf764 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 21 Nov 2008 16:45:22 -0800
Subject: net: Fix memory leak in the proto_register function

If the slub allocator is used, kmem_cache_create() may merge two or more
kmem_cache's into one but the cache name pointer is not updated and
kmem_cache_name() is no longer guaranteed to return the pointer passed
to the former function. This patch stores the kmalloc'ed pointers in the
corresponding request_sock_ops and timewait_sock_ops structures.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/request_sock.h  |  1 +
 include/net/timewait_sock.h |  1 +
 net/core/sock.c             | 31 ++++++++++++-------------------
 3 files changed, 14 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index cac811e51f6d..c7190846e128 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -31,6 +31,7 @@ struct request_sock_ops {
 	int		family;
 	int		obj_size;
 	struct kmem_cache	*slab;
+	char		*slab_name;
 	int		(*rtx_syn_ack)(struct sock *sk,
 				       struct request_sock *req);
 	void		(*send_ack)(struct sock *sk, struct sk_buff *skb,
diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
index 1e1ee3253fd8..97c3b14da55d 100644
--- a/include/net/timewait_sock.h
+++ b/include/net/timewait_sock.h
@@ -16,6 +16,7 @@
 
 struct timewait_sock_ops {
 	struct kmem_cache	*twsk_slab;
+	char		*twsk_slab_name;
 	unsigned int	twsk_obj_size;
 	int		(*twsk_unique)(struct sock *sk,
 				       struct sock *sktw, void *twp);
diff --git a/net/core/sock.c b/net/core/sock.c
index 341e39456952..edf7220889a4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2035,9 +2035,6 @@ static inline void release_proto_idx(struct proto *prot)
 
 int proto_register(struct proto *prot, int alloc_slab)
 {
-	char *request_sock_slab_name = NULL;
-	char *timewait_sock_slab_name;
-
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
 					       SLAB_HWCACHE_ALIGN, NULL);
@@ -2051,12 +2048,12 @@ int proto_register(struct proto *prot, int alloc_slab)
 		if (prot->rsk_prot != NULL) {
 			static const char mask[] = "request_sock_%s";
 
-			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
-			if (request_sock_slab_name == NULL)
+			prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+			if (prot->rsk_prot->slab_name == NULL)
 				goto out_free_sock_slab;
 
-			sprintf(request_sock_slab_name, mask, prot->name);
-			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
+			sprintf(prot->rsk_prot->slab_name, mask, prot->name);
+			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
 								 prot->rsk_prot->obj_size, 0,
 								 SLAB_HWCACHE_ALIGN, NULL);
 
@@ -2070,14 +2067,14 @@ int proto_register(struct proto *prot, int alloc_slab)
 		if (prot->twsk_prot != NULL) {
 			static const char mask[] = "tw_sock_%s";
 
-			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+			prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
 
-			if (timewait_sock_slab_name == NULL)
+			if (prot->twsk_prot->twsk_slab_name == NULL)
 				goto out_free_request_sock_slab;
 
-			sprintf(timewait_sock_slab_name, mask, prot->name);
+			sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
 			prot->twsk_prot->twsk_slab =
-				kmem_cache_create(timewait_sock_slab_name,
+				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
 						  prot->twsk_prot->twsk_obj_size,
 						  0, SLAB_HWCACHE_ALIGN,
 						  NULL);
@@ -2093,14 +2090,14 @@ int proto_register(struct proto *prot, int alloc_slab)
 	return 0;
 
 out_free_timewait_sock_slab_name:
-	kfree(timewait_sock_slab_name);
+	kfree(prot->twsk_prot->twsk_slab_name);
 out_free_request_sock_slab:
 	if (prot->rsk_prot && prot->rsk_prot->slab) {
 		kmem_cache_destroy(prot->rsk_prot->slab);
 		prot->rsk_prot->slab = NULL;
 	}
 out_free_request_sock_slab_name:
-	kfree(request_sock_slab_name);
+	kfree(prot->rsk_prot->slab_name);
 out_free_sock_slab:
 	kmem_cache_destroy(prot->slab);
 	prot->slab = NULL;
@@ -2123,18 +2120,14 @@ void proto_unregister(struct proto *prot)
 	}
 
 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
-		const char *name = kmem_cache_name(prot->rsk_prot->slab);
-
 		kmem_cache_destroy(prot->rsk_prot->slab);
-		kfree(name);
+		kfree(prot->rsk_prot->slab_name);
 		prot->rsk_prot->slab = NULL;
 	}
 
 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
-		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
-
 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
-		kfree(name);
+		kfree(prot->twsk_prot->twsk_slab_name);
 		prot->twsk_prot->twsk_slab = NULL;
 	}
 }
-- 
cgit v1.2.3-71-gd317


From 2ed1cdcf9a83205d1343f29b630abff232eaa72c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 21 Nov 2008 16:59:57 -0800
Subject: irq.h: fix missing/extra kernel-doc

Impact: fix kernel-doc build

Fix missing & excess irq.h kernel-doc:

Warning(include/linux/irq.h:182): No description found for parameter 'irq'
Warning(include/linux/irq.h:182): Excess struct/union/enum/typedef member 'affinity_entry' description in 'irq_desc'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 36b186eb318b..3dddfa703ebd 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -131,7 +131,7 @@ struct irq_chip {
 
 /**
  * struct irq_desc - interrupt descriptor
- *
+ * @irq:		interrupt number for this descriptor
  * @handle_irq:		highlevel irq-events handler [if NULL, __do_IRQ()]
  * @chip:		low level interrupt hardware access
  * @msi_desc:		MSI descriptor
@@ -150,7 +150,6 @@ struct irq_chip {
  * @cpu:		cpu index useful for balancing
  * @pending_mask:	pending rebalanced interrupts
  * @dir:		/proc/irq/ procfs entry
- * @affinity_entry:	/proc/irq/smp_affinity procfs entry on SMP
  * @name:		flow handler name for /proc/interrupts output
  */
 struct irq_desc {
-- 
cgit v1.2.3-71-gd317


From 52440211dcdc52c0b757f8b34d122e11b12cdd50 Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Tue, 18 Nov 2008 09:30:25 -0800
Subject: drm: move drm vblank initialization/cleanup to driver load/unload

drm vblank initialization keeps track of the changes in driver-supplied
frame counts across vt switch and mode setting, but only if you let it by
not tearing down the drm vblank structure.

Signed-off-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/drm_drv.c           |  2 ++
 drivers/gpu/drm/drm_irq.c           |  4 +---
 drivers/gpu/drm/i915/i915_dma.c     |  7 +++++++
 drivers/gpu/drm/i915/i915_drv.h     |  2 ++
 drivers/gpu/drm/i915/i915_irq.c     |  5 -----
 drivers/gpu/drm/mga/mga_dma.c       |  8 ++++++++
 drivers/gpu/drm/mga/mga_irq.c       |  5 -----
 drivers/gpu/drm/r128/r128_drv.c     |  6 ++++++
 drivers/gpu/drm/r128/r128_drv.h     |  1 +
 drivers/gpu/drm/r128/r128_irq.c     |  2 +-
 drivers/gpu/drm/radeon/radeon_cp.c  |  6 ++++++
 drivers/gpu/drm/radeon/radeon_irq.c |  5 -----
 drivers/gpu/drm/via/via_irq.c       |  1 -
 drivers/gpu/drm/via/via_map.c       | 11 ++++++++++-
 include/drm/drmP.h                  |  1 +
 15 files changed, 45 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 3ab1e9cc4692..996097acb5e7 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -305,6 +305,8 @@ static void drm_cleanup(struct drm_device * dev)
 		return;
 	}
 
+	drm_vblank_cleanup(dev);
+
 	drm_lastclose(dev);
 
 	if (drm_core_has_MTRR(dev) && drm_core_has_AGP(dev) &&
diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c
index 15c8dabc3e97..1e787f894b3c 100644
--- a/drivers/gpu/drm/drm_irq.c
+++ b/drivers/gpu/drm/drm_irq.c
@@ -94,7 +94,7 @@ static void vblank_disable_fn(unsigned long arg)
 	}
 }
 
-static void drm_vblank_cleanup(struct drm_device *dev)
+void drm_vblank_cleanup(struct drm_device *dev)
 {
 	/* Bail if the driver didn't call drm_vblank_init() */
 	if (dev->num_crtcs == 0)
@@ -278,8 +278,6 @@ int drm_irq_uninstall(struct drm_device * dev)
 
 	free_irq(dev->pdev->irq, dev);
 
-	drm_vblank_cleanup(dev);
-
 	return 0;
 }
 EXPORT_SYMBOL(drm_irq_uninstall);
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 0d215e38606a..ba89b42f790a 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -856,6 +856,13 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 
 	spin_lock_init(&dev_priv->user_irq_lock);
 
+	ret = drm_vblank_init(dev, I915_NUM_PIPE);
+
+	if (ret) {
+		(void) i915_driver_unload(dev);
+		return ret;
+	}
+
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ed1edcf8704f..b24d522f490c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -47,6 +47,8 @@ enum pipe {
 	PIPE_B,
 };
 
+#define I915_NUM_PIPE	2
+
 /* Interface history:
  *
  * 1.1: Original.
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index c3673581db58..fe3d9cc72bf5 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -498,11 +498,6 @@ void i915_driver_irq_preinstall(struct drm_device * dev)
 int i915_driver_irq_postinstall(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private;
-	int ret, num_pipes = 2;
-
-	ret = drm_vblank_init(dev, num_pipes);
-	if (ret)
-		return ret;
 
 	dev_priv->vblank_pipe = DRM_I915_VBLANK_PIPE_A | DRM_I915_VBLANK_PIPE_B;
 
diff --git a/drivers/gpu/drm/mga/mga_dma.c b/drivers/gpu/drm/mga/mga_dma.c
index c1d12dbfa8d8..b49c5ff29585 100644
--- a/drivers/gpu/drm/mga/mga_dma.c
+++ b/drivers/gpu/drm/mga/mga_dma.c
@@ -396,6 +396,7 @@ int mga_freelist_put(struct drm_device * dev, struct drm_buf * buf)
 int mga_driver_load(struct drm_device * dev, unsigned long flags)
 {
 	drm_mga_private_t *dev_priv;
+	int ret;
 
 	dev_priv = drm_alloc(sizeof(drm_mga_private_t), DRM_MEM_DRIVER);
 	if (!dev_priv)
@@ -415,6 +416,13 @@ int mga_driver_load(struct drm_device * dev, unsigned long flags)
 	dev->types[7] = _DRM_STAT_PRIMARY;
 	dev->types[8] = _DRM_STAT_SECONDARY;
 
+	ret = drm_vblank_init(dev, 1);
+
+	if (ret) {
+		(void) mga_driver_unload(dev);
+		return ret;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/mga/mga_irq.c b/drivers/gpu/drm/mga/mga_irq.c
index bab42f41188b..daa6041a483a 100644
--- a/drivers/gpu/drm/mga/mga_irq.c
+++ b/drivers/gpu/drm/mga/mga_irq.c
@@ -152,11 +152,6 @@ void mga_driver_irq_preinstall(struct drm_device * dev)
 int mga_driver_irq_postinstall(struct drm_device *dev)
 {
 	drm_mga_private_t *dev_priv = (drm_mga_private_t *) dev->dev_private;
-	int ret;
-
-	ret = drm_vblank_init(dev, 1);
-	if (ret)
-		return ret;
 
 	DRM_INIT_WAITQUEUE(&dev_priv->fence_queue);
 
diff --git a/drivers/gpu/drm/r128/r128_drv.c b/drivers/gpu/drm/r128/r128_drv.c
index 3265d53ba91f..601f4c0e5da5 100644
--- a/drivers/gpu/drm/r128/r128_drv.c
+++ b/drivers/gpu/drm/r128/r128_drv.c
@@ -45,6 +45,7 @@ static struct drm_driver driver = {
 	    DRIVER_USE_AGP | DRIVER_USE_MTRR | DRIVER_PCI_DMA | DRIVER_SG |
 	    DRIVER_HAVE_DMA | DRIVER_HAVE_IRQ | DRIVER_IRQ_SHARED,
 	.dev_priv_size = sizeof(drm_r128_buf_priv_t),
+	.load = r128_driver_load,
 	.preclose = r128_driver_preclose,
 	.lastclose = r128_driver_lastclose,
 	.get_vblank_counter = r128_get_vblank_counter,
@@ -84,6 +85,11 @@ static struct drm_driver driver = {
 	.patchlevel = DRIVER_PATCHLEVEL,
 };
 
+int r128_driver_load(struct drm_device * dev, unsigned long flags)
+{
+	return drm_vblank_init(dev, 1);
+}
+
 static int __init r128_init(void)
 {
 	driver.num_ioctls = r128_max_ioctl;
diff --git a/drivers/gpu/drm/r128/r128_drv.h b/drivers/gpu/drm/r128/r128_drv.h
index 5898b274279d..797a26c42dab 100644
--- a/drivers/gpu/drm/r128/r128_drv.h
+++ b/drivers/gpu/drm/r128/r128_drv.h
@@ -159,6 +159,7 @@ extern void r128_driver_irq_preinstall(struct drm_device * dev);
 extern int r128_driver_irq_postinstall(struct drm_device *dev);
 extern void r128_driver_irq_uninstall(struct drm_device * dev);
 extern void r128_driver_lastclose(struct drm_device * dev);
+extern int r128_driver_load(struct drm_device * dev, unsigned long flags);
 extern void r128_driver_preclose(struct drm_device * dev,
 				 struct drm_file *file_priv);
 
diff --git a/drivers/gpu/drm/r128/r128_irq.c b/drivers/gpu/drm/r128/r128_irq.c
index d7349012a680..69810fb8ac49 100644
--- a/drivers/gpu/drm/r128/r128_irq.c
+++ b/drivers/gpu/drm/r128/r128_irq.c
@@ -102,7 +102,7 @@ void r128_driver_irq_preinstall(struct drm_device * dev)
 
 int r128_driver_irq_postinstall(struct drm_device *dev)
 {
-	return drm_vblank_init(dev, 1);
+	return 0;
 }
 
 void r128_driver_irq_uninstall(struct drm_device * dev)
diff --git a/drivers/gpu/drm/radeon/radeon_cp.c b/drivers/gpu/drm/radeon/radeon_cp.c
index abdc1ae38467..dcebb4bee7aa 100644
--- a/drivers/gpu/drm/radeon/radeon_cp.c
+++ b/drivers/gpu/drm/radeon/radeon_cp.c
@@ -1757,6 +1757,12 @@ int radeon_driver_load(struct drm_device *dev, unsigned long flags)
 	if (ret != 0)
 		return ret;
 
+	ret = drm_vblank_init(dev, 2);
+	if (ret) {
+		radeon_driver_unload(dev);
+		return ret;
+	}
+
 	DRM_DEBUG("%s card detected\n",
 		  ((dev_priv->flags & RADEON_IS_AGP) ? "AGP" : (((dev_priv->flags & RADEON_IS_PCIE) ? "PCIE" : "PCI"))));
 	return ret;
diff --git a/drivers/gpu/drm/radeon/radeon_irq.c b/drivers/gpu/drm/radeon/radeon_irq.c
index 5079f7054a2f..97c0599fdb1e 100644
--- a/drivers/gpu/drm/radeon/radeon_irq.c
+++ b/drivers/gpu/drm/radeon/radeon_irq.c
@@ -337,15 +337,10 @@ int radeon_driver_irq_postinstall(struct drm_device *dev)
 {
 	drm_radeon_private_t *dev_priv =
 	    (drm_radeon_private_t *) dev->dev_private;
-	int ret;
 
 	atomic_set(&dev_priv->swi_emitted, 0);
 	DRM_INIT_WAITQUEUE(&dev_priv->swi_queue);
 
-	ret = drm_vblank_init(dev, 2);
-	if (ret)
-		return ret;
-
 	dev->max_vblank_count = 0x001fffff;
 
 	radeon_irq_set_state(dev, RADEON_SW_INT_ENABLE, 1);
diff --git a/drivers/gpu/drm/via/via_irq.c b/drivers/gpu/drm/via/via_irq.c
index 665d319b927b..c248c1d37268 100644
--- a/drivers/gpu/drm/via/via_irq.c
+++ b/drivers/gpu/drm/via/via_irq.c
@@ -314,7 +314,6 @@ int via_driver_irq_postinstall(struct drm_device *dev)
 	if (!dev_priv)
 		return -EINVAL;
 
-	drm_vblank_init(dev, 1);
 	status = VIA_READ(VIA_REG_INTERRUPT);
 	VIA_WRITE(VIA_REG_INTERRUPT, status | VIA_IRQ_GLOBAL
 		  | dev_priv->irq_enable_mask);
diff --git a/drivers/gpu/drm/via/via_map.c b/drivers/gpu/drm/via/via_map.c
index a967556be014..2c4f0b485792 100644
--- a/drivers/gpu/drm/via/via_map.c
+++ b/drivers/gpu/drm/via/via_map.c
@@ -107,8 +107,17 @@ int via_driver_load(struct drm_device *dev, unsigned long chipset)
 	ret = drm_sman_init(&dev_priv->sman, 2, 12, 8);
 	if (ret) {
 		drm_free(dev_priv, sizeof(*dev_priv), DRM_MEM_DRIVER);
+		return ret;
 	}
-	return ret;
+
+	ret = drm_vblank_init(dev, 1);
+	if (ret) {
+		drm_sman_takedown(&dev_priv->sman);
+		drm_free(dev_priv, sizeof(drm_via_private_t), DRM_MEM_DRIVER);
+		return ret;
+	}
+
+	return 0;
 }
 
 int via_driver_unload(struct drm_device *dev)
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 28c7f1679d49..d5e8e5c89548 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -1151,6 +1151,7 @@ extern u32 drm_vblank_count(struct drm_device *dev, int crtc);
 extern void drm_handle_vblank(struct drm_device *dev, int crtc);
 extern int drm_vblank_get(struct drm_device *dev, int crtc);
 extern void drm_vblank_put(struct drm_device *dev, int crtc);
+extern void drm_vblank_cleanup(struct drm_device *dev);
 /* Modesetting support */
 extern int drm_modeset_ctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
-- 
cgit v1.2.3-71-gd317


From f79fca55f9a6fe54635ad32ddc8a38f92a94ec30 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Mon, 24 Nov 2008 16:06:17 -0800
Subject: netfilter: xtables: add missing const qualifier to xt_tgchk_param

When entryinfo was a standalone parameter to functions, it used to be
"const void *". Put the const back in.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index be41b609c88f..e52ce475d19f 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -251,7 +251,7 @@ struct xt_target_param {
  */
 struct xt_tgchk_param {
 	const char *table;
-	void *entryinfo;
+	const void *entryinfo;
 	const struct xt_target *target;
 	void *targinfo;
 	unsigned int hook_mask;
-- 
cgit v1.2.3-71-gd317


From 5f23b734963ec7eaa3ebcd9050da0c9b7d143dd3 Mon Sep 17 00:00:00 2001
From: dann frazier <dannf@hp.com>
Date: Wed, 26 Nov 2008 15:32:27 -0800
Subject: net: Fix soft lockups/OOM issues w/ unix garbage collector

This is an implementation of David Miller's suggested fix in:
  https://bugzilla.redhat.com/show_bug.cgi?id=470201

It has been updated to use wait_event() instead of
wait_event_interruptible().

Paraphrasing the description from the above report, it makes sendmsg()
block while UNIX garbage collection is in progress. This avoids a
situation where child processes continue to queue new FDs over a
AF_UNIX socket to a parent which is in the exit path and running
garbage collection on these FDs. This contention can result in soft
lockups and oom-killing of unrelated processes.

Signed-off-by: dann frazier <dannf@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  1 +
 net/unix/af_unix.c    |  2 ++
 net/unix/garbage.c    | 13 ++++++++++---
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index c29ff1da8a18..1614d78c60ed 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -9,6 +9,7 @@
 extern void unix_inflight(struct file *fp);
 extern void unix_notinflight(struct file *fp);
 extern void unix_gc(void);
+extern void wait_for_unix_gc(void);
 
 #define UNIX_HASH_SIZE	256
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index eb90f77bb0e2..66d5ac4773ab 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1343,6 +1343,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
+	wait_for_unix_gc();
 	err = scm_send(sock, msg, siocb->scm);
 	if (err < 0)
 		return err;
@@ -1493,6 +1494,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
+	wait_for_unix_gc();
 	err = scm_send(sock, msg, siocb->scm);
 	if (err < 0)
 		return err;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 6d4a9a8de5ef..abb3ab34cb1e 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -80,6 +80,7 @@
 #include <linux/file.h>
 #include <linux/proc_fs.h>
 #include <linux/mutex.h>
+#include <linux/wait.h>
 
 #include <net/sock.h>
 #include <net/af_unix.h>
@@ -91,6 +92,7 @@
 static LIST_HEAD(gc_inflight_list);
 static LIST_HEAD(gc_candidates);
 static DEFINE_SPINLOCK(unix_gc_lock);
+static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
 
 unsigned int unix_tot_inflight;
 
@@ -266,12 +268,16 @@ static void inc_inflight_move_tail(struct unix_sock *u)
 		list_move_tail(&u->link, &gc_candidates);
 }
 
-/* The external entry point: unix_gc() */
+static bool gc_in_progress = false;
 
-void unix_gc(void)
+void wait_for_unix_gc(void)
 {
-	static bool gc_in_progress = false;
+	wait_event(unix_gc_wait, gc_in_progress == false);
+}
 
+/* The external entry point: unix_gc() */
+void unix_gc(void)
+{
 	struct unix_sock *u;
 	struct unix_sock *next;
 	struct sk_buff_head hitlist;
@@ -376,6 +382,7 @@ void unix_gc(void)
 	/* All candidates should have been detached by now. */
 	BUG_ON(!list_empty(&gc_candidates));
 	gc_in_progress = false;
+	wake_up(&unix_gc_wait);
 
  out:
 	spin_unlock(&unix_gc_lock);
-- 
cgit v1.2.3-71-gd317


From 95a28ed08619cc70f31611886ac7b26ab0e462dc Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Thu, 13 Nov 2008 11:01:34 +0800
Subject: ACPICA: Allow _WAK method to return an Integer

This can happen if the _WAK method returns nothing (as per ACPI
1.0) but does return an integer if the implicit return mechanism
is enabled.  This is the only method that has this problem,
since it is also defined to return a package of two integers
(ACPI 1.0b+). In all other cases, if a method returns an object
when one was not expected, no warning is issued.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/acpi/acpredef.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpredef.h b/include/acpi/acpredef.h
index 619fb75f8861..e6452dbf39e3 100644
--- a/include/acpi/acpredef.h
+++ b/include/acpi/acpredef.h
@@ -346,7 +346,7 @@ static const union acpi_predefined_info predefined_names[] = {
 
 	/* Acpi 1.0 defined _WAK with no return value. Later, it was changed to return a package */
 
-	{.info = {"_WAK", 1, ACPI_RTYPE_NONE | ACPI_RTYPE_PACKAGE}},
+	{.info = {"_WAK", 1, ACPI_RTYPE_NONE | ACPI_RTYPE_INTEGER | ACPI_RTYPE_PACKAGE}},
 	{.ret_info = {ACPI_PTYPE1_FIXED, ACPI_RTYPE_INTEGER, 2, 0, 0, 0}},	/* fixed (2 Int), but is optional */
 	{.ret_info = {0, 0, 0, 0, 0, 0}}	/* Table terminator */
 };
-- 
cgit v1.2.3-71-gd317


From e899b6485c332aa2d7510739507ab5e5d7b28e59 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Thu, 27 Nov 2008 14:42:30 +0800
Subject: ACPICA: disable _BIF warning

A generic work-around from ACPICA is in the queue,
but since Linux has a work-around in its battery
driver, we can disable this warning now.

Allow _BIF method to return an Package with Buffer elements

http://bugzilla.kernel.org/show_bug.cgi?id=11822

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/acpi/acpredef.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpredef.h b/include/acpi/acpredef.h
index e6452dbf39e3..16a9ca9a66e4 100644
--- a/include/acpi/acpredef.h
+++ b/include/acpi/acpredef.h
@@ -167,7 +167,7 @@ static const union acpi_predefined_info predefined_names[] = {
 	{.info = {"_BFS", 1, 0}},
 	{.info = {"_BIF", 0, ACPI_RTYPE_PACKAGE}}, {.ret_info = {ACPI_PTYPE1_FIXED, ACPI_RTYPE_INTEGER,
 					  9,
-					  ACPI_RTYPE_STRING, 4, 0}},	/* fixed (9 Int),(4 Str) */
+					  ACPI_RTYPE_STRING | ACPI_RTYPE_BUFFER, 4, 0}},	/* fixed (9 Int),(4 Str) */
 	{.info = {"_BLT", 3, 0}},
 	{.info = {"_BMC", 1, 0}},
 	{.info = {"_BMD", 0, ACPI_RTYPE_PACKAGE}}, {.ret_info = {ACPI_PTYPE1_FIXED, ACPI_RTYPE_INTEGER, 5, 0, 0, 0}},	/* fixed (5 Int) */
-- 
cgit v1.2.3-71-gd317


From 487ff32082a9bd7489d8185cf7d7a2fdf18a22fa Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Thu, 27 Nov 2008 11:13:58 +0000
Subject: Allow architectures to override copy_user_highpage()

With aliasing VIPT cache support, the ARM implementation of
clear_user_page() and copy_user_page() sets up a temporary kernel space
mapping such that we have the same cache colour as the userspace page.
This avoids having to consider any userspace aliases from this operation.

However, when highmem is enabled, kmap_atomic() have to setup mappings.
The copy_user_highpage() and clear_user_highpage() call these functions
before delegating the copies to copy_user_page() and clear_user_page().

The effect of this is that each of the *_user_highpage() functions setup
their own kmap mapping, followed by the *_user_page() functions setting
up another mapping.  This is rather wasteful.

Thankfully, copy_user_highpage() can be overriden by architectures by
defining __HAVE_ARCH_COPY_USER_HIGHPAGE.  However, replacement of
clear_user_highpage() is more difficult because its inline definition
is not conditional.  It seems that you're expected to define
__HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE and provide a replacement
__alloc_zeroed_user_highpage() implementation instead.

The allocation itself is fine, so we don't want to override that.  What
we really want to do is to override clear_user_highpage() with our own
version which doesn't kmap_atomic() unnecessarily.

Other VIPT architectures (PARISC and SH) would also like to override
this function as well.

Acked-by: Hugh Dickins <hugh@veritas.com>
Acked-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/highmem.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 7dcbc82f3b7b..13875ce9112a 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -63,12 +63,14 @@ static inline void *kmap_atomic(struct page *page, enum km_type idx)
 #endif /* CONFIG_HIGHMEM */
 
 /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
+#ifndef clear_user_highpage
 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
 {
 	void *addr = kmap_atomic(page, KM_USER0);
 	clear_user_page(addr, vaddr, page);
 	kunmap_atomic(addr, KM_USER0);
 }
+#endif
 
 #ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 /**
-- 
cgit v1.2.3-71-gd317


From 9a5aa622dd4cd22b5e0fe83e4a9c0c768d4e2dea Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Fri, 28 Nov 2008 21:29:46 -0800
Subject: mlx4_core: Save/restore default port IB capability mask

Commit 7ff93f8b ("mlx4_core: Multiple port type support") introduced
support for different port types.  As part of that support, SET_PORT
is invoked to set the port type during driver startup.  However, as a
side-effect, for IB ports the invocation of this command also sets the
port's capability mask to zero (losing the default value set by FW).

To fix this, get the default ib port capabilities (via a MAD_IFC Port
Info query) during driver startup, and save them for use in the
mlx4_SET_PORT command when setting the port-type to Infiniband.

This patch fixes problems with subnet manager (SM) failover such as
<https://bugs.openfabrics.org/show_bug.cgi?id=1183>, which occurred
because the IsTrapSupported bit in the capability mask was zeroed.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/main.c     |  8 ++++++++
 drivers/net/mlx4/mlx4.h     |  1 +
 drivers/net/mlx4/port.c     | 39 ++++++++++++++++++++++++++++++++++++++-
 include/linux/mlx4/device.h |  1 +
 4 files changed, 48 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 468921b8f4b6..90a0281d15ea 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -753,6 +753,7 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
 	int port;
+	__be32 ib_port_default_caps;
 
 	err = mlx4_init_uar_table(dev);
 	if (err) {
@@ -852,6 +853,13 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 	}
 
 	for (port = 1; port <= dev->caps.num_ports; port++) {
+		ib_port_default_caps = 0;
+		err = mlx4_get_port_ib_caps(dev, port, &ib_port_default_caps);
+		if (err)
+			mlx4_warn(dev, "failed to get port %d default "
+				  "ib capabilities (%d). Continuing with "
+				  "caps = 0\n", port, err);
+		dev->caps.ib_port_def_cap[port] = ib_port_default_caps;
 		err = mlx4_SET_PORT(dev, port);
 		if (err) {
 			mlx4_err(dev, "Failed to set port %d, aborting\n",
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 56a2e213fe62..34c909deaff3 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -385,5 +385,6 @@ void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
 void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
 
 int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port);
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps);
 
 #endif /* MLX4_H */
diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c
index e2fdab42c4ce..0a057e5dc63b 100644
--- a/drivers/net/mlx4/port.c
+++ b/drivers/net/mlx4/port.c
@@ -258,6 +258,42 @@ out:
 }
 EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
 
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
+{
+	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+	u8 *inbuf, *outbuf;
+	int err;
+
+	inmailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+
+	outmailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outmailbox)) {
+		mlx4_free_cmd_mailbox(dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	inbuf = inmailbox->buf;
+	outbuf = outmailbox->buf;
+	memset(inbuf, 0, 256);
+	memset(outbuf, 0, 256);
+	inbuf[0] = 1;
+	inbuf[1] = 1;
+	inbuf[2] = 1;
+	inbuf[3] = 1;
+	*(__be16 *) (&inbuf[16]) = cpu_to_be16(0x0015);
+	*(__be32 *) (&inbuf[20]) = cpu_to_be32(port);
+
+	err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
+	if (!err)
+		*caps = *(__be32 *) (outbuf + 84);
+	mlx4_free_cmd_mailbox(dev, inmailbox);
+	mlx4_free_cmd_mailbox(dev, outmailbox);
+	return err;
+}
+
 int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port)
 {
 	struct mlx4_cmd_mailbox *mailbox;
@@ -273,7 +309,8 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port)
 		((u8 *) mailbox->buf)[3] = 6;
 		((__be16 *) mailbox->buf)[4] = cpu_to_be16(1 << 15);
 		((__be16 *) mailbox->buf)[6] = cpu_to_be16(1 << 15);
-	}
+	} else
+		((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
 	err = mlx4_cmd(dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
 		       MLX4_CMD_TIME_CLASS_B);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index bd9977b89490..371086fd946f 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -179,6 +179,7 @@ struct mlx4_caps {
 	int			num_ports;
 	int			vl_cap[MLX4_MAX_PORTS + 1];
 	int			ib_mtu_cap[MLX4_MAX_PORTS + 1];
+	__be32			ib_port_def_cap[MLX4_MAX_PORTS + 1];
 	u64			def_mac[MLX4_MAX_PORTS + 1];
 	int			eth_mtu_cap[MLX4_MAX_PORTS + 1];
 	int			gid_table_len[MLX4_MAX_PORTS + 1];
-- 
cgit v1.2.3-71-gd317


From 31168481c32c8a485e1003af9433124dede57f8d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 22 Nov 2008 17:33:24 +0000
Subject: meminit section warnings

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_cgroup.h |  4 ++--
 mm/memory_hotplug.c         |  9 +++++----
 mm/page_cgroup.c            | 13 +++++++------
 mm/sparse.c                 |  2 +-
 4 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index f546ad6fc028..1e6d34bfa094 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -17,7 +17,7 @@ struct page_cgroup {
 	struct list_head lru;		/* per cgroup LRU list */
 };
 
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat);
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
 void __init page_cgroup_init(void);
 struct page_cgroup *lookup_page_cgroup(struct page *page);
 
@@ -91,7 +91,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
-static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
+static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
 }
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b5b2b15085a8..b17371185468 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -189,7 +189,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
 					pgdat->node_start_pfn;
 }
 
-static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nr_pages = PAGES_PER_SECTION;
@@ -216,7 +216,7 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	return 0;
 }
 
-static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
 	int nr_pages = PAGES_PER_SECTION;
 	int ret;
@@ -273,7 +273,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
  * call this function after deciding the zone to which to
  * add the new pages.
  */
-int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
 		 unsigned long nr_pages)
 {
 	unsigned long i;
@@ -470,7 +470,8 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
 
 
-int add_memory(int nid, u64 start, u64 size)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+int __ref add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat = NULL;
 	int new_pgdat = 0;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1223d927904d..436c00229e70 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -21,7 +21,7 @@ static unsigned long total_usage;
 #if !defined(CONFIG_SPARSEMEM)
 
 
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
 	pgdat->node_page_cgroup = NULL;
 }
@@ -97,7 +97,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
 	return section->page_cgroup + pfn;
 }
 
-int __meminit init_section_page_cgroup(unsigned long pfn)
+/* __alloc_bootmem...() is protected by !slab_available() */
+int __init_refok init_section_page_cgroup(unsigned long pfn)
 {
 	struct mem_section *section;
 	struct page_cgroup *base, *pc;
@@ -158,7 +159,7 @@ void __free_page_cgroup(unsigned long pfn)
 	}
 }
 
-int online_page_cgroup(unsigned long start_pfn,
+int __meminit online_page_cgroup(unsigned long start_pfn,
 			unsigned long nr_pages,
 			int nid)
 {
@@ -183,7 +184,7 @@ int online_page_cgroup(unsigned long start_pfn,
 	return -ENOMEM;
 }
 
-int offline_page_cgroup(unsigned long start_pfn,
+int __meminit offline_page_cgroup(unsigned long start_pfn,
 		unsigned long nr_pages, int nid)
 {
 	unsigned long start, end, pfn;
@@ -197,7 +198,7 @@ int offline_page_cgroup(unsigned long start_pfn,
 
 }
 
-static int page_cgroup_callback(struct notifier_block *self,
+static int __meminit page_cgroup_callback(struct notifier_block *self,
 			       unsigned long action, void *arg)
 {
 	struct memory_notify *mn = arg;
@@ -248,7 +249,7 @@ void __init page_cgroup_init(void)
 	" want\n");
 }
 
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
 	return;
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 39db301b920d..083f5b63e7a8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -570,7 +570,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
  * set.  If this is <=0, then that means that the passed-in
  * map was not consumed and must be freed.
  */
-int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 			   int nr_pages)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
-- 
cgit v1.2.3-71-gd317


From 02d0e6753d8ab0173b63338157929e52eac86d12 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 22 Nov 2008 17:38:34 +0000
Subject: hotplug_memory_notifier section annotation

Same as for hotplug_cpu - we want static notifier_block in there in meminitdata,
to avoid false positives whenever it's used.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 2f5f8a5ef2a0..36c82c9e6ea7 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -91,7 +91,7 @@ extern int memory_notify(unsigned long val, void *v);
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 #define hotplug_memory_notifier(fn, pri) {			\
-	static struct notifier_block fn##_mem_nb =		\
+	static __meminitdata struct notifier_block fn##_mem_nb =\
 		{ .notifier_call = fn, .priority = pri };	\
 	register_memory_notifier(&fn##_mem_nb);			\
 }
-- 
cgit v1.2.3-71-gd317


From 96b8936a9ed08746e47081458a5eb9e43a751e24 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 Nov 2008 08:10:03 +0100
Subject: remove __ARCH_WANT_COMPAT_SYS_PTRACE

All architectures now use the generic compat_sys_ptrace, as should every
new architecture that needs 32bit compat (if we'll ever get another).

Remove the now superflous __ARCH_WANT_COMPAT_SYS_PTRACE define, and also
kill a comment about __ARCH_SYS_PTRACE that was added after
__ARCH_SYS_PTRACE was already gone.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                       | 2 --
 arch/ia64/include/asm/ptrace.h     | 2 --
 arch/mips/include/asm/ptrace.h     | 4 ----
 arch/parisc/include/asm/ptrace.h   | 2 --
 arch/powerpc/include/asm/ptrace.h  | 2 --
 arch/s390/include/asm/ptrace.h     | 2 --
 arch/sparc/include/asm/ptrace_64.h | 2 --
 arch/x86/include/asm/ptrace.h      | 2 --
 include/linux/compat.h             | 2 --
 kernel/ptrace.c                    | 4 ++--
 10 files changed, 2 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 8977d99987cb..471e72dbaf8b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -79,8 +79,6 @@ config HAVE_KRETPROBES
 #	task_pt_regs()		in asm/processor.h or asm/ptrace.h
 #	arch_has_single_step()	if there is hardware single-step support
 #	arch_has_block_step()	if there is hardware block-step support
-#	arch_ptrace()		and not #define __ARCH_SYS_PTRACE
-#	compat_arch_ptrace()	and #define __ARCH_WANT_COMPAT_SYS_PTRACE
 #	asm/syscall.h		supplying asm-generic/syscall.h interface
 #	linux/regset.h		user_regset interfaces
 #	CORE_DUMP_USE_REGSET	#define'd in linux/elf.h
diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h
index 6417c1ecb44e..14055c636adf 100644
--- a/arch/ia64/include/asm/ptrace.h
+++ b/arch/ia64/include/asm/ptrace.h
@@ -325,8 +325,6 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
   #define arch_has_block_step()   (1)
   extern void user_enable_block_step(struct task_struct *);
 
-#define __ARCH_WANT_COMPAT_SYS_PTRACE
-
 #endif /* !__KERNEL__ */
 
 /* pt_all_user_regs is used for PTRACE_GETREGS PTRACE_SETREGS */
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index 813abd16255d..c2c8bac43307 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -9,10 +9,6 @@
 #ifndef _ASM_PTRACE_H
 #define _ASM_PTRACE_H
 
-#ifdef CONFIG_64BIT
-#define __ARCH_WANT_COMPAT_SYS_PTRACE
-#endif
-
 /* 0 - 31 are integer registers, 32 - 63 are fp registers.  */
 #define FPR_BASE	32
 #define PC		64
diff --git a/arch/parisc/include/asm/ptrace.h b/arch/parisc/include/asm/ptrace.h
index afa5333187b4..302f68dc889c 100644
--- a/arch/parisc/include/asm/ptrace.h
+++ b/arch/parisc/include/asm/ptrace.h
@@ -47,8 +47,6 @@ struct pt_regs {
 
 #define task_regs(task) ((struct pt_regs *) ((char *)(task) + TASK_REGS))
 
-#define __ARCH_WANT_COMPAT_SYS_PTRACE
-
 struct task_struct;
 #define arch_has_single_step()	1
 void user_disable_single_step(struct task_struct *task);
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index 280a90cc9894..c9c678fb2538 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -55,8 +55,6 @@ struct pt_regs {
 
 #ifdef __powerpc64__
 
-#define __ARCH_WANT_COMPAT_SYS_PTRACE
-
 #define STACK_FRAME_OVERHEAD	112	/* size of minimum stack frame */
 #define STACK_FRAME_LR_SAVE	2	/* Location of LR in stack frame */
 #define STACK_FRAME_REGS_MARKER	ASM_CONST(0x7265677368657265)
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index a7226f8143fb..560ce8561dfd 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -486,8 +486,6 @@ struct task_struct;
 extern void user_enable_single_step(struct task_struct *);
 extern void user_disable_single_step(struct task_struct *);
 
-#define __ARCH_WANT_COMPAT_SYS_PTRACE
-
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
 #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
 #define user_stack_pointer(regs)((regs)->gprs[15])
diff --git a/arch/sparc/include/asm/ptrace_64.h b/arch/sparc/include/asm/ptrace_64.h
index 3d3e9c161d8b..84e969f06afe 100644
--- a/arch/sparc/include/asm/ptrace_64.h
+++ b/arch/sparc/include/asm/ptrace_64.h
@@ -142,8 +142,6 @@ struct global_reg_snapshot {
 };
 extern struct global_reg_snapshot global_reg_snapshot[NR_CPUS];
 
-#define __ARCH_WANT_COMPAT_SYS_PTRACE
-
 #define force_successful_syscall_return()	    \
 do {	current_thread_info()->syscall_noerror = 1; \
 } while (0)
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index d1531c8480b7..eefb0594b058 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -271,8 +271,6 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
-#define __ARCH_WANT_COMPAT_SYS_PTRACE
-
 #endif /* __KERNEL__ */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/include/linux/compat.h b/include/linux/compat.h
index f061a1ea1b74..e88f3ecf38b4 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -252,12 +252,10 @@ extern int compat_ptrace_request(struct task_struct *child,
 				 compat_long_t request,
 				 compat_ulong_t addr, compat_ulong_t data);
 
-#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
 extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			       compat_ulong_t addr, compat_ulong_t data);
 asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 				  compat_long_t addr, compat_long_t data);
-#endif	/* __ARCH_WANT_COMPAT_SYS_PTRACE */
 
 /*
  * epoll (fs/eventpoll.c) compat bits follow ...
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1e68e4c39e2c..4c8bcd7dd8e0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
 	return (copied == sizeof(data)) ? 0 : -EIO;
 }
 
-#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE
+#if defined CONFIG_COMPAT
 #include <linux/compat.h>
 
 int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -709,4 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	unlock_kernel();
 	return ret;
 }
-#endif	/* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */
+#endif	/* CONFIG_COMPAT */
-- 
cgit v1.2.3-71-gd317


From ac70a964b0e22a95af3628c344815857a01461b7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Nov 2008 13:36:48 +0900
Subject: libata: blacklist Seagate drives which time out FLUSH_CACHE when used
 with NCQ

Some recent Seagate harddrives have firmware bug which causes FLUSH
CACHE to timeout under certain circumstances if NCQ is being used.
This can be worked around by disabling NCQ and fixed by updating the
firmware.  Implement ATA_HORKAGE_FIRMWARE_UPDATE and blacklist these
devices.

The wiki page has been updated to contain information on this issue.

  http://ata.wiki.kernel.org/index.php/Known_issues

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c | 21 +++++++++++++++++++++
 include/linux/libata.h    |  1 +
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 4214bfb13bbd..5e2eb740df46 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2492,6 +2492,13 @@ int ata_dev_configure(struct ata_device *dev)
 		}
 	}
 
+	if ((dev->horkage & ATA_HORKAGE_FIRMWARE_WARN) && print_info) {
+		ata_dev_printk(dev, KERN_WARNING, "WARNING: device requires "
+			       "firmware update to be fully functional.\n");
+		ata_dev_printk(dev, KERN_WARNING, "         contact the vendor "
+			       "or visit http://ata.wiki.kernel.org.\n");
+	}
+
 	return 0;
 
 err_out_nosup:
@@ -4042,6 +4049,20 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "ST380817AS",		"3.42",		ATA_HORKAGE_NONCQ },
 	{ "ST3160023AS",	"3.42",		ATA_HORKAGE_NONCQ },
 
+	/* Seagate NCQ + FLUSH CACHE firmware bug */
+	{ "ST31500341AS",	"9JU138",	ATA_HORKAGE_NONCQ |
+						ATA_HORKAGE_FIRMWARE_WARN },
+	{ "ST31000333AS",	"9FZ136",	ATA_HORKAGE_NONCQ |
+						ATA_HORKAGE_FIRMWARE_WARN },
+	{ "ST3640623AS",	"9FZ164",	ATA_HORKAGE_NONCQ |
+						ATA_HORKAGE_FIRMWARE_WARN },
+	{ "ST3640323AS",	"9FZ134",	ATA_HORKAGE_NONCQ |
+						ATA_HORKAGE_FIRMWARE_WARN },
+	{ "ST3320813AS",	"9FZ182",	ATA_HORKAGE_NONCQ |
+						ATA_HORKAGE_FIRMWARE_WARN },
+	{ "ST3320613AS",	"9FZ162",	ATA_HORKAGE_NONCQ |
+						ATA_HORKAGE_FIRMWARE_WARN },
+
 	/* Blacklist entries taken from Silicon Image 3124/3132
 	   Windows driver .inf file - also several Linux problem reports */
 	{ "HTS541060G9SA00",    "MB3OC60D",     ATA_HORKAGE_NONCQ, },
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 59b0f1c807b5..ed3f26eb5df1 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -375,6 +375,7 @@ enum {
 	ATA_HORKAGE_BRIDGE_OK	= (1 << 10),	/* no bridge limits */
 	ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands
 						    not multiple of 16 bytes */
+	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firwmare update warning */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3-71-gd317


From 7ef9964e6d1b911b78709f144000aacadd0ebc21 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Mon, 1 Dec 2008 13:13:55 -0800
Subject: epoll: introduce resource usage limits

It has been thought that the per-user file descriptors limit would also
limit the resources that a normal user can request via the epoll
interface.  Vegard Nossum reported a very simple program (a modified
version attached) that can make a normal user to request a pretty large
amount of kernel memory, well within the its maximum number of fds.  To
solve such problem, default limits are now imposed, and /proc based
configuration has been introduced.  A new directory has been created,
named /proc/sys/fs/epoll/ and inside there, there are two configuration
points:

  max_user_instances = Maximum number of devices - per user

  max_user_watches   = Maximum number of "watched" fds - per user

The current default for "max_user_watches" limits the memory used by epoll
to store "watches", to 1/32 of the amount of the low RAM.  As example, a
256MB 32bit machine, will have "max_user_watches" set to roughly 90000.
That should be enough to not break existing heavy epoll users.  The
default value for "max_user_instances" is set to 128, that should be
enough too.

This also changes the userspace, because a new error code can now come out
from EPOLL_CTL_ADD (-ENOSPC).  The EMFILE from epoll_create() was already
listed, so that should be ok.

[akpm@linux-foundation.org: use get_current_user()]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <stable@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Reported-by: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 27 ++++++++++++
 fs/eventpoll.c                     | 85 ++++++++++++++++++++++++++++++++++----
 include/linux/sched.h              |  4 ++
 kernel/sysctl.c                    | 10 +++++
 4 files changed, 118 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index bcceb99b81dd..bb1b0dd3bfcb 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -44,6 +44,7 @@ Table of Contents
   2.14	/proc/<pid>/io - Display the IO accounting fields
   2.15	/proc/<pid>/coredump_filter - Core dump filtering settings
   2.16	/proc/<pid>/mountinfo - Information about mounts
+  2.17	/proc/sys/fs/epoll - Configuration options for the epoll interface
 
 ------------------------------------------------------------------------------
 Preface
@@ -2483,4 +2484,30 @@ For more information on mount propagation see:
 
   Documentation/filesystems/sharedsubtree.txt
 
+2.17	/proc/sys/fs/epoll - Configuration options for the epoll interface
+--------------------------------------------------------
+
+This directory contains configuration options for the epoll(7) interface.
+
+max_user_instances
+------------------
+
+This is the maximum number of epoll file descriptors that a single user can
+have open at a given time. The default value is 128, and should be enough
+for normal users.
+
+max_user_watches
+----------------
+
+Every epoll file descriptor can store a number of files to be monitored
+for event readiness. Each one of these monitored files constitutes a "watch".
+This configuration option sets the maximum number of "watches" that are
+allowed for each user.
+Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
+on a 64bit one.
+The current default value for  max_user_watches  is the 1/32 of the available
+low memory, divided for the "watch" cost in bytes.
+
+
 ------------------------------------------------------------------------------
+
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index aec5c13f6341..96355d505347 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -102,6 +102,8 @@
 
 #define EP_UNACTIVE_PTR ((void *) -1L)
 
+#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
+
 struct epoll_filefd {
 	struct file *file;
 	int fd;
@@ -200,6 +202,9 @@ struct eventpoll {
 	 * holding ->lock.
 	 */
 	struct epitem *ovflist;
+
+	/* The user that created the eventpoll descriptor */
+	struct user_struct *user;
 };
 
 /* Wait structure used by the poll hooks */
@@ -226,10 +231,18 @@ struct ep_pqueue {
 	struct epitem *epi;
 };
 
+/*
+ * Configuration options available inside /proc/sys/fs/epoll/
+ */
+/* Maximum number of epoll devices, per user */
+static int max_user_instances __read_mostly;
+/* Maximum number of epoll watched descriptors, per user */
+static int max_user_watches __read_mostly;
+
 /*
  * This mutex is used to serialize ep_free() and eventpoll_release_file().
  */
-static struct mutex epmutex;
+static DEFINE_MUTEX(epmutex);
 
 /* Safe wake up implementation */
 static struct poll_safewake psw;
@@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly;
 /* Slab cache used to allocate "struct eppoll_entry" */
 static struct kmem_cache *pwq_cache __read_mostly;
 
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+ctl_table epoll_table[] = {
+	{
+		.procname	= "max_user_instances",
+		.data		= &max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "max_user_watches",
+		.data		= &max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{ .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+
 
 /* Setup the structure that is used as key for the RB tree */
 static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	/* At this point it is safe to free the eventpoll item */
 	kmem_cache_free(epi_cache, epi);
 
+	atomic_dec(&ep->user->epoll_watches);
+
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
 		     current, ep, file));
 
@@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep)
 
 	mutex_unlock(&epmutex);
 	mutex_destroy(&ep->mtx);
+	atomic_dec(&ep->user->epoll_devs);
+	free_uid(ep->user);
 	kfree(ep);
 }
 
@@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file)
 
 static int ep_alloc(struct eventpoll **pep)
 {
-	struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	int error;
+	struct user_struct *user;
+	struct eventpoll *ep;
 
-	if (!ep)
-		return -ENOMEM;
+	user = get_current_user();
+	error = -EMFILE;
+	if (unlikely(atomic_read(&user->epoll_devs) >=
+			max_user_instances))
+		goto free_uid;
+	error = -ENOMEM;
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	if (unlikely(!ep))
+		goto free_uid;
 
 	spin_lock_init(&ep->lock);
 	mutex_init(&ep->mtx);
@@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep)
 	INIT_LIST_HEAD(&ep->rdllist);
 	ep->rbr = RB_ROOT;
 	ep->ovflist = EP_UNACTIVE_PTR;
+	ep->user = user;
 
 	*pep = ep;
 
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
 		     current, ep));
 	return 0;
+
+free_uid:
+	free_uid(user);
+	return error;
 }
 
 /*
@@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	struct epitem *epi;
 	struct ep_pqueue epq;
 
-	error = -ENOMEM;
+	if (unlikely(atomic_read(&ep->user->epoll_watches) >=
+		     max_user_watches))
+		return -ENOSPC;
 	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
-		goto error_return;
+		return -ENOMEM;
 
 	/* Item initialization follow here ... */
 	INIT_LIST_HEAD(&epi->rdllink);
@@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	 * install process. Namely an allocation for a wait queue failed due
 	 * high memory pressure.
 	 */
+	error = -ENOMEM;
 	if (epi->nwait < 0)
 		goto error_unregister;
 
@@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	spin_unlock_irqrestore(&ep->lock, flags);
 
+	atomic_inc(&ep->user->epoll_watches);
+
 	/* We have to call this outside the lock */
 	if (pwake)
 		ep_poll_safewake(&psw, &ep->poll_wait);
@@ -789,7 +852,7 @@ error_unregister:
 	spin_unlock_irqrestore(&ep->lock, flags);
 
 	kmem_cache_free(epi_cache, epi);
-error_return:
+
 	return error;
 }
 
@@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags)
 			      flags & O_CLOEXEC);
 	if (fd < 0)
 		ep_free(ep);
+	atomic_inc(&ep->user->epoll_devs);
 
 error_return:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
 
 static int __init eventpoll_init(void)
 {
-	mutex_init(&epmutex);
+	struct sysinfo si;
+
+	si_meminfo(&si);
+	max_user_instances = 128;
+	max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
+		EP_ITEM_COST;
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_poll_safewake_init(&psw);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 644ffbda17ca..55e30d114477 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -630,6 +630,10 @@ struct user_struct {
 	atomic_t inotify_watches; /* How many inotify watches does this user have? */
 	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
 #endif
+#ifdef CONFIG_EPOLL
+	atomic_t epoll_devs;	/* The number of epoll descriptors currently open */
+	atomic_t epoll_watches;	/* The number of file descriptors currently watched */
+#endif
 #ifdef CONFIG_POSIX_MQUEUE
 	/* protected by mq_lock	*/
 	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d048fa2d902..3d56fe7570da 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -176,6 +176,9 @@ extern struct ctl_table random_table[];
 #ifdef CONFIG_INOTIFY_USER
 extern struct ctl_table inotify_table[];
 #endif
+#ifdef CONFIG_EPOLL
+extern struct ctl_table epoll_table[];
+#endif
 
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 int sysctl_legacy_va_layout;
@@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = {
 		.child		= inotify_table,
 	},
 #endif	
+#ifdef CONFIG_EPOLL
+	{
+		.procname	= "epoll",
+		.mode		= 0555,
+		.child		= epoll_table,
+	},
+#endif
 #endif
 	{
 		.ctl_name	= KERN_SETUID_DUMPABLE,
-- 
cgit v1.2.3-71-gd317


From 6ff2d39b91aec3dcae951afa982059e3dd9b49dc Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Mon, 1 Dec 2008 13:14:02 -0800
Subject: lib/idr.c: fix rcu related race with idr_find

2nd part of the fixes needed for
http://bugzilla.kernel.org/show_bug.cgi?id=11796.

When the idr tree is either grown or shrunk, then the update to the number
of layers and the top pointer were not atomic.  This race caused crashes.

The attached patch fixes that by replicating the layers counter in each
layer, thus idr_find doesn't need idp->layers anymore.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Clement Calmels <cboulte@gmail.com>
Cc: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Pierre Peiffer <peifferp@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/idr.h |  3 ++-
 lib/idr.c           | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index fa035f96f2a3..dd846df8cd32 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -52,13 +52,14 @@ struct idr_layer {
 	unsigned long		 bitmap; /* A zero bit means "space here" */
 	struct idr_layer	*ary[1<<IDR_BITS];
 	int			 count;	 /* When zero, we can release it */
+	int			 layer;	 /* distance from leaf */
 	struct rcu_head		 rcu_head;
 };
 
 struct idr {
 	struct idr_layer *top;
 	struct idr_layer *id_free;
-	int		  layers;
+	int		  layers; /* only valid without concurrent changes */
 	int		  id_free_cnt;
 	spinlock_t	  lock;
 };
diff --git a/lib/idr.c b/lib/idr.c
index e728c7fccc4d..7a785a0c2ea0 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -185,6 +185,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa)
 			new = get_from_free_list(idp);
 			if (!new)
 				return -1;
+			new->layer = l-1;
 			rcu_assign_pointer(p->ary[m], new);
 			p->count++;
 		}
@@ -210,6 +211,7 @@ build_up:
 	if (unlikely(!p)) {
 		if (!(p = get_from_free_list(idp)))
 			return -1;
+		p->layer = 0;
 		layers = 1;
 	}
 	/*
@@ -237,6 +239,7 @@ build_up:
 		}
 		new->ary[0] = p;
 		new->count = 1;
+		new->layer = layers-1;
 		if (p->bitmap == IDR_FULL)
 			__set_bit(0, &new->bitmap);
 		p = new;
@@ -493,17 +496,21 @@ void *idr_find(struct idr *idp, int id)
 	int n;
 	struct idr_layer *p;
 
-	n = idp->layers * IDR_BITS;
 	p = rcu_dereference(idp->top);
+	if (!p)
+		return NULL;
+	n = (p->layer+1) * IDR_BITS;
 
 	/* Mask off upper bits we don't use for the search. */
 	id &= MAX_ID_MASK;
 
 	if (id >= (1 << n))
 		return NULL;
+	BUG_ON(n == 0);
 
 	while (n > 0 && p) {
 		n -= IDR_BITS;
+		BUG_ON(n != p->layer*IDR_BITS);
 		p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
 	}
 	return((void *)p);
@@ -582,8 +589,11 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 	int n;
 	struct idr_layer *p, *old_p;
 
-	n = idp->layers * IDR_BITS;
 	p = idp->top;
+	if (!p)
+		return ERR_PTR(-EINVAL);
+
+	n = (p->layer+1) * IDR_BITS;
 
 	id &= MAX_ID_MASK;
 
-- 
cgit v1.2.3-71-gd317


From 6636487e8dc49a1c43fed336bdc4a2f3d7ce6881 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 2 Dec 2008 20:40:03 +0100
Subject: amd74xx: workaround unreliable AltStatus register for nVidia
 controllers

It seems that on some nVidia controllers using AltStatus register
can be unreliable so default to Status register if the PCI device
is in Compatibility Mode.  In order to achieve this:

* Add ide_pci_is_in_compatibility_mode() inline helper to <linux/ide.h>.

* Add IDE_HFLAG_BROKEN_ALTSTATUS host flag and set it in amd74xx host
  driver for nVidia controllers in Compatibility Mode.

* Teach actual_try_to_identify() and drive_is_ready() about the new flag.

This fixes the regression caused by removal of CONFIG_IDEPCI_SHARE_IRQ
config option in 2.6.25 and using AltStatus register unconditionally when
available (kernel.org bugs #11659 and #10216).

[ Moreover for CONFIG_IDEPCI_SHARE_IRQ=y (which is what most people
  and distributions use) it never worked correctly. ]

Thanks to Remy LABENE and Lars Winterfeld for help with debugging the problem.

More info at:
http://bugzilla.kernel.org/show_bug.cgi?id=11659
http://bugzilla.kernel.org/show_bug.cgi?id=10216

Reported-by: Remy LABENE <remy.labene@free.fr>
Tested-by: Remy LABENE <remy.labene@free.fr>
Tested-by: Lars Winterfeld <lars.winterfeld@tu-ilmenau.de>
Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/amd74xx.c   | 11 ++++++++++-
 drivers/ide/ide-iops.c  |  3 ++-
 drivers/ide/ide-probe.c |  3 ++-
 include/linux/ide.h     |  8 ++++++++
 4 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/ide/amd74xx.c b/drivers/ide/amd74xx.c
index 81ec73134eda..c6bcd3014a29 100644
--- a/drivers/ide/amd74xx.c
+++ b/drivers/ide/amd74xx.c
@@ -3,7 +3,7 @@
  * IDE driver for Linux.
  *
  * Copyright (c) 2000-2002 Vojtech Pavlik
- * Copyright (c) 2007 Bartlomiej Zolnierkiewicz
+ * Copyright (c) 2007-2008 Bartlomiej Zolnierkiewicz
  *
  * Based on the work of:
  *      Andre Hedrick
@@ -263,6 +263,15 @@ static int __devinit amd74xx_probe(struct pci_dev *dev, const struct pci_device_
 			d.udma_mask = ATA_UDMA5;
 	}
 
+	/*
+	 * It seems that on some nVidia controllers using AltStatus
+	 * register can be unreliable so default to Status register
+	 * if the device is in Compatibility Mode.
+	 */
+	if (dev->vendor == PCI_VENDOR_ID_NVIDIA &&
+	    ide_pci_is_in_compatibility_mode(dev))
+		d.host_flags |= IDE_HFLAG_BROKEN_ALTSTATUS;
+
 	printk(KERN_INFO "%s %s: UDMA%s controller\n",
 		d.name, pci_name(dev), amd_dma[fls(d.udma_mask) - 1]);
 
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 5d6ba14e211d..142b9573d64c 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -468,7 +468,8 @@ int drive_is_ready (ide_drive_t *drive)
 	 * an interrupt with another pci card/device.  We make no assumptions
 	 * about possible isa-pnp and pci-pnp issues yet.
 	 */
-	if (hwif->io_ports.ctl_addr)
+	if (hwif->io_ports.ctl_addr &&
+	    (hwif->host_flags & IDE_HFLAG_BROKEN_ALTSTATUS) == 0)
 		stat = hwif->tp_ops->read_altstatus(hwif);
 	else
 		/* Note: this may clear a pending IRQ!! */
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 1649ea54f76c..c55bdbd22314 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -266,7 +266,8 @@ static int actual_try_to_identify (ide_drive_t *drive, u8 cmd)
 	/* take a deep breath */
 	msleep(50);
 
-	if (io_ports->ctl_addr) {
+	if (io_ports->ctl_addr &&
+	    (hwif->host_flags & IDE_HFLAG_BROKEN_ALTSTATUS) == 0) {
 		a = tp_ops->read_altstatus(hwif);
 		s = tp_ops->read_status(hwif);
 		if ((a ^ s) & ~ATA_IDX)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 54525be4b5f8..010fb26a1579 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1296,6 +1296,13 @@ extern int __ide_pci_register_driver(struct pci_driver *driver, struct module *o
 #define ide_pci_register_driver(d) pci_register_driver(d)
 #endif
 
+static inline int ide_pci_is_in_compatibility_mode(struct pci_dev *dev)
+{
+	if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE && (dev->class & 5) != 5)
+		return 1;
+	return 0;
+}
+
 void ide_pci_setup_ports(struct pci_dev *, const struct ide_port_info *, int,
 			 hw_regs_t *, hw_regs_t **);
 void ide_setup_pci_noise(struct pci_dev *, const struct ide_port_info *);
@@ -1375,6 +1382,7 @@ enum {
 	IDE_HFLAG_IO_32BIT		= (1 << 24),
 	/* unmask IRQs */
 	IDE_HFLAG_UNMASK_IRQS		= (1 << 25),
+	IDE_HFLAG_BROKEN_ALTSTATUS	= (1 << 26),
 	/* serialize ports if DMA is possible (for sl82c105) */
 	IDE_HFLAG_SERIALIZE_DMA		= (1 << 27),
 	/* force host out of "simplex" mode */
-- 
cgit v1.2.3-71-gd317


From 1b79cd04fab80be61dcd2732e2423aafde9a4c1c Mon Sep 17 00:00:00 2001
From: "Junjiro R. Okajima" <hooanon05@yahoo.co.jp>
Date: Tue, 2 Dec 2008 10:31:46 -0800
Subject: nfsd: fix vm overcommit crash fix #2

The previous patch from Alan Cox ("nfsd: fix vm overcommit crash",
commit 731572d39fcd3498702eda4600db4c43d51e0b26) fixed the problem where
knfsd crashes on exported shmemfs objects and strict overcommit is set.

But the patch forgot supporting the case when CONFIG_SECURITY is
disabled.

This patch copies a part of his fix which is mainly for detecting a bug
earlier.

Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Junjiro R. Okajima <hooanon05@yahoo.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/security.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index c13f1cec9abb..e3d4ecda2673 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1818,17 +1818,21 @@ static inline int security_settime(struct timespec *ts, struct timezone *tz)
 
 static inline int security_vm_enough_memory(long pages)
 {
+	WARN_ON(current->mm == NULL);
 	return cap_vm_enough_memory(current->mm, pages);
 }
 
-static inline int security_vm_enough_memory_kern(long pages)
+static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 {
-	return cap_vm_enough_memory(current->mm, pages);
+	WARN_ON(mm == NULL);
+	return cap_vm_enough_memory(mm, pages);
 }
 
-static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
+static inline int security_vm_enough_memory_kern(long pages)
 {
-	return cap_vm_enough_memory(mm, pages);
+	/* If current->mm is a kernel thread then we will pass NULL,
+	   for this specific case that is fine */
+	return cap_vm_enough_memory(current->mm, pages);
 }
 
 static inline int security_bprm_alloc(struct linux_binprm *bprm)
-- 
cgit v1.2.3-71-gd317


From 53a08807c01989c6847bb135d8d43f61c5dfdda5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Dec 2008 12:41:26 +0100
Subject: block: internal dequeue shouldn't start timer

blkdev_dequeue_request() and elv_dequeue_request() are equivalent and
both start the timeout timer.  Barrier code dequeues the original
barrier request but doesn't passes the request itself to lower level
driver, only broken down proxy requests; however, as the original
barrier code goes through the same dequeue path and timeout timer is
started on it.  If barrier sequence takes long enough, this timer
expires but the low level driver has no idea about this request and
oops follows.

Timeout timer shouldn't have been started on the original barrier
request as it never goes through actual IO.  This patch unexports
elv_dequeue_request(), which has no external user anyway, and makes it
operate on elevator proper w/o adding the timer and make
blkdev_dequeue_request() call elv_dequeue_request() and add timer.
Internal users which don't pass the request to driver - barrier code
and end_that_request_last() - are converted to use
elv_dequeue_request().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    |  4 ++--
 block/blk-core.c       | 24 +++++++++++++++++++++++-
 block/elevator.c       |  7 -------
 include/linux/blkdev.h |  7 ++-----
 4 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 5c99ff8d2db8..6e72d661ae42 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -161,7 +161,7 @@ static inline struct request *start_ordered(struct request_queue *q,
 	/*
 	 * Prep proxy barrier request.
 	 */
-	blkdev_dequeue_request(rq);
+	elv_dequeue_request(q, rq);
 	q->orig_bar_rq = rq;
 	rq = &q->bar_rq;
 	blk_rq_init(q, rq);
@@ -219,7 +219,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 * This can happen when the queue switches to
 			 * ORDERED_NONE while this request is on it.
 			 */
-			blkdev_dequeue_request(rq);
+			elv_dequeue_request(q, rq);
 			if (__blk_end_request(rq, -EOPNOTSUPP,
 					      blk_rq_bytes(rq)))
 				BUG();
diff --git a/block/blk-core.c b/block/blk-core.c
index 10e8a64a5a5b..7a779d7c69c9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1636,6 +1636,28 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
+/**
+ * blkdev_dequeue_request - dequeue request and start timeout timer
+ * @req: request to dequeue
+ *
+ * Dequeue @req and start timeout timer on it.  This hands off the
+ * request to the driver.
+ *
+ * Block internal functions which don't want to start timer should
+ * call elv_dequeue_request().
+ */
+void blkdev_dequeue_request(struct request *req)
+{
+	elv_dequeue_request(req->q, req);
+
+	/*
+	 * We are now handing the request to the hardware, add the
+	 * timeout handler.
+	 */
+	blk_add_timer(req);
+}
+EXPORT_SYMBOL(blkdev_dequeue_request);
+
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
@@ -1774,7 +1796,7 @@ static void end_that_request_last(struct request *req, int error)
 		blk_queue_end_tag(req->q, req);
 
 	if (blk_queued_rq(req))
-		blkdev_dequeue_request(req);
+		elv_dequeue_request(req->q, req);
 
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
diff --git a/block/elevator.c b/block/elevator.c
index 9ac82dde99dd..a6951f76ba0c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -844,14 +844,7 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 	 */
 	if (blk_account_rq(rq))
 		q->in_flight++;
-
-	/*
-	 * We are now handing the request to the hardware, add the
-	 * timeout handler.
-	 */
-	blk_add_timer(rq);
 }
-EXPORT_SYMBOL(elv_dequeue_request);
 
 int elv_queue_empty(struct request_queue *q)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a135256b272c..9cc7cc5fdce1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -786,6 +786,8 @@ static inline void blk_run_address_space(struct address_space *mapping)
 		blk_run_backing_dev(mapping->backing_dev_info, NULL);
 }
 
+extern void blkdev_dequeue_request(struct request *req);
+
 /*
  * blk_end_request() and friends.
  * __blk_end_request() and end_request() must be called with
@@ -820,11 +822,6 @@ extern void blk_update_request(struct request *rq, int error,
 extern unsigned int blk_rq_bytes(struct request *rq);
 extern unsigned int blk_rq_cur_bytes(struct request *rq);
 
-static inline void blkdev_dequeue_request(struct request *req)
-{
-	elv_dequeue_request(req->q, req);
-}
-
 /*
  * Access functions for manipulating queue properties
  */
-- 
cgit v1.2.3-71-gd317


From 0e435ac26e3f951d83338ed3d4ab7dc0fe0055bc Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Wed, 3 Dec 2008 12:55:08 +0100
Subject: block: fix setting of max_segment_size and seg_boundary mask

Fix setting of max_segment_size and seg_boundary mask for stacked md/dm
devices.

When stacking devices (LVM over MD over SCSI) some of the request queue
parameters are not set up correctly in some cases by default, namely
max_segment_size and and seg_boundary mask.

If you create MD device over SCSI, these attributes are zeroed.

Problem become when there is over this mapping next device-mapper mapping
- queue attributes are set in DM this way:

request_queue   max_segment_size  seg_boundary_mask
SCSI                65536             0xffffffff
MD RAID1                0                      0
LVM                 65536                 -1 (64bit)

Unfortunately bio_add_page (resp.  bio_phys_segments) calculates number of
physical segments according to these parameters.

During the generic_make_request() is segment cout recalculated and can
increase bio->bi_phys_segments count over the allowed limit.  (After
bio_clone() in stack operation.)

Thi is specially problem in CCISS driver, where it produce OOPS here

    BUG_ON(creq->nr_phys_segments > MAXSGENTRIES);

(MAXSEGENTRIES is 31 by default.)

Sometimes even this command is enough to cause oops:

  dd iflag=direct if=/dev/<vg>/<lv> of=/dev/null bs=128000 count=10

This command generates bios with 250 sectors, allocated in 32 4k-pages
(last page uses only 1024 bytes).

For LVM layer, it allocates bio with 31 segments (still OK for CCISS),
unfortunatelly on lower layer it is recalculated to 32 segments and this
violates CCISS restriction and triggers BUG_ON().

The patch tries to fix it by:

 * initializing attributes above in queue request constructor
   blk_queue_make_request()

 * make sure that blk_queue_stack_limits() inherits setting

 (DM uses its own function to set the limits because it
 blk_queue_stack_limits() was introduced later.  It should probably switch
 to use generic stack limit function too.)

 * sets the default seg_boundary value in one place (blkdev.h)

 * use this mask as default in DM (instead of -1, which differs in 64bit)

Bugs related to this:
https://bugzilla.redhat.com/show_bug.cgi?id=471639
http://bugzilla.kernel.org/show_bug.cgi?id=8672

Signed-off-by: Milan Broz <mbroz@redhat.com>
Reviewed-by: Alasdair G Kergon <agk@redhat.com>
Cc: Neil Brown <neilb@suse.de>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Mike Miller <mike.miller@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 2 +-
 block/blk-settings.c   | 4 ++++
 drivers/md/dm-table.c  | 2 +-
 include/linux/blkdev.h | 2 ++
 4 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 7a779d7c69c9..c36aa98fafa3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -592,7 +592,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 				   1 << QUEUE_FLAG_STACKABLE);
 	q->queue_lock		= lock;
 
-	blk_queue_segment_boundary(q, 0xffffffff);
+	blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
 
 	blk_queue_make_request(q, __make_request);
 	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 41392fbe19ff..afa55e14e278 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -125,6 +125,9 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->nr_requests = BLKDEV_MAX_RQ;
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
+	blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
+	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
+
 	q->make_request_fn = mfn;
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -314,6 +317,7 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 	/* zero is "infinity" */
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask);
 
 	t->max_phys_segments = min(t->max_phys_segments, b->max_phys_segments);
 	t->max_hw_segments = min(t->max_hw_segments, b->max_hw_segments);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a63161aec487..04e5fd742c2c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -668,7 +668,7 @@ static void check_for_valid_limits(struct io_restrictions *rs)
 	if (!rs->max_segment_size)
 		rs->max_segment_size = MAX_SEGMENT_SIZE;
 	if (!rs->seg_boundary_mask)
-		rs->seg_boundary_mask = -1;
+		rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	if (!rs->bounce_pfn)
 		rs->bounce_pfn = -1;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9cc7cc5fdce1..6dcd30d806cd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -918,6 +918,8 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define MAX_SEGMENT_SIZE	65536
 
+#define BLK_SEG_BOUNDARY_MASK	0xFFFFFFFFUL
+
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
 static inline int queue_hardsect_size(struct request_queue *q)
-- 
cgit v1.2.3-71-gd317