From 9ac95f2f90e022c16d293d7978faddf7e779a1a9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 9 Feb 2006 22:41:50 +0300
Subject: [PATCH] do_sigaction: cleanup ->sa_mask manipulation

Clear unblockable signals beforehand.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0cfcd1c7865e..9c1da0269a18 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1098,7 +1098,7 @@ extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
 extern int send_sigqueue(int, struct sigqueue *,  struct task_struct *);
 extern int send_group_sigqueue(int, struct sigqueue *,  struct task_struct *);
-extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
+extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
 
 /* These can be the second arg to send_sig_info/send_group_sig_info.  */
-- 
cgit v1.2.3-71-gd317


From a70ea994a0d83fd0151a070be72b87d014ef0a7e Mon Sep 17 00:00:00 2001
From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Thu, 9 Feb 2006 16:40:11 -0800
Subject: [NETLINK]: Fix a severe bug

netlink overrun was broken while improvement of netlink.
Destination socket is used in the place where it was meant to be source socket,
so that now overrun is never sent to user netlink sockets, when it should be,
and it even can be set on kernel socket, which results in complete deadlock
of rtnetlink.

Suggested fix is to restore status quo passing source socket as additional
argument to netlink_attachskb().

A little explanation: overrun is set on a socket, when it failed
to receive some message and sender of this messages does not or even
have no way to handle this error. This happens in two cases:
1. when kernel sends something. Kernel never retransmits and cannot
   wait for buffer space.
2. when user sends a broadcast and the message was not delivered
   to some recipients.

Signed-off-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  | 3 ++-
 ipc/mqueue.c             | 3 ++-
 net/netlink/af_netlink.c | 7 ++++---
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 6a2ccf78a356..c256ebe2a7b4 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -160,7 +160,8 @@ extern int netlink_unregister_notifier(struct notifier_block *nb);
 
 /* finegrained unicast helpers: */
 struct sock *netlink_getsockbyfilp(struct file *filp);
-int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo);
+int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+		long timeo, struct sock *ssk);
 void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
 int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol);
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 59302fc3643b..fd2e26b6f966 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1018,7 +1018,8 @@ retry:
 				goto out;
 			}
 
-			ret = netlink_attachskb(sock, nc, 0, MAX_SCHEDULE_TIMEOUT);
+			ret = netlink_attachskb(sock, nc, 0,
+					MAX_SCHEDULE_TIMEOUT, NULL);
 			if (ret == 1)
 		       		goto retry;
 			if (ret) {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2101b45d2ec6..6b9772d95872 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -702,7 +702,8 @@ struct sock *netlink_getsockbyfilp(struct file *filp)
  * 0: continue
  * 1: repeat lookup - reference dropped while waiting for socket memory.
  */
-int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo)
+int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+		long timeo, struct sock *ssk)
 {
 	struct netlink_sock *nlk;
 
@@ -712,7 +713,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long t
 	    test_bit(0, &nlk->state)) {
 		DECLARE_WAITQUEUE(wait, current);
 		if (!timeo) {
-			if (!nlk->pid)
+			if (!ssk || nlk_sk(ssk)->pid == 0)
 				netlink_overrun(sk);
 			sock_put(sk);
 			kfree_skb(skb);
@@ -797,7 +798,7 @@ retry:
 		kfree_skb(skb);
 		return PTR_ERR(sk);
 	}
-	err = netlink_attachskb(sk, skb, nonblock, timeo);
+	err = netlink_attachskb(sk, skb, nonblock, timeo, ssk);
 	if (err == 1)
 		goto retry;
 	if (err)
-- 
cgit v1.2.3-71-gd317


From 9c15e852a524d55ab768cf48c97f5c684f876af2 Mon Sep 17 00:00:00 2001
From: Haren Myneni <haren@us.ibm.com>
Date: Fri, 10 Feb 2006 01:51:05 -0800
Subject: [PATCH] kexec: fix in free initrd when overlapped with crashkernel
 region

It is possible that the reserved crashkernel region can be overlapped with
initrd since the bootloader sets the initrd location.  When the initrd
region is freed, the second kernel memory will not be contiguous.  The
Kexec_load can cause an oops since there is no contiguous memory to write
the second kernel or this memory could be used in the first kernel itself
and may not be part of the dump.  For example, on powerpc, the initrd is
located at 36MB and the crashkernel starts at 32MB.  The kexec_load caused
panic since writing into non-allocated memory (after 36MB).  We could see
the similar issue even on other archs.

One possibility is to move the initrd outside of crashkernel region.  But,
the initrd region will be freed anyway before the system is up.  This patch
fixes this issue and frees only regions that are not part of crashkernel
memory in case overlaps.

Signed-off-by: Haren Myneni <haren@us.ibm.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kexec.h |  1 +
 init/initramfs.c      | 24 +++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index a311f58c8a7c..cfb3410e32b1 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -6,6 +6,7 @@
 #include <linux/list.h>
 #include <linux/linkage.h>
 #include <linux/compat.h>
+#include <linux/ioport.h>
 #include <asm/kexec.h>
 
 /* Verify architecture specific macros are defined */
diff --git a/init/initramfs.c b/init/initramfs.c
index 0c5d9a3f951b..637344b05981 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -466,10 +466,32 @@ static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only)
 extern char __initramfs_start[], __initramfs_end[];
 #ifdef CONFIG_BLK_DEV_INITRD
 #include <linux/initrd.h>
+#include <linux/kexec.h>
 
 static void __init free_initrd(void)
 {
-	free_initrd_mem(initrd_start, initrd_end);
+#ifdef CONFIG_KEXEC
+	unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
+	unsigned long crashk_end   = (unsigned long)__va(crashk_res.end);
+
+	/*
+	 * If the initrd region is overlapped with crashkernel reserved region,
+	 * free only memory that is not part of crashkernel region.
+	 */
+	if (initrd_start < crashk_end && initrd_end > crashk_start) {
+		/*
+		 * Initialize initrd memory region since the kexec boot does
+		 * not do.
+		 */
+		memset((void *)initrd_start, 0, initrd_end - initrd_start);
+		if (initrd_start < crashk_start)
+			free_initrd_mem(initrd_start, crashk_start);
+		if (initrd_end > crashk_end)
+			free_initrd_mem(crashk_end, initrd_end);
+	} else
+#endif
+		free_initrd_mem(initrd_start, initrd_end);
+
 	initrd_start = 0;
 	initrd_end = 0;
 }
-- 
cgit v1.2.3-71-gd317


From 7a8ef1cb774e5438d292365626f9b96616283706 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 10 Feb 2006 01:51:08 -0800
Subject: [PATCH] x86: don't initialise cpu_possible_map to all ones

Initialising cpu_possible_map to all-ones with CONFIG_HOTPLUG_CPU means that

a) All for_each_cpu() loops will iterate across all NR_CPUS CPUs, rather
   than over possible ones.  That can be quite expensive.

b) Soon we'll be allocating per-cpu areas only for possible CPUs.  So with
   CPU_MASK_ALL, we'll be wasting memory.

I also switched voyager over to not use CPU_MASK_ALL in the non-CPU-hotplug
case.  Should be OK..

I note that parisc is also using CPU_MASK_ALL.  Suggest that it stop doing
that.

Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Paul Jackson <pj@sgi.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Zwane Mwaikambo <zwane@linuxpower.ca>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smpboot.c           | 4 ----
 arch/i386/mach-voyager/voyager_smp.c | 2 +-
 include/linux/cpumask.h              | 2 +-
 3 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 255adb498268..fb00ab7b7612 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -87,11 +87,7 @@ EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
-#ifdef CONFIG_HOTPLUG_CPU
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
-#else
 cpumask_t cpu_possible_map;
-#endif
 EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index 72a1b9cae2e4..6e4c3baef6cc 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -240,7 +240,7 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
 cpumask_t cpu_callin_map = CPU_MASK_NONE;
 cpumask_t cpu_callout_map = CPU_MASK_NONE;
 EXPORT_SYMBOL(cpu_callout_map);
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
+cpumask_t cpu_possible_map = CPU_MASK_NONE;
 EXPORT_SYMBOL(cpu_possible_map);
 
 /* The per processor IRQ masks (these are usually kept in sync) */
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 20b446f26ecd..60e56c6e03dd 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -328,7 +328,7 @@ static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp,
  * bitmap of size NR_CPUS.
  *
  *  #ifdef CONFIG_HOTPLUG_CPU
- *     cpu_possible_map - all NR_CPUS bits set
+ *     cpu_possible_map - has bit 'cpu' set iff cpu is populatable
  *     cpu_present_map  - has bit 'cpu' set iff cpu is populated
  *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
  *  #else
-- 
cgit v1.2.3-71-gd317


From 8977d929e49021d9a6e031310aab01fa72f849c2 Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Fri, 10 Feb 2006 01:51:14 -0800
Subject: [PATCH] tty buffering stall fix

Prevent stalled processing of received data when a driver allocates tty
buffer space but does not immediately follow the allocation with more data
and a call to schedule receive tty processing.  (example: hvc_console) This
bug was introduced by the first locking patch for the new tty buffering.

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/tty_io.c    | 30 ++++++++++++++++++++++--------
 include/linux/kbd_kern.h |  4 +++-
 include/linux/tty.h      |  2 ++
 include/linux/tty_flip.h |  4 +++-
 4 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 076e07c1da38..a23816d3e9a1 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -268,6 +268,8 @@ static struct tty_buffer *tty_buffer_alloc(size_t size)
 	p->size = size;
 	p->next = NULL;
 	p->active = 0;
+	p->commit = 0;
+	p->read = 0;
 	p->char_buf_ptr = (char *)(p->data);
 	p->flag_buf_ptr = (unsigned char *)p->char_buf_ptr + size;
 /* 	printk("Flip create %p\n", p); */
@@ -298,6 +300,8 @@ static struct tty_buffer *tty_buffer_find(struct tty_struct *tty, size_t size)
 			*tbh = t->next;
 			t->next = NULL;
 			t->used = 0;
+			t->commit = 0;
+			t->read = 0;
 			/* DEBUG ONLY */
 			memset(t->data, '*', size);
 /* 			printk("Flip recycle %p\n", t); */
@@ -335,6 +339,7 @@ int tty_buffer_request_room(struct tty_struct *tty, size_t size)
 			if (b != NULL) {
 				b->next = n;
 				b->active = 0;
+				b->commit = b->used;
 			} else
 				tty->buf.head = n;
 			tty->buf.tail = n;
@@ -2752,6 +2757,9 @@ static void flush_to_ldisc(void *private_)
 	unsigned long 	flags;
 	struct tty_ldisc *disc;
 	struct tty_buffer *tbuf;
+	int count;
+	char *char_buf;
+	unsigned char *flag_buf;
 
 	disc = tty_ldisc_ref(tty);
 	if (disc == NULL)	/*  !TTY_LDISC */
@@ -2765,16 +2773,20 @@ static void flush_to_ldisc(void *private_)
 		goto out;
 	}
 	spin_lock_irqsave(&tty->buf.lock, flags);
-	while((tbuf = tty->buf.head) != NULL && !tbuf->active) {
+	while((tbuf = tty->buf.head) != NULL) {
+		while ((count = tbuf->commit - tbuf->read) != 0) {
+			char_buf = tbuf->char_buf_ptr + tbuf->read;
+			flag_buf = tbuf->flag_buf_ptr + tbuf->read;
+			tbuf->read += count;
+			spin_unlock_irqrestore(&tty->buf.lock, flags);
+			disc->receive_buf(tty, char_buf, flag_buf, count);
+			spin_lock_irqsave(&tty->buf.lock, flags);
+		}
+		if (tbuf->active)
+			break;
 		tty->buf.head = tbuf->next;
 		if (tty->buf.head == NULL)
 			tty->buf.tail = NULL;
-		spin_unlock_irqrestore(&tty->buf.lock, flags);
-		/* printk("Process buffer %p for %d\n", tbuf, tbuf->used); */
-		disc->receive_buf(tty, tbuf->char_buf_ptr,
-				       tbuf->flag_buf_ptr,
-				       tbuf->used);
-		spin_lock_irqsave(&tty->buf.lock, flags);
 		tty_buffer_free(tty, tbuf);
 	}
 	spin_unlock_irqrestore(&tty->buf.lock, flags);
@@ -2871,8 +2883,10 @@ void tty_flip_buffer_push(struct tty_struct *tty)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&tty->buf.lock, flags);
-	if (tty->buf.tail != NULL)
+	if (tty->buf.tail != NULL) {
 		tty->buf.tail->active = 0;
+		tty->buf.tail->commit = tty->buf.tail->used;
+	}
 	spin_unlock_irqrestore(&tty->buf.lock, flags);
 
 	if (tty->low_latency)
diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h
index 3aed37314ab8..e87c32a5c86a 100644
--- a/include/linux/kbd_kern.h
+++ b/include/linux/kbd_kern.h
@@ -153,8 +153,10 @@ static inline void con_schedule_flip(struct tty_struct *t)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&t->buf.lock, flags);
-	if (t->buf.tail != NULL)
+	if (t->buf.tail != NULL) {
 		t->buf.tail->active = 0;
+		t->buf.tail->commit = t->buf.tail->used;
+	}
 	spin_unlock_irqrestore(&t->buf.lock, flags);
 	schedule_work(&t->buf.work);
 }
diff --git a/include/linux/tty.h b/include/linux/tty.h
index a7bd3b4558d2..f45cd74e6f24 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -58,6 +58,8 @@ struct tty_buffer {
 	int used;
 	int size;
 	int active;
+	int commit;
+	int read;
 	/* Data points here */
 	unsigned long data[0];
 };
diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index 82961eb19888..222faf97d5f9 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -29,8 +29,10 @@ _INLINE_ void tty_schedule_flip(struct tty_struct *tty)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&tty->buf.lock, flags);
-	if (tty->buf.tail != NULL)
+	if (tty->buf.tail != NULL) {
 		tty->buf.tail->active = 0;
+		tty->buf.tail->commit = tty->buf.tail->used;
+	}
 	spin_unlock_irqrestore(&tty->buf.lock, flags);
 	schedule_delayed_work(&tty->buf.work, 1);
 }
-- 
cgit v1.2.3-71-gd317


From cff2b760096d1e6feaa31948e7af4abbefe47822 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sat, 11 Feb 2006 17:55:47 -0800
Subject: [PATCH] fstatat64 support

The *at patches introduced fstatat and, due to inusfficient research, I
used the newfstat functions generally as the guideline.  The result is that
on 32-bit platforms we don't have all the information needed to implement
fstatat64.

This patch modifies the code to pass up 64-bit information if
__ARCH_WANT_STAT64 is defined.  I renamed the syscall entry point to make
this clear.  Other archs will continue to use the existing code.  On x86-64
the compat code is implemented using a new sys32_ function.  this is what
is done for the other stat syscalls as well.

This patch might break some other archs (those which define
__ARCH_WANT_STAT64 and which already wired up the syscall).  Yet others
might need changes to accomodate the compatibility mode.  I really don't
want to do that work because all this stat handling is a mess (more so in
glibc, but the kernel is also affected).  It should be done by the arch
maintainers.  I'll provide some stand-alone test shortly.  Those who are
eager could compile glibc and run 'make check' (no installation needed).

The patch below has been tested on x86 and x86-64.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |  2 +-
 arch/x86_64/ia32/ia32entry.S     |  2 +-
 arch/x86_64/ia32/sys_ia32.c      | 22 ++++++++++++++++++++++
 fs/stat.c                        | 22 ++++++++++++++++++++++
 include/asm-i386/unistd.h        |  2 +-
 include/asm-x86_64/ia32_unistd.h |  2 +-
 include/linux/syscalls.h         |  2 ++
 7 files changed, 50 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 5a8b3fb6d27b..ac687d00a1ce 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -299,7 +299,7 @@ ENTRY(sys_call_table)
 	.long sys_mknodat
 	.long sys_fchownat
 	.long sys_futimesat
-	.long sys_newfstatat		/* 300 */
+	.long sys_fstatat64		/* 300 */
 	.long sys_unlinkat
 	.long sys_renameat
 	.long sys_linkat
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index ada4535d0161..00dee176c08e 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -677,7 +677,7 @@ ia32_sys_call_table:
 	.quad sys_mknodat
 	.quad sys_fchownat
 	.quad compat_sys_futimesat
-	.quad compat_sys_newfstatat	/* 300 */
+	.quad sys32_fstatat		/* 300 */
 	.quad sys_unlinkat
 	.quad sys_renameat
 	.quad sys_linkat
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 54481af5344a..2bc55af95419 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -180,6 +180,28 @@ sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
 	return ret;
 }
 
+asmlinkage long
+sys32_fstatat(unsigned int dfd, char __user *filename,
+	      struct stat64 __user* statbuf, int flag)
+{
+	struct kstat stat;
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, &stat);
+	else
+		error = vfs_stat_fd(dfd, filename, &stat);
+
+	if (!error)
+		error = cp_stat64(statbuf, &stat);
+
+out:
+	return error;
+}
+
 /*
  * Linux/i386 didn't use to be able to handle more than
  * 4 system call parameters, so these system calls used a memory
diff --git a/fs/stat.c b/fs/stat.c
index 24211b030f39..9948cc1685a4 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -261,6 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
 	return error;
 }
 
+#ifndef __ARCH_WANT_STAT64
 asmlinkage long sys_newfstatat(int dfd, char __user *filename,
 				struct stat __user *statbuf, int flag)
 {
@@ -281,6 +282,7 @@ asmlinkage long sys_newfstatat(int dfd, char __user *filename,
 out:
 	return error;
 }
+#endif
 
 asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
 {
@@ -395,6 +397,26 @@ asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf)
 	return error;
 }
 
+asmlinkage long sys_fstatat64(int dfd, char __user *filename,
+			       struct stat64 __user *statbuf, int flag)
+{
+	struct kstat stat;
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, &stat);
+	else
+		error = vfs_stat_fd(dfd, filename, &stat);
+
+	if (!error)
+		error = cp_new_stat64(&stat, statbuf);
+
+out:
+	return error;
+}
 #endif /* __ARCH_WANT_STAT64 */
 
 void inode_add_bytes(struct inode *inode, loff_t bytes)
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index cf6f2cd9c514..dc81a55dd94d 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -305,7 +305,7 @@
 #define __NR_mknodat		297
 #define __NR_fchownat		298
 #define __NR_futimesat		299
-#define __NR_newfstatat		300
+#define __NR_fstatat64		300
 #define __NR_unlinkat		301
 #define __NR_renameat		302
 #define __NR_linkat		303
diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h
index 20468983d453..eeb2bcd635de 100644
--- a/include/asm-x86_64/ia32_unistd.h
+++ b/include/asm-x86_64/ia32_unistd.h
@@ -305,7 +305,7 @@
 #define __NR_ia32_mknodat		297
 #define __NR_ia32_fchownat		298
 #define __NR_ia32_futimesat		299
-#define __NR_ia32_newfstatat		300
+#define __NR_ia32_fstatat64		300
 #define __NR_ia32_unlinkat		301
 #define __NR_ia32_renameat		302
 #define __NR_ia32_linkat		303
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3877209d23c3..d73501ba7e44 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -557,6 +557,8 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   int mode);
 asmlinkage long sys_newfstatat(int dfd, char __user *filename,
 			       struct stat __user *statbuf, int flag);
+asmlinkage long sys_fstatat64(int dfd, char __user *filename,
+			       struct stat64 __user *statbuf, int flag);
 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
 			       int bufsiz);
 asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename,
-- 
cgit v1.2.3-71-gd317


From 643a654540579b0dcc7a206a4a7475276a41aff0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 11 Feb 2006 17:55:52 -0800
Subject: [PATCH] select: fix returned timeval

With David Woodhouse <dwmw2@infradead.org>

select() presently has a habit of increasing the value of the user's
`timeout' argument on return.

We were writing back a timeout larger than the original.  We _deliberately_
round up, since we know we must wait at _least_ as long as the caller asks
us to.

The patch adds a couple of helper functions for magnitude comparison of
timespecs and of timevals, and uses them to prevent the various poll and
select functions from returning a timeout which is larger than the one which
was passed in.

The patch also fixes a bug in compat_sys_pselect7(): it was adding the new
timeout value to the old one and was returning that.  It should just return
the new timeout value.

(We have various handy timespec/timeval-to-from-nsec conversion functions in
time.h.  But this code open-codes it all).

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andi Kleen <ak@muc.de>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: george anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c            | 37 +++++++++++++++++++++++++------------
 fs/select.c            | 32 +++++++++++++++++++++++---------
 include/linux/compat.h | 20 ++++++++++++++++++++
 include/linux/time.h   | 25 ++++++++++++++++++++++++-
 4 files changed, 92 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 70c5af4cc270..a2ba78bdf7f7 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1751,11 +1751,15 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 	ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
 
 	if (tvp) {
+		struct compat_timeval rtv;
+
 		if (current->personality & STICKY_TIMEOUTS)
 			goto sticky;
-		tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		tv.tv_sec = timeout;
-		if (copy_to_user(tvp, &tv, sizeof(tv))) {
+		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
+		rtv.tv_sec = timeout;
+		if (compat_timeval_compare(&rtv, &tv) < 0)
+			rtv = tv;
+		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
 sticky:
 			/*
 			 * If an application puts its timeval in read-only
@@ -1822,13 +1826,17 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 	} while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
 
 	if (tsp && !(current->personality & STICKY_TIMEOUTS)) {
-		ts.tv_sec += timeout / HZ;
-		ts.tv_nsec += (timeout % HZ) * (1000000000/HZ);
-		if (ts.tv_nsec >= 1000000000) {
-			ts.tv_sec++;
-			ts.tv_nsec -= 1000000000;
+		struct compat_timespec rts;
+
+		rts.tv_sec = timeout / HZ;
+		rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ);
+		if (rts.tv_nsec >= NSEC_PER_SEC) {
+			rts.tv_sec++;
+			rts.tv_nsec -= NSEC_PER_SEC;
 		}
-		(void)copy_to_user(tsp, &ts, sizeof(ts));
+		if (compat_timespec_compare(&rts, &ts) < 0)
+			rts = ts;
+		copy_to_user(tsp, &rts, sizeof(rts));
 	}
 
 	if (ret == -ERESTARTNOHAND) {
@@ -1918,12 +1926,17 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
 	if (tsp && timeout >= 0) {
+		struct compat_timespec rts;
+
 		if (current->personality & STICKY_TIMEOUTS)
 			goto sticky;
 		/* Yes, we know it's actually an s64, but it's also positive. */
-		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
-		ts.tv_sec = timeout;
-		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
+					1000;
+		rts.tv_sec = timeout;
+		if (compat_timespec_compare(&rts, &ts) < 0)
+			rts = ts;
+		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 sticky:
 			/*
 			 * If an application puts its timeval in read-only
diff --git a/fs/select.c b/fs/select.c
index bc60a3e14ef3..6ce68a9c8976 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -398,11 +398,15 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	ret = core_sys_select(n, inp, outp, exp, &timeout);
 
 	if (tvp) {
+		struct timeval rtv;
+
 		if (current->personality & STICKY_TIMEOUTS)
 			goto sticky;
-		tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		tv.tv_sec = timeout;
-		if (copy_to_user(tvp, &tv, sizeof(tv))) {
+		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
+		rtv.tv_sec = timeout;
+		if (timeval_compare(&rtv, &tv) < 0)
+			rtv = tv;
+		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
 sticky:
 			/*
 			 * If an application puts its timeval in read-only
@@ -460,11 +464,16 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 	ret = core_sys_select(n, inp, outp, exp, &timeout);
 
 	if (tsp) {
+		struct timespec rts;
+
 		if (current->personality & STICKY_TIMEOUTS)
 			goto sticky;
-		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
-		ts.tv_sec = timeout;
-		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
+						1000;
+		rts.tv_sec = timeout;
+		if (timespec_compare(&rts, &ts) < 0)
+			rts = ts;
+		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 sticky:
 			/*
 			 * If an application puts its timeval in read-only
@@ -758,12 +767,17 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
 	if (tsp && timeout >= 0) {
+		struct timespec rts;
+
 		if (current->personality & STICKY_TIMEOUTS)
 			goto sticky;
 		/* Yes, we know it's actually an s64, but it's also positive. */
-		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
-		ts.tv_sec = timeout;
-		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
+						1000;
+		rts.tv_sec = timeout;
+		if (timespec_compare(&rts, &ts) < 0)
+			rts = ts;
+		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 		sticky:
 			/*
 			 * If an application puts its timeval in read-only
diff --git a/include/linux/compat.h b/include/linux/compat.h
index f9ca534787e2..c9ab2a26348c 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -161,5 +161,25 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from);
 int get_compat_sigevent(struct sigevent *event,
 		const struct compat_sigevent __user *u_event);
 
+static inline int compat_timeval_compare(struct compat_timeval *lhs,
+					struct compat_timeval *rhs)
+{
+	if (lhs->tv_sec < rhs->tv_sec)
+		return -1;
+	if (lhs->tv_sec > rhs->tv_sec)
+		return 1;
+	return lhs->tv_usec - rhs->tv_usec;
+}
+
+static inline int compat_timespec_compare(struct compat_timespec *lhs,
+					struct compat_timespec *rhs)
+{
+	if (lhs->tv_sec < rhs->tv_sec)
+		return -1;
+	if (lhs->tv_sec > rhs->tv_sec)
+		return 1;
+	return lhs->tv_nsec - rhs->tv_nsec;
+}
+
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/time.h b/include/linux/time.h
index 7b4dc36532bb..d9cdba54b789 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -33,11 +33,34 @@ struct timezone {
 #define NSEC_PER_SEC		1000000000L
 #define NSEC_PER_USEC		1000L
 
-static __inline__ int timespec_equal(struct timespec *a, struct timespec *b)
+static inline int timespec_equal(struct timespec *a, struct timespec *b)
 {
 	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
 }
 
+/*
+ * lhs < rhs:  return <0
+ * lhs == rhs: return 0
+ * lhs > rhs:  return >0
+ */
+static inline int timespec_compare(struct timespec *lhs, struct timespec *rhs)
+{
+	if (lhs->tv_sec < rhs->tv_sec)
+		return -1;
+	if (lhs->tv_sec > rhs->tv_sec)
+		return 1;
+	return lhs->tv_nsec - rhs->tv_nsec;
+}
+
+static inline int timeval_compare(struct timeval *lhs, struct timeval *rhs)
+{
+	if (lhs->tv_sec < rhs->tv_sec)
+		return -1;
+	if (lhs->tv_sec > rhs->tv_sec)
+		return 1;
+	return lhs->tv_usec - rhs->tv_usec;
+}
+
 extern unsigned long mktime(const unsigned int year, const unsigned int mon,
 			    const unsigned int day, const unsigned int hour,
 			    const unsigned int min, const unsigned int sec);
-- 
cgit v1.2.3-71-gd317


From bc7fc0601b3eb2254f080492f3fd69e319ed32d0 Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Sat, 11 Feb 2006 17:56:07 -0800
Subject: [PATCH] nvidiafb: Add support for Geforce4 MX 4000

Add support for Geforce4 MX 4000 (0x185)

Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/video/nvidia/nvidia.c | 2 ++
 include/linux/pci_ids.h       | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c
index dbcb8962e57d..a7c4e5e8ead6 100644
--- a/drivers/video/nvidia/nvidia.c
+++ b/drivers/video/nvidia/nvidia.c
@@ -138,6 +138,8 @@ static struct pci_device_id nvidiafb_pci_tbl[] = {
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_420_8X,
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_4000,
+	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_448_GO,
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_488_GO,
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 7a61ccdcbc4b..82b83da25d77 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1087,6 +1087,7 @@
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_440_8X 0x0181
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_440SE_8X 0x0182
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_420_8X 0x0183
+#define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_4000   0x0185
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_448_GO    0x0186
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_488_GO    0x0187
 #define PCI_DEVICE_ID_NVIDIA_QUADRO4_580_XGL    0x0188
-- 
cgit v1.2.3-71-gd317


From 7c8903f6373f9abecf060bad53ca36bc4ac037f2 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 14 Feb 2006 13:53:03 -0800
Subject: [PATCH] jbd: revert checkpoint list changes

This patch reverts commit f93ea411b73594f7d144855fd34278bcf34a9afc:
  [PATCH] jbd: split checkpoint lists

This broke journal_flush() for OCFS2, which is its method of being sure
that metadata is sent to disk for another node.

And two related commits 8d3c7fce2d20ecc3264c8d8c91ae3beacdeaed1b and
43c3e6f5abdf6acac9b90c86bf03f995bf7d3d92 with the subjects:
  [PATCH] jbd: log_do_checkpoint fix
  [PATCH] jbd: remove_transaction fix

These seem to be incremental bugfixes on the original patch and as such are
no longer needed.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Cc: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/checkpoint.c | 418 ++++++++++++++++++++++------------------------------
 fs/jbd/commit.c     |   3 +-
 include/linux/jbd.h |   8 +-
 3 files changed, 179 insertions(+), 250 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index e6265a0b56b8..543ed543d1e5 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,75 +24,29 @@
 #include <linux/slab.h>
 
 /*
- * Unlink a buffer from a transaction checkpoint list.
+ * Unlink a buffer from a transaction.
  *
  * Called with j_list_lock held.
  */
 
-static void __buffer_unlink_first(struct journal_head *jh)
+static inline void __buffer_unlink(struct journal_head *jh)
 {
 	transaction_t *transaction;
 
 	transaction = jh->b_cp_transaction;
+	jh->b_cp_transaction = NULL;
 
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh) {
+	if (transaction->t_checkpoint_list == jh)
 		transaction->t_checkpoint_list = jh->b_cpnext;
-		if (transaction->t_checkpoint_list == jh)
-			transaction->t_checkpoint_list = NULL;
-	}
-}
-
-/*
- * Unlink a buffer from a transaction checkpoint(io) list.
- *
- * Called with j_list_lock held.
- */
-
-static inline void __buffer_unlink(struct journal_head *jh)
-{
-	transaction_t *transaction;
-
-	transaction = jh->b_cp_transaction;
-
-	__buffer_unlink_first(jh);
-	if (transaction->t_checkpoint_io_list == jh) {
-		transaction->t_checkpoint_io_list = jh->b_cpnext;
-		if (transaction->t_checkpoint_io_list == jh)
-			transaction->t_checkpoint_io_list = NULL;
-	}
-}
-
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
-	transaction_t *transaction;
-
-	transaction = jh->b_cp_transaction;
-	__buffer_unlink_first(jh);
-
-	if (!transaction->t_checkpoint_io_list) {
-		jh->b_cpnext = jh->b_cpprev = jh;
-	} else {
-		jh->b_cpnext = transaction->t_checkpoint_io_list;
-		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
-		jh->b_cpprev->b_cpnext = jh;
-		jh->b_cpnext->b_cpprev = jh;
-	}
-	transaction->t_checkpoint_io_list = jh;
+	if (transaction->t_checkpoint_list == jh)
+		transaction->t_checkpoint_list = NULL;
 }
 
 /*
  * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it and 2 if we also released the
- * whole transaction.
- *
+ * Returns 1 if we released it.
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
@@ -103,11 +57,12 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		ret = __journal_remove_checkpoint(jh) + 1;
+		__journal_remove_checkpoint(jh);
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
+		ret = 1;
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
@@ -162,53 +117,83 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 
 /*
- * Clean up transaction's list of buffers submitted for io.
- * We wait for any pending IO to complete and remove any clean
- * buffers. Note that we take the buffers in the opposite ordering
- * from the one in which they were submitted for IO.
+ * Clean up a transaction's checkpoint list.
+ *
+ * We wait for any pending IO to complete and make sure any clean
+ * buffers are removed from the transaction.
+ *
+ * Return 1 if we performed any actions which might have destroyed the
+ * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
+ * the last checkpoint buffer is cleansed)
  *
  * Called with j_list_lock held.
  */
-
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
 {
-	struct journal_head *jh;
+	struct journal_head *jh, *next_jh, *last_jh;
 	struct buffer_head *bh;
-	tid_t this_tid;
-	int released = 0;
-
-	this_tid = transaction->t_tid;
-restart:
-	/* Didn't somebody clean up the transaction in the meanwhile */
-	if (journal->j_checkpoint_transactions != transaction ||
-		transaction->t_tid != this_tid)
-		return;
-	while (!released && transaction->t_checkpoint_io_list) {
-		jh = transaction->t_checkpoint_io_list;
+	int ret = 0;
+
+	assert_spin_locked(&journal->j_list_lock);
+	jh = transaction->t_checkpoint_list;
+	if (!jh)
+		return 0;
+
+	last_jh = jh->b_cpprev;
+	next_jh = jh;
+	do {
+		jh = next_jh;
 		bh = jh2bh(jh);
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			spin_lock(&journal->j_list_lock);
-			goto restart;
-		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
-			spin_lock(&journal->j_list_lock);
-			goto restart;
+			goto out_return_1;
 		}
+
 		/*
-		 * Now in whatever state the buffer currently is, we know that
-		 * it has been written out and so we can drop it from the list
+		 * This is foul
 		 */
-		released = __journal_remove_checkpoint(jh);
-		jbd_unlock_bh_state(bh);
-	}
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			goto out_return_1;
+		}
+
+		if (jh->b_transaction != NULL) {
+			transaction_t *t = jh->b_transaction;
+			tid_t tid = t->t_tid;
+
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			log_start_commit(journal, tid);
+			log_wait_commit(journal, tid);
+			goto out_return_1;
+		}
+
+		/*
+		 * AKPM: I think the buffer_jbddirty test is redundant - it
+		 * shouldn't have NULL b_transaction?
+		 */
+		next_jh = jh->b_cpnext;
+		if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
+			BUFFER_TRACE(bh, "remove from checkpoint");
+			__journal_remove_checkpoint(jh);
+			jbd_unlock_bh_state(bh);
+			journal_remove_journal_head(bh);
+			__brelse(bh);
+			ret = 1;
+		} else {
+			jbd_unlock_bh_state(bh);
+		}
+	} while (jh != last_jh);
+
+	return ret;
+out_return_1:
+	spin_lock(&journal->j_list_lock);
+	return 1;
 }
 
 #define NR_BATCH	64
@@ -218,7 +203,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
+	spin_unlock(&journal->j_list_lock);
 	ll_rw_block(SWRITE, *batch_count, bhs);
+	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -234,46 +221,19 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.  
  *
- * Called with j_list_lock held and drops it if 1 is returned
+ * Called with j_list_lock held.
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
-static int __process_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count)
+static int __flush_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count,
+			int *drop_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 
-	if (buffer_locked(bh)) {
-		get_bh(bh);
-		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
-		wait_on_buffer(bh);
-		/* the journal_head may have gone by now */
-		BUFFER_TRACE(bh, "brelse");
-		put_bh(bh);
-		ret = 1;
-	}
-	else if (jh->b_transaction != NULL) {
-		transaction_t *t = jh->b_transaction;
-		tid_t tid = t->t_tid;
+	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
+		J_ASSERT_JH(jh, jh->b_transaction == NULL);
 
-		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
-		log_start_commit(journal, tid);
-		log_wait_commit(journal, tid);
-		ret = 1;
-	}
-	else if (!buffer_dirty(bh)) {
-		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
-		BUFFER_TRACE(bh, "remove from checkpoint");
-		__journal_remove_checkpoint(jh);
-		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
-		journal_remove_journal_head(bh);
-		put_bh(bh);
-		ret = 1;
-	}
-	else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
@@ -286,30 +246,45 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
-		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
-			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
+	} else {
+		int last_buffer = 0;
+		if (jh->b_cpnext == jh) {
+			/* We may be about to drop the transaction.  Tell the
+			 * caller that the lists have changed.
+			 */
+			last_buffer = 1;
+		}
+		if (__try_to_free_cp_buf(jh)) {
+			(*drop_count)++;
+			ret = last_buffer;
+		}
 	}
 	return ret;
 }
 
 /*
- * Perform an actual checkpoint. We take the first transaction on the
- * list of transactions to be checkpointed and send all its buffers
- * to disk. We submit larger chunks of data at once.
+ * Perform an actual checkpoint.  We don't write out only enough to
+ * satisfy the current blocked requests: rather we submit a reasonably
+ * sized chunk of the outstanding data to disk at once for
+ * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
  * 
+ * However, we _do_ take into account the amount requested so that once
+ * the IO has been queued, we can return as soon as enough of it has
+ * completed to disk.
+ *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
-	transaction_t *transaction;
-	tid_t this_tid;
 	int result;
+	int batch_count = 0;
+	struct buffer_head *bhs[NR_BATCH];
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -324,70 +299,79 @@ int log_do_checkpoint(journal_t *journal)
 		return result;
 
 	/*
-	 * OK, we need to start writing disk blocks.  Take one transaction
-	 * and write it.
+	 * OK, we need to start writing disk blocks.  Try to free up a
+	 * quarter of the log in a single checkpoint if we can.
 	 */
-	spin_lock(&journal->j_list_lock);
-	if (!journal->j_checkpoint_transactions)
-		goto out;
-	transaction = journal->j_checkpoint_transactions;
-	this_tid = transaction->t_tid;
-restart:
 	/*
-	 * If someone cleaned up this transaction while we slept, we're
-	 * done (maybe it's a new transaction, but it fell at the same
-	 * address).
+	 * AKPM: check this code.  I had a feeling a while back that it
+	 * degenerates into a busy loop at unmount time.
 	 */
- 	if (journal->j_checkpoint_transactions == transaction &&
-			transaction->t_tid == this_tid) {
-		int batch_count = 0;
-		struct buffer_head *bhs[NR_BATCH];
-		struct journal_head *jh;
-		int retry = 0;
-
-		while (!retry && transaction->t_checkpoint_list) {
+	spin_lock(&journal->j_list_lock);
+	while (journal->j_checkpoint_transactions) {
+		transaction_t *transaction;
+		struct journal_head *jh, *last_jh, *next_jh;
+		int drop_count = 0;
+		int cleanup_ret, retry = 0;
+		tid_t this_tid;
+
+		transaction = journal->j_checkpoint_transactions;
+		this_tid = transaction->t_tid;
+		jh = transaction->t_checkpoint_list;
+		last_jh = jh->b_cpprev;
+		next_jh = jh;
+		do {
 			struct buffer_head *bh;
 
-			jh = transaction->t_checkpoint_list;
+			jh = next_jh;
+			next_jh = jh->b_cpnext;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
+				spin_lock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-			retry = __process_buffer(journal, jh, bhs,
-						&batch_count);
-			if (!retry &&
-			    lock_need_resched(&journal->j_list_lock)) {
-				spin_unlock(&journal->j_list_lock);
+			retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
+			if (cond_resched_lock(&journal->j_list_lock)) {
 				retry = 1;
 				break;
 			}
-		}
+		} while (jh != last_jh && !retry);
 
 		if (batch_count) {
-			if (!retry) {
-				spin_unlock(&journal->j_list_lock);
-				retry = 1;
-			}
 			__flush_batch(journal, bhs, &batch_count);
+			retry = 1;
 		}
 
-		if (retry) {
-			spin_lock(&journal->j_list_lock);
-			goto restart;
-		}
 		/*
-		 * Now we have cleaned up the first transaction's checkpoint
-		 * list.  Let's clean up the second one.
+		 * If someone cleaned up this transaction while we slept, we're
+		 * done
+		 */
+		if (journal->j_checkpoint_transactions != transaction)
+			break;
+		if (retry)
+			continue;
+		/*
+		 * Maybe it's a new transaction, but it fell at the same
+		 * address
 		 */
-		__wait_cp_io(journal, transaction);
+		if (transaction->t_tid != this_tid)
+			continue;
+		/*
+		 * We have walked the whole transaction list without
+		 * finding anything to write to disk.  We had better be
+		 * able to make some progress or we are in trouble.
+		 */
+		cleanup_ret = __cleanup_transaction(journal, transaction);
+		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
+		if (journal->j_checkpoint_transactions != transaction)
+			break;
 	}
-out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
+
 	return 0;
 }
 
@@ -471,53 +455,6 @@ int cleanup_journal_tail(journal_t *journal)
 
 /* Checkpoint list management */
 
-/*
- * journal_clean_one_cp_list
- *
- * Find all the written-back checkpoint buffers in the given list and release them.
- *
- * Called with the journal locked.
- * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
- */
-
-static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
-{
-	struct journal_head *last_jh;
-	struct journal_head *next_jh = jh;
-	int ret, freed = 0;
-
-	*released = 0;
-	if (!jh)
-		return 0;
-
- 	last_jh = jh->b_cpprev;
-	do {
-		jh = next_jh;
-		next_jh = jh->b_cpnext;
-		/* Use trylock because of the ranking */
-		if (jbd_trylock_bh_state(jh2bh(jh))) {
-			ret = __try_to_free_cp_buf(jh);
-			if (ret) {
-				freed++;
-				if (ret == 2) {
-					*released = 1;
-					return freed;
-				}
-			}
-		}
-		/*
-		 * This function only frees up some memory if possible so we
-		 * dont have an obligation to finish processing. Bail out if
-		 * preemption requested:
-		 */
-		if (need_resched())
-			return freed;
-	} while (jh != last_jh);
-
-	return freed;
-}
-
 /*
  * journal_clean_checkpoint_list
  *
@@ -525,38 +462,46 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
+ * Returns number of bufers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
-	int ret = 0, released;
+	int ret = 0;
 
 	transaction = journal->j_checkpoint_transactions;
-	if (!transaction)
+	if (transaction == 0)
 		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
+		struct journal_head *jh;
+
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		ret += journal_clean_one_cp_list(transaction->
-				t_checkpoint_list, &released);
-		if (need_resched())
-			goto out;
-		if (released)
-			continue;
-		/*
-		 * It is essential that we are as careful as in the case of
-		 * t_checkpoint_list with removing the buffer from the list as
-		 * we can possibly see not yet submitted buffers on io_list
-		 */
-		ret += journal_clean_one_cp_list(transaction->
-				t_checkpoint_io_list, &released);
-		if (need_resched())
-			goto out;
+		jh = transaction->t_checkpoint_list;
+		if (jh) {
+			struct journal_head *last_jh = jh->b_cpprev;
+			struct journal_head *next_jh = jh;
+
+			do {
+				jh = next_jh;
+				next_jh = jh->b_cpnext;
+				/* Use trylock because of the ranknig */
+				if (jbd_trylock_bh_state(jh2bh(jh)))
+					ret += __try_to_free_cp_buf(jh);
+				/*
+				 * This function only frees up some memory
+				 * if possible so we dont have an obligation
+				 * to finish processing. Bail out if preemption
+				 * requested:
+				 */
+				if (need_resched())
+					goto out;
+			} while (jh != last_jh);
+		}
 	} while (transaction != last_transaction);
 out:
 	return ret;
@@ -571,22 +516,18 @@ out:
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
- * lists until they have been rewritten, at which point this function is
+ * list until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
- * checkpoint lists.
- *
- * The function returns 1 if it frees the transaction, 0 otherwise.
+ * checkpoint list.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
- * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 
-int __journal_remove_checkpoint(struct journal_head *jh)
+void __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
-	int ret = 0;
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -597,10 +538,8 @@ int __journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	__buffer_unlink(jh);
-	jh->b_cp_transaction = NULL;
 
-	if (transaction->t_checkpoint_list != NULL ||
-	    transaction->t_checkpoint_io_list != NULL)
+	if (transaction->t_checkpoint_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
@@ -626,10 +565,8 @@ int __journal_remove_checkpoint(struct journal_head *jh)
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
-	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
-	return ret;
 }
 
 /*
@@ -691,7 +628,6 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
-	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 29e62d98bae6..002ad2bbc769 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -829,8 +829,7 @@ restart_loop:
 	journal->j_committing_transaction = NULL;
 	spin_unlock(&journal->j_state_lock);
 
-	if (commit_transaction->t_checkpoint_list == NULL &&
-	    commit_transaction->t_checkpoint_io_list == NULL) {
+	if (commit_transaction->t_checkpoint_list == NULL) {
 		__journal_drop_transaction(journal, commit_transaction);
 	} else {
 		if (journal->j_checkpoint_transactions == NULL) {
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 0fe4aa891ddc..41ee79962bb2 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -497,12 +497,6 @@ struct transaction_s
 	 */
 	struct journal_head	*t_checkpoint_list;
 
-	/*
-	 * Doubly-linked circular list of all buffers submitted for IO while
-	 * checkpointing. [j_list_lock]
-	 */
-	struct journal_head	*t_checkpoint_io_list;
-
 	/*
 	 * Doubly-linked circular list of temporary buffers currently undergoing
 	 * IO in the log [j_list_lock]
@@ -852,7 +846,7 @@ extern void journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
 int __journal_clean_checkpoint_list(journal_t *journal);
-int __journal_remove_checkpoint(struct journal_head *);
+void __journal_remove_checkpoint(struct journal_head *);
 void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
 
 /* Buffer IO */
-- 
cgit v1.2.3-71-gd317


From 5ac5f9d1ce8492163dbde5d357dc5d03becf7e36 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 14 Feb 2006 13:53:04 -0800
Subject: [PATCH] NLM: Fix the NLM_GRANTED callback checks

If 2 threads attached to the same process are blocking on different locks on
different files (maybe even on different servers) but have the same lock
arguments (i.e.  same offset+length - actually quite common, since most
processes try to lock the entire file) then the first GRANTED call that wakes
one up will also wake the other.

Currently when the NLM_GRANTED callback comes in, lockd walks the list of
blocked locks in search of a match to the lock that the NLM server has
granted.  Although it checks the lock pid, start and end, it fails to check
the filehandle and the server address.

By checking the filehandle and server IP address, we ensure that this only
happens if the locks truly are referencing the same file.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/lockd/clntlock.c         | 27 +++++++++++++++++----------
 fs/lockd/svc4proc.c         |  2 +-
 fs/lockd/svcproc.c          |  2 +-
 include/linux/lockd/lockd.h |  6 +++---
 4 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 3eaf6e701087..da6354baa0b8 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -111,9 +111,10 @@ long nlmclnt_block(struct nlm_rqst *req, long timeout)
 /*
  * The server lockd has called us back to tell us the lock was granted
  */
-u32
-nlmclnt_grant(struct nlm_lock *lock)
+u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
 {
+	const struct file_lock *fl = &lock->fl;
+	const struct nfs_fh *fh = &lock->fh;
 	struct nlm_wait	*block;
 	u32 res = nlm_lck_denied;
 
@@ -122,14 +123,20 @@ nlmclnt_grant(struct nlm_lock *lock)
 	 * Warning: must not use cookie to match it!
 	 */
 	list_for_each_entry(block, &nlm_blocked, b_list) {
-		if (nlm_compare_locks(block->b_lock, &lock->fl)) {
-			/* Alright, we found a lock. Set the return status
-			 * and wake up the caller
-			 */
-			block->b_status = NLM_LCK_GRANTED;
-			wake_up(&block->b_wait);
-			res = nlm_granted;
-		}
+		struct file_lock *fl_blocked = block->b_lock;
+
+		if (!nlm_compare_locks(fl_blocked, fl))
+			continue;
+		if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
+			continue;
+		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_dentry->d_inode) ,fh) != 0)
+			continue;
+		/* Alright, we found a lock. Set the return status
+		 * and wake up the caller
+		 */
+		block->b_status = NLM_LCK_GRANTED;
+		wake_up(&block->b_wait);
+		res = nlm_granted;
 	}
 	return res;
 }
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4063095d849e..b10f913aa06a 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -228,7 +228,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	dprintk("lockd: GRANTED       called\n");
-	resp->status = nlmclnt_grant(&argp->lock);
+	resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock);
 	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
 	return rpc_success;
 }
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3bc437e0cf5b..35681d9cf1fc 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -256,7 +256,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	dprintk("lockd: GRANTED       called\n");
-	resp->status = nlmclnt_grant(&argp->lock);
+	resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock);
 	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
 	return rpc_success;
 }
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 920766cea79c..ef21ed296039 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -149,7 +149,7 @@ struct nlm_rqst * nlmclnt_alloc_call(void);
 int		  nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl);
 void		  nlmclnt_finish_block(struct nlm_rqst *req);
 long		  nlmclnt_block(struct nlm_rqst *req, long timeout);
-u32		  nlmclnt_grant(struct nlm_lock *);
+u32		  nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *);
 void		  nlmclnt_recovery(struct nlm_host *, u32);
 int		  nlmclnt_reclaim(struct nlm_host *, struct file_lock *);
 int		  nlmclnt_setgrantargs(struct nlm_rqst *, struct nlm_lock *);
@@ -204,7 +204,7 @@ nlmsvc_file_inode(struct nlm_file *file)
  * Compare two host addresses (needs modifying for ipv6)
  */
 static __inline__ int
-nlm_cmp_addr(struct sockaddr_in *sin1, struct sockaddr_in *sin2)
+nlm_cmp_addr(const struct sockaddr_in *sin1, const struct sockaddr_in *sin2)
 {
 	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
 }
@@ -214,7 +214,7 @@ nlm_cmp_addr(struct sockaddr_in *sin1, struct sockaddr_in *sin2)
  * When the second lock is of type F_UNLCK, this acts like a wildcard.
  */
 static __inline__ int
-nlm_compare_locks(struct file_lock *fl1, struct file_lock *fl2)
+nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2)
 {
 	return	fl1->fl_pid   == fl2->fl_pid
 	     && fl1->fl_start == fl2->fl_start
-- 
cgit v1.2.3-71-gd317


From d6077cb80cde4506720f9165eba99ee07438513f Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Tue, 14 Feb 2006 13:53:10 -0800
Subject: [PATCH] sched: revert "filter affine wakeups"

Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6:

    [PATCH] sched: filter affine wakeups

Apparently caused more than 10% performance regression for aim7 benchmark.
The setup in use is 16-cpu HP rx8620, 64Gb of memory and 12 MSA1000s with 144
disks.  Each disk is 72Gb with a single ext3 filesystem (courtesy of HP, who
supplied benchmark results).

The problem is, for aim7, the wake-up pattern is random, but it still needs
load balancing action in the wake-up path to achieve best performance.  With
the above commit, lack of load balancing hurts that workload.

However, for workloads like database transaction processing, the requirement
is exactly opposite.  In the wake up path, best performance is achieved with
absolutely zero load balancing.  We simply wake up the process on the CPU that
it was previously run.  Worst performance is obtained when we do load
balancing at wake up.

There isn't an easy way to auto detect the workload characteristics.  Ingo's
earlier patch that detects idle CPU and decide whether to load balance or not
doesn't perform with aim7 either since all CPUs are busy (it causes even
bigger perf.  regression).

Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6, which causes more
than 10% performance regression with aim7.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  5 +----
 kernel/sched.c        | 10 +---------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c1da0269a18..b6f51e3a38ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -697,11 +697,8 @@ struct task_struct {
 
 	int lock_depth;		/* BKL lock depth */
 
-#if defined(CONFIG_SMP)
-	int last_waker_cpu;	/* CPU that last woke this task up */
-#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	int oncpu;
-#endif
 #endif
 	int prio, static_prio;
 	struct list_head run_list;
diff --git a/kernel/sched.c b/kernel/sched.c
index 87d93be336a1..66d957227de9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1204,9 +1204,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 		}
 	}
 
-	if (p->last_waker_cpu != this_cpu)
-		goto out_set_cpu;
-
 	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
@@ -1277,8 +1274,6 @@ out_set_cpu:
 		cpu = task_cpu(p);
 	}
 
-	p->last_waker_cpu = this_cpu;
-
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1360,12 +1355,9 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP)
-	p->last_waker_cpu = cpu;
-#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
-#endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
-- 
cgit v1.2.3-71-gd317


From ee68cea2c26b7a8222f9020f54d22c6067011e8b Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 15 Feb 2006 01:34:23 -0800
Subject: [NETFILTER]: Fix xfrm lookup after SNAT

To find out if a packet needs to be handled by IPsec after SNAT, packets
are currently rerouted in POST_ROUTING and a new xfrm lookup is done. This
breaks SNAT of non-unicast packets to non-local addresses because the
packet is routed as incoming packet and no neighbour entry is bound to the
dst_entry. In general, it seems to be a bad idea to replace the dst_entry
after the packet was already sent to the output routine because its state
might not match what's expected.

This patch changes the xfrm lookup in POST_ROUTING to re-use the original
dst_entry without routing the packet again. This means no policy routing
can be used for transport mode transforms (which keep the original route)
when packets are SNATed to match the policy, but it looks like the best
we can do for now.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4.h         |  2 +-
 net/ipv4/netfilter.c                   | 41 ++++++++++++++++++++++++++++++++++
 net/ipv4/netfilter/ip_nat_standalone.c |  6 ++---
 3 files changed, 45 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index fdc4a9527343..43c09d790b83 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -79,7 +79,7 @@ enum nf_ip_hook_priorities {
 
 #ifdef __KERNEL__
 extern int ip_route_me_harder(struct sk_buff **pskb);
-
+extern int ip_xfrm_me_harder(struct sk_buff **pskb);
 #endif /*__KERNEL__*/
 
 #endif /*__LINUX_IP_NETFILTER_H*/
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 52a3d7c57907..ed42cdc57cd9 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -78,6 +78,47 @@ int ip_route_me_harder(struct sk_buff **pskb)
 }
 EXPORT_SYMBOL(ip_route_me_harder);
 
+#ifdef CONFIG_XFRM
+int ip_xfrm_me_harder(struct sk_buff **pskb)
+{
+	struct flowi fl;
+	unsigned int hh_len;
+	struct dst_entry *dst;
+
+	if (IPCB(*pskb)->flags & IPSKB_XFRM_TRANSFORMED)
+		return 0;
+	if (xfrm_decode_session(*pskb, &fl, AF_INET) < 0)
+		return -1;
+
+	dst = (*pskb)->dst;
+	if (dst->xfrm)
+		dst = ((struct xfrm_dst *)dst)->route;
+	dst_hold(dst);
+
+	if (xfrm_lookup(&dst, &fl, (*pskb)->sk, 0) < 0)
+		return -1;
+
+	dst_release((*pskb)->dst);
+	(*pskb)->dst = dst;
+
+	/* Change in oif may mean change in hh_len. */
+	hh_len = (*pskb)->dst->dev->hard_header_len;
+	if (skb_headroom(*pskb) < hh_len) {
+		struct sk_buff *nskb;
+
+		nskb = skb_realloc_headroom(*pskb, hh_len);
+		if (!nskb)
+			return -1;
+		if ((*pskb)->sk)
+			skb_set_owner_w(nskb, (*pskb)->sk);
+		kfree_skb(*pskb);
+		*pskb = nskb;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ip_xfrm_me_harder);
+#endif
+
 void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
 EXPORT_SYMBOL(ip_nat_decode_session);
 
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 92c54999a19d..7c3f7d380240 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -235,19 +235,19 @@ ip_nat_out(unsigned int hooknum,
 		return NF_ACCEPT;
 
 	ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+#ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN
 	    && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 
 		if (ct->tuplehash[dir].tuple.src.ip !=
 		    ct->tuplehash[!dir].tuple.dst.ip
-#ifdef CONFIG_XFRM
 		    || ct->tuplehash[dir].tuple.src.u.all !=
 		       ct->tuplehash[!dir].tuple.dst.u.all
-#endif
 		    )
-			return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+			return ip_xfrm_me_harder(pskb) == 0 ? ret : NF_DROP;
 	}
+#endif
 	return ret;
 }
 
-- 
cgit v1.2.3-71-gd317


From 5ecfbae093f0c37311e89b29bfc0c9d586eace87 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 15 Feb 2006 22:50:10 +0300
Subject: [PATCH] fix zap_thread's ptrace related problems

1. The tracee can go from ptrace_stop() to do_signal_stop()
   after __ptrace_unlink(p).

2. It is unsafe to __ptrace_unlink(p) while p->parent may wait
   for tasklist_lock in ptrace_detach().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c              |  2 +-
 include/linux/ptrace.h |  1 +
 kernel/ptrace.c        | 25 +++++++++++++++----------
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 055378d2513e..0e1c95074d42 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1403,7 +1403,7 @@ static void zap_threads (struct mm_struct *mm)
 		do_each_thread(g,p) {
 			if (mm == p->mm && p != tsk &&
 			    p->ptrace && p->parent->mm == mm) {
-				__ptrace_unlink(p);
+				__ptrace_detach(p, 0);
 			}
 		} while_each_thread(g,p);
 		write_unlock_irq(&tasklist_lock);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 9d5cd106b344..0d36750fc0f1 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -84,6 +84,7 @@ extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __us
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern int ptrace_attach(struct task_struct *tsk);
 extern int ptrace_detach(struct task_struct *, unsigned int);
+extern void __ptrace_detach(struct task_struct *, unsigned int);
 extern void ptrace_disable(struct task_struct *);
 extern int ptrace_check_attach(struct task_struct *task, int kill);
 extern int ptrace_request(struct task_struct *child, long request, long addr, long data);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d2cf144d0af5..d95a72c9279d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -72,8 +72,8 @@ void ptrace_untrace(task_t *child)
  */
 void __ptrace_unlink(task_t *child)
 {
-	if (!child->ptrace)
-		BUG();
+	BUG_ON(!child->ptrace);
+
 	child->ptrace = 0;
 	if (!list_empty(&child->ptrace_list)) {
 		list_del_init(&child->ptrace_list);
@@ -184,22 +184,27 @@ bad:
 	return retval;
 }
 
+void __ptrace_detach(struct task_struct *child, unsigned int data)
+{
+	child->exit_code = data;
+	/* .. re-parent .. */
+	__ptrace_unlink(child);
+	/* .. and wake it up. */
+	if (child->exit_state != EXIT_ZOMBIE)
+		wake_up_process(child);
+}
+
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
 	if (!valid_signal(data))
-		return	-EIO;
+		return -EIO;
 
 	/* Architecture-specific hardware disable .. */
 	ptrace_disable(child);
 
-	/* .. re-parent .. */
-	child->exit_code = data;
-
 	write_lock_irq(&tasklist_lock);
-	__ptrace_unlink(child);
-	/* .. and wake it up. */
-	if (child->exit_state != EXIT_ZOMBIE)
-		wake_up_process(child);
+	if (child->ptrace)
+		__ptrace_detach(child, data);
 	write_unlock_irq(&tasklist_lock);
 
 	return 0;
-- 
cgit v1.2.3-71-gd317


From 48d5cad87c3a4998d0bda16ccfb5c60dfe4de5fb Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 15 Feb 2006 15:10:22 -0800
Subject: [XFRM]: Fix SNAT-related crash in xfrm4_output_finish

When a packet matching an IPsec policy is SNATed so it doesn't match any
policy anymore it looses its xfrm bundle, which makes xfrm4_output_finish
crash because of a NULL pointer dereference.

This patch directs these packets to the original output path instead. Since
the packets have already passed the POST_ROUTING hook, but need to start at
the beginning of the original output path which includes another
POST_ROUTING invocation, a flag is added to the IPCB to indicate that the
packet was rerouted and doesn't need to pass the POST_ROUTING hook again.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 19 +++++++++++++++----
 include/net/ip.h          |  1 +
 include/net/xfrm.h        |  1 -
 net/ipv4/ip_gre.c         |  3 ++-
 net/ipv4/ip_output.c      | 16 ++++++++++------
 net/ipv4/ipip.c           |  3 ++-
 net/ipv4/xfrm4_output.c   | 13 ++++++++++---
 7 files changed, 40 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 4cf6088625c1..3ca3d9ee78a9 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -184,8 +184,11 @@ static inline int nf_hook_thresh(int pf, unsigned int hook,
 				 struct sk_buff **pskb,
 				 struct net_device *indev,
 				 struct net_device *outdev,
-				 int (*okfn)(struct sk_buff *), int thresh)
+				 int (*okfn)(struct sk_buff *), int thresh,
+				 int cond)
 {
+	if (!cond)
+		return 1;
 #ifndef CONFIG_NETFILTER_DEBUG
 	if (list_empty(&nf_hooks[pf][hook]))
 		return 1;
@@ -197,7 +200,7 @@ static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb,
 			  struct net_device *indev, struct net_device *outdev,
 			  int (*okfn)(struct sk_buff *))
 {
-	return nf_hook_thresh(pf, hook, pskb, indev, outdev, okfn, INT_MIN);
+	return nf_hook_thresh(pf, hook, pskb, indev, outdev, okfn, INT_MIN, 1);
 }
                    
 /* Activate hook; either okfn or kfree_skb called, unless a hook
@@ -224,7 +227,13 @@ static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb,
 
 #define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh)	       \
 ({int __ret;								       \
-if ((__ret=nf_hook_thresh(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)\
+if ((__ret=nf_hook_thresh(pf, hook, &(skb), indev, outdev, okfn, thresh, 1)) == 1)\
+	__ret = (okfn)(skb);						       \
+__ret;})
+
+#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond)		       \
+({int __ret;								       \
+if ((__ret=nf_hook_thresh(pf, hook, &(skb), indev, outdev, okfn, INT_MIN, cond)) == 1)\
 	__ret = (okfn)(skb);						       \
 __ret;})
 
@@ -295,11 +304,13 @@ extern struct proc_dir_entry *proc_net_netfilter;
 
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
+#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) (okfn)(skb)
 static inline int nf_hook_thresh(int pf, unsigned int hook,
 				 struct sk_buff **pskb,
 				 struct net_device *indev,
 				 struct net_device *outdev,
-				 int (*okfn)(struct sk_buff *), int thresh)
+				 int (*okfn)(struct sk_buff *), int thresh,
+				 int cond)
 {
 	return okfn(*pskb);
 }
diff --git a/include/net/ip.h b/include/net/ip.h
index 8de0697b364c..fab3d5b3ab1c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -41,6 +41,7 @@ struct inet_skb_parm
 #define IPSKB_XFRM_TUNNEL_SIZE	2
 #define IPSKB_XFRM_TRANSFORMED	4
 #define IPSKB_FRAG_COMPLETE	8
+#define IPSKB_REROUTED		16
 };
 
 struct ipcm_cookie
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d09ca0e7d139..d6111a2f0a23 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -866,7 +866,6 @@ extern int xfrm_state_mtu(struct xfrm_state *x, int mtu);
 extern int xfrm_init_state(struct xfrm_state *x);
 extern int xfrm4_rcv(struct sk_buff *skb);
 extern int xfrm4_output(struct sk_buff *skb);
-extern int xfrm4_output_finish(struct sk_buff *skb);
 extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
 extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler);
 extern int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index abe23923e4e7..9981dcd68f11 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -830,7 +830,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, gre_hlen);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED);
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3324fbfe528a..57d290d89ec2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -207,8 +207,10 @@ static inline int ip_finish_output(struct sk_buff *skb)
 {
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 	/* Policy lookup after SNAT yielded a new policy */
-	if (skb->dst->xfrm != NULL)
-		return xfrm4_output_finish(skb);
+	if (skb->dst->xfrm != NULL) {
+		IPCB(skb)->flags |= IPSKB_REROUTED;
+		return dst_output(skb);
+	}
 #endif
 	if (skb->len > dst_mtu(skb->dst) &&
 	    !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
@@ -271,8 +273,9 @@ int ip_mc_output(struct sk_buff *skb)
 				newskb->dev, ip_dev_loopback_xmit);
 	}
 
-	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
-		       ip_finish_output);
+	return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
+			    ip_finish_output,
+			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
 
 int ip_output(struct sk_buff *skb)
@@ -284,8 +287,9 @@ int ip_output(struct sk_buff *skb)
 	skb->dev = dev;
 	skb->protocol = htons(ETH_P_IP);
 
-	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
-		       ip_finish_output);
+	return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
+		            ip_finish_output,
+			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
 
 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e5cbe72c6b80..03d13742a4b8 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -622,7 +622,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED);
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index d4df0ddd424b..32ad229b4fed 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -152,10 +152,16 @@ error_nolock:
 	goto out_exit;
 }
 
-int xfrm4_output_finish(struct sk_buff *skb)
+static int xfrm4_output_finish(struct sk_buff *skb)
 {
 	int err;
 
+#ifdef CONFIG_NETFILTER
+	if (!skb->dst->xfrm) {
+		IPCB(skb)->flags |= IPSKB_REROUTED;
+		return dst_output(skb);
+	}
+#endif
 	while (likely((err = xfrm4_output_one(skb)) == 0)) {
 		nf_reset(skb);
 
@@ -178,6 +184,7 @@ int xfrm4_output_finish(struct sk_buff *skb)
 
 int xfrm4_output(struct sk_buff *skb)
 {
-	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev,
-		       xfrm4_output_finish);
+	return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev,
+			    xfrm4_output_finish,
+			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
-- 
cgit v1.2.3-71-gd317


From 9c92d3486434e7310cb288587953e2dae4a79701 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 15 Feb 2006 15:18:19 -0800
Subject: [NETFILTER]: Don't invoke okfn in CONFIG_NETFILTER=n variant of
 nf_hook()

nf_hook() is supposed to call the netfilter hook and return control of the
packet back to the caller in case it may pass, the okfn is only used for
queueing.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 3ca3d9ee78a9..468896939843 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -318,7 +318,7 @@ static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb,
 			  struct net_device *indev, struct net_device *outdev,
 			  int (*okfn)(struct sk_buff *))
 {
-	return okfn(*pskb);
+	return 1;
 }
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 struct flowi;
-- 
cgit v1.2.3-71-gd317


From b2ee9dbfad14ba8e34a589d552ddc67300a26bec Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Wed, 15 Feb 2006 15:17:40 -0800
Subject: [PATCH] hrtimer: fix multiple macro argument expansion

For two macros the arguments were expanded twice, change them to inline
functions to avoid it.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/ktime.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 6aca67a569a2..f3dec45ef874 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -96,10 +96,16 @@ static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
 		({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; })
 
 /* convert a timespec to ktime_t format: */
-#define timespec_to_ktime(ts)		ktime_set((ts).tv_sec, (ts).tv_nsec)
+static inline ktime_t timespec_to_ktime(struct timespec ts)
+{
+	return ktime_set(ts.tv_sec, ts.tv_nsec);
+}
 
 /* convert a timeval to ktime_t format: */
-#define timeval_to_ktime(tv)		ktime_set((tv).tv_sec, (tv).tv_usec * 1000)
+static inline ktime_t timeval_to_ktime(struct timeval tv)
+{
+	return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
+}
 
 /* Map the ktime_t to timespec conversion to ns_to_timespec function */
 #define ktime_to_timespec(kt)		ns_to_timespec((kt).tv64)
-- 
cgit v1.2.3-71-gd317


From a62eaf151d9cb478d127cfbc2e93c498869785b0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 16 Feb 2006 23:41:58 +0100
Subject: [PATCH] x86_64: Add boot option to disable randomized mappings and
 cleanup

AMD SimNow!'s JIT doesn't like them at all in the guest. For distribution
installation it's easiest if it's a boot time option.

Also I moved the variable to a more appropiate place and make
it independent from sysctl

And marked __read_mostly which it is.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |  3 +++
 arch/i386/kernel/cpu/transmeta.c    |  1 +
 include/linux/kernel.h              |  6 ------
 include/linux/mm.h                  |  2 ++
 kernel/sysctl.c                     |  2 --
 mm/memory.c                         | 10 ++++++++++
 6 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ac75b57edf2e..b874771385cd 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1638,6 +1638,9 @@ running once the system is up.
 			Format:
 			<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
 
+	norandmaps	Don't use address space randomization
+			Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space
+
 
 ______________________________________________________________________
 Changelog:
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index bdbeb77f4e22..7214c9b577ab 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/init.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b49affa0ac5a..3b507bf05d09 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -326,12 +326,6 @@ struct sysinfo {
 /* Force a compilation error if condition is true */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 
-#ifdef CONFIG_SYSCTL
-extern int randomize_va_space;
-#else
-#define randomize_va_space 1
-#endif
-
 /* Trap pasters of __FUNCTION__ at compile-time */
 #define __FUNCTION__ (__func__)
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75e9f0724997..26e1663a5cbe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1051,5 +1051,7 @@ int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 void drop_pagecache(void);
 void drop_slab(void);
 
+extern int randomize_va_space;
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 71dd6f62efec..7654d55c47f5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,8 +126,6 @@ extern int sysctl_hz_timer;
 extern int acct_parm[];
 #endif
 
-int randomize_va_space = 1;
-
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8a..9abc6008544b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmalloc_earlyreserve);
 
+int randomize_va_space __read_mostly = 1;
+
+static int __init disable_randmaps(char *s)
+{
+	randomize_va_space = 0;
+	return 0;
+}
+__setup("norandmaps", disable_randmaps);
+
+
 /*
  * If a p?d_bad entry is found while walking page tables, report
  * the error, before resetting entry to p?d_none.  Usually (but
-- 
cgit v1.2.3-71-gd317


From 726c14bf499e91e7ede4f1728830aba05c675061 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 17 Feb 2006 10:30:23 +1100
Subject: [PATCH] Provide an interface for getting the current tick length

This provides an interface for arch code to find out how many
nanoseconds are going to be added on to xtime by the next call to
do_timer.  The value returned is a fixed-point number in 52.12 format
in nanoseconds.  The reason for this format is that it gives the
full precision that the timekeeping code is using internally.

The motivation for this is to fix a problem that has arisen on 32-bit
powerpc in that the value returned by do_gettimeofday drifts apart
from xtime if NTP is being used.  PowerPC is now using a lockless
do_gettimeofday based on reading the timebase register and performing
some simple arithmetic.  (This method of getting the time is also
exported to userspace via the VDSO.)  However, the factor and offset
it uses were calculated based on the nominal tick length and weren't
being adjusted when NTP varied the tick length.

Note that 64-bit powerpc has had the lockless do_gettimeofday for a
long time now.  It also had an extremely hairy routine that got called
from the 32-bit compat routine for adjtimex, which adjusted the
factor and offset according to what it thought the timekeeping code
was going to do.  Not only was this only called if a 32-bit task did
adjtimex (i.e. not if a 64-bit task did adjtimex), it was also
duplicating computations from kernel/timer.c and it wasn't clear that
it was (still) correct.

The simple solution is to ask the timekeeping code how long the
current jiffy will be on each timer interrupt, after calling
do_timer.  If this jiffy will be a different length from the last one,
we then need to compute new values for the factor and offset used in
the lockless do_gettimeofday.  In this way we can keep xtime and
do_gettimeofday in sync, even when NTP is varying the tick length.

Note that when adjtimex varies the tick length, it almost always
introduces the variation from the next tick on.  The only case I could
see where adjtimex would vary the length of the current tick is when
an old-style adjtime adjustment is being cancelled.  (It's not clear
to me why the adjustment has to be cancelled immediately rather than
from the next tick on.)  Thus I don't see any real need for a hook in
adjtimex; the rare case of an old-style adjustment being cancelled can
be fixed up at the next tick.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: john stultz <johnstul@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h |  3 +++
 kernel/timer.c        | 39 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 04a4a8cb4ed3..b7ca1204e42a 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -345,6 +345,9 @@ time_interpolator_reset(void)
 
 #endif /* !CONFIG_TIME_INTERPOLATION */
 
+/* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
+extern u64 current_tick_length(void);
+
 #endif /* KERNEL */
 
 #endif /* LINUX_TIMEX_H */
diff --git a/kernel/timer.c b/kernel/timer.c
index b9dad3994676..fe3a9a9f8328 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -717,12 +717,16 @@ static void second_overflow(void)
 #endif
 }
 
-/* in the NTP reference this is called "hardclock()" */
-static void update_wall_time_one_tick(void)
+/*
+ * Returns how many microseconds we need to add to xtime this tick
+ * in doing an adjustment requested with adjtime.
+ */
+static long adjtime_adjustment(void)
 {
-	long time_adjust_step, delta_nsec;
+	long time_adjust_step;
 
-	if ((time_adjust_step = time_adjust) != 0 ) {
+	time_adjust_step = time_adjust;
+	if (time_adjust_step) {
 		/*
 		 * We are doing an adjtime thing.  Prepare time_adjust_step to
 		 * be within bounds.  Note that a positive time_adjust means we
@@ -733,10 +737,19 @@ static void update_wall_time_one_tick(void)
 		 */
 		time_adjust_step = min(time_adjust_step, (long)tickadj);
 		time_adjust_step = max(time_adjust_step, (long)-tickadj);
+	}
+	return time_adjust_step;
+}
 
+/* in the NTP reference this is called "hardclock()" */
+static void update_wall_time_one_tick(void)
+{
+	long time_adjust_step, delta_nsec;
+
+	time_adjust_step = adjtime_adjustment();
+	if (time_adjust_step)
 		/* Reduce by this step the amount of time left  */
 		time_adjust -= time_adjust_step;
-	}
 	delta_nsec = tick_nsec + time_adjust_step * 1000;
 	/*
 	 * Advance the phase, once it gets to one microsecond, then
@@ -758,6 +771,22 @@ static void update_wall_time_one_tick(void)
 	}
 }
 
+/*
+ * Return how long ticks are at the moment, that is, how much time
+ * update_wall_time_one_tick will add to xtime next time we call it
+ * (assuming no calls to do_adjtimex in the meantime).
+ * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10
+ * bits to the right of the binary point.
+ * This function has no side-effects.
+ */
+u64 current_tick_length(void)
+{
+	long delta_nsec;
+
+	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
+	return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
+}
+
 /*
  * Using a loop looks inefficient, but "ticks" is
  * usually just one (we shouldn't be losing ticks,
-- 
cgit v1.2.3-71-gd317