From 9343e79a7bb2d3268d68997163608b87d58d8098 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Tue, 17 Jan 2006 02:10:53 -0800
Subject: [IPV6]: Preserve procfs IPV6 address output format

Procfs always output IPV6 addresses without the colon
characters, and we cannot change that.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 323924edb26a..a5363324cf95 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -228,6 +228,7 @@ extern void dump_stack(void);
 	ntohs((addr).s6_addr16[6]), \
 	ntohs((addr).s6_addr16[7])
 #define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
+#define NIP6_SEQFMT "%04x%04x%04x%04x%04x%04x%04x%04x"
 
 #if defined(__LITTLE_ENDIAN)
 #define HIPQUAD(addr) \
-- 
cgit v1.2.3-71-gd317


From e0069caede8387c585060b7e2e87729e9efcebc6 Mon Sep 17 00:00:00 2001
From: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Date: Tue, 17 Jan 2006 02:39:19 -0800
Subject: [NETFILTER] ip6tables: remove unused definitions

These definitions ware used for only internal use in kernel <= 2.6.13,
which had not introduced the unified parser of IPv6 extension header yet.

Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6/ip6t_ah.h   | 9 ---------
 include/linux/netfilter_ipv6/ip6t_esp.h  | 9 ---------
 include/linux/netfilter_ipv6/ip6t_frag.h | 9 ---------
 include/linux/netfilter_ipv6/ip6t_opts.h | 9 ---------
 include/linux/netfilter_ipv6/ip6t_rt.h   | 9 ---------
 5 files changed, 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6/ip6t_ah.h b/include/linux/netfilter_ipv6/ip6t_ah.h
index c4f0793a0a98..8531879eb464 100644
--- a/include/linux/netfilter_ipv6/ip6t_ah.h
+++ b/include/linux/netfilter_ipv6/ip6t_ah.h
@@ -18,13 +18,4 @@ struct ip6t_ah
 #define IP6T_AH_INV_LEN		0x02	/* Invert the sense of length. */
 #define IP6T_AH_INV_MASK	0x03	/* All possible flags. */
 
-#define MASK_HOPOPTS    128
-#define MASK_DSTOPTS    64
-#define MASK_ROUTING    32
-#define MASK_FRAGMENT   16
-#define MASK_AH         8
-#define MASK_ESP        4
-#define MASK_NONE       2
-#define MASK_PROTO      1
-
 #endif /*_IP6T_AH_H*/
diff --git a/include/linux/netfilter_ipv6/ip6t_esp.h b/include/linux/netfilter_ipv6/ip6t_esp.h
index 01142b98a231..a91b6abc8079 100644
--- a/include/linux/netfilter_ipv6/ip6t_esp.h
+++ b/include/linux/netfilter_ipv6/ip6t_esp.h
@@ -7,15 +7,6 @@ struct ip6t_esp
 	u_int8_t  invflags;			/* Inverse flags */
 };
 
-#define MASK_HOPOPTS    128
-#define MASK_DSTOPTS    64
-#define MASK_ROUTING    32
-#define MASK_FRAGMENT   16
-#define MASK_AH         8
-#define MASK_ESP        4
-#define MASK_NONE       2
-#define MASK_PROTO      1
-
 /* Values for "invflags" field in struct ip6t_esp. */
 #define IP6T_ESP_INV_SPI		0x01	/* Invert the sense of spi. */
 #define IP6T_ESP_INV_MASK	0x01	/* All possible flags. */
diff --git a/include/linux/netfilter_ipv6/ip6t_frag.h b/include/linux/netfilter_ipv6/ip6t_frag.h
index 449a57eca7dd..66070a0d6dfc 100644
--- a/include/linux/netfilter_ipv6/ip6t_frag.h
+++ b/include/linux/netfilter_ipv6/ip6t_frag.h
@@ -21,13 +21,4 @@ struct ip6t_frag
 #define IP6T_FRAG_INV_LEN	0x02	/* Invert the sense of length. */
 #define IP6T_FRAG_INV_MASK	0x03	/* All possible flags. */
 
-#define MASK_HOPOPTS    128
-#define MASK_DSTOPTS    64
-#define MASK_ROUTING    32
-#define MASK_FRAGMENT   16
-#define MASK_AH         8
-#define MASK_ESP        4
-#define MASK_NONE       2
-#define MASK_PROTO      1
-
 #endif /*_IP6T_FRAG_H*/
diff --git a/include/linux/netfilter_ipv6/ip6t_opts.h b/include/linux/netfilter_ipv6/ip6t_opts.h
index e259b6275bd2..a07e36380ae8 100644
--- a/include/linux/netfilter_ipv6/ip6t_opts.h
+++ b/include/linux/netfilter_ipv6/ip6t_opts.h
@@ -20,13 +20,4 @@ struct ip6t_opts
 #define IP6T_OPTS_INV_LEN	0x01	/* Invert the sense of length. */
 #define IP6T_OPTS_INV_MASK	0x01	/* All possible flags. */
 
-#define MASK_HOPOPTS    128
-#define MASK_DSTOPTS    64
-#define MASK_ROUTING    32
-#define MASK_FRAGMENT   16
-#define MASK_AH         8
-#define MASK_ESP        4
-#define MASK_NONE       2
-#define MASK_PROTO      1
-
 #endif /*_IP6T_OPTS_H*/
diff --git a/include/linux/netfilter_ipv6/ip6t_rt.h b/include/linux/netfilter_ipv6/ip6t_rt.h
index f1070fbf2757..52156023e8db 100644
--- a/include/linux/netfilter_ipv6/ip6t_rt.h
+++ b/include/linux/netfilter_ipv6/ip6t_rt.h
@@ -30,13 +30,4 @@ struct ip6t_rt
 #define IP6T_RT_INV_LEN		0x04	/* Invert the sense of length. */
 #define IP6T_RT_INV_MASK	0x07	/* All possible flags. */
 
-#define MASK_HOPOPTS    128
-#define MASK_DSTOPTS    64
-#define MASK_ROUTING    32
-#define MASK_FRAGMENT   16
-#define MASK_AH         8
-#define MASK_ESP        4
-#define MASK_NONE       2
-#define MASK_PROTO      1
-
 #endif /*_IP6T_RT_H*/
-- 
cgit v1.2.3-71-gd317


From 8243126c5e29030bf1a3fb75187a513966dcba62 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 17 Jan 2006 02:54:21 -0800
Subject: [NET]: Make second arg to skb_reserved() signed.

Some subsystems, such as PPP, can send negative values
here.  It just happened to work correctly on 32-bit with
an unsigned value, but on 64-bit this explodes.

Figured out by Paul Mackerras based upon several PPP crash
reports.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e5fd66c5650b..ad7cc22bd424 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -926,7 +926,7 @@ static inline int skb_tailroom(const struct sk_buff *skb)
  *	Increase the headroom of an empty &sk_buff by reducing the tail
  *	room. This is only allowed for an empty buffer.
  */
-static inline void skb_reserve(struct sk_buff *skb, unsigned int len)
+static inline void skb_reserve(struct sk_buff *skb, int len)
 {
 	skb->data += len;
 	skb->tail += len;
-- 
cgit v1.2.3-71-gd317


From 16cb4b333c9e7a00ce3b1d74ec0c9b4c2e956910 Mon Sep 17 00:00:00 2001
From: Per Liden <per.liden@nospam.ericsson.com>
Date: Fri, 13 Jan 2006 22:22:22 +0100
Subject: [TIPC] Updated link priority macros

Added macros for min/default/max link priority in tipc_config.h.
Also renamed TIPC_NUM_LINK_PRI to TIPC_MEDIA_LINK_PRI since that
is a more accurate description of what it is used for.

Signed-off-by: Per Liden <per.liden@ericsson.com>
---
 include/linux/tipc_config.h |  7 +++++--
 net/tipc/bcast.c            |  4 ++--
 net/tipc/bearer.c           | 19 +++++++++++++------
 net/tipc/eth_media.c        |  6 +++---
 net/tipc/link.c             |  3 ++-
 5 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tipc_config.h b/include/linux/tipc_config.h
index a52c8c64a5a3..33a653913d94 100644
--- a/include/linux/tipc_config.h
+++ b/include/linux/tipc_config.h
@@ -168,10 +168,13 @@
 #define TIPC_MAX_LINK_NAME	60	/* format = Z.C.N:interface-Z.C.N:interface */
 
 /*
- * Link priority limits (range from 0 to # priorities - 1)
+ * Link priority limits (min, default, max, media default)
  */
 
-#define TIPC_NUM_LINK_PRI 32
+#define TIPC_MIN_LINK_PRI	0
+#define TIPC_DEF_LINK_PRI	10
+#define TIPC_MAX_LINK_PRI	31
+#define TIPC_MEDIA_LINK_PRI	(TIPC_MAX_LINK_PRI + 1)
 
 /*
  * Link tolerance limits (min, default, max), in ms
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 9713d622efb8..af9743a52d6c 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -82,7 +82,7 @@ struct bcbearer {
 	struct bearer bearer;
 	struct media media;
 	struct bcbearer_pair bpairs[MAX_BEARERS];
-	struct bcbearer_pair bpairs_temp[TIPC_NUM_LINK_PRI];
+	struct bcbearer_pair bpairs_temp[TIPC_MAX_LINK_PRI + 1];
 };
 
 /**
@@ -630,7 +630,7 @@ void bcbearer_sort(void)
 	bp_curr = bcbearer->bpairs;
 	memset(bcbearer->bpairs, 0, sizeof(bcbearer->bpairs));
 
-	for (pri = (TIPC_NUM_LINK_PRI - 1); pri >= 0; pri--) {
+	for (pri = TIPC_MAX_LINK_PRI; pri >= 0; pri--) {
 
 		if (!bp_temp[pri].primary)
 			continue;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 3dd19fdc5a2c..02b6cf6ab7a4 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -119,7 +119,8 @@ int  tipc_register_media(u32 media_type,
 		warn("Media registration error: no broadcast address supplied\n");
 		goto exit;
 	}
-	if (bearer_priority >= TIPC_NUM_LINK_PRI) {
+	if ((bearer_priority < TIPC_MIN_LINK_PRI) &&
+	    (bearer_priority > TIPC_MAX_LINK_PRI)) {
 		warn("Media registration error: priority %u\n", bearer_priority);
 		goto exit;
 	}
@@ -476,10 +477,15 @@ int tipc_enable_bearer(const char *name, u32 bcast_scope, u32 priority)
 
 	if (tipc_mode != TIPC_NET_MODE)
 		return -ENOPROTOOPT;
+
 	if (!bearer_name_validate(name, &b_name) ||
 	    !addr_domain_valid(bcast_scope) ||
-	    !in_scope(bcast_scope, tipc_own_addr) ||
-	    (priority > TIPC_NUM_LINK_PRI))
+	    !in_scope(bcast_scope, tipc_own_addr))
+		return -EINVAL;
+
+	if ((priority < TIPC_MIN_LINK_PRI ||
+	     priority > TIPC_MAX_LINK_PRI) &&
+	    (priority != TIPC_MEDIA_LINK_PRI))
 		return -EINVAL;
 
 	write_lock_bh(&net_lock);
@@ -491,7 +497,8 @@ int tipc_enable_bearer(const char *name, u32 bcast_scope, u32 priority)
 		warn("No media <%s>\n", b_name.media_name);
 		goto failed;
 	}
-	if (priority == TIPC_NUM_LINK_PRI)
+
+	if (priority == TIPC_MEDIA_LINK_PRI)
 		priority = m_ptr->priority;
 
 restart:
@@ -547,8 +554,8 @@ restart:
 	}
 	b_ptr->publ.lock = SPIN_LOCK_UNLOCKED;
 	write_unlock_bh(&net_lock);
-	info("Enabled bearer <%s>, discovery domain %s\n",
-	     name, addr_string_fill(addr_string, bcast_scope));
+	info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
+	     name, addr_string_fill(addr_string, bcast_scope), priority);
 	return 0;
 failed:
 	write_unlock_bh(&net_lock);
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index 34d0462db3aa..2ab6c16280e6 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -42,9 +42,9 @@
 
 #define MAX_ETH_BEARERS		2
 #define TIPC_PROTOCOL		0x88ca
-#define ETH_LINK_PRIORITY	10
+#define ETH_LINK_PRIORITY	TIPC_DEF_LINK_PRI
 #define ETH_LINK_TOLERANCE	TIPC_DEF_LINK_TOL
-
+#define ETH_LINK_WINDOW		TIPC_DEF_LINK_WIN
 
 /**
  * struct eth_bearer - Ethernet bearer data structure
@@ -260,7 +260,7 @@ int eth_media_start(void)
 	res = tipc_register_media(TIPC_MEDIA_TYPE_ETH, "eth",
 				  enable_bearer, disable_bearer, send_msg, 
 				  eth_addr2str, &bcast_addr, ETH_LINK_PRIORITY, 
-				  ETH_LINK_TOLERANCE, TIPC_DEF_LINK_WIN);
+				  ETH_LINK_TOLERANCE, ETH_LINK_WINDOW);
 	if (res)
 		return res;
 
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 7265f4be4766..d1e1ae66464a 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -2812,7 +2812,8 @@ struct sk_buff *link_cmd_config(const void *req_tlv_area, int req_tlv_space,
 		}
 		break;
 	case TIPC_CMD_SET_LINK_PRI: 
-		if (new_value < TIPC_NUM_LINK_PRI) {
+		if ((new_value >= TIPC_MIN_LINK_PRI) &&
+		    (new_value <= TIPC_MAX_LINK_PRI)) {
 			l_ptr->priority = new_value;
 			link_send_proto_msg(l_ptr, STATE_MSG, 
 					    0, 0, 0, new_value, 0);
-- 
cgit v1.2.3-71-gd317


From 33a9c4da5ab16192ef1e961d4c4e45c18031cd67 Mon Sep 17 00:00:00 2001
From: Per Liden <per.liden@ericsson.com>
Date: Mon, 16 Jan 2006 11:42:12 +0100
Subject: [TIPC] Move ethernet protocol id to linux/if_ether.h

Signed-off-by: Per Liden <per.liden@ericsson.com>
---
 include/linux/if_ether.h | 1 +
 net/tipc/eth_media.c     | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index fe26d431de87..7a92c1ce1457 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -72,6 +72,7 @@
 					 * over Ethernet
 					 */
 #define ETH_P_AOE	0x88A2		/* ATA over Ethernet		*/
+#define ETH_P_TIPC	0x88CA		/* TIPC 			*/
 
 /*
  *	Non DIX types. Won't clash for 1500 types.
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index 2ab6c16280e6..64a880b5935a 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -41,7 +41,6 @@
 #include <linux/version.h>
 
 #define MAX_ETH_BEARERS		2
-#define TIPC_PROTOCOL		0x88ca
 #define ETH_LINK_PRIORITY	TIPC_DEF_LINK_PRI
 #define ETH_LINK_TOLERANCE	TIPC_DEF_LINK_TOL
 #define ETH_LINK_WINDOW		TIPC_DEF_LINK_WIN
@@ -78,7 +77,7 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
 		clone->nh.raw = clone->data;
 		dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev;
 		clone->dev = dev;
-		dev->hard_header(clone, dev, TIPC_PROTOCOL, 
+		dev->hard_header(clone, dev, ETH_P_TIPC, 
 				 &dest->dev_addr.eth_addr,
 				 dev->dev_addr, clone->len);
 		dev_queue_xmit(clone);
@@ -141,7 +140,7 @@ static int enable_bearer(struct tipc_bearer *tb_ptr)
 		return -EDQUOT;
 	if (!eb_ptr->dev) {
 		eb_ptr->dev = dev;
-		eb_ptr->tipc_packet_type.type = __constant_htons(TIPC_PROTOCOL);
+		eb_ptr->tipc_packet_type.type = __constant_htons(ETH_P_TIPC);
 		eb_ptr->tipc_packet_type.dev = dev;
 		eb_ptr->tipc_packet_type.func = recv_msg;
 		eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr;
-- 
cgit v1.2.3-71-gd317


From 8d238e012469a9a332c78d6a69a8a46ac4b1e9c2 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 17 Jan 2006 20:50:31 +0000
Subject: [PATCH] libata: Fix heuristic typos add LBA48PIO flag and support
 code, add IRQ flag for next diff

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 4 ++++
 include/linux/libata.h     | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index b42acbe0e9a5..e6044455ca45 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -611,6 +611,10 @@ int ata_rwcmd_protocol(struct ata_queued_cmd *qc)
 	if (dev->flags & ATA_DFLAG_PIO) {
 		tf->protocol = ATA_PROT_PIO;
 		index = dev->multi_count ? 0 : 8;
+	} else if (lba48 && (qc->ap->flags & ATA_FLAG_PIO_LBA48)) {
+		/* Unable to use DMA due to host limitation */
+		tf->protocol = ATA_PROT_PIO;
+		index = dev->multi_count ? 0 : 4;
 	} else {
 		tf->protocol = ATA_PROT_DMA;
 		index = 16;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index af6624450f65..9e5db2949c58 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -126,16 +126,19 @@ enum {
 
 	ATA_FLAG_SUSPENDED	= (1 << 12), /* port is suspended */
 
+	ATA_FLAG_PIO_LBA48	= (1 << 13), /* Host DMA engine is LBA28 only */
+	ATA_FLAG_IRQ_MASK	= (1 << 14), /* Mask IRQ in PIO xfers */
+
 	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
 	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
 	ATA_QCFLAG_SINGLE	= (1 << 4), /* no s/g, just a single buffer */
 	ATA_QCFLAG_DMAMAP	= ATA_QCFLAG_SG | ATA_QCFLAG_SINGLE,
 
 	/* various lengths of time */
-	ATA_TMOUT_EDD		= 5 * HZ,	/* hueristic */
+	ATA_TMOUT_EDD		= 5 * HZ,	/* heuristic */
 	ATA_TMOUT_PIO		= 30 * HZ,
-	ATA_TMOUT_BOOT		= 30 * HZ,	/* hueristic */
-	ATA_TMOUT_BOOT_QUICK	= 7 * HZ,	/* hueristic */
+	ATA_TMOUT_BOOT		= 30 * HZ,	/* heuristic */
+	ATA_TMOUT_BOOT_QUICK	= 7 * HZ,	/* heuristic */
 	ATA_TMOUT_CDB		= 30 * HZ,
 	ATA_TMOUT_CDB_QUICK	= 5 * HZ,
 	ATA_TMOUT_INTERNAL	= 30 * HZ,
-- 
cgit v1.2.3-71-gd317


From d9004eb466d03b7900ed432fecec6819012b4ed3 Mon Sep 17 00:00:00 2001
From: Alon Bar-Lev <alon.barlev@gmail.com>
Date: Wed, 18 Jan 2006 11:47:33 +0000
Subject: [SERIAL] Add 8250 support for Decision Computer International Co.
 PCCOM2

There is a new device which is look like:

	Serial controller: Decision Computer International Co. PCCOM2 (rev 02) (prog-if 02 [16550])
	0700: 6666:0004 (rev 02) (prog-if 02)
	Flags: medium devsel, IRQ 177
	Memory at fe000000 (32-bit, non-prefetchable) [size=128]
	I/O ports at e880 [size=128]
	I/O ports at e400 [size=256]

It has two 16550A, and is not listed in kernel, although the
manufacturer clams that it is supported...

I've created the following patch, it only add the new PCI id and the
card to the repository, it seems to work.

Signed-off-by: Alon Bar-Lev <alon.barlev@gmail.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/8250_pci.c | 10 ++++++++++
 include/linux/pci_ids.h   |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 589fb076654a..2a912153321e 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -940,6 +940,7 @@ enum pci_board_num_t {
 	pbn_b2_bt_2_921600,
 	pbn_b2_bt_4_921600,
 
+	pbn_b3_2_115200,
 	pbn_b3_4_115200,
 	pbn_b3_8_115200,
 
@@ -1311,6 +1312,12 @@ static struct pciserial_board pci_boards[] __devinitdata = {
 		.uart_offset	= 8,
 	},
 
+	[pbn_b3_2_115200] = {
+		.flags		= FL_BASE3,
+		.num_ports	= 2,
+		.base_baud	= 115200,
+		.uart_offset	= 8,
+	},
 	[pbn_b3_4_115200] = {
 		.flags		= FL_BASE3,
 		.num_ports	= 4,
@@ -2272,6 +2279,9 @@ static struct pci_device_id serial_pci_tbl[] = {
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_nec_nile4 },
 
+	{	PCI_VENDOR_ID_DCI, PCI_DEVICE_ID_DCI_PCCOM2,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b3_2_115200 },
 	{	PCI_VENDOR_ID_DCI, PCI_DEVICE_ID_DCI_PCCOM4,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b3_4_115200 },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5403257ae3e7..ecc1fc1f0f04 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1992,6 +1992,7 @@
 #define PCI_VENDOR_ID_DCI		0x6666
 #define PCI_DEVICE_ID_DCI_PCCOM4	0x0001
 #define PCI_DEVICE_ID_DCI_PCCOM8	0x0002
+#define PCI_DEVICE_ID_DCI_PCCOM2	0x0004
 
 #define PCI_VENDOR_ID_INTEL		0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC	0x0008
-- 
cgit v1.2.3-71-gd317


From 053837fce7aa79025ed57656855df09f80175527 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 18 Jan 2006 17:42:27 -0800
Subject: [PATCH] mm: migration page refcounting fix

Migration code currently does not take a reference to target page
properly, so between unlocking the pte and trying to take a new
reference to the page with isolate_lru_page, anything could happen to
it.

Fix this by holding the pte lock until we get a chance to elevate the
refcount.

Other small cleanups while we're here.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm_inline.h | 21 --------------
 include/linux/swap.h      |  1 +
 mm/filemap.c              |  1 +
 mm/mempolicy.c            | 29 +++++++++++--------
 mm/rmap.c                 |  2 +-
 mm/swap.c                 | 26 +++++++++++++++++
 mm/vmscan.c               | 71 ++++++++++++++++++++---------------------------
 7 files changed, 76 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 49cc68af01f8..8ac854f7f190 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -39,24 +39,3 @@ del_page_from_lru(struct zone *zone, struct page *page)
 	}
 }
 
-/*
- * Isolate one page from the LRU lists.
- *
- * - zone->lru_lock must be held
- */
-static inline int __isolate_lru_page(struct page *page)
-{
-	if (unlikely(!TestClearPageLRU(page)))
-		return 0;
-
-	if (get_page_testone(page)) {
-		/*
-		 * It is being freed elsewhere
-		 */
-		__put_page(page);
-		SetPageLRU(page);
-		return -ENOENT;
-	}
-
-	return 1;
-}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e92054d6530b..d01f7efb0f2c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -167,6 +167,7 @@ extern void FASTCALL(lru_cache_add_active(struct page *));
 extern void FASTCALL(activate_page(struct page *));
 extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
+extern int lru_add_drain_all(void);
 extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index a965b6b35f26..44da3d476994 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
+ *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode_lock		(page_remove_rmap->set_page_dirty)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3171f884d245..551cde40520b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -208,6 +208,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		page = vm_normal_page(vma, addr, *pte);
 		if (!page)
 			continue;
+		/*
+		 * The check for PageReserved here is important to avoid
+		 * handling zero pages and other pages that may have been
+		 * marked special by the system.
+		 *
+		 * If the PageReserved would not be checked here then f.e.
+		 * the location of the zero page could have an influence
+		 * on MPOL_MF_STRICT, zero pages would be counted for
+		 * the per node stats, and there would be useless attempts
+		 * to put zero pages on the migration list.
+		 */
 		if (PageReserved(page))
 			continue;
 		nid = page_to_nid(page);
@@ -216,11 +227,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 		if (flags & MPOL_MF_STATS)
 			gather_stats(page, private);
-		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-			spin_unlock(ptl);
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(vma, page, private, flags);
-			spin_lock(ptl);
-		}
 		else
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -309,6 +317,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
 
+	/* Clear the LRU lists so pages can be isolated */
+	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		lru_add_drain_all();
+
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
@@ -555,15 +567,8 @@ static void migrate_page_add(struct vm_area_struct *vma,
 	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 	    mapping_writably_mapped(page->mapping) ||
 	    single_mm_mapping(vma->vm_mm, page->mapping)) {
-		int rc = isolate_lru_page(page);
-
-		if (rc == 1)
+		if (isolate_lru_page(page))
 			list_add(&page->lru, pagelist);
-		/*
-		 * If the isolate attempt was not successful then we just
-		 * encountered an unswappable page. Something must be wrong.
-	 	 */
-		WARN_ON(rc == 0);
 	}
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index dfbb89f99a15..d85a99d28c03 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -33,7 +33,7 @@
  *     mapping->i_mmap_lock
  *       anon_vma->lock
  *         mm->page_table_lock or pte_lock
- *           zone->lru_lock (in mark_page_accessed)
+ *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
  *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
  *             mapping->private_lock (in __set_page_dirty_buffers)
diff --git a/mm/swap.c b/mm/swap.c
index cbb48e721ab9..bc2442a7b0ee 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -174,6 +174,32 @@ void lru_add_drain(void)
 	put_cpu();
 }
 
+#ifdef CONFIG_NUMA
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+	return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+}
+
+#else
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+	lru_add_drain();
+	return 0;
+}
+#endif
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf903b2d198f..827bf674577a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -586,7 +586,7 @@ static inline void move_to_lru(struct page *page)
 }
 
 /*
- * Add isolated pages on the list back to the LRU
+ * Add isolated pages on the list back to the LRU.
  *
  * returns the number of pages put back.
  */
@@ -760,46 +760,33 @@ next:
 	return nr_failed + retry;
 }
 
-static void lru_add_drain_per_cpu(void *dummy)
-{
-	lru_add_drain();
-}
-
 /*
  * Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
- * page is not on the LRU lists yet.
+ * indicated list with elevated refcount.
  *
  * Result:
  *  0 = page not on LRU list
  *  1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
  */
 int isolate_lru_page(struct page *page)
 {
-	int rc = 0;
-	struct zone *zone = page_zone(page);
+	int ret = 0;
 
-redo:
-	spin_lock_irq(&zone->lru_lock);
-	rc = __isolate_lru_page(page);
-	if (rc == 1) {
-		if (PageActive(page))
-			del_page_from_active_list(zone, page);
-		else
-			del_page_from_inactive_list(zone, page);
-	}
-	spin_unlock_irq(&zone->lru_lock);
-	if (rc == 0) {
-		/*
-		 * Maybe this page is still waiting for a cpu to drain it
-		 * from one of the lru lists?
-		 */
-		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-		if (rc == 0 && PageLRU(page))
-			goto redo;
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+		spin_lock_irq(&zone->lru_lock);
+		if (TestClearPageLRU(page)) {
+			ret = 1;
+			get_page(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+		}
+		spin_unlock_irq(&zone->lru_lock);
 	}
-	return rc;
+
+	return ret;
 }
 #endif
 
@@ -831,18 +818,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		switch (__isolate_lru_page(page)) {
-		case 1:
-			/* Succeeded to isolate page */
-			list_move(&page->lru, dst);
-			nr_taken++;
-			break;
-		case -ENOENT:
-			/* Not possible to isolate */
-			list_move(&page->lru, src);
-			break;
-		default:
+		if (!TestClearPageLRU(page))
 			BUG();
+		list_del(&page->lru);
+		if (get_page_testone(page)) {
+			/*
+			 * It is being freed elsewhere
+			 */
+			__put_page(page);
+			SetPageLRU(page);
+			list_add(&page->lru, src);
+			continue;
+		} else {
+			list_add(&page->lru, dst);
+			nr_taken++;
 		}
 	}
 
-- 
cgit v1.2.3-71-gd317


From 9eeff2395e3cfd05c9b2e6074ff943a34b0c5c21 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 18 Jan 2006 17:42:31 -0800
Subject: [PATCH] Zone reclaim: Reclaim logic

Some bits for zone reclaim exists in 2.6.15 but they are not usable.  This
patch fixes them up, removes unused code and makes zone reclaim usable.

Zone reclaim allows the reclaiming of pages from a zone if the number of
free pages falls below the watermarks even if other zones still have enough
pages available.  Zone reclaim is of particular importance for NUMA
machines.  It can be more beneficial to reclaim a page than taking the
performance penalties that come with allocating a page on a remote zone.

Zone reclaim is enabled if the maximum distance to another node is higher
than RECLAIM_DISTANCE, which may be defined by an arch.  By default
RECLAIM_DISTANCE is 20.  20 is the distance to another node in the same
component (enclosure or motherboard) on IA64.  The meaning of the NUMA
distance information seems to vary by arch.

If zone reclaim is not successful then no further reclaim attempts will
occur for a certain time period (ZONE_RECLAIM_INTERVAL).

This patch was discussed before. See

http://marc.theaimsgroup.com/?l=linux-kernel&m=113519961504207&w=2
http://marc.theaimsgroup.com/?l=linux-kernel&m=113408418232531&w=2
http://marc.theaimsgroup.com/?l=linux-kernel&m=113389027420032&w=2
http://marc.theaimsgroup.com/?l=linux-kernel&m=113380938612205&w=2

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h   | 12 +++++----
 include/linux/swap.h     | 11 ++++++++
 include/linux/topology.h |  8 ++++++
 mm/page_alloc.c          | 17 +++++++++---
 mm/vmscan.c              | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 108 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 34cbefd2ebde..93a849f742db 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -149,14 +149,16 @@ struct zone {
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	int			all_unreclaimable; /* All pages pinned */
 
-	/*
-	 * Does the allocator try to reclaim pages from the zone as soon
-	 * as it fails a watermark_ok() in __alloc_pages?
-	 */
-	int			reclaim_pages;
 	/* A count of how many reclaimers are scanning this zone */
 	atomic_t		reclaim_in_progress;
 
+	/*
+	 * timestamp (in jiffies) of the last zone reclaim that did not
+	 * result in freeing of pages. This is used to avoid repeated scans
+	 * if all memory in the zone is in use.
+	 */
+	unsigned long		last_unsuccessful_zone_reclaim;
+
 	/*
 	 * prev_priority holds the scanning priority for this zone.  It is
 	 * defined as the scanning priority at which we achieved our reclaim
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d01f7efb0f2c..4a99e4a7fbf3 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -176,6 +176,17 @@ extern int try_to_free_pages(struct zone **, gfp_t);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
+#ifdef CONFIG_NUMA
+extern int zone_reclaim_mode;
+extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+#else
+#define zone_reclaim_mode 0
+static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 315a5163d6a0..e8eb0040ce3a 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -56,6 +56,14 @@
 #define REMOTE_DISTANCE		20
 #define node_distance(from,to)	((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
 #endif
+#ifndef RECLAIM_DISTANCE
+/*
+ * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
+ * (in whatever arch specific measurement units returned by node_distance())
+ * then switch on zone reclaim on boot.
+ */
+#define RECLAIM_DISTANCE 20
+#endif
 #ifndef PENALTY_FOR_NODE_WITH_CPUS
 #define PENALTY_FOR_NODE_WITH_CPUS	(1)
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2e29743a8d1..df54e2fc8ee0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 				mark = (*z)->pages_high;
 			if (!zone_watermark_ok(*z, order, mark,
 				    classzone_idx, alloc_flags))
-				continue;
+				if (!zone_reclaim_mode ||
+				    !zone_reclaim(*z, gfp_mask, order))
+					continue;
 		}
 
 		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
-		if (node_distance(local_node, node) !=
-				node_distance(local_node, prev_node))
+
+		if (distance != node_distance(local_node, prev_node))
 			node_load[node] += load;
 		prev_node = node;
 		load--;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5117b6897a9..2e34b61a70c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
 }
 
 module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+
+/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+	int nr_pages = 1 << order;
+	struct task_struct *p = current;
+	struct reclaim_state reclaim_state;
+	struct scan_control sc = {
+		.gfp_mask	= gfp_mask,
+		.may_writepage	= 0,
+		.may_swap	= 0,
+		.nr_mapped	= read_page_state(nr_mapped),
+		.nr_scanned	= 0,
+		.nr_reclaimed	= 0,
+		.priority	= 0
+	};
+
+	if (!(gfp_mask & __GFP_WAIT) ||
+		zone->zone_pgdat->node_id != numa_node_id() ||
+		zone->all_unreclaimable ||
+		atomic_read(&zone->reclaim_in_progress) > 0)
+			return 0;
+
+	if (time_before(jiffies,
+		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+			return 0;
+
+	disable_swap_token();
+
+	if (nr_pages > SWAP_CLUSTER_MAX)
+		sc.swap_cluster_max = nr_pages;
+	else
+		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+	cond_resched();
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+	shrink_zone(zone, &sc);
+	p->reclaim_state = NULL;
+	current->flags &= ~PF_MEMALLOC;
+
+	if (sc.nr_reclaimed == 0)
+		zone->last_unsuccessful_zone_reclaim = jiffies;
+
+	return sc.nr_reclaimed > nr_pages;
+}
+#endif
+
-- 
cgit v1.2.3-71-gd317


From 1743660b911bfb849b1fb33830522254561b9f9b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 18 Jan 2006 17:42:32 -0800
Subject: [PATCH] Zone reclaim: proc override

proc support for zone reclaim

This patch creates a proc entry /proc/sys/vm/zone_reclaim_mode that may be
used to override the automatic determination of the zone reclaim made on
bootup.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
 include/linux/sysctl.h      |  1 +
 kernel/sysctl.c             | 11 +++++++++++
 3 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6910c0136f8d..391dd64363e7 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -27,6 +27,7 @@ Currently, these files are in /proc/sys/vm:
 - laptop_mode
 - block_dump
 - drop-caches
+- zone_reclaim_mode
 
 ==============================================================
 
@@ -120,3 +121,20 @@ set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
 
 The initial value is zero.  Kernel does not use this value at boot time to set
 the high water marks for each per cpu page list.
+
+===============================================================
+
+zone_reclaim_mode:
+
+This is set during bootup to 1 if it is determined that pages from
+remote zones will cause a significant performance reduction. The
+page allocator will then reclaim easily reusable pages (those page
+cache pages that are currently not used) before going off node.
+
+The user can override this setting. It may be beneficial to switch
+off zone reclaim if the system is used for a file server and all
+of memory should be used for caching files from disk.
+
+It may be beneficial to switch this on if one wants to do zone
+reclaim regardless of the numa distances in the system.
+
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7f472127b7b5..8352a7ce5895 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -182,6 +182,7 @@ enum
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
 	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
 	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
+	VM_ZONE_RECLAIM_MODE=31,/* reclaim local zone memory before going off node */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5d69b6e29f5..cb99a42f8b37 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -869,6 +869,17 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec_jiffies,
 		.strategy	= &sysctl_jiffies,
 	},
+#endif
+#ifdef CONFIG_NUMA
+	{
+		.ctl_name	= VM_ZONE_RECLAIM_MODE,
+		.procname	= "zone_reclaim_mode",
+		.data		= &zone_reclaim_mode,
+		.maxlen		= sizeof(zone_reclaim_mode),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &zero,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3-71-gd317


From dc85da15d42b0efc792b0f5eab774dc5dbc1ceec Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 18 Jan 2006 17:42:36 -0800
Subject: [PATCH] NUMA policies in the slab allocator V2

This patch fixes a regression in 2.6.14 against 2.6.13 that causes an
imbalance in memory allocation during bootup.

The slab allocator in 2.6.13 is not numa aware and simply calls
alloc_pages().  This means that memory policies may control the behavior of
alloc_pages().  During bootup the memory policy is set to MPOL_INTERLEAVE
resulting in the spreading out of allocations during bootup over all
available nodes.  The slab allocator in 2.6.13 has only a single list of
slab pages.  As a result the per cpu slab cache and the spinlock controlled
page lists may contain slab entries from off node memory.  The slab
allocator in 2.6.13 makes no effort to discern the locality of an entry on
its lists.

The NUMA aware slab allocator in 2.6.14 controls locality of the slab pages
explicitly by calling alloc_pages_node().  The NUMA slab allocator manages
slab entries by having lists of available slab pages for each node.  The
per cpu slab cache can only contain slab entries associated with the node
local to the processor.  This guarantees that the default allocation mode
of the slab allocator always assigns local memory if available.

Setting MPOL_INTERLEAVE as a default policy during bootup has no effect
anymore.  In 2.6.14 all node unspecific slab allocations are performed on
the boot processor.  This means that most of key data structures are
allocated on one node.  Most processors will have to refer to these
structures making the boot node a potential bottleneck.  This may reduce
performance and cause unnecessary memory pressure on the boot node.

This patch implements NUMA policies in the slab layer.  There is the need
of explicit application of NUMA memory policies by the slab allcator itself
since the NUMA slab allocator does no longer let the page_allocator control
locality.

The check for policies is made directly at the beginning of __cache_alloc
using current->mempolicy.  The memory policy is already frequently checked
by the page allocator (alloc_page_vma() and alloc_page_current()).  So it
is highly likely that the cacheline is present.  For MPOL_INTERLEAVE
kmalloc() will spread out each request to one node after another so that an
equal distribution of allocations can be obtained during bootup.

It is not possible to push the policy check to lower layers of the NUMA
slab allocator since the per cpu caches are now only containing slab
entries from the current node.  If the policy says that the local node is
not to be preferred or forbidden then there is no point in checking the
slab cache or local list of slab pages.  The allocation better be directed
immediately to the lists containing slab entries for the allowed set of
nodes.

This way of applying policy also fixes another strange behavior in 2.6.13.
alloc_pages() is controlled by the memory allocation policy of the current
process.  It could therefore be that one process is running with
MPOL_INTERLEAVE and would f.e.  obtain a new page following that policy
since no slab entries are in the lists anymore.  A page can typically be
used for multiple slab entries but lets say that the current process is
only using one.  The other entries are then added to the slab lists.  These
are now non local entries in the slab lists despite of the possible
availability of local pages that would provide faster access and increase
the performance of the application.

Another process without MPOL_INTERLEAVE may now run and expect a local slab
entry from kmalloc().  However, there are still these free slab entries
from the off node page obtained from the other process via MPOL_INTERLEAVE
in the cache.  The process will then get an off node slab entry although
other slab entries may be available that are local to that process.  This
means that the policy if one process may contaminate the locality of the
slab caches for other processes.

This patch in effect insures that a per process policy is followed for the
allocation of slab entries and that there cannot be a memory policy
influence from one process to another.  A process with default policy will
always get a local slab entry if one is available.  And the process using
memory policies will get its memory arranged as requested.  Off-node slab
allocation will require the use of spinlocks and will make the use of per
cpu caches not possible.  A process using memory policies to redirect
allocations offnode will have to cope with additional lock overhead in
addition to the latency added by the need to access a remote slab entry.

Changes V1->V2
- Remove #ifdef CONFIG_NUMA by moving forward declaration into
  prior #ifdef CONFIG_NUMA section.

- Give the function determining the node number to use a saner
  name.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |  1 +
 mm/mempolicy.c            | 30 ++++++++++++++++++++++++++++++
 mm/slab.c                 | 12 ++++++++++++
 3 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d6a53ed6ab6c..bbd2221923c3 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -159,6 +159,7 @@ extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
+extern unsigned slab_node(struct mempolicy *policy);
 
 extern int policy_zone;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a683a66599b1..71430d440822 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -976,6 +976,36 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	return nid;
 }
 
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+	if (in_interrupt())
+		return numa_node_id();
+
+	switch (policy->policy) {
+	case MPOL_INTERLEAVE:
+		return interleave_nodes(policy);
+
+	case MPOL_BIND:
+		/*
+		 * Follow bind policy behavior and start allocation at the
+		 * first node.
+		 */
+		return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+	case MPOL_PREFERRED:
+		if (policy->v.preferred_node >= 0)
+			return policy->v.preferred_node;
+		/* Fall through */
+
+	default:
+		return numa_node_id();
+	}
+}
+
 /* Do static interleaving for a VMA with known offset. */
 static unsigned offset_il_node(struct mempolicy *pol,
 		struct vm_area_struct *vma, unsigned long off)
diff --git a/mm/slab.c b/mm/slab.c
index bd0317f1e06c..9025608696ec 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -103,6 +103,7 @@
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/nodemask.h>
+#include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
 
 #include	<asm/uaccess.h>
@@ -773,6 +774,8 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 }
 
 #ifdef CONFIG_NUMA
+static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
+
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
 	struct array_cache **ac_ptr;
@@ -2570,6 +2573,15 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 	void *objp;
 	struct array_cache *ac;
 
+#ifdef CONFIG_NUMA
+	if (current->mempolicy) {
+		int nid = slab_node(current->mempolicy);
+
+		if (nid != numa_node_id())
+			return __cache_alloc_node(cachep, flags, nid);
+	}
+#endif
+
 	check_irq_off();
 	ac = ac_data(cachep);
 	if (likely(ac->avail)) {
-- 
cgit v1.2.3-71-gd317


From 5131cf154ad1c6e584efa58d17a469d0b80f49bd Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 18 Jan 2006 17:43:04 -0800
Subject: [PATCH] add missing syscall declarations

All standard system calls should be declared in include/linux/syscalls.h.

Add some of the new additions that were previously missed.

Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inotify.c             |  1 +
 include/linux/syscalls.h | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inotify.c b/fs/inotify.c
index 2fecb7af4a77..878ccca61213 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -33,6 +33,7 @@
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
+#include <linux/syscalls.h>
 
 #include <asm/ioctls.h>
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3eed47347013..e666d6070569 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -510,9 +510,24 @@ asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3,
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
 asmlinkage long sys_ioprio_get(int which, int who);
 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
-					unsigned long maxnode);
+				unsigned long maxnode);
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
-			const unsigned long __user *from, const unsigned long __user *to);
+				const unsigned long __user *from,
+				const unsigned long __user *to);
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+				unsigned long mode,
+				unsigned long __user *nmask,
+				unsigned long maxnode,
+				unsigned flags);
+asmlinkage long sys_get_mempolicy(int __user *policy,
+				unsigned long __user *nmask,
+				unsigned long maxnode,
+				unsigned long addr, unsigned long flags);
+
+asmlinkage long sys_inotify_init(void);
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *path,
+					u32 mask);
+asmlinkage long sys_inotify_rm_watch(int fd, u32 wd);
 
 asmlinkage long sys_spu_run(int fd, __u32 __user *unpc,
 				 __u32 __user *ustatus);
-- 
cgit v1.2.3-71-gd317


From f193fbab2e4710f629e7c1859d4908646b47b126 Mon Sep 17 00:00:00 2001
From: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Date: Wed, 18 Jan 2006 17:43:13 -0800
Subject: [PATCH] nfsd: check error status from nfsd_sync_dir

Change nfsd_sync_dir to return an error if ->sync fails, and pass that error
up through the stack.  This involves a number of rearrangements of error
paths, and care to distinguish between Linux -errno numbers and NFSERR
numbers.

In the 'create' routines, we continue with the 'setattr' even if a previous
sync_dir failed.

This patch is quite different from Takashi's in a few ways, but there is still
a strong lineage.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c             | 80 ++++++++++++++++++++++++++---------------------
 include/linux/nfsd/nfsd.h |  2 +-
 2 files changed, 45 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index eef0576a7785..acf637869ccf 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -710,14 +710,15 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
 {
 	struct inode *inode = dp->d_inode;
 	int (*fsync) (struct file *, struct dentry *, int);
-	int err = nfs_ok;
+	int err;
 
-	filemap_fdatawrite(inode->i_mapping);
-	if (fop && (fsync = fop->fsync))
-		err=fsync(filp, dp, 0);
-	filemap_fdatawait(inode->i_mapping);
+	err = filemap_fdatawrite(inode->i_mapping);
+	if (err == 0 && fop && (fsync = fop->fsync))
+		err = fsync(filp, dp, 0);
+	if (err == 0)
+		err = filemap_fdatawait(inode->i_mapping);
 
-	return nfserrno(err);
+	return err;
 }
 	
 
@@ -734,10 +735,10 @@ nfsd_sync(struct file *filp)
 	return err;
 }
 
-void
+int
 nfsd_sync_dir(struct dentry *dp)
 {
-	nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
+	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
 }
 
 /*
@@ -1065,6 +1066,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (EX_ISSYNC(fhp->fh_export)) {
 		if (file->f_op && file->f_op->fsync) {
 			err = nfsd_sync(file);
+			err = nfserrno(err);
 		} else {
 			err = nfserr_notsupp;
 		}
@@ -1175,7 +1177,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out_nfserr;
 
 	if (EX_ISSYNC(fhp->fh_export)) {
-		nfsd_sync_dir(dentry);
+		err = nfsd_sync_dir(dentry);
 		write_inode_now(dchild->d_inode, 1);
 	}
 
@@ -1185,9 +1187,11 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 * send along the gid when it tries to implement setgid
 	 * directories via NFS.
 	 */
-	err = 0;
-	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0)
-		err = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
+		int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+		if (err2)
+			err = err2;
+	}
 	/*
 	 * Update the file handle to get the new inode info.
 	 */
@@ -1306,17 +1310,12 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out_nfserr;
 
 	if (EX_ISSYNC(fhp->fh_export)) {
-		nfsd_sync_dir(dentry);
+		err = nfsd_sync_dir(dentry);
+		if (err)
+			err = nfserrno(err);
 		/* setattr will sync the child (or not) */
 	}
 
-	/*
-	 * Update the filehandle to get the new inode info.
-	 */
-	err = fh_update(resfhp);
-	if (err)
-		goto out;
-
 	if (createmode == NFS3_CREATE_EXCLUSIVE) {
 		/* Cram the verifier into atime/mtime/mode */
 		iap->ia_valid = ATTR_MTIME|ATTR_ATIME
@@ -1337,8 +1336,17 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 * implement setgid directories via NFS. Clear out all that cruft.
 	 */
  set_attr:
-	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0)
- 		err = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) {
+ 		int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+		if (err2)
+			err = nfserrno(err2);
+	}
+
+	/*
+	 * Update the filehandle to get the new inode info.
+	 */
+	if (!err)
+		err = fh_update(resfhp);
 
  out:
 	fh_unlock(fhp);
@@ -1447,10 +1455,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	} else
 		err = vfs_symlink(dentry->d_inode, dnew, path, mode);
 
-	if (!err) {
+	if (!err)
 		if (EX_ISSYNC(fhp->fh_export))
-			nfsd_sync_dir(dentry);
-	} else
+			err = nfsd_sync_dir(dentry);
+	if (err)
 		err = nfserrno(err);
 	fh_unlock(fhp);
 
@@ -1506,8 +1514,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	err = vfs_link(dold, dirp, dnew);
 	if (!err) {
 		if (EX_ISSYNC(ffhp->fh_export)) {
-			nfsd_sync_dir(ddir);
+			err = nfsd_sync_dir(ddir);
 			write_inode_now(dest, 1);
+			if (err)
+				err = nfserrno(err);
 		}
 	} else {
 		if (err == -EXDEV && rqstp->rq_vers == 2)
@@ -1595,8 +1605,9 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 #endif
 	err = vfs_rename(fdir, odentry, tdir, ndentry);
 	if (!err && EX_ISSYNC(tfhp->fh_export)) {
-		nfsd_sync_dir(tdentry);
-		nfsd_sync_dir(fdentry);
+		err = nfsd_sync_dir(tdentry);
+		if (!err)
+			err = nfsd_sync_dir(fdentry);
 	}
 
  out_dput_new:
@@ -1671,17 +1682,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 
 	dput(rdentry);
 
-	if (err)
-		goto out_nfserr;
-	if (EX_ISSYNC(fhp->fh_export)) 
-		nfsd_sync_dir(dentry);
-
-out:
-	return err;
+	if (err == 0 &&
+	    EX_ISSYNC(fhp->fh_export))
+			err = nfsd_sync_dir(dentry);
 
 out_nfserr:
 	err = nfserrno(err);
-	goto out;
+out:
+	return err;
 }
 
 /*
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 51c231a1e5a6..ec7c2e872d72 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -124,7 +124,7 @@ int		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 
 int		nfsd_notify_change(struct inode *, struct iattr *);
 int		nfsd_permission(struct svc_export *, struct dentry *, int);
-void		nfsd_sync_dir(struct dentry *dp);
+int		nfsd_sync_dir(struct dentry *dp);
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 #ifdef CONFIG_NFSD_V2_ACL
-- 
cgit v1.2.3-71-gd317


From 1918e341383ab787d6c5b17200f4ed901b10c777 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 18 Jan 2006 17:43:16 -0800
Subject: [PATCH] svcrpc: save and restore the daddr field when request
 deferred

The server code currently keeps track of the destination address on every
request so that it can reply using the same address.  However we forget to do
that in the case of a deferred request.  Remedy this oversight.  >From folks
at PolyServe.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sunrpc/svc.h | 1 +
 net/sunrpc/svcsock.c       | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e4086ec8b952..50cab2a09f28 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -246,6 +246,7 @@ struct svc_deferred_req {
 	u32			prot;	/* protocol (UDP or TCP) */
 	struct sockaddr_in	addr;
 	struct svc_sock		*svsk;	/* where reply must go */
+	u32			daddr;	/* where reply must come from */
 	struct cache_deferred_req handle;
 	int			argslen;
 	u32			args[0];
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index e67613e4eb18..50580620e897 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1527,6 +1527,7 @@ svc_defer(struct cache_req *req)
 		dr->handle.owner = rqstp->rq_server;
 		dr->prot = rqstp->rq_prot;
 		dr->addr = rqstp->rq_addr;
+		dr->daddr = rqstp->rq_daddr;
 		dr->argslen = rqstp->rq_arg.len >> 2;
 		memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
 	}
@@ -1552,6 +1553,7 @@ static int svc_deferred_recv(struct svc_rqst *rqstp)
 	rqstp->rq_arg.len = dr->argslen<<2;
 	rqstp->rq_prot        = dr->prot;
 	rqstp->rq_addr        = dr->addr;
+	rqstp->rq_daddr       = dr->daddr;
 	return dr->argslen<<2;
 }
 
-- 
cgit v1.2.3-71-gd317


From 3a65588adc4401622b204caa897123e16a4a0318 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 18 Jan 2006 17:43:19 -0800
Subject: [PATCH] nfsd4: rename lk_stateowner

One of the things that's confusing about nfsd4_lock is that the lk_stateowner
field could be set to either of two different lockowners: the open owner or
the lock owner.  Rename to lk_replay_owner and add a comment to make it clear
that it's used for whichever stateowner has its sequence id bumped for replay
detection.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c       | 16 ++++++++--------
 fs/nfsd/nfs4xdr.c         |  5 ++---
 include/linux/nfsd/xdr4.h |  5 +++--
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5bf7fd3947ce..578ea521c827 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2725,11 +2725,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
 		                        CHECK_FH | OPEN_STATE,
-		                        &lock->lk_stateowner, &open_stp,
+		                        &lock->lk_replay_owner, &open_stp,
 					lock);
 		if (status)
 			goto out;
-		open_sop = lock->lk_stateowner;
+		open_sop = lock->lk_replay_owner;
 		/* create lockowner and lock stateid */
 		fp = open_stp->st_file;
 		strhashval = lock_ownerstr_hashval(fp->fi_inode, 
@@ -2752,12 +2752,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 				       lock->lk_old_lock_seqid, 
 				       &lock->lk_old_lock_stateid, 
 				       CHECK_FH | LOCK_STATE, 
-				       &lock->lk_stateowner, &lock_stp, lock);
+				       &lock->lk_replay_owner, &lock_stp, lock);
 		if (status)
 			goto out;
-		lock_sop = lock->lk_stateowner;
+		lock_sop = lock->lk_replay_owner;
 	}
-	/* lock->lk_stateowner and lock_stp have been created or found */
+	/* lock->lk_replay_owner and lock_stp have been created or found */
 	filp = lock_stp->st_vfs_file;
 
 	status = nfserr_grace;
@@ -2830,9 +2830,9 @@ conflicting_lock:
 out:
 	if (status && lock->lk_is_new && lock_sop)
 		release_stateowner(lock_sop);
-	if (lock->lk_stateowner) {
-		nfs4_get_stateowner(lock->lk_stateowner);
-		*replay_owner = lock->lk_stateowner;
+	if (lock->lk_replay_owner) {
+		nfs4_get_stateowner(lock->lk_replay_owner);
+		*replay_owner = lock->lk_replay_owner;
 	}
 	nfs4_unlock_state();
 	return status;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index dcd673186944..6b743327686c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -528,7 +528,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 {
 	DECODE_HEAD;
 
-	lock->lk_stateowner = NULL;
+	lock->lk_replay_owner = NULL;
 	/*
 	* type, reclaim(boolean), offset, length, new_lock_owner(boolean)
 	*/
@@ -1895,7 +1895,6 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
 static void
 nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock *lock)
 {
-
 	ENCODE_SEQID_OP_HEAD;
 
 	if (!nfserr) {
@@ -1906,7 +1905,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
 	} else if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lock->lk_denied);
 
-	ENCODE_SEQID_OP_TAIL(lock->lk_stateowner);
+	ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
 }
 
 static void
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 8903688890ce..77adba7d2281 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -145,8 +145,9 @@ struct nfsd4_lock {
 		} ok;
 		struct nfsd4_lock_denied        denied;
 	} u;
-
-	struct nfs4_stateowner *lk_stateowner;
+	/* The lk_replay_owner is the open owner in the open_to_lock_owner
+	 * case and the lock owner otherwise: */
+	struct nfs4_stateowner *lk_replay_owner;
 };
 #define lk_new_open_seqid       v.new.open_seqid
 #define lk_new_open_stateid     v.new.open_stateid
-- 
cgit v1.2.3-71-gd317


From 5590ff0d5528b60153c0b4e7b771472b5a95e297 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 18 Jan 2006 17:43:53 -0800
Subject: [PATCH] vfs: *at functions: core

Here is a series of patches which introduce in total 13 new system calls
which take a file descriptor/filename pair instead of a single file
name.  These functions, openat etc, have been discussed on numerous
occasions.  They are needed to implement race-free filesystem traversal,
they are necessary to implement a virtual per-thread current working
directory (think multi-threaded backup software), etc.

We have in glibc today implementations of the interfaces which use the
/proc/self/fd magic.  But this code is rather expensive.  Here are some
results (similar to what Jim Meyering posted before).

The test creates a deep directory hierarchy on a tmpfs filesystem.  Then
rm -fr is used to remove all directories.  Without syscall support I get
this:

real    0m31.921s
user    0m0.688s
sys     0m31.234s

With syscall support the results are much better:

real    0m20.699s
user    0m0.536s
sys     0m20.149s

The interfaces are for obvious reasons currently not much used.  But they'll
be used.  coreutils (and Jeff's posixutils) are already using them.
Furthermore, code like ftw/fts in libc (maybe even glob) will also start using
them.  I expect a patch to make follow soon.  Every program which is walking
the filesystem tree will benefit.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@ftp.linux.org.uk>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/osf_sys.c       |   2 +-
 arch/sparc64/kernel/sys_sparc32.c |   4 +-
 fs/compat.c                       |  48 +++++++++--
 fs/exec.c                         |   2 +-
 fs/namei.c                        | 172 ++++++++++++++++++++++++++++++--------
 fs/open.c                         |  83 ++++++++++++++----
 fs/stat.c                         |  66 ++++++++++++---
 include/linux/fcntl.h             |   7 ++
 include/linux/fs.h                |   7 +-
 include/linux/namei.h             |   7 +-
 include/linux/time.h              |   2 +-
 11 files changed, 320 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 01fe990d3e54..7fb14f42a125 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -960,7 +960,7 @@ osf_utimes(char __user *filename, struct timeval32 __user *tvs)
 			return -EFAULT;
 	}
 
-	return do_utimes(filename, tvs ? ktvs : NULL);
+	return do_utimes(AT_FDCWD, filename, tvs ? ktvs : NULL);
 }
 
 #define MAX_SELECT_SECONDS \
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index d4b7a100cb8a..9264ccbaaafa 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -821,7 +821,7 @@ asmlinkage long sys32_utimes(char __user *filename,
 			return -EFAULT;
 	}
 
-	return do_utimes(filename, (tvs ? &ktvs[0] : NULL));
+	return do_utimes(AT_FDCWD, filename, (tvs ? &ktvs[0] : NULL));
 }
 
 /* These are here just in case some old sparc32 binary calls it. */
@@ -1003,7 +1003,7 @@ asmlinkage long sys32_adjtimex(struct timex32 __user *utp)
 asmlinkage long sparc32_open(const char __user *filename,
 			     int flags, int mode)
 {
-	return do_sys_open(filename, flags, mode);
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
 }
 
 extern unsigned long do_mremap(unsigned long addr,
diff --git a/fs/compat.c b/fs/compat.c
index 2468ac1df2f0..c6ba9deabada 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -68,10 +68,10 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
 		tv[0].tv_usec = 0;
 		tv[1].tv_usec = 0;
 	}
-	return do_utimes(filename, t ? tv : NULL);
+	return do_utimes(AT_FDCWD, filename, t ? tv : NULL);
 }
 
-asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_futimesat(int dfd, char __user *filename, struct compat_timeval __user *t)
 {
 	struct timeval tv[2];
 
@@ -82,14 +82,19 @@ asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval _
 		    get_user(tv[1].tv_usec, &t[1].tv_usec))
 			return -EFAULT; 
 	} 
-	return do_utimes(filename, t ? tv : NULL);
+	return do_utimes(dfd, filename, t ? tv : NULL);
+}
+
+asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
+{
+	return compat_sys_futimesat(AT_FDCWD, filename, t);
 }
 
 asmlinkage long compat_sys_newstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat(filename, &stat);
+	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_compat_stat(&stat, statbuf);
@@ -100,13 +105,34 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat(filename, &stat);
+	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_compat_stat(&stat, statbuf);
 	return error;
 }
 
+asmlinkage long compat_sys_newfstatat(int dfd, char __user *filename,
+		struct compat_stat __user *statbuf, int flag)
+{
+	struct kstat stat;
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, &stat);
+	else
+		error = vfs_stat_fd(dfd, filename, &stat);
+
+	if (!error)
+		error = cp_compat_stat(&stat, statbuf);
+
+out:
+	return error;
+}
+
 asmlinkage long compat_sys_newfstat(unsigned int fd,
 		struct compat_stat __user * statbuf)
 {
@@ -1290,7 +1316,17 @@ out:
 asmlinkage long
 compat_sys_open(const char __user *filename, int flags, int mode)
 {
-	return do_sys_open(filename, flags, mode);
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
+}
+
+/*
+ * Exactly like fs/open.c:sys_openat(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_openat(int dfd, const char __user *filename, int flags, int mode)
+{
+	return do_sys_open(dfd, filename, flags, mode);
 }
 
 /*
diff --git a/fs/exec.c b/fs/exec.c
index 62b40af68cc4..055378d2513e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -477,7 +477,7 @@ struct file *open_exec(const char *name)
 	int err;
 	struct file *file;
 
-	err = path_lookup_open(name, LOOKUP_FOLLOW, &nd, FMODE_READ);
+	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ);
 	file = ERR_PTR(err);
 
 	if (!err) {
diff --git a/fs/namei.c b/fs/namei.c
index 33fb5bd34a81..4acdac043b6b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -30,6 +30,8 @@
 #include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/namei.h>
 #include <asm/namei.h>
 #include <asm/uaccess.h>
 
@@ -1063,7 +1065,8 @@ set_it:
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
+static int fastcall do_path_lookup(int dfd, const char *name,
+				unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
 
@@ -1083,9 +1086,38 @@ int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata
 		}
 		nd->mnt = mntget(current->fs->rootmnt);
 		nd->dentry = dget(current->fs->root);
-	} else {
+	} else if (dfd == AT_FDCWD) {
 		nd->mnt = mntget(current->fs->pwdmnt);
 		nd->dentry = dget(current->fs->pwd);
+	} else {
+		struct file *file;
+		int fput_needed;
+		struct dentry *dentry;
+
+		file = fget_light(dfd, &fput_needed);
+		if (!file) {
+			retval = -EBADF;
+			goto out_fail;
+		}
+
+		dentry = file->f_dentry;
+
+		if (!S_ISDIR(dentry->d_inode->i_mode)) {
+			retval = -ENOTDIR;
+			fput_light(file, fput_needed);
+			goto out_fail;
+		}
+
+		retval = file_permission(file, MAY_EXEC);
+		if (retval) {
+			fput_light(file, fput_needed);
+			goto out_fail;
+		}
+
+		nd->mnt = mntget(file->f_vfsmnt);
+		nd->dentry = dget(dentry);
+
+		fput_light(file, fput_needed);
 	}
 	read_unlock(&current->fs->lock);
 	current->total_link_count = 0;
@@ -1094,11 +1126,19 @@ out:
 	if (unlikely(current->audit_context
 		     && nd && nd->dentry && nd->dentry->d_inode))
 		audit_inode(name, nd->dentry->d_inode, flags);
+out_fail:
 	return retval;
 }
 
-static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags,
-		struct nameidata *nd, int open_flags, int create_mode)
+int fastcall path_lookup(const char *name, unsigned int flags,
+			struct nameidata *nd)
+{
+	return do_path_lookup(AT_FDCWD, name, flags, nd);
+}
+
+static int __path_lookup_intent_open(int dfd, const char *name,
+		unsigned int lookup_flags, struct nameidata *nd,
+		int open_flags, int create_mode)
 {
 	struct file *filp = get_empty_filp();
 	int err;
@@ -1108,7 +1148,7 @@ static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags
 	nd->intent.open.file = filp;
 	nd->intent.open.flags = open_flags;
 	nd->intent.open.create_mode = create_mode;
-	err = path_lookup(name, lookup_flags|LOOKUP_OPEN, nd);
+	err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
 	if (IS_ERR(nd->intent.open.file)) {
 		if (err == 0) {
 			err = PTR_ERR(nd->intent.open.file);
@@ -1126,10 +1166,10 @@ static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags
  * @nd: pointer to nameidata
  * @open_flags: open intent flags
  */
-int path_lookup_open(const char *name, unsigned int lookup_flags,
+int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
 		struct nameidata *nd, int open_flags)
 {
-	return __path_lookup_intent_open(name, lookup_flags, nd,
+	return __path_lookup_intent_open(dfd, name, lookup_flags, nd,
 			open_flags, 0);
 }
 
@@ -1141,12 +1181,12 @@ int path_lookup_open(const char *name, unsigned int lookup_flags,
  * @open_flags: open intent flags
  * @create_mode: create intent flags
  */
-static int path_lookup_create(const char *name, unsigned int lookup_flags,
-			      struct nameidata *nd, int open_flags,
-			      int create_mode)
+static int path_lookup_create(int dfd, const char *name,
+			      unsigned int lookup_flags, struct nameidata *nd,
+			      int open_flags, int create_mode)
 {
-	return __path_lookup_intent_open(name, lookup_flags|LOOKUP_CREATE, nd,
-			open_flags, create_mode);
+	return __path_lookup_intent_open(dfd, name, lookup_flags|LOOKUP_CREATE,
+			nd, open_flags, create_mode);
 }
 
 int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
@@ -1156,7 +1196,7 @@ int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
 	int err = PTR_ERR(tmp);
 
 	if (!IS_ERR(tmp)) {
-		err = __path_lookup_intent_open(tmp, lookup_flags, nd, open_flags, 0);
+		err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0);
 		putname(tmp);
 	}
 	return err;
@@ -1248,18 +1288,24 @@ access:
  * that namei follows links, while lnamei does not.
  * SMP-safe
  */
-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
+int fastcall __user_walk_fd(int dfd, const char __user *name, unsigned flags,
+			    struct nameidata *nd)
 {
 	char *tmp = getname(name);
 	int err = PTR_ERR(tmp);
 
 	if (!IS_ERR(tmp)) {
-		err = path_lookup(tmp, flags, nd);
+		err = do_path_lookup(dfd, tmp, flags, nd);
 		putname(tmp);
 	}
 	return err;
 }
 
+int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
+{
+	return __user_walk_fd(AT_FDCWD, name, flags, nd);
+}
+
 /*
  * It's inline, so penalty for filesystems that don't use sticky bit is
  * minimal.
@@ -1518,7 +1564,8 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
  * for symlinks (where the permissions are checked later).
  * SMP-safe
  */
-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
+int open_namei(int dfd, const char *pathname, int flag,
+		int mode, struct nameidata *nd)
 {
 	int acc_mode, error;
 	struct path path;
@@ -1540,7 +1587,8 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
 	 * The simplest case - just a plain lookup.
 	 */
 	if (!(flag & O_CREAT)) {
-		error = path_lookup_open(pathname, lookup_flags(flag), nd, flag);
+		error = path_lookup_open(dfd, pathname, lookup_flags(flag),
+					 nd, flag);
 		if (error)
 			return error;
 		goto ok;
@@ -1549,7 +1597,7 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
 	/*
 	 * Create - we need to know the parent.
 	 */
-	error = path_lookup_create(pathname, LOOKUP_PARENT, nd, flag, mode);
+	error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
 	if (error)
 		return error;
 
@@ -1744,7 +1792,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	return error;
 }
 
-asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev)
+asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
+				unsigned dev)
 {
 	int error = 0;
 	char * tmp;
@@ -1757,7 +1806,7 @@ asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev)
 	if (IS_ERR(tmp))
 		return PTR_ERR(tmp);
 
-	error = path_lookup(tmp, LOOKUP_PARENT, &nd);
+	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
 	if (error)
 		goto out;
 	dentry = lookup_create(&nd, 0);
@@ -1793,6 +1842,11 @@ out:
 	return error;
 }
 
+asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
+{
+	return sys_mknodat(AT_FDCWD, filename, mode, dev);
+}
+
 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int error = may_create(dir, dentry, NULL);
@@ -1815,7 +1869,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return error;
 }
 
-asmlinkage long sys_mkdir(const char __user * pathname, int mode)
+asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
 {
 	int error = 0;
 	char * tmp;
@@ -1826,7 +1880,7 @@ asmlinkage long sys_mkdir(const char __user * pathname, int mode)
 		struct dentry *dentry;
 		struct nameidata nd;
 
-		error = path_lookup(tmp, LOOKUP_PARENT, &nd);
+		error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
 		if (error)
 			goto out;
 		dentry = lookup_create(&nd, 1);
@@ -1846,6 +1900,11 @@ out:
 	return error;
 }
 
+asmlinkage long sys_mkdir(const char __user *pathname, int mode)
+{
+	return sys_mkdirat(AT_FDCWD, pathname, mode);
+}
+
 /*
  * We try to drop the dentry early: we should have
  * a usage count of 2 if we're the only user of this
@@ -1907,7 +1966,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	return error;
 }
 
-asmlinkage long sys_rmdir(const char __user * pathname)
+static long do_rmdir(int dfd, const char __user *pathname)
 {
 	int error = 0;
 	char * name;
@@ -1918,7 +1977,7 @@ asmlinkage long sys_rmdir(const char __user * pathname)
 	if(IS_ERR(name))
 		return PTR_ERR(name);
 
-	error = path_lookup(name, LOOKUP_PARENT, &nd);
+	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
 	if (error)
 		goto exit;
 
@@ -1948,6 +2007,11 @@ exit:
 	return error;
 }
 
+asmlinkage long sys_rmdir(const char __user *pathname)
+{
+	return do_rmdir(AT_FDCWD, pathname);
+}
+
 int vfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int error = may_delete(dir, dentry, 0);
@@ -1984,7 +2048,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
  * writeout happening, and we don't want to prevent access to the directory
  * while waiting on the I/O.
  */
-asmlinkage long sys_unlink(const char __user * pathname)
+static long do_unlinkat(int dfd, const char __user *pathname)
 {
 	int error = 0;
 	char * name;
@@ -1996,7 +2060,7 @@ asmlinkage long sys_unlink(const char __user * pathname)
 	if(IS_ERR(name))
 		return PTR_ERR(name);
 
-	error = path_lookup(name, LOOKUP_PARENT, &nd);
+	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
 	if (error)
 		goto exit;
 	error = -EISDIR;
@@ -2031,6 +2095,22 @@ slashes:
 	goto exit2;
 }
 
+asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
+{
+	if ((flag & ~AT_REMOVEDIR) != 0)
+		return -EINVAL;
+
+	if (flag & AT_REMOVEDIR)
+		return do_rmdir(dfd, pathname);
+
+	return do_unlinkat(dfd, pathname);
+}
+
+asmlinkage long sys_unlink(const char __user *pathname)
+{
+	return do_unlinkat(AT_FDCWD, pathname);
+}
+
 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
 {
 	int error = may_create(dir, dentry, NULL);
@@ -2052,7 +2132,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
 	return error;
 }
 
-asmlinkage long sys_symlink(const char __user * oldname, const char __user * newname)
+asmlinkage long sys_symlinkat(const char __user *oldname,
+			      int newdfd, const char __user *newname)
 {
 	int error = 0;
 	char * from;
@@ -2067,7 +2148,7 @@ asmlinkage long sys_symlink(const char __user * oldname, const char __user * new
 		struct dentry *dentry;
 		struct nameidata nd;
 
-		error = path_lookup(to, LOOKUP_PARENT, &nd);
+		error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
 		if (error)
 			goto out;
 		dentry = lookup_create(&nd, 0);
@@ -2085,6 +2166,11 @@ out:
 	return error;
 }
 
+asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname)
+{
+	return sys_symlinkat(oldname, AT_FDCWD, newname);
+}
+
 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
 {
 	struct inode *inode = old_dentry->d_inode;
@@ -2132,7 +2218,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
  * with linux 2.0, and to avoid hard-linking to directories
  * and other special files.  --ADM
  */
-asmlinkage long sys_link(const char __user * oldname, const char __user * newname)
+asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
+			   int newdfd, const char __user *newname)
 {
 	struct dentry *new_dentry;
 	struct nameidata nd, old_nd;
@@ -2143,10 +2230,10 @@ asmlinkage long sys_link(const char __user * oldname, const char __user * newnam
 	if (IS_ERR(to))
 		return PTR_ERR(to);
 
-	error = __user_walk(oldname, 0, &old_nd);
+	error = __user_walk_fd(olddfd, oldname, 0, &old_nd);
 	if (error)
 		goto exit;
-	error = path_lookup(to, LOOKUP_PARENT, &nd);
+	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
 	if (error)
 		goto out;
 	error = -EXDEV;
@@ -2169,6 +2256,11 @@ exit:
 	return error;
 }
 
+asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
+{
+	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname);
+}
+
 /*
  * The worst of all namespace operations - renaming directory. "Perverted"
  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
@@ -2315,7 +2407,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return error;
 }
 
-static int do_rename(const char * oldname, const char * newname)
+static int do_rename(int olddfd, const char *oldname,
+			int newdfd, const char *newname)
 {
 	int error = 0;
 	struct dentry * old_dir, * new_dir;
@@ -2323,11 +2416,11 @@ static int do_rename(const char * oldname, const char * newname)
 	struct dentry * trap;
 	struct nameidata oldnd, newnd;
 
-	error = path_lookup(oldname, LOOKUP_PARENT, &oldnd);
+	error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd);
 	if (error)
 		goto exit;
 
-	error = path_lookup(newname, LOOKUP_PARENT, &newnd);
+	error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd);
 	if (error)
 		goto exit1;
 
@@ -2391,7 +2484,8 @@ exit:
 	return error;
 }
 
-asmlinkage long sys_rename(const char __user * oldname, const char __user * newname)
+asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
+			     int newdfd, const char __user *newname)
 {
 	int error;
 	char * from;
@@ -2403,13 +2497,18 @@ asmlinkage long sys_rename(const char __user * oldname, const char __user * newn
 	to = getname(newname);
 	error = PTR_ERR(to);
 	if (!IS_ERR(to)) {
-		error = do_rename(from,to);
+		error = do_rename(olddfd, from, newdfd, to);
 		putname(to);
 	}
 	putname(from);
 	return error;
 }
 
+asmlinkage long sys_rename(const char __user *oldname, const char __user *newname)
+{
+	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
+}
+
 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
 {
 	int len;
@@ -2553,6 +2652,7 @@ struct inode_operations page_symlink_inode_operations = {
 };
 
 EXPORT_SYMBOL(__user_walk);
+EXPORT_SYMBOL(__user_walk_fd);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/open.c b/fs/open.c
index 8e20c1f32563..70e0230d8e77 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -20,6 +20,7 @@
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
+#include <linux/fcntl.h>
 #include <asm/uaccess.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
@@ -383,7 +384,7 @@ asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times)
 
 		error = get_user(newattrs.ia_atime.tv_sec, &times->actime);
 		newattrs.ia_atime.tv_nsec = 0;
-		if (!error) 
+		if (!error)
 			error = get_user(newattrs.ia_mtime.tv_sec, &times->modtime);
 		newattrs.ia_mtime.tv_nsec = 0;
 		if (error)
@@ -414,14 +415,14 @@ out:
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-long do_utimes(char __user * filename, struct timeval * times)
+long do_utimes(int dfd, char __user *filename, struct timeval *times)
 {
 	int error;
 	struct nameidata nd;
 	struct inode * inode;
 	struct iattr newattrs;
 
-	error = user_path_walk(filename, &nd);
+	error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
 
 	if (error)
 		goto out;
@@ -461,13 +462,18 @@ out:
 	return error;
 }
 
-asmlinkage long sys_utimes(char __user * filename, struct timeval __user * utimes)
+asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes)
 {
 	struct timeval times[2];
 
 	if (utimes && copy_from_user(&times, utimes, sizeof(times)))
 		return -EFAULT;
-	return do_utimes(filename, utimes ? times : NULL);
+	return do_utimes(dfd, filename, utimes ? times : NULL);
+}
+
+asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes)
+{
+	return sys_futimesat(AT_FDCWD, filename, utimes);
 }
 
 
@@ -476,7 +482,7 @@ asmlinkage long sys_utimes(char __user * filename, struct timeval __user * utime
  * We do this by temporarily clearing all FS-related capabilities and
  * switching the fsuid/fsgid around to the real ones.
  */
-asmlinkage long sys_access(const char __user * filename, int mode)
+asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
 	struct nameidata nd;
 	int old_fsuid, old_fsgid;
@@ -506,7 +512,7 @@ asmlinkage long sys_access(const char __user * filename, int mode)
 	else
 		current->cap_effective = current->cap_permitted;
 
-	res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+	res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
 	if (!res) {
 		res = vfs_permission(&nd, mode);
 		/* SuS v2 requires we report a read only fs too */
@@ -523,6 +529,11 @@ asmlinkage long sys_access(const char __user * filename, int mode)
 	return res;
 }
 
+asmlinkage long sys_access(const char __user *filename, int mode)
+{
+	return sys_faccessat(AT_FDCWD, filename, mode);
+}
+
 asmlinkage long sys_chdir(const char __user * filename)
 {
 	struct nameidata nd;
@@ -635,14 +646,15 @@ out:
 	return err;
 }
 
-asmlinkage long sys_chmod(const char __user * filename, mode_t mode)
+asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
+			     mode_t mode)
 {
 	struct nameidata nd;
 	struct inode * inode;
 	int error;
 	struct iattr newattrs;
 
-	error = user_path_walk(filename, &nd);
+	error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
 	if (error)
 		goto out;
 	inode = nd.dentry->d_inode;
@@ -669,6 +681,11 @@ out:
 	return error;
 }
 
+asmlinkage long sys_chmod(const char __user *filename, mode_t mode)
+{
+	return sys_fchmodat(AT_FDCWD, filename, mode);
+}
+
 static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 {
 	struct inode * inode;
@@ -717,6 +734,26 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
 	return error;
 }
 
+asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
+			     gid_t group, int flag)
+{
+	struct nameidata nd;
+	int error = -EINVAL;
+	int follow;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	error = __user_walk_fd(dfd, filename, follow, &nd);
+	if (!error) {
+		error = chown_common(nd.dentry, user, group);
+		path_release(&nd);
+	}
+out:
+	return error;
+}
+
 asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group)
 {
 	struct nameidata nd;
@@ -820,7 +857,8 @@ cleanup_file:
  * for the internal routines (ie open_namei()/follow_link() etc). 00 is
  * used by symlinks.
  */
-struct file *filp_open(const char * filename, int flags, int mode)
+static struct file *do_filp_open(int dfd, const char *filename, int flags,
+				 int mode)
 {
 	int namei_flags, error;
 	struct nameidata nd;
@@ -829,12 +867,17 @@ struct file *filp_open(const char * filename, int flags, int mode)
 	if ((namei_flags+1) & O_ACCMODE)
 		namei_flags++;
 
-	error = open_namei(filename, namei_flags, mode, &nd);
+	error = open_namei(dfd, filename, namei_flags, mode, &nd);
 	if (!error)
 		return nameidata_to_filp(&nd, flags);
 
 	return ERR_PTR(error);
 }
+
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+	return do_filp_open(AT_FDCWD, filename, flags, mode);
+}
 EXPORT_SYMBOL(filp_open);
 
 /**
@@ -991,7 +1034,7 @@ void fastcall put_unused_fd(unsigned int fd)
 EXPORT_SYMBOL(put_unused_fd);
 
 /*
- * Install a file pointer in the fd array.  
+ * Install a file pointer in the fd array.
  *
  * The VFS is full of places where we drop the files lock between
  * setting the open_fds bitmap and installing the file in the file
@@ -1016,7 +1059,7 @@ void fastcall fd_install(unsigned int fd, struct file * file)
 
 EXPORT_SYMBOL(fd_install);
 
-long do_sys_open(const char __user *filename, int flags, int mode)
+long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
 	char *tmp = getname(filename);
 	int fd = PTR_ERR(tmp);
@@ -1024,7 +1067,7 @@ long do_sys_open(const char __user *filename, int flags, int mode)
 	if (!IS_ERR(tmp)) {
 		fd = get_unused_fd();
 		if (fd >= 0) {
-			struct file *f = filp_open(tmp, flags, mode);
+			struct file *f = do_filp_open(dfd, tmp, flags, mode);
 			if (IS_ERR(f)) {
 				put_unused_fd(fd);
 				fd = PTR_ERR(f);
@@ -1043,10 +1086,20 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode)
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	return do_sys_open(filename, flags, mode);
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
 }
 EXPORT_SYMBOL_GPL(sys_open);
 
+asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
+			   int mode)
+{
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	return do_sys_open(dfd, filename, flags, mode);
+}
+EXPORT_SYMBOL_GPL(sys_openat);
+
 #ifndef __alpha__
 
 /*
diff --git a/fs/stat.c b/fs/stat.c
index b8a0e5110ab2..24211b030f39 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -63,12 +63,12 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 
 EXPORT_SYMBOL(vfs_getattr);
 
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
 {
 	struct nameidata nd;
 	int error;
 
-	error = user_path_walk(name, &nd);
+	error = __user_walk_fd(dfd, name, LOOKUP_FOLLOW, &nd);
 	if (!error) {
 		error = vfs_getattr(nd.mnt, nd.dentry, stat);
 		path_release(&nd);
@@ -76,14 +76,19 @@ int vfs_stat(char __user *name, struct kstat *stat)
 	return error;
 }
 
+int vfs_stat(char __user *name, struct kstat *stat)
+{
+	return vfs_stat_fd(AT_FDCWD, name, stat);
+}
+
 EXPORT_SYMBOL(vfs_stat);
 
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
 {
 	struct nameidata nd;
 	int error;
 
-	error = user_path_walk_link(name, &nd);
+	error = __user_walk_fd(dfd, name, 0, &nd);
 	if (!error) {
 		error = vfs_getattr(nd.mnt, nd.dentry, stat);
 		path_release(&nd);
@@ -91,6 +96,11 @@ int vfs_lstat(char __user *name, struct kstat *stat)
 	return error;
 }
 
+int vfs_lstat(char __user *name, struct kstat *stat)
+{
+	return vfs_lstat_fd(AT_FDCWD, name, stat);
+}
+
 EXPORT_SYMBOL(vfs_lstat);
 
 int vfs_fstat(unsigned int fd, struct kstat *stat)
@@ -151,7 +161,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user * statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat(filename, &stat);
+	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_old_stat(&stat, statbuf);
@@ -161,7 +171,7 @@ asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user
 asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __user * statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat(filename, &stat);
+	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_old_stat(&stat, statbuf);
@@ -229,27 +239,50 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_newstat(char __user * filename, struct stat __user * statbuf)
+asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat(filename, &stat);
+	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_new_stat(&stat, statbuf);
 
 	return error;
 }
-asmlinkage long sys_newlstat(char __user * filename, struct stat __user * statbuf)
+
+asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat(filename, &stat);
+	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_new_stat(&stat, statbuf);
 
 	return error;
 }
-asmlinkage long sys_newfstat(unsigned int fd, struct stat __user * statbuf)
+
+asmlinkage long sys_newfstatat(int dfd, char __user *filename,
+				struct stat __user *statbuf, int flag)
+{
+	struct kstat stat;
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, &stat);
+	else
+		error = vfs_stat_fd(dfd, filename, &stat);
+
+	if (!error)
+		error = cp_new_stat(&stat, statbuf);
+
+out:
+	return error;
+}
+
+asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
 {
 	struct kstat stat;
 	int error = vfs_fstat(fd, &stat);
@@ -260,7 +293,8 @@ asmlinkage long sys_newfstat(unsigned int fd, struct stat __user * statbuf)
 	return error;
 }
 
-asmlinkage long sys_readlink(const char __user * path, char __user * buf, int bufsiz)
+asmlinkage long sys_readlinkat(int dfd, const char __user *path,
+				char __user *buf, int bufsiz)
 {
 	struct nameidata nd;
 	int error;
@@ -268,7 +302,7 @@ asmlinkage long sys_readlink(const char __user * path, char __user * buf, int bu
 	if (bufsiz <= 0)
 		return -EINVAL;
 
-	error = user_path_walk_link(path, &nd);
+	error = __user_walk_fd(dfd, path, 0, &nd);
 	if (!error) {
 		struct inode * inode = nd.dentry->d_inode;
 
@@ -285,6 +319,12 @@ asmlinkage long sys_readlink(const char __user * path, char __user * buf, int bu
 	return error;
 }
 
+asmlinkage long sys_readlink(const char __user *path, char __user *buf,
+				int bufsiz)
+{
+	return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
+}
+
 
 /* ---------- LFS-64 ----------- */
 #ifdef __ARCH_WANT_STAT64
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 8a7c82151de9..c52a63755fdd 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -23,6 +23,13 @@
 #define DN_ATTRIB	0x00000020	/* File changed attibutes */
 #define DN_MULTISHOT	0x80000000	/* Don't remove notifier */
 
+#define AT_FDCWD		-100    /* Special value used to indicate
+                                           openat should use the current
+                                           working directory. */
+#define AT_SYMLINK_NOFOLLOW	0x100   /* Do not follow symbolic links.  */
+#define AT_REMOVEDIR		0x200   /* Remove directory instead of
+                                           unlinking file.  */
+
 #ifdef __KERNEL__
 
 #ifndef force_o_largefile
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b77f2608eef9..84bb449b9b01 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1340,7 +1340,8 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 
 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
 		       struct file *filp);
-extern long do_sys_open(const char __user *filename, int flags, int mode);
+extern long do_sys_open(int fdf, const char __user *filename, int flags,
+			int mode);
 extern struct file *filp_open(const char *, int, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
 extern int filp_close(struct file *, fl_owner_t id);
@@ -1479,7 +1480,7 @@ static inline void allow_write_access(struct file *file)
 }
 extern int do_pipe(int *);
 
-extern int open_namei(const char *, int, int, struct nameidata *);
+extern int open_namei(int dfd, const char *, int, int, struct nameidata *);
 extern int may_open(struct nameidata *, int, int);
 
 extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
@@ -1677,6 +1678,8 @@ extern int vfs_readdir(struct file *, filldir_t, void *);
 
 extern int vfs_stat(char __user *, struct kstat *);
 extern int vfs_lstat(char __user *, struct kstat *);
+extern int vfs_stat_fd(int dfd, char __user *, struct kstat *);
+extern int vfs_lstat_fd(int dfd, char __user *, struct kstat *);
 extern int vfs_fstat(unsigned int, struct kstat *);
 
 extern int vfs_ioctl(struct file *, unsigned int, unsigned int, unsigned long);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index b699e427c00c..e6698013e4d0 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -56,10 +56,11 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_ACCESS		(0x0400)
 
 extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
+extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *));
 #define user_path_walk(name,nd) \
-	__user_walk(name, LOOKUP_FOLLOW, nd)
+	__user_walk_fd(AT_FDCWD, name, LOOKUP_FOLLOW, nd)
 #define user_path_walk_link(name,nd) \
-	__user_walk(name, 0, nd)
+	__user_walk_fd(AT_FDCWD, name, 0, nd)
 extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
 extern int FASTCALL(path_walk(const char *, struct nameidata *));
 extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
@@ -67,7 +68,7 @@ extern void path_release(struct nameidata *);
 extern void path_release_on_umount(struct nameidata *);
 
 extern int __user_path_lookup_open(const char __user *, unsigned lookup_flags, struct nameidata *nd, int open_flags);
-extern int path_lookup_open(const char *, unsigned lookup_flags, struct nameidata *, int open_flags);
+extern int path_lookup_open(int dfd, const char *name, unsigned lookup_flags, struct nameidata *, int open_flags);
 extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
 		int (*open)(struct inode *, struct file *));
 extern struct file *nameidata_to_filp(struct nameidata *nd, int flags);
diff --git a/include/linux/time.h b/include/linux/time.h
index f2aca7ec6325..614dd8465839 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -74,7 +74,7 @@ extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
 #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
-extern long do_utimes(char __user *filename, struct timeval *times);
+extern long do_utimes(int dfd, char __user *filename, struct timeval *times);
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
-- 
cgit v1.2.3-71-gd317


From 150256d8aadb3a337c31efa9e175cbd25bf06b06 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 18 Jan 2006 17:43:57 -0800
Subject: [PATCH] Generic sys_rt_sigsuspend()

The TIF_RESTORE_SIGMASK flag allows us to have a generic implementation of
sys_rt_sigsuspend() instead of duplicating it for each architecture.  This
provides such an implementation and makes arch/powerpc use it.

It also tidies up the ppc32 sys_sigsuspend() to use TIF_RESTORE_SIGMASK.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/kernel/signal_32.c | 56 ++++-------------------------------------
 arch/powerpc/kernel/signal_64.c | 36 --------------------------
 include/asm-powerpc/unistd.h    |  2 ++
 include/linux/sched.h           |  1 +
 kernel/compat.c                 | 28 +++++++++++++++++++++
 kernel/signal.c                 | 26 +++++++++++++++++++
 6 files changed, 62 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 177bba78fb0b..7f0d5ce2567d 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -252,8 +252,7 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs);
 /*
  * Atomically swap in the new signal mask, and wait for a signal.
  */
-long sys_sigsuspend(old_sigset_t mask, int p2, int p3, int p4, int p6, int p7,
-	       struct pt_regs *regs)
+long sys_sigsuspend(old_sigset_t mask)
 {
 	sigset_t saveset;
 
@@ -264,55 +263,10 @@ long sys_sigsuspend(old_sigset_t mask, int p2, int p3, int p4, int p6, int p7,
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	regs->result = -EINTR;
-	regs->gpr[3] = EINTR;
-	regs->ccr |= 0x10000000;
-	while (1) {
-		current->state = TASK_INTERRUPTIBLE;
-		schedule();
-		if (do_signal(&saveset, regs)) {
-			set_thread_flag(TIF_RESTOREALL);
-			return 0;
-		}
-	}
-}
-
-long sys_rt_sigsuspend(
-#ifdef CONFIG_PPC64
-		compat_sigset_t __user *unewset,
-#else
-		sigset_t __user *unewset,
-#endif
-		size_t sigsetsize, int p3, int p4,
-		int p6, int p7, struct pt_regs *regs)
-{
-	sigset_t saveset, newset;
-
-	/* XXX: Don't preclude handling different sized sigset_t's.  */
-	if (sigsetsize != sizeof(sigset_t))
-		return -EINVAL;
-
-	if (get_sigset_t(&newset, unewset))
-		return -EFAULT;
-	sigdelsetmask(&newset, ~_BLOCKABLE);
-
-	spin_lock_irq(&current->sighand->siglock);
-	saveset = current->blocked;
-	current->blocked = newset;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-
-	regs->result = -EINTR;
-	regs->gpr[3] = EINTR;
-	regs->ccr |= 0x10000000;
-	while (1) {
-		current->state = TASK_INTERRUPTIBLE;
-		schedule();
-		if (do_signal(&saveset, regs)) {
-			set_thread_flag(TIF_RESTOREALL);
-			return 0;
-		}
-	}
+ 	current->state = TASK_INTERRUPTIBLE;
+ 	schedule();
+ 	set_thread_flag(TIF_RESTORE_SIGMASK);
+ 	return -ERESTARTNOHAND;
 }
 
 #ifdef CONFIG_PPC32
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 7b9d999e2115..a4a6812e815e 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -67,42 +67,6 @@ struct rt_sigframe {
 	char abigap[288];
 } __attribute__ ((aligned (16)));
 
-
-/*
- * Atomically swap in the new signal mask, and wait for a signal.
- */
-long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, int p3, int p4,
-		       int p6, int p7, struct pt_regs *regs)
-{
-	sigset_t saveset, newset;
-
-	/* XXX: Don't preclude handling different sized sigset_t's.  */
-	if (sigsetsize != sizeof(sigset_t))
-		return -EINVAL;
-
-	if (copy_from_user(&newset, unewset, sizeof(newset)))
-		return -EFAULT;
-	sigdelsetmask(&newset, ~_BLOCKABLE);
-
-	spin_lock_irq(&current->sighand->siglock);
-	saveset = current->blocked;
-	current->blocked = newset;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-
-	regs->result = -EINTR;
-	regs->gpr[3] = EINTR;
-	regs->ccr |= 0x10000000;
-	while (1) {
-		current->state = TASK_INTERRUPTIBLE;
-		schedule();
-		if (do_signal(&saveset, regs)) {
-			set_thread_flag(TIF_RESTOREALL);
-			return 0;
-		}
-	}
-}
-
 long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, unsigned long r5,
 		     unsigned long r6, unsigned long r7, unsigned long r8,
 		     struct pt_regs *regs)
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index 19eaac3fbbf9..76daec496062 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -444,11 +444,13 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_SYS_RT_SIGACTION
+#define __ARCH_WANT_SYS_RT_SIGSUSPEND
 #ifdef CONFIG_PPC32
 #define __ARCH_WANT_OLD_STAT
 #endif
 #ifdef CONFIG_PPC64
 #define __ARCH_WANT_COMPAT_SYS_TIME
+#define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 #endif
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2df1a1a2fee5..0cfcd1c7865e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -809,6 +809,7 @@ struct task_struct {
 	struct sighand_struct *sighand;
 
 	sigset_t blocked, real_blocked;
+	sigset_t saved_sigmask;		/* To be restored with TIF_RESTORE_SIGMASK */
 	struct sigpending pending;
 
 	unsigned long sas_ss_sp;
diff --git a/kernel/compat.c b/kernel/compat.c
index 256e5d9f0647..1867290c37e3 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -871,3 +871,31 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
 }
 
 #endif /* __ARCH_WANT_COMPAT_SYS_TIME */
+
+#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
+asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
+{
+	sigset_t newset;
+	compat_sigset_t newset32;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+		return -EFAULT;
+	sigset_from_compat(&newset, &newset32);
+	sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+	spin_lock_irq(&current->sighand->siglock);
+	current->saved_sigmask = current->blocked;
+	current->blocked = newset;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();
+	set_thread_flag(TIF_RESTORE_SIGMASK);
+	return -ERESTARTNOHAND;
+}
+#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/signal.c b/kernel/signal.c
index 5dafbd36d62e..d3efafd8109a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2721,6 +2721,32 @@ sys_pause(void)
 
 #endif
 
+#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
+asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
+{
+	sigset_t newset;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&newset, unewset, sizeof(newset)))
+		return -EFAULT;
+	sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+	spin_lock_irq(&current->sighand->siglock);
+	current->saved_sigmask = current->blocked;
+	current->blocked = newset;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();
+	set_thread_flag(TIF_RESTORE_SIGMASK);
+	return -ERESTARTNOHAND;
+}
+#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+
 void __init signals_init(void)
 {
 	sigqueue_cachep =
-- 
cgit v1.2.3-71-gd317


From 9f72949f679df06021c9e43886c9191494fdb007 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 18 Jan 2006 17:44:05 -0800
Subject: [PATCH] Add pselect/ppoll system call implementation

The following implementation of ppoll() and pselect() system calls
depends on the architecture providing a TIF_RESTORE_SIGMASK flag in the
thread_info.

These system calls have to change the signal mask during their
operation, and signal handlers must be invoked using the new, temporary
signal mask. The old signal mask must be restored either upon successful
exit from the system call, or upon returning from the invoked signal
handler if the system call is interrupted. We can't simply restore the
original signal mask and return to userspace, since the restored signal
mask may actually block the signal which interrupted the system call.

The TIF_RESTORE_SIGMASK flag deals with this by causing the syscall exit
path to trap into do_signal() just as TIF_SIGPENDING does, and by
causing do_signal() to use the saved signal mask instead of the current
signal mask when setting up the stack frame for the signal handler -- or
by causing do_signal() to simply restore the saved signal mask in the
case where there is no handler to be invoked.

The first patch implements the sys_pselect() and sys_ppoll() system
calls, which are present only if TIF_RESTORE_SIGMASK is defined. That
#ifdef should go away in time when all architectures have implemented
it. The second patch implements TIF_RESTORE_SIGMASK for the PowerPC
kernel (in the -mm tree), and the third patch then removes the
arch-specific implementations of sys_rt_sigsuspend() and replaces them
with generic versions using the same trick.

The fourth and fifth patches, provided by David Howells, implement
TIF_RESTORE_SIGMASK for FR-V and i386 respectively, and the sixth patch
adds the syscalls to the i386 syscall table.

This patch:

Add the pselect() and ppoll() system calls, providing core routines usable by
the original select() and poll() system calls and also the new calls (with
their semantics w.r.t timeouts).

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c          | 260 ++++++++++++++++++++++++++++++++------
 fs/select.c          | 348 ++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/poll.h |   6 +-
 3 files changed, 519 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index c6ba9deabada..18b21b4c9e3a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -53,6 +53,8 @@
 #include <asm/mmu_context.h>
 #include <asm/ioctls.h>
 
+extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -1657,36 +1659,14 @@ static void select_bits_free(void *bits, int size)
 #define MAX_SELECT_SECONDS \
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
-asmlinkage long
-compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp,
-		compat_ulong_t __user *exp, struct compat_timeval __user *tvp)
+int compat_core_sys_select(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout)
 {
 	fd_set_bits fds;
 	char *bits;
-	long timeout;
 	int size, max_fdset, ret = -EINVAL;
 	struct fdtable *fdt;
 
-	timeout = MAX_SCHEDULE_TIMEOUT;
-	if (tvp) {
-		time_t sec, usec;
-
-		if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp))
-		    || __get_user(sec, &tvp->tv_sec)
-		    || __get_user(usec, &tvp->tv_usec)) {
-			ret = -EFAULT;
-			goto out_nofds;
-		}
-
-		if (sec < 0 || usec < 0)
-			goto out_nofds;
-
-		if ((unsigned long) sec < MAX_SELECT_SECONDS) {
-			timeout = ROUND_UP(usec, 1000000/HZ);
-			timeout += sec * (unsigned long) HZ;
-		}
-	}
-
 	if (n < 0)
 		goto out_nofds;
 
@@ -1723,19 +1703,7 @@ compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, &timeout);
-
-	if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
-		time_t sec = 0, usec = 0;
-		if (timeout) {
-			sec = timeout / HZ;
-			usec = timeout % HZ;
-			usec *= (1000000/HZ);
-		}
-		if (put_user(sec, &tvp->tv_sec) ||
-		    put_user(usec, &tvp->tv_usec))
-			ret = -EFAULT;
-	}
+	ret = do_select(n, &fds, timeout);
 
 	if (ret < 0)
 		goto out;
@@ -1756,6 +1724,224 @@ out_nofds:
 	return ret;
 }
 
+asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timeval __user *tvp)
+{
+	s64 timeout = -1;
+	struct compat_timeval tv;
+	int ret;
+
+	if (tvp) {
+		if (copy_from_user(&tv, tvp, sizeof(tv)))
+			return -EFAULT;
+
+		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+			return -EINVAL;
+
+		/* Cast to u64 to make GCC stop complaining */
+		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
+			timeout = -1;	/* infinite */
+		else {
+			timeout = ROUND_UP(tv.tv_sec, 1000000/HZ);
+			timeout += tv.tv_sec * HZ;
+		}
+	}
+
+	ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
+
+	if (tvp) {
+		if (current->personality & STICKY_TIMEOUTS)
+			goto sticky;
+		tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
+		tv.tv_sec = timeout;
+		if (copy_to_user(tvp, &tv, sizeof(tv))) {
+sticky:
+			/*
+			 * If an application puts its timeval in read-only
+			 * memory, we don't want the Linux-specific update to
+			 * the timeval to cause a fault after the select has
+			 * completed successfully. However, because we're not
+			 * updating the timeval, we can't restart the system
+			 * call.
+			 */
+			if (ret == -ERESTARTNOHAND)
+				ret = -EINTR;
+		}
+	}
+
+	return ret;
+}
+
+#ifdef TIF_RESTORE_SIGMASK
+asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
+	compat_size_t sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	long timeout = MAX_SCHEDULE_TIMEOUT;
+	struct compat_timespec ts;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	do {
+		if (tsp) {
+			if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) {
+				timeout = ROUND_UP(ts.tv_nsec, 1000000000/HZ);
+				timeout += ts.tv_sec * (unsigned long)HZ;
+				ts.tv_sec = 0;
+				ts.tv_nsec = 0;
+			} else {
+				ts.tv_sec -= MAX_SELECT_SECONDS;
+				timeout = MAX_SELECT_SECONDS * HZ;
+			}
+		}
+
+		ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
+
+	} while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
+
+	if (tsp && !(current->personality & STICKY_TIMEOUTS)) {
+		ts.tv_sec += timeout / HZ;
+		ts.tv_nsec += (timeout % HZ) * (1000000000/HZ);
+		if (ts.tv_nsec >= 1000000000) {
+			ts.tv_sec++;
+			ts.tv_nsec -= 1000000000;
+		}
+		(void)copy_to_user(tsp, &ts, sizeof(ts));
+	}
+
+	if (ret == -ERESTARTNOHAND) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_thread_flag(TIF_RESTORE_SIGMASK);
+		}
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timespec __user *tsp, void __user *sig)
+{
+	compat_size_t sigsetsize = 0;
+	compat_uptr_t up = 0;
+
+	if (sig) {
+		if (!access_ok(VERIFY_READ, sig,
+				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
+		    	__get_user(up, (compat_uptr_t __user *)sig) ||
+		    	__get_user(sigsetsize,
+				(compat_size_t __user *)(sig+sizeof(up))))
+			return -EFAULT;
+	}
+	return compat_sys_pselect7(n, inp, outp, exp, tsp, compat_ptr(up),
+					sigsetsize);
+}
+
+asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
+	unsigned int nfds, struct compat_timespec __user *tsp,
+	const compat_sigset_t __user *sigmask, compat_size_t sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	struct compat_timespec ts;
+	s64 timeout = -1;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		/* We assume that ts.tv_sec is always lower than
+		   the number of seconds that can be expressed in
+		   an s64. Otherwise the compiler bitches at us */
+		timeout = ROUND_UP(ts.tv_sec, 1000000000/HZ);
+		timeout += ts.tv_sec * HZ;
+	}
+
+	if (sigmask) {
+		if (sigsetsize |= sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_sys_poll(ufds, nfds, &timeout);
+
+	/* We can restart this syscall, usually */
+	if (ret == -EINTR) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+				sizeof(sigsaved));
+			set_thread_flag(TIF_RESTORE_SIGMASK);
+		}
+		ret = -ERESTARTNOHAND;
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	if (tsp && timeout >= 0) {
+		if (current->personality & STICKY_TIMEOUTS)
+			goto sticky;
+		/* Yes, we know it's actually an s64, but it's also positive. */
+		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
+		ts.tv_sec = timeout;
+		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+sticky:
+			/*
+			 * If an application puts its timeval in read-only
+			 * memory, we don't want the Linux-specific update to
+			 * the timeval to cause a fault after the select has
+			 * completed successfully. However, because we're not
+			 * updating the timeval, we can't restart the system
+			 * call.
+			 */
+			if (ret == -ERESTARTNOHAND && timeout >= 0)
+				ret = -EINTR;
+		}
+	}
+
+	return ret;
+}
+#endif /* TIF_RESTORE_SIGMASK */
+
 #if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
 /* Stuff for NFS server syscalls... */
 struct compat_nfsctl_svc {
diff --git a/fs/select.c b/fs/select.c
index f10a10317d54..c0f02d36c60e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -179,12 +179,11 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
 
-int do_select(int n, fd_set_bits *fds, long *timeout)
+int do_select(int n, fd_set_bits *fds, s64 *timeout)
 {
 	struct poll_wqueues table;
 	poll_table *wait;
 	int retval, i;
-	long __timeout = *timeout;
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -196,11 +195,12 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
 
 	poll_initwait(&table);
 	wait = &table.pt;
-	if (!__timeout)
+	if (!*timeout)
 		wait = NULL;
 	retval = 0;
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
+		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -255,22 +255,32 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
 				*rexp = res_ex;
 		}
 		wait = NULL;
-		if (retval || !__timeout || signal_pending(current))
+		if (retval || !*timeout || signal_pending(current))
 			break;
 		if(table.error) {
 			retval = table.error;
 			break;
 		}
+
+		if (*timeout < 0) {
+			/* Wait indefinitely */
+			__timeout = MAX_SCHEDULE_TIMEOUT;
+		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
+			/* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
+			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
+			*timeout -= __timeout;
+		} else {
+			__timeout = *timeout;
+			*timeout = 0;
+		}
 		__timeout = schedule_timeout(__timeout);
+		if (*timeout >= 0)
+			*timeout += __timeout;
 	}
 	__set_current_state(TASK_RUNNING);
 
 	poll_freewait(&table);
 
-	/*
-	 * Up-to-date the caller timeout.
-	 */
-	*timeout = __timeout;
 	return retval;
 }
 
@@ -295,36 +305,14 @@ static void select_bits_free(void *bits, int size)
 #define MAX_SELECT_SECONDS \
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
-asmlinkage long
-sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
+static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
+			   fd_set __user *exp, s64 *timeout)
 {
 	fd_set_bits fds;
 	char *bits;
-	long timeout;
 	int ret, size, max_fdset;
 	struct fdtable *fdt;
 
-	timeout = MAX_SCHEDULE_TIMEOUT;
-	if (tvp) {
-		time_t sec, usec;
-
-		if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp))
-		    || __get_user(sec, &tvp->tv_sec)
-		    || __get_user(usec, &tvp->tv_usec)) {
-			ret = -EFAULT;
-			goto out_nofds;
-		}
-
-		ret = -EINVAL;
-		if (sec < 0 || usec < 0)
-			goto out_nofds;
-
-		if ((unsigned long) sec < MAX_SELECT_SECONDS) {
-			timeout = ROUND_UP(usec, 1000000/HZ);
-			timeout += sec * (unsigned long) HZ;
-		}
-	}
-
 	ret = -EINVAL;
 	if (n < 0)
 		goto out_nofds;
@@ -362,18 +350,7 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, &timeout);
-
-	if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
-		time_t sec = 0, usec = 0;
-		if (timeout) {
-			sec = timeout / HZ;
-			usec = timeout % HZ;
-			usec *= (1000000/HZ);
-		}
-		put_user(sec, &tvp->tv_sec);
-		put_user(usec, &tvp->tv_usec);
-	}
+	ret = do_select(n, &fds, timeout);
 
 	if (ret < 0)
 		goto out;
@@ -395,6 +372,154 @@ out_nofds:
 	return ret;
 }
 
+asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
+			fd_set __user *exp, struct timeval __user *tvp)
+{
+	s64 timeout = -1;
+	struct timeval tv;
+	int ret;
+
+	if (tvp) {
+		if (copy_from_user(&tv, tvp, sizeof(tv)))
+			return -EFAULT;
+
+		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+			return -EINVAL;
+
+		/* Cast to u64 to make GCC stop complaining */
+		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
+			timeout = -1;	/* infinite */
+		else {
+			timeout = ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
+			timeout += tv.tv_sec * HZ;
+		}
+	}
+
+	ret = core_sys_select(n, inp, outp, exp, &timeout);
+
+	if (tvp) {
+		if (current->personality & STICKY_TIMEOUTS)
+			goto sticky;
+		tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
+		tv.tv_sec = timeout;
+		if (copy_to_user(tvp, &tv, sizeof(tv))) {
+sticky:
+			/*
+			 * If an application puts its timeval in read-only
+			 * memory, we don't want the Linux-specific update to
+			 * the timeval to cause a fault after the select has
+			 * completed successfully. However, because we're not
+			 * updating the timeval, we can't restart the system
+			 * call.
+			 */
+			if (ret == -ERESTARTNOHAND)
+				ret = -EINTR;
+		}
+	}
+
+	return ret;
+}
+
+#ifdef TIF_RESTORE_SIGMASK
+asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
+		fd_set __user *exp, struct timespec __user *tsp,
+		const sigset_t __user *sigmask, size_t sigsetsize)
+{
+	s64 timeout = MAX_SCHEDULE_TIMEOUT;
+	sigset_t ksigmask, sigsaved;
+	struct timespec ts;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+			return -EINVAL;
+
+		/* Cast to u64 to make GCC stop complaining */
+		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
+			timeout = -1;	/* infinite */
+		else {
+			timeout = ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
+			timeout += ts.tv_sec * HZ;
+		}
+	}
+
+	if (sigmask) {
+		/* XXX: Don't preclude handling different sized sigset_t's.  */
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = core_sys_select(n, inp, outp, exp, &timeout);
+
+	if (tsp) {
+		if (current->personality & STICKY_TIMEOUTS)
+			goto sticky;
+		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
+		ts.tv_sec = timeout;
+		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+sticky:
+			/*
+			 * If an application puts its timeval in read-only
+			 * memory, we don't want the Linux-specific update to
+			 * the timeval to cause a fault after the select has
+			 * completed successfully. However, because we're not
+			 * updating the timeval, we can't restart the system
+			 * call.
+			 */
+			if (ret == -ERESTARTNOHAND)
+				ret = -EINTR;
+		}
+	}
+
+	if (ret == -ERESTARTNOHAND) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_thread_flag(TIF_RESTORE_SIGMASK);
+		}
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return ret;
+}
+
+/*
+ * Most architectures can't handle 7-argument syscalls. So we provide a
+ * 6-argument version where the sixth argument is a pointer to a structure
+ * which has a pointer to the sigset_t itself followed by a size_t containing
+ * the sigset size.
+ */
+asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
+	fd_set __user *exp, struct timespec __user *tsp, void __user *sig)
+{
+	size_t sigsetsize = 0;
+	sigset_t __user *up = NULL;
+
+	if (sig) {
+		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
+		    || __get_user(up, (sigset_t * __user *)sig)
+		    || __get_user(sigsetsize,
+				(size_t * __user)(sig+sizeof(void *))))
+			return -EFAULT;
+	}
+
+	return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize);
+}
+#endif /* TIF_RESTORE_SIGMASK */
+
 struct poll_list {
 	struct poll_list *next;
 	int len;
@@ -436,16 +561,19 @@ static void do_pollfd(unsigned int num, struct pollfd * fdpage,
 }
 
 static int do_poll(unsigned int nfds,  struct poll_list *list,
-			struct poll_wqueues *wait, long timeout)
+		   struct poll_wqueues *wait, s64 *timeout)
 {
 	int count = 0;
 	poll_table* pt = &wait->pt;
 
-	if (!timeout)
+	/* Optimise the no-wait case */
+	if (!(*timeout))
 		pt = NULL;
  
 	for (;;) {
 		struct poll_list *walk;
+		long __timeout;
+
 		set_current_state(TASK_INTERRUPTIBLE);
 		walk = list;
 		while(walk != NULL) {
@@ -453,18 +581,36 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 			walk = walk->next;
 		}
 		pt = NULL;
-		if (count || !timeout || signal_pending(current))
+		if (count || !*timeout || signal_pending(current))
 			break;
 		count = wait->error;
 		if (count)
 			break;
-		timeout = schedule_timeout(timeout);
+
+		if (*timeout < 0) {
+			/* Wait indefinitely */
+			__timeout = MAX_SCHEDULE_TIMEOUT;
+		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) {
+			/*
+			 * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in
+			 * a loop
+			 */
+			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
+			*timeout -= __timeout;
+		} else {
+			__timeout = *timeout;
+			*timeout = 0;
+		}
+
+		__timeout = schedule_timeout(__timeout);
+		if (*timeout >= 0)
+			*timeout += __timeout;
 	}
 	__set_current_state(TASK_RUNNING);
 	return count;
 }
 
-asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout)
+int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 {
 	struct poll_wqueues table;
  	int fdcount, err;
@@ -482,14 +628,6 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
 	if (nfds > max_fdset && nfds > OPEN_MAX)
 		return -EINVAL;
 
-	if (timeout) {
-		/* Careful about overflow in the intermediate values */
-		if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
-			timeout = (unsigned long)(timeout*HZ+999)/1000+1;
-		else /* Negative or overflow */
-			timeout = MAX_SCHEDULE_TIMEOUT;
-	}
-
 	poll_initwait(&table);
 
 	head = NULL;
@@ -519,6 +657,7 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
 		}
 		i -= pp->len;
 	}
+
 	fdcount = do_poll(nfds, head, &table, timeout);
 
 	/* OK, now copy the revents fields back to user space. */
@@ -547,3 +686,98 @@ out_fds:
 	poll_freewait(&table);
 	return err;
 }
+
+asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+			long timeout_msecs)
+{
+	s64 timeout_jiffies = 0;
+
+	if (timeout_msecs) {
+#if HZ > 1000
+		/* We can only overflow if HZ > 1000 */
+		if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
+			timeout_jiffies = -1;
+		else
+#endif
+			timeout_jiffies = msecs_to_jiffies(timeout_msecs);
+	}
+
+	return do_sys_poll(ufds, nfds, &timeout_jiffies);
+}
+
+#ifdef TIF_RESTORE_SIGMASK
+asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
+	struct timespec __user *tsp, const sigset_t __user *sigmask,
+	size_t sigsetsize)
+{
+	sigset_t ksigmask, sigsaved;
+	struct timespec ts;
+	s64 timeout = -1;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		/* Cast to u64 to make GCC stop complaining */
+		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
+			timeout = -1;	/* infinite */
+		else {
+			timeout = ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
+			timeout += ts.tv_sec * HZ;
+		}
+	}
+
+	if (sigmask) {
+		/* XXX: Don't preclude handling different sized sigset_t's.  */
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_sys_poll(ufds, nfds, &timeout);
+
+	/* We can restart this syscall, usually */
+	if (ret == -EINTR) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_thread_flag(TIF_RESTORE_SIGMASK);
+		}
+		ret = -ERESTARTNOHAND;
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	if (tsp && timeout >= 0) {
+		if (current->personality & STICKY_TIMEOUTS)
+			goto sticky;
+		/* Yes, we know it's actually an s64, but it's also positive. */
+		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
+		ts.tv_sec = timeout;
+		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+		sticky:
+			/*
+			 * If an application puts its timeval in read-only
+			 * memory, we don't want the Linux-specific update to
+			 * the timeval to cause a fault after the select has
+			 * completed successfully. However, because we're not
+			 * updating the timeval, we can't restart the system
+			 * call.
+			 */
+			if (ret == -ERESTARTNOHAND && timeout >= 0)
+				ret = -EINTR;
+		}
+	}
+
+	return ret;
+}
+#endif /* TIF_RESTORE_SIGMASK */
diff --git a/include/linux/poll.h b/include/linux/poll.h
index f6da702088f4..8e8f6098508a 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -92,7 +92,11 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset)
 	memset(fdset, 0, FDS_BYTES(nr));
 }
 
-extern int do_select(int n, fd_set_bits *fds, long *timeout);
+#define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
+
+extern int do_select(int n, fd_set_bits *fds, s64 *timeout);
+extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
+		       s64 *timeout);
 
 #endif /* KERNEL */
 
-- 
cgit v1.2.3-71-gd317


From 4f2d7680cb1ac5c5a70f3ba2447d5aa5c0a1643a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 19 Jan 2006 16:58:37 -0800
Subject: [NETFILTER] x_tables: Make XT_ALIGN align as strictly as necessary.

Or else we break on ppc32 and other 32-bit platforms.

Based upon a patch from Harald Welte.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 472f04834809..59ff6c430cf6 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -19,7 +19,7 @@ struct xt_get_revision
 /* For standard target */
 #define XT_RETURN (-NF_REPEAT - 1)
 
-#define XT_ALIGN(s) (((s) + (__alignof__(void *)-1)) & ~(__alignof__(void *)-1))
+#define XT_ALIGN(s) (((s) + (__alignof__(u_int64_t)-1)) & ~(__alignof__(u_int64_t)-1))
 
 /* Standard return verdict, or do jump. */
 #define XT_STANDARD_TARGET ""
-- 
cgit v1.2.3-71-gd317