From 793b883ed12a6ae6e2901ddb5e038b77d6f0c0ac Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 14 Sep 2005 16:06:14 -0700
Subject: [PATCH] sky2: driver update.

Here is revised patch against netdev sky2 branch.
It includes whitespace fixes, all the changes from the previous
review as well as some optimizations and timing fixes to
solve some of the hangs.

The stall problem is better but not perfect. It appears that
under stress the chip can't keep up with the bus
and sends a pause frame, then hangs. This version is for
testing, and hopefully other eyes might see the root
cause of the problem.

I don't want to reinvent the ugly watchdog code in the syskonnect
version of sk98lin.  If you read it you will see, the original
driver writer and the hardware developer obviously didn't
understand each other.

Dual port support is included, but not tested yet. It did
require small change to NAPI since both ports share same
IRQ.
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 include/linux/netdevice.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7c717907896d..5e90557715ab 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -780,11 +780,15 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
 }
 
 /* Schedule rx intr now? */
+static inline int netif_rx_schedule_test(struct net_device *dev)
+{
+	return !test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state);
+}
 
+/* Schedule only if device is up */
 static inline int netif_rx_schedule_prep(struct net_device *dev)
 {
-	return netif_running(dev) &&
-		!test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state);
+	return netif_running(dev) && netif_rx_schedule_test(dev);
 }
 
 /* Add interface to tail of rx poll list. This assumes that _prep has
-- 
cgit v1.2.3-71-gd317


From c2373ee98982a1c842dfb213c398f388d4227e63 Mon Sep 17 00:00:00 2001
From: Mitch Williams <mitch.a.williams@intel.com>
Date: Wed, 9 Nov 2005 10:34:45 -0800
Subject: [PATCH] net: make dev_valid_name public

dev_valid_name() is a useful function.  Make it public.

Signed-off-by: Mitch Williams <mitch.a.williams@intel.com>
Acked-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/netdevice.h | 1 +
 net/core/dev.c            | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 936f8b76114e..4fceff0e59ec 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -684,6 +684,7 @@ extern int		netif_rx(struct sk_buff *skb);
 extern int		netif_rx_ni(struct sk_buff *skb);
 #define HAVE_NETIF_RECEIVE_SKB 1
 extern int		netif_receive_skb(struct sk_buff *skb);
+extern int		dev_valid_name(const char *name);
 extern int		dev_ioctl(unsigned int cmd, void __user *);
 extern int		dev_ethtool(struct ifreq *);
 extern unsigned		dev_get_flags(const struct net_device *);
diff --git a/net/core/dev.c b/net/core/dev.c
index 0b48e294aafe..94e642ee6e2b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -626,7 +626,7 @@ struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mas
  *	Network device names need to be valid file names to
  *	to allow sysfs to work
  */
-static int dev_valid_name(const char *name)
+int dev_valid_name(const char *name)
 {
 	return !(*name == '\0' 
 		 || !strcmp(name, ".")
@@ -3269,6 +3269,7 @@ EXPORT_SYMBOL(__dev_get_by_index);
 EXPORT_SYMBOL(__dev_get_by_name);
 EXPORT_SYMBOL(__dev_remove_pack);
 EXPORT_SYMBOL(__skb_linearize);
+EXPORT_SYMBOL(dev_valid_name);
 EXPORT_SYMBOL(dev_add_pack);
 EXPORT_SYMBOL(dev_alloc_name);
 EXPORT_SYMBOL(dev_close);
-- 
cgit v1.2.3-71-gd317


From e74ac79956ecb56e71a398c57eb10fab8c58a562 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@pobox.com>
Date: Mon, 14 Nov 2005 18:16:37 -0500
Subject: [libata] remove two unused fields from struct ata_port

---
 include/linux/libata.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index f2dbb684ce9e..83a83babff84 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -330,8 +330,6 @@ struct ata_port {
 
 	u8			ctl;	/* cache of ATA control register */
 	u8			last_ctl;	/* Cache last written value */
-	unsigned int		bus_state;
-	unsigned int		port_state;
 	unsigned int		pio_mask;
 	unsigned int		mwdma_mask;
 	unsigned int		udma_mask;
-- 
cgit v1.2.3-71-gd317


From 0a1225769763779288d759e904c4f5a660844ce4 Mon Sep 17 00:00:00 2001
From: "shemminger@osdl.org" <shemminger@osdl.org>
Date: Wed, 30 Nov 2005 11:45:17 -0800
Subject: [PATCH] sky2: change netif_rx_schedule_test to __netif_schedule_prep

I didn't like the name netif_rx_schedule_test(), in earlier patches
and changed to __netif_rx_schedule_prep to be more consistent.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/net/sky2.c        | 5 +++--
 include/linux/netdevice.h | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 2253140ff4dc..f56de9894208 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1961,10 +1961,11 @@ static irqreturn_t sky2_intr(int irq, void *dev_id, struct pt_regs *regs)
 	if (status & Y2_IS_STAT_BMU) {
 		hw->intr_mask &= ~Y2_IS_STAT_BMU;
 		sky2_write32(hw, B0_IMSK, hw->intr_mask);
-		prefetch(&hw->st_le[hw->st_idx]);
 
-		if (netif_rx_schedule_test(dev0))
+		if (likely(__netif_rx_schedule_prep(dev0))) {
+			prefetch(&hw->st_le[hw->st_idx]);
 			__netif_rx_schedule(dev0);
+		}
 	}
 
 	if (status & Y2_IS_IRQ_PHY1)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 07e114d48bbb..7fda03d338d1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -802,16 +802,16 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
 	return (1 << debug_value) - 1;
 }
 
-/* Schedule rx intr now? */
-static inline int netif_rx_schedule_test(struct net_device *dev)
+/* Test if receive needs to be scheduled */
+static inline int __netif_rx_schedule_prep(struct net_device *dev)
 {
 	return !test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state);
 }
 
-/* Schedule only if device is up */
+/* Test if receive needs to be scheduled but only if up */
 static inline int netif_rx_schedule_prep(struct net_device *dev)
 {
-	return netif_running(dev) && netif_rx_schedule_test(dev);
+	return netif_running(dev) && __netif_rx_schedule_prep(dev);
 }
 
 /* Add interface to tail of rx poll list. This assumes that _prep has
-- 
cgit v1.2.3-71-gd317


From a22e2eb0710798009b8e696ae911aef745089dd6 Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Mon, 5 Dec 2005 15:38:02 +0800
Subject: [PATCH] libata: move err_mask to ata_queued_cmd

  - remove err_mask from the parameter list of the complete functions
  - move err_mask to ata_queued_cmd
  - initialize qc->err_mask when needed
  - for each function call to ata_qc_complete(), replace the err_mask parameter with qc->err_mask.

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>

===============
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/ahci.c         | 12 ++++++++----
 drivers/scsi/libata-core.c  | 32 +++++++++++++++++++-------------
 drivers/scsi/libata-scsi.c  | 18 ++++++++++--------
 drivers/scsi/libata.h       |  2 +-
 drivers/scsi/pdc_adma.c     | 11 +++++------
 drivers/scsi/sata_mv.c      |  9 ++++++---
 drivers/scsi/sata_promise.c | 14 ++++++++------
 drivers/scsi/sata_qstor.c   |  7 ++++---
 drivers/scsi/sata_sil24.c   | 15 ++++++++++-----
 drivers/scsi/sata_sx4.c     | 15 ++++++++++-----
 include/linux/libata.h      |  7 +++++--
 11 files changed, 86 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index cfbdd3f071b6..887eaa2a3ebf 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -643,7 +643,8 @@ static void ahci_eng_timeout(struct ata_port *ap)
 	 	 * not being called from the SCSI EH.
 	 	 */
 		qc->scsidone = scsi_finish_command;
-		ata_qc_complete(qc, AC_ERR_OTHER);
+		qc->err_mask |= AC_ERR_OTHER;
+		ata_qc_complete(qc);
 	}
 
 	spin_unlock_irqrestore(&host_set->lock, flags);
@@ -664,7 +665,8 @@ static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
 	ci = readl(port_mmio + PORT_CMD_ISSUE);
 	if (likely((ci & 0x1) == 0)) {
 		if (qc) {
-			ata_qc_complete(qc, 0);
+			assert(qc->err_mask == 0);
+			ata_qc_complete(qc);
 			qc = NULL;
 		}
 	}
@@ -681,8 +683,10 @@ static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
 		/* command processing has stopped due to error; restart */
 		ahci_restart_port(ap, status);
 
-		if (qc)
-			ata_qc_complete(qc, err_mask);
+		if (qc) {
+			qc->err_mask |= AC_ERR_OTHER;
+			ata_qc_complete(qc);
+		}
 	}
 
 	return 1;
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 0a959566f964..f56b4daf4189 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1053,9 +1053,9 @@ static int ata_qc_wait_err(struct ata_queued_cmd *qc,
 
 	if (wait_for_completion_timeout(wait, 30 * HZ) < 1) {
 		/* timeout handling */
-		unsigned int err_mask = ac_err_mask(ata_chk_status(qc->ap));
+		qc->err_mask |= ac_err_mask(ata_chk_status(qc->ap));
 
-		if (!err_mask) {
+		if (!qc->err_mask) {
 			printk(KERN_WARNING "ata%u: slow completion (cmd %x)\n",
 			       qc->ap->id, qc->tf.command);
 		} else {
@@ -1064,7 +1064,7 @@ static int ata_qc_wait_err(struct ata_queued_cmd *qc,
 			rc = -EIO;
 		}
 
-		ata_qc_complete(qc, err_mask);
+		ata_qc_complete(qc);
 	}
 
 	return rc;
@@ -1175,6 +1175,7 @@ retry:
 				qc->cursg_ofs = 0;
 				qc->cursect = 0;
 				qc->nsect = 1;
+				qc->err_mask = 0;
 				goto retry;
 			}
 		}
@@ -2777,7 +2778,7 @@ skip_map:
  *	None.  (grabs host lock)
  */
 
-void ata_poll_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask)
+void ata_poll_qc_complete(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
 	unsigned long flags;
@@ -2785,7 +2786,7 @@ void ata_poll_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask)
 	spin_lock_irqsave(&ap->host_set->lock, flags);
 	ap->flags &= ~ATA_FLAG_NOINTR;
 	ata_irq_on(ap);
-	ata_qc_complete(qc, err_mask);
+	ata_qc_complete(qc);
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 }
 
@@ -2885,7 +2886,8 @@ static int ata_pio_complete (struct ata_port *ap)
 
 	ap->hsm_task_state = HSM_ST_IDLE;
 
-	ata_poll_qc_complete(qc, 0);
+	assert(qc->err_mask == 0);
+	ata_poll_qc_complete(qc);
 
 	/* another command may start at this point */
 
@@ -3261,7 +3263,8 @@ static void ata_pio_error(struct ata_port *ap)
 
 	ap->hsm_task_state = HSM_ST_IDLE;
 
-	ata_poll_qc_complete(qc, AC_ERR_ATA_BUS);
+	qc->err_mask |= AC_ERR_ATA_BUS;
+	ata_poll_qc_complete(qc);
 }
 
 static void ata_pio_task(void *_data)
@@ -3363,7 +3366,8 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
 		       ap->id, qc->tf.command, drv_stat, host_stat);
 
 		/* complete taskfile transaction */
-		ata_qc_complete(qc, ac_err_mask(drv_stat));
+		qc->err_mask |= ac_err_mask(drv_stat);
+		ata_qc_complete(qc);
 		break;
 	}
 
@@ -3462,7 +3466,7 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 	return qc;
 }
 
-int ata_qc_complete_noop(struct ata_queued_cmd *qc, unsigned int err_mask)
+int ata_qc_complete_noop(struct ata_queued_cmd *qc)
 {
 	return 0;
 }
@@ -3521,7 +3525,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
  *	spin_lock_irqsave(host_set lock)
  */
 
-void ata_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask)
+void ata_qc_complete(struct ata_queued_cmd *qc)
 {
 	int rc;
 
@@ -3538,7 +3542,7 @@ void ata_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask)
 	qc->flags &= ~ATA_QCFLAG_ACTIVE;
 
 	/* call completion callback */
-	rc = qc->complete_fn(qc, err_mask);
+	rc = qc->complete_fn(qc);
 
 	/* if callback indicates not to complete command (non-zero),
 	 * return immediately
@@ -3976,7 +3980,8 @@ inline unsigned int ata_host_intr (struct ata_port *ap,
 		ap->ops->irq_clear(ap);
 
 		/* complete taskfile transaction */
-		ata_qc_complete(qc, ac_err_mask(status));
+		qc->err_mask |= ac_err_mask(status);
+		ata_qc_complete(qc);
 		break;
 
 	default:
@@ -4111,7 +4116,8 @@ static void atapi_packet_task(void *_data)
 err_out_status:
 	status = ata_chk_status(ap);
 err_out:
-	ata_poll_qc_complete(qc, __ac_err_mask(status));
+	qc->err_mask |= __ac_err_mask(status);
+	ata_poll_qc_complete(qc);
 }
 
 
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index ef763ed9a0e5..2aef41112c43 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -1203,12 +1203,11 @@ nothing_to_do:
 	return 1;
 }
 
-static int ata_scsi_qc_complete(struct ata_queued_cmd *qc,
-				unsigned int err_mask)
+static int ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 {
 	struct scsi_cmnd *cmd = qc->scsicmd;
 	u8 *cdb = cmd->cmnd;
- 	int need_sense = (err_mask != 0);
+ 	int need_sense = (qc->err_mask != 0);
 
 	/* For ATA pass thru (SAT) commands, generate a sense block if
 	 * user mandated it or if there's an error.  Note that if we
@@ -1955,9 +1954,9 @@ void ata_scsi_badcmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *), u8
 	done(cmd);
 }
 
-static int atapi_sense_complete(struct ata_queued_cmd *qc,unsigned int err_mask)
+static int atapi_sense_complete(struct ata_queued_cmd *qc)
 {
-	if (err_mask && ((err_mask & AC_ERR_DEV) == 0))
+	if (qc->err_mask && ((qc->err_mask & AC_ERR_DEV) == 0))
 		/* FIXME: not quite right; we don't want the
 		 * translation of taskfile registers into
 		 * a sense descriptors, since that's only
@@ -2015,15 +2014,18 @@ static void atapi_request_sense(struct ata_queued_cmd *qc)
 
 	qc->complete_fn = atapi_sense_complete;
 
-	if (ata_qc_issue(qc))
-		ata_qc_complete(qc, AC_ERR_OTHER);
+	if (ata_qc_issue(qc)) {
+		qc->err_mask |= AC_ERR_OTHER;
+		ata_qc_complete(qc);
+	}
 
 	DPRINTK("EXIT\n");
 }
 
-static int atapi_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask)
+static int atapi_qc_complete(struct ata_queued_cmd *qc)
 {
 	struct scsi_cmnd *cmd = qc->scsicmd;
+	unsigned int err_mask = qc->err_mask;
 
 	VPRINTK("ENTER, err_mask 0x%X\n", err_mask);
 
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 8ebaa694d18e..686255df76b8 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -39,7 +39,7 @@ struct ata_scsi_args {
 
 /* libata-core.c */
 extern int atapi_enabled;
-extern int ata_qc_complete_noop(struct ata_queued_cmd *qc, unsigned int err_mask);
+extern int ata_qc_complete_noop(struct ata_queued_cmd *qc);
 extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 				      struct ata_device *dev);
 extern void ata_rwcmd_protocol(struct ata_queued_cmd *qc);
diff --git a/drivers/scsi/pdc_adma.c b/drivers/scsi/pdc_adma.c
index f557f17ca00c..e8df0c9ec1e6 100644
--- a/drivers/scsi/pdc_adma.c
+++ b/drivers/scsi/pdc_adma.c
@@ -464,14 +464,12 @@ static inline unsigned int adma_intr_pkt(struct ata_host_set *host_set)
 			continue;
 		qc = ata_qc_from_tag(ap, ap->active_tag);
 		if (qc && (!(qc->tf.ctl & ATA_NIEN))) {
-			unsigned int err_mask = 0;
-
 			if ((status & (aPERR | aPSD | aUIRQ)))
-				err_mask = AC_ERR_OTHER;
+				qc->err_mask |= AC_ERR_OTHER;
 			else if (pp->pkt[0] != cDONE)
-				err_mask = AC_ERR_OTHER;
+				qc->err_mask |= AC_ERR_OTHER;
 
-			ata_qc_complete(qc, err_mask);
+			ata_qc_complete(qc);
 		}
 	}
 	return handled;
@@ -501,7 +499,8 @@ static inline unsigned int adma_intr_mmio(struct ata_host_set *host_set)
 		
 				/* complete taskfile transaction */
 				pp->state = adma_state_idle;
-				ata_qc_complete(qc, ac_err_mask(status));
+				qc->err_mask |= ac_err_mask(status);
+				ata_qc_complete(qc);
 				handled = 1;
 			}
 		}
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index c94176693d1f..3e7866b51ac6 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -1242,8 +1242,10 @@ static void mv_host_intr(struct ata_host_set *host_set, u32 relevant,
 				VPRINTK("port %u IRQ found for qc, "
 					"ata_status 0x%x\n", port,ata_status);
 				/* mark qc status appropriately */
-				if (!(qc->tf.ctl & ATA_NIEN))
-					ata_qc_complete(qc, err_mask);
+				if (!(qc->tf.ctl & ATA_NIEN)) {
+					qc->err_mask |= err_mask;
+					ata_qc_complete(qc);
+				}
 			}
 		}
 	}
@@ -1864,7 +1866,8 @@ static void mv_eng_timeout(struct ata_port *ap)
 	 	 */
 		spin_lock_irqsave(&ap->host_set->lock, flags);
 		qc->scsidone = scsi_finish_command;
-		ata_qc_complete(qc, AC_ERR_OTHER);
+		qc->err_mask |= AC_ERR_OTHER;
+		ata_qc_complete(qc);
 		spin_unlock_irqrestore(&ap->host_set->lock, flags);
 	}
 }
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index 02089069b0f6..e2e146a14f97 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -401,7 +401,8 @@ static void pdc_eng_timeout(struct ata_port *ap)
 	case ATA_PROT_NODATA:
 		printk(KERN_ERR "ata%u: command timeout\n", ap->id);
 		drv_stat = ata_wait_idle(ap);
-		ata_qc_complete(qc, __ac_err_mask(drv_stat));
+		qc->err_mask |= __ac_err_mask(drv_stat);
+		ata_qc_complete(qc);
 		break;
 
 	default:
@@ -410,7 +411,8 @@ static void pdc_eng_timeout(struct ata_port *ap)
 		printk(KERN_ERR "ata%u: unknown timeout, cmd 0x%x stat 0x%x\n",
 		       ap->id, qc->tf.command, drv_stat);
 
-		ata_qc_complete(qc, ac_err_mask(drv_stat));
+		qc->err_mask |= ac_err_mask(drv_stat);
+		ata_qc_complete(qc);
 		break;
 	}
 
@@ -422,21 +424,21 @@ out:
 static inline unsigned int pdc_host_intr( struct ata_port *ap,
                                           struct ata_queued_cmd *qc)
 {
-	unsigned int handled = 0, err_mask = 0;
+	unsigned int handled = 0;
 	u32 tmp;
 	void __iomem *mmio = (void __iomem *) ap->ioaddr.cmd_addr + PDC_GLOBAL_CTL;
 
 	tmp = readl(mmio);
 	if (tmp & PDC_ERR_MASK) {
-		err_mask = AC_ERR_DEV;
+		qc->err_mask |= AC_ERR_DEV;
 		pdc_reset_port(ap);
 	}
 
 	switch (qc->tf.protocol) {
 	case ATA_PROT_DMA:
 	case ATA_PROT_NODATA:
-		err_mask |= ac_err_mask(ata_wait_idle(ap));
-		ata_qc_complete(qc, err_mask);
+		qc->err_mask |= ac_err_mask(ata_wait_idle(ap));
+		ata_qc_complete(qc);
 		handled = 1;
 		break;
 
diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c
index 6b9c3ae07cb3..de05e2883f9c 100644
--- a/drivers/scsi/sata_qstor.c
+++ b/drivers/scsi/sata_qstor.c
@@ -409,8 +409,8 @@ static inline unsigned int qs_intr_pkt(struct ata_host_set *host_set)
 					case 3: /* device error */
 						pp->state = qs_state_idle;
 						qs_enter_reg_mode(qc->ap);
-						ata_qc_complete(qc,
-							ac_err_mask(sDST));
+						qc->err_mask |= ac_err_mask(sDST);
+						ata_qc_complete(qc);
 						break;
 					default:
 						break;
@@ -447,7 +447,8 @@ static inline unsigned int qs_intr_mmio(struct ata_host_set *host_set)
 
 				/* complete taskfile transaction */
 				pp->state = qs_state_idle;
-				ata_qc_complete(qc, ac_err_mask(status));
+				qc->err_mask |= ac_err_mask(status);
+				ata_qc_complete(qc);
 				handled = 1;
 			}
 		}
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index e0d6f194f54f..a0ad3ed2200a 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -654,7 +654,8 @@ static void sil24_eng_timeout(struct ata_port *ap)
 	 */
 	printk(KERN_ERR "ata%u: command timeout\n", ap->id);
 	qc->scsidone = scsi_finish_command;
-	ata_qc_complete(qc, AC_ERR_OTHER);
+	qc->err_mask |= AC_ERR_OTHER;
+	ata_qc_complete(qc);
 
 	sil24_reset_controller(ap);
 }
@@ -711,8 +712,10 @@ static void sil24_error_intr(struct ata_port *ap, u32 slot_stat)
 		sil24_reset_controller(ap);
 	}
 
-	if (qc)
-		ata_qc_complete(qc, err_mask);
+	if (qc) {
+		qc->err_mask |= err_mask;
+		ata_qc_complete(qc);
+	}
 }
 
 static inline void sil24_host_intr(struct ata_port *ap)
@@ -734,8 +737,10 @@ static inline void sil24_host_intr(struct ata_port *ap)
 		 */
 		sil24_update_tf(ap);
 
-		if (qc)
-			ata_qc_complete(qc, ac_err_mask(pp->tf.command));
+		if (qc) {
+			qc->err_mask |= ac_err_mask(pp->tf.command);
+			ata_qc_complete(qc);
+		}
 	} else
 		sil24_error_intr(ap, slot_stat);
 }
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index 7c4b53575510..58da854a7c68 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -718,7 +718,8 @@ static inline unsigned int pdc20621_host_intr( struct ata_port *ap,
 			VPRINTK("ata%u: read hdma, 0x%x 0x%x\n", ap->id,
 				readl(mmio + 0x104), readl(mmio + PDC_HDMA_CTLSTAT));
 			/* get drive status; clear intr; complete txn */
-			ata_qc_complete(qc, ac_err_mask(ata_wait_idle(ap)));
+			qc->err_mask |= ac_err_mask(ata_wait_idle(ap));
+			ata_qc_complete(qc);
 			pdc20621_pop_hdma(qc);
 		}
 
@@ -756,7 +757,8 @@ static inline unsigned int pdc20621_host_intr( struct ata_port *ap,
 			VPRINTK("ata%u: write ata, 0x%x 0x%x\n", ap->id,
 				readl(mmio + 0x104), readl(mmio + PDC_HDMA_CTLSTAT));
 			/* get drive status; clear intr; complete txn */
-			ata_qc_complete(qc, ac_err_mask(ata_wait_idle(ap)));
+			qc->err_mask |= ac_err_mask(ata_wait_idle(ap));
+			ata_qc_complete(qc);
 			pdc20621_pop_hdma(qc);
 		}
 		handled = 1;
@@ -766,7 +768,8 @@ static inline unsigned int pdc20621_host_intr( struct ata_port *ap,
 
 		status = ata_busy_wait(ap, ATA_BUSY | ATA_DRQ, 1000);
 		DPRINTK("BUS_NODATA (drv_stat 0x%X)\n", status);
-		ata_qc_complete(qc, ac_err_mask(status));
+		qc->err_mask |= ac_err_mask(status);
+		ata_qc_complete(qc);
 		handled = 1;
 
 	} else {
@@ -881,7 +884,8 @@ static void pdc_eng_timeout(struct ata_port *ap)
 	case ATA_PROT_DMA:
 	case ATA_PROT_NODATA:
 		printk(KERN_ERR "ata%u: command timeout\n", ap->id);
-		ata_qc_complete(qc, __ac_err_mask(ata_wait_idle(ap)));
+		qc->err_mask |= __ac_err_mask(ata_wait_idle(ap));
+		ata_qc_complete(qc);
 		break;
 
 	default:
@@ -890,7 +894,8 @@ static void pdc_eng_timeout(struct ata_port *ap)
 		printk(KERN_ERR "ata%u: unknown timeout, cmd 0x%x stat 0x%x\n",
 		       ap->id, qc->tf.command, drv_stat);
 
-		ata_qc_complete(qc, ac_err_mask(drv_stat));
+		qc->err_mask |= ac_err_mask(drv_stat);
+		ata_qc_complete(qc);
 		break;
 	}
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 83a83babff84..e18ce039cdfd 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -194,7 +194,7 @@ struct ata_port;
 struct ata_queued_cmd;
 
 /* typedefs */
-typedef int (*ata_qc_cb_t) (struct ata_queued_cmd *qc, unsigned int err_mask);
+typedef int (*ata_qc_cb_t) (struct ata_queued_cmd *qc);
 
 struct ata_ioports {
 	unsigned long		cmd_addr;
@@ -279,6 +279,8 @@ struct ata_queued_cmd {
 	/* DO NOT iterate over __sg manually, use ata_for_each_sg() */
 	struct scatterlist	*__sg;
 
+	unsigned int		err_mask;
+
 	ata_qc_cb_t		complete_fn;
 
 	struct completion	*waiting;
@@ -475,7 +477,7 @@ extern void ata_bmdma_start (struct ata_queued_cmd *qc);
 extern void ata_bmdma_stop(struct ata_queued_cmd *qc);
 extern u8   ata_bmdma_status(struct ata_port *ap);
 extern void ata_bmdma_irq_clear(struct ata_port *ap);
-extern void ata_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask);
+extern void ata_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eng_timeout(struct ata_port *ap);
 extern void ata_scsi_simulate(u16 *id, struct scsi_cmnd *cmd,
 			      void (*done)(struct scsi_cmnd *));
@@ -667,6 +669,7 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
 	qc->cursect = qc->cursg = qc->cursg_ofs = 0;
 	qc->nsect = 0;
 	qc->nbytes = qc->curbytes = 0;
+	qc->err_mask = 0;
 
 	ata_tf_init(qc->ap, &qc->tf, qc->dev->devno);
 }
-- 
cgit v1.2.3-71-gd317


From 95235ca2c20ac0b31a8eb39e2d599bcc3e9c9a10 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 2 Dec 2005 10:43:20 -0800
Subject: [CPUFREQ] CPU frequency display in /proc/cpuinfo

What is the value shown in "cpu MHz" of /proc/cpuinfo when CPUs are capable of
changing frequency?

Today the answer is: It depends.
On i386:
SMP kernel - It is always the boot frequency
UP kernel - Scales with the frequency change and shows that was last set.

On x86_64:
There is one single variable cpu_khz that gets written by all the CPUs. So,
the frequency set by last CPU will be seen on /proc/cpuinfo of all the
CPUs in the system. What you see also depends on whether you have constant_tsc
capable CPU or not.

On ia64:
It is always boot time frequency of a particular CPU that gets displayed.

The patch below changes this to:
Show the last known frequency of the particular CPU, when cpufreq is present. If
cpu doesnot support changing of frequency through cpufreq, then boot frequency
will be shown. The patch affects i386, x86_64 and ia64 architectures.

Signed-off-by: Venkatesh Pallipadi<venkatesh.pallipadi@intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/i386/kernel/cpu/proc.c |  6 +++++-
 arch/ia64/kernel/setup.c    |  8 +++++++-
 arch/x86_64/kernel/setup.c  |  6 +++++-
 drivers/cpufreq/cpufreq.c   | 24 ++++++++++++++++++++++++
 include/linux/cpufreq.h     | 10 ++++++++++
 5 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index e7921315ae9d..6d91b274589c 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -3,6 +3,7 @@
 #include <linux/string.h>
 #include <asm/semaphore.h>
 #include <linux/seq_file.h>
+#include <linux/cpufreq.h>
 
 /*
  *	Get CPU information for use by the procfs.
@@ -86,8 +87,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		seq_printf(m, "stepping\t: unknown\n");
 
 	if ( cpu_has(c, X86_FEATURE_TSC) ) {
+		unsigned int freq = cpufreq_quick_get(n);
+		if (!freq)
+			freq = cpu_khz;
 		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
-			cpu_khz / 1000, (cpu_khz % 1000));
+			freq / 1000, (freq % 1000));
 	}
 
 	/* Cache size */
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 5add0bcf87a7..088e5dded8dc 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -43,6 +43,7 @@
 #include <linux/initrd.h>
 #include <linux/platform.h>
 #include <linux/pm.h>
+#include <linux/cpufreq.h>
 
 #include <asm/ia32.h>
 #include <asm/machvec.h>
@@ -517,6 +518,7 @@ show_cpuinfo (struct seq_file *m, void *v)
 	char family[32], features[128], *cp, sep;
 	struct cpuinfo_ia64 *c = v;
 	unsigned long mask;
+	unsigned int proc_freq;
 	int i;
 
 	mask = c->features;
@@ -549,6 +551,10 @@ show_cpuinfo (struct seq_file *m, void *v)
 		sprintf(cp, " 0x%lx", mask);
 	}
 
+	proc_freq = cpufreq_quick_get(cpunum);
+	if (!proc_freq)
+		proc_freq = c->proc_freq / 1000;
+
 	seq_printf(m,
 		   "processor  : %d\n"
 		   "vendor     : %s\n"
@@ -565,7 +571,7 @@ show_cpuinfo (struct seq_file *m, void *v)
 		   "BogoMIPS   : %lu.%02lu\n",
 		   cpunum, c->vendor, family, c->model, c->revision, c->archrev,
 		   features, c->ppn, c->number,
-		   c->proc_freq / 1000000, c->proc_freq % 1000000,
+		   proc_freq / 1000, proc_freq % 1000,
 		   c->itc_freq / 1000000, c->itc_freq % 1000000,
 		   lpj*HZ/500000, (lpj*HZ/5000) % 100);
 #ifdef CONFIG_SMP
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 750e01dcbdf4..64c4534b930c 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -42,6 +42,7 @@
 #include <linux/edd.h>
 #include <linux/mmzone.h>
 #include <linux/kexec.h>
+#include <linux/cpufreq.h>
 
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -1256,8 +1257,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		seq_printf(m, "stepping\t: unknown\n");
 	
 	if (cpu_has(c,X86_FEATURE_TSC)) {
+		unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
+		if (!freq)
+			freq = cpu_khz;
 		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
-			     cpu_khz / 1000, (cpu_khz % 1000));
+			     freq / 1000, (freq % 1000));
 	}
 
 	/* Cache size */
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 815902c2c856..a9163d02983a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -822,6 +822,30 @@ static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq, unsigne
 }
 
 
+/** 
+ * cpufreq_quick_get - get the CPU frequency (in kHz) frpm policy->cur
+ * @cpu: CPU number
+ *
+ * This is the last known freq, without actually getting it from the driver.
+ * Return value will be same as what is shown in scaling_cur_freq in sysfs.
+ */
+unsigned int cpufreq_quick_get(unsigned int cpu)
+{
+	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+	unsigned int ret = 0;
+
+	if (policy) {
+		down(&policy->lock);
+		ret = policy->cur;
+		up(&policy->lock);
+		cpufreq_cpu_put(policy);
+	}
+
+	return (ret);
+}
+EXPORT_SYMBOL(cpufreq_quick_get);
+
+
 /** 
  * cpufreq_get - get the current CPU frequency (in kHz)
  * @cpu: CPU number
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d068176b7ad7..c31650df9241 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -256,6 +256,16 @@ int cpufreq_update_policy(unsigned int cpu);
 /* query the current CPU frequency (in kHz). If zero, cpufreq couldn't detect it */
 unsigned int cpufreq_get(unsigned int cpu);
 
+/* query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it */
+#ifdef CONFIG_CPU_FREQ
+unsigned int cpufreq_quick_get(unsigned int cpu);
+#else
+static inline unsigned int cpufreq_quick_get(unsigned int cpu)
+{
+	return 0;
+}
+#endif
+
 
 /*********************************************************************
  *                       CPUFREQ DEFAULT GOVERNOR                    *
-- 
cgit v1.2.3-71-gd317


From a2a7a662f80d8b7f2295a36de1f9b033ed0b910c Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 13 Dec 2005 14:48:31 +0900
Subject: [PATCH] libata: implement ata_exec_internal()

This patch implements ata_exec_internal() function which performs
libata internal command execution.  Previously, this was done by each
user by manually initializing a qc, issueing it, waiting for its
completion and handling errors.  In addition to obvious code
factoring, using ata_exec_internal() fixes the following bugs.

* qc not freed on issue failure
* ap->qactive clearing could race with the next internal command
* race between timeout handling and irq
* ignoring error condition not represented in tf->status

Also, qc & hardware are not accessed anymore once it's completed,
making internal commands more conformant with general semantics.
ata_exec_internal() also makes it easy to issue internal commands from
multiple threads if that becomes necessary.

This patch only implements ata_exec_internal().  A following patch
will convert all users.

Signed-off-by: Tejun Heo <htejun@gmail.com>

--

Jeff, all patches have been regenerated against upstream branch as of
today.  (575ab52a218e4ff0667a6cbd972c3af443ee8713)

Also, I took out a debug printk from ata_exec_internal (don't know how
that one got left there).  Other than that, all patches are identical
to the previous posting.

Thanks. :-)
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |  2 +
 2 files changed, 101 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index a0060cf31e0d..de80abeab065 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1046,6 +1046,105 @@ static unsigned int ata_pio_modes(const struct ata_device *adev)
 	return modes;
 }
 
+struct ata_exec_internal_arg {
+	unsigned int err_mask;
+	struct ata_taskfile *tf;
+	struct completion *waiting;
+};
+
+int ata_qc_complete_internal(struct ata_queued_cmd *qc)
+{
+	struct ata_exec_internal_arg *arg = qc->private_data;
+	struct completion *waiting = arg->waiting;
+
+	if (!(qc->err_mask & ~AC_ERR_DEV))
+		qc->ap->ops->tf_read(qc->ap, arg->tf);
+	arg->err_mask = qc->err_mask;
+	arg->waiting = NULL;
+	complete(waiting);
+
+	return 0;
+}
+
+/**
+ *	ata_exec_internal - execute libata internal command
+ *	@ap: Port to which the command is sent
+ *	@dev: Device to which the command is sent
+ *	@tf: Taskfile registers for the command and the result
+ *	@dma_dir: Data tranfer direction of the command
+ *	@buf: Data buffer of the command
+ *	@buflen: Length of data buffer
+ *
+ *	Executes libata internal command with timeout.  @tf contains
+ *	command on entry and result on return.  Timeout and error
+ *	conditions are reported via return value.  No recovery action
+ *	is taken after a command times out.  It's caller's duty to
+ *	clean up after timeout.
+ *
+ *	LOCKING:
+ *	None.  Should be called with kernel context, might sleep.
+ */
+
+static unsigned
+ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
+		  struct ata_taskfile *tf,
+		  int dma_dir, void *buf, unsigned int buflen)
+{
+	u8 command = tf->command;
+	struct ata_queued_cmd *qc;
+	DECLARE_COMPLETION(wait);
+	unsigned long flags;
+	struct ata_exec_internal_arg arg;
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+
+	qc = ata_qc_new_init(ap, dev);
+	BUG_ON(qc == NULL);
+
+	qc->tf = *tf;
+	qc->dma_dir = dma_dir;
+	if (dma_dir != DMA_NONE) {
+		ata_sg_init_one(qc, buf, buflen);
+		qc->nsect = buflen / ATA_SECT_SIZE;
+	}
+
+	arg.waiting = &wait;
+	arg.tf = tf;
+	qc->private_data = &arg;
+	qc->complete_fn = ata_qc_complete_internal;
+
+	if (ata_qc_issue(qc))
+		goto issue_fail;
+
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	if (!wait_for_completion_timeout(&wait, ATA_TMOUT_INTERNAL)) {
+		spin_lock_irqsave(&ap->host_set->lock, flags);
+
+		/* We're racing with irq here.  If we lose, the
+		 * following test prevents us from completing the qc
+		 * again.  If completion irq occurs after here but
+		 * before the caller cleans up, it will result in a
+		 * spurious interrupt.  We can live with that.
+		 */
+		if (arg.waiting) {
+			qc->err_mask = AC_ERR_OTHER;
+			ata_qc_complete(qc);
+			printk(KERN_WARNING "ata%u: qc timeout (cmd 0x%x)\n",
+			       ap->id, command);
+		}
+
+		spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	}
+
+	return arg.err_mask;
+
+ issue_fail:
+	ata_qc_free(qc);
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+	return AC_ERR_OTHER;
+}
+
 static int ata_qc_wait_err(struct ata_queued_cmd *qc,
 			   struct completion *wait)
 {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e18ce039cdfd..833e57afd54c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -135,6 +135,8 @@ enum {
 	ATA_TMOUT_BOOT_QUICK	= 7 * HZ,	/* hueristic */
 	ATA_TMOUT_CDB		= 30 * HZ,
 	ATA_TMOUT_CDB_QUICK	= 5 * HZ,
+	ATA_TMOUT_INTERNAL	= 30 * HZ,
+	ATA_TMOUT_INTERNAL_QUICK = 5 * HZ,
 
 	/* ATA bus states */
 	BUS_UNKNOWN		= 0,
-- 
cgit v1.2.3-71-gd317


From b5632303401c231bf270ef36f1013e52caf4caf9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 13 Dec 2005 14:51:25 +0900
Subject: [PATCH] libata: remove unused qc->waiting

There is no user of qc->waiting left after ata_exec_internal()
changes.  Kill the field.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 14 ++------------
 include/linux/libata.h     |  2 --
 2 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 1c4dbf3e9818..9ea102587914 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3503,7 +3503,7 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 static void __ata_qc_complete(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
-	unsigned int tag, do_clear = 0;
+	unsigned int tag;
 
 	qc->flags = 0;
 	tag = qc->tag;
@@ -3511,17 +3511,8 @@ static void __ata_qc_complete(struct ata_queued_cmd *qc)
 		if (tag == ap->active_tag)
 			ap->active_tag = ATA_TAG_POISON;
 		qc->tag = ATA_TAG_POISON;
-		do_clear = 1;
-	}
-
-	if (qc->waiting) {
-		struct completion *waiting = qc->waiting;
-		qc->waiting = NULL;
-		complete(waiting);
-	}
-
-	if (likely(do_clear))
 		clear_bit(tag, &ap->qactive);
+	}
 }
 
 /**
@@ -3537,7 +3528,6 @@ static void __ata_qc_complete(struct ata_queued_cmd *qc)
 void ata_qc_free(struct ata_queued_cmd *qc)
 {
 	assert(qc != NULL);	/* ata_qc_from_tag _might_ return NULL */
-	assert(qc->waiting == NULL);	/* nothing should be waiting */
 
 	__ata_qc_complete(qc);
 }
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 833e57afd54c..46337e71613e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -285,8 +285,6 @@ struct ata_queued_cmd {
 
 	ata_qc_cb_t		complete_fn;
 
-	struct completion	*waiting;
-
 	void			*private_data;
 };
 
-- 
cgit v1.2.3-71-gd317


From 4e06cbd42c41f9e49fcfe5ee45c749eefaae9cf4 Mon Sep 17 00:00:00 2001
From: "Moore, Eric Dean" <Eric.Moore@lsil.com>
Date: Thu, 1 Dec 2005 16:51:02 -0700
Subject: [SCSI] pci_ids.h: add subclass code for SAS Controllers

Signed-off-by: Eric Moore <Eric.Moore@lsil.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 1e737e269db9..aa76fc4e38c2 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -15,6 +15,7 @@
 #define PCI_CLASS_STORAGE_FLOPPY	0x0102
 #define PCI_CLASS_STORAGE_IPI		0x0103
 #define PCI_CLASS_STORAGE_RAID		0x0104
+#define PCI_CLASS_STORAGE_SAS		0x0107
 #define PCI_CLASS_STORAGE_OTHER		0x0180
 
 #define PCI_BASE_CLASS_NETWORK		0x02
-- 
cgit v1.2.3-71-gd317


From 6e39b69e7ea9205c5f80aeac3ef999ab8fb1a4cc Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Fri, 11 Nov 2005 05:30:24 -0600
Subject: [SCSI] export blk layer functions needed for blk_execute_rq_nowait

To send async requests we need these two functions exported.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 block/ll_rw_blk.c      | 6 +++++-
 include/linux/blkdev.h | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 99c9ca6d5992..c525b5a2b598 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -2306,6 +2306,8 @@ void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
 	generic_unplug_device(q);
 }
 
+EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
+
 /**
  * blk_execute_rq - insert a request into queue for execution
  * @q:		queue to insert the request in
@@ -2444,7 +2446,7 @@ void disk_round_stats(struct gendisk *disk)
 /*
  * queue lock must be held
  */
-static void __blk_put_request(request_queue_t *q, struct request *req)
+void __blk_put_request(request_queue_t *q, struct request *req)
 {
 	struct request_list *rl = req->rl;
 
@@ -2473,6 +2475,8 @@ static void __blk_put_request(request_queue_t *q, struct request *req)
 	}
 }
 
+EXPORT_SYMBOL_GPL(__blk_put_request);
+
 void blk_put_request(struct request *req)
 {
 	unsigned long flags;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a33a31e71bbc..9a68716dcf75 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -558,6 +558,7 @@ extern void blk_unregister_queue(struct gendisk *disk);
 extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
+extern void __blk_put_request(request_queue_t *, struct request *);
 extern void blk_end_sync_rq(struct request *rq);
 extern void blk_attempt_remerge(request_queue_t *, struct request *);
 extern struct request *blk_get_request(request_queue_t *, int, gfp_t);
@@ -579,6 +580,10 @@ extern int blk_rq_map_kern(request_queue_t *, struct request *, void *, unsigned
 extern int blk_rq_map_user_iov(request_queue_t *, struct request *, struct sg_iovec *, int);
 extern int blk_execute_rq(request_queue_t *, struct gendisk *,
 			  struct request *, int);
+extern void blk_execute_rq_nowait(request_queue_t *, struct gendisk *,
+				  struct request *, int,
+				  void (*done)(struct request *));
+
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 {
 	return bdev->bd_disk->queue;
-- 
cgit v1.2.3-71-gd317


From 6e68af666f5336254b5715dca591026b7324499a Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Fri, 11 Nov 2005 05:30:27 -0600
Subject: [SCSI] Convert SCSI mid-layer to scsi_execute_async

Add scsi helpers to create really-large-requests and convert
scsi-ml to scsi_execute_async().

Per Jens's previous comments, I placed this function in scsi_lib.c.
I made it follow all the queue's limits - I think I did at least :), so
I removed the warning on the function header.

I think the scsi_execute_* functions should eventually take a request_queue
and be placed some place where the dm-multipath hw_handler can use them
if that failover code is going to stay in the kernel. That conversion
patch will be sent in another mail though.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/scsi_error.c  |  47 ++-------
 drivers/scsi/scsi_lib.c    | 230 ++++++++++++++++++++++++++++++++++++++-------
 drivers/scsi/scsi_priv.h   |   1 -
 fs/bio.c                   |  20 ++++
 include/linux/bio.h        |   2 +
 include/scsi/scsi_device.h |   6 ++
 6 files changed, 233 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 18c5d2523014..53ea62d3b53d 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1314,23 +1314,6 @@ int scsi_decide_disposition(struct scsi_cmnd *scmd)
 	}
 }
 
-/**
- * scsi_eh_lock_done - done function for eh door lock request
- * @scmd:	SCSI command block for the door lock request
- *
- * Notes:
- * 	We completed the asynchronous door lock request, and it has either
- * 	locked the door or failed.  We must free the command structures
- * 	associated with this request.
- **/
-static void scsi_eh_lock_done(struct scsi_cmnd *scmd)
-{
-	struct scsi_request *sreq = scmd->sc_request;
-
-	scsi_release_request(sreq);
-}
-
-
 /**
  * scsi_eh_lock_door - Prevent medium removal for the specified device
  * @sdev:	SCSI device to prevent medium removal
@@ -1353,29 +1336,17 @@ static void scsi_eh_lock_done(struct scsi_cmnd *scmd)
  **/
 static void scsi_eh_lock_door(struct scsi_device *sdev)
 {
-	struct scsi_request *sreq = scsi_allocate_request(sdev, GFP_KERNEL);
+	unsigned char cmnd[MAX_COMMAND_SIZE];
 
-	if (unlikely(!sreq)) {
-		printk(KERN_ERR "%s: request allocate failed,"
-		       "prevent media removal cmd not sent\n", __FUNCTION__);
-		return;
-	}
+	cmnd[0] = ALLOW_MEDIUM_REMOVAL;
+	cmnd[1] = 0;
+	cmnd[2] = 0;
+	cmnd[3] = 0;
+	cmnd[4] = SCSI_REMOVAL_PREVENT;
+	cmnd[5] = 0;
 
-	sreq->sr_cmnd[0] = ALLOW_MEDIUM_REMOVAL;
-	sreq->sr_cmnd[1] = 0;
-	sreq->sr_cmnd[2] = 0;
-	sreq->sr_cmnd[3] = 0;
-	sreq->sr_cmnd[4] = SCSI_REMOVAL_PREVENT;
-	sreq->sr_cmnd[5] = 0;
-	sreq->sr_data_direction = DMA_NONE;
-	sreq->sr_bufflen = 0;
-	sreq->sr_buffer = NULL;
-	sreq->sr_allowed = 5;
-	sreq->sr_done = scsi_eh_lock_done;
-	sreq->sr_timeout_per_command = 10 * HZ;
-	sreq->sr_cmd_len = COMMAND_SIZE(sreq->sr_cmnd[0]);
-
-	scsi_insert_special_req(sreq, 1);
+	scsi_execute_async(sdev, cmnd, DMA_NONE, NULL, 0, 0, 10 * HZ,
+			   5, NULL, NULL, GFP_KERNEL);
 }
 
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1f2782767ca9..eb0cfbfbcf8f 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -63,39 +63,6 @@ static struct scsi_host_sg_pool scsi_sg_pools[] = {
 }; 	
 #undef SP
 
-
-/*
- * Function:    scsi_insert_special_req()
- *
- * Purpose:     Insert pre-formed request into request queue.
- *
- * Arguments:   sreq	- request that is ready to be queued.
- *              at_head	- boolean.  True if we should insert at head
- *                        of queue, false if we should insert at tail.
- *
- * Lock status: Assumed that lock is not held upon entry.
- *
- * Returns:     Nothing
- *
- * Notes:       This function is called from character device and from
- *              ioctl types of functions where the caller knows exactly
- *              what SCSI command needs to be issued.   The idea is that
- *              we merely inject the command into the queue (at the head
- *              for now), and then call the queue request function to actually
- *              process it.
- */
-int scsi_insert_special_req(struct scsi_request *sreq, int at_head)
-{
-	/*
-	 * Because users of this function are apt to reuse requests with no
-	 * modification, we have to sanitise the request flags here
-	 */
-	sreq->sr_request->flags &= ~REQ_DONTPREP;
-	blk_insert_request(sreq->sr_device->request_queue, sreq->sr_request,
-		       	   at_head, sreq);
-	return 0;
-}
-
 static void scsi_run_queue(struct request_queue *q);
 
 /*
@@ -249,8 +216,13 @@ void scsi_do_req(struct scsi_request *sreq, const void *cmnd,
 
 	/*
 	 * head injection *required* here otherwise quiesce won't work
+	 *
+	 * Because users of this function are apt to reuse requests with no
+	 * modification, we have to sanitise the request flags here
 	 */
-	scsi_insert_special_req(sreq, 1);
+	sreq->sr_request->flags &= ~REQ_DONTPREP;
+	blk_insert_request(sreq->sr_device->request_queue, sreq->sr_request,
+		       	   1, sreq);
 }
 EXPORT_SYMBOL(scsi_do_req);
 
@@ -327,6 +299,196 @@ int scsi_execute_req(struct scsi_device *sdev, const unsigned char *cmd,
 }
 EXPORT_SYMBOL(scsi_execute_req);
 
+struct scsi_io_context {
+	void *data;
+	void (*done)(void *data, char *sense, int result, int resid);
+	char sense[SCSI_SENSE_BUFFERSIZE];
+};
+
+static void scsi_end_async(struct request *req)
+{
+	struct scsi_io_context *sioc = req->end_io_data;
+
+	if (sioc->done)
+		sioc->done(sioc->data, sioc->sense, req->errors, req->data_len);
+
+	kfree(sioc);
+	__blk_put_request(req->q, req);
+}
+
+static int scsi_merge_bio(struct request *rq, struct bio *bio)
+{
+	struct request_queue *q = rq->q;
+
+	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+	if (rq_data_dir(rq) == WRITE)
+		bio->bi_rw |= (1 << BIO_RW);
+	blk_queue_bounce(q, &bio);
+
+	if (!rq->bio)
+		blk_rq_bio_prep(q, rq, bio);
+	else if (!q->back_merge_fn(q, rq, bio))
+		return -EINVAL;
+	else {
+		rq->biotail->bi_next = bio;
+		rq->biotail = bio;
+		rq->hard_nr_sectors += bio_sectors(bio);
+		rq->nr_sectors = rq->hard_nr_sectors;
+	}
+
+	return 0;
+}
+
+static int scsi_bi_endio(struct bio *bio, unsigned int bytes_done, int error)
+{
+	if (bio->bi_size)
+		return 1;
+
+	bio_put(bio);
+	return 0;
+}
+
+/**
+ * scsi_req_map_sg - map a scatterlist into a request
+ * @rq:		request to fill
+ * @sg:		scatterlist
+ * @nsegs:	number of elements
+ * @bufflen:	len of buffer
+ * @gfp:	memory allocation flags
+ *
+ * scsi_req_map_sg maps a scatterlist into a request so that the
+ * request can be sent to the block layer. We do not trust the scatterlist
+ * sent to use, as some ULDs use that struct to only organize the pages.
+ */
+static int scsi_req_map_sg(struct request *rq, struct scatterlist *sgl,
+			   int nsegs, unsigned bufflen, gfp_t gfp)
+{
+	struct request_queue *q = rq->q;
+	int nr_pages = (bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	unsigned int data_len = 0, len, bytes, off;
+	struct page *page;
+	struct bio *bio = NULL;
+	int i, err, nr_vecs = 0;
+
+	for (i = 0; i < nsegs; i++) {
+		page = sgl[i].page;
+		off = sgl[i].offset;
+		len = sgl[i].length;
+		data_len += len;
+
+		while (len > 0) {
+			bytes = min_t(unsigned int, len, PAGE_SIZE - off);
+
+			if (!bio) {
+				nr_vecs = min_t(int, BIO_MAX_PAGES, nr_pages);
+				nr_pages -= nr_vecs;
+
+				bio = bio_alloc(gfp, nr_vecs);
+				if (!bio) {
+					err = -ENOMEM;
+					goto free_bios;
+				}
+				bio->bi_end_io = scsi_bi_endio;
+			}
+
+			if (bio_add_pc_page(q, bio, page, bytes, off) !=
+			    bytes) {
+				bio_put(bio);
+				err = -EINVAL;
+				goto free_bios;
+			}
+
+			if (bio->bi_vcnt >= nr_vecs) {
+				err = scsi_merge_bio(rq, bio);
+				if (err) {
+					bio_endio(bio, bio->bi_size, 0);
+					goto free_bios;
+				}
+				bio = NULL;
+			}
+
+			page++;
+			len -= bytes;
+			off = 0;
+		}
+	}
+
+	rq->buffer = rq->data = NULL;
+	rq->data_len = data_len;
+	return 0;
+
+free_bios:
+	while ((bio = rq->bio) != NULL) {
+		rq->bio = bio->bi_next;
+		/*
+		 * call endio instead of bio_put incase it was bounced
+		 */
+		bio_endio(bio, bio->bi_size, 0);
+	}
+
+	return err;
+}
+
+/**
+ * scsi_execute_async - insert request
+ * @sdev:	scsi device
+ * @cmd:	scsi command
+ * @data_direction: data direction
+ * @buffer:	data buffer (this can be a kernel buffer or scatterlist)
+ * @bufflen:	len of buffer
+ * @use_sg:	if buffer is a scatterlist this is the number of elements
+ * @timeout:	request timeout in seconds
+ * @retries:	number of times to retry request
+ * @flags:	or into request flags
+ **/
+int scsi_execute_async(struct scsi_device *sdev, const unsigned char *cmd,
+		       int data_direction, void *buffer, unsigned bufflen,
+		       int use_sg, int timeout, int retries, void *privdata,
+		       void (*done)(void *, char *, int, int), gfp_t gfp)
+{
+	struct request *req;
+	struct scsi_io_context *sioc;
+	int err = 0;
+	int write = (data_direction == DMA_TO_DEVICE);
+
+	sioc = kzalloc(sizeof(*sioc), gfp);
+	if (!sioc)
+		return DRIVER_ERROR << 24;
+
+	req = blk_get_request(sdev->request_queue, write, gfp);
+	if (!req)
+		goto free_sense;
+
+	if (use_sg)
+		err = scsi_req_map_sg(req, buffer, use_sg, bufflen, gfp);
+	else if (bufflen)
+		err = blk_rq_map_kern(req->q, req, buffer, bufflen, gfp);
+
+	if (err)
+		goto free_req;
+
+	req->cmd_len = COMMAND_SIZE(cmd[0]);
+	memcpy(req->cmd, cmd, req->cmd_len);
+	req->sense = sioc->sense;
+	req->sense_len = 0;
+	req->timeout = timeout;
+	req->flags |= REQ_BLOCK_PC | REQ_QUIET;
+	req->end_io_data = sioc;
+
+	sioc->data = privdata;
+	sioc->done = done;
+
+	blk_execute_rq_nowait(req->q, NULL, req, 1, scsi_end_async);
+	return 0;
+
+free_req:
+	blk_put_request(req);
+free_sense:
+	kfree(sioc);
+	return DRIVER_ERROR << 24;
+}
+EXPORT_SYMBOL_GPL(scsi_execute_async);
+
 /*
  * Function:    scsi_init_cmd_errh()
  *
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index a8d121c8fbcd..f04e7e11f57a 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -40,7 +40,6 @@ extern void scsi_exit_hosts(void);
 extern int scsi_dispatch_cmd(struct scsi_cmnd *cmd);
 extern int scsi_setup_command_freelist(struct Scsi_Host *shost);
 extern void scsi_destroy_command_freelist(struct Scsi_Host *shost);
-extern int scsi_insert_special_req(struct scsi_request *sreq, int);
 extern void scsi_init_cmd_from_req(struct scsi_cmnd *cmd,
 		struct scsi_request *sreq);
 extern void __scsi_release_request(struct scsi_request *sreq);
diff --git a/fs/bio.c b/fs/bio.c
index 460554b07ff9..4d21ee3873ec 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -385,6 +385,25 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 	return len;
 }
 
+/**
+ *	bio_add_pc_page	-	attempt to add page to bio
+ *	@bio: destination bio
+ *	@page: page to add
+ *	@len: vec entry length
+ *	@offset: vec entry offset
+ *
+ *	Attempt to add a page to the bio_vec maplist. This can fail for a
+ *	number of reasons, such as the bio being full or target block
+ *	device limitations. The target block device must allow bio's
+ *      smaller than PAGE_SIZE, so it is always possible to add a single
+ *      page to an empty bio. This should only be used by REQ_PC bios.
+ */
+int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page,
+		    unsigned int len, unsigned int offset)
+{
+	return __bio_add_page(q, bio, page, len, offset);
+}
+
 /**
  *	bio_add_page	-	attempt to add page to bio
  *	@bio: destination bio
@@ -1228,6 +1247,7 @@ EXPORT_SYMBOL(bio_clone);
 EXPORT_SYMBOL(bio_phys_segments);
 EXPORT_SYMBOL(bio_hw_segments);
 EXPORT_SYMBOL(bio_add_page);
+EXPORT_SYMBOL(bio_add_pc_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
 EXPORT_SYMBOL(bio_map_user);
 EXPORT_SYMBOL(bio_unmap_user);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 685fd3720df5..b60ffe32cd21 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -292,6 +292,8 @@ extern struct bio *bio_clone(struct bio *, gfp_t);
 extern void bio_init(struct bio *);
 
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
+extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
+			   unsigned int, unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
 				unsigned long, unsigned int, int);
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 063e32fe036c..e94ca4d36035 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -274,6 +274,12 @@ extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 extern int scsi_execute_req(struct scsi_device *sdev, const unsigned char *cmd,
 			    int data_direction, void *buffer, unsigned bufflen,
 			    struct scsi_sense_hdr *, int timeout, int retries);
+extern int scsi_execute_async(struct scsi_device *sdev,
+			      const unsigned char *cmd, int data_direction,
+			      void *buffer, unsigned bufflen, int use_sg,
+			      int timeout, int retries, void *privdata,
+			      void (*done)(void *, char *, int, int),
+			      gfp_t gfp);
 
 static inline unsigned int sdev_channel(struct scsi_device *sdev)
 {
-- 
cgit v1.2.3-71-gd317


From 17e01f216b611fc46956dcd9063aec4de75991e3 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Fri, 11 Nov 2005 05:31:37 -0600
Subject: [SCSI] add retries field to request for REQ_BLOCK_PC use

For tape we need to control the retries. This patch adds a retries
counter on the request for REQ_BLOCK_PC commands originating from
scsi_execute* to use. REQ_BLOCK_PC commands comming from the block
layer SG_IO path continue to use the retires set in the ULD init_command.
(scsi_execute* does not set the gendisk so we do not execute
the init_command in that path).

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/scsi_lib.c | 4 +++-
 include/linux/blkdev.h  | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index eb0cfbfbcf8f..365843a1561f 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -259,6 +259,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	memcpy(req->cmd, cmd, req->cmd_len);
 	req->sense = sense;
 	req->sense_len = 0;
+	req->retries = retries;
 	req->timeout = timeout;
 	req->flags |= flags | REQ_BLOCK_PC | REQ_SPECIAL | REQ_QUIET;
 
@@ -472,6 +473,7 @@ int scsi_execute_async(struct scsi_device *sdev, const unsigned char *cmd,
 	req->sense = sioc->sense;
 	req->sense_len = 0;
 	req->timeout = timeout;
+	req->retries = retries;
 	req->flags |= REQ_BLOCK_PC | REQ_QUIET;
 	req->end_io_data = sioc;
 
@@ -1393,7 +1395,7 @@ static int scsi_prep_fn(struct request_queue *q, struct request *req)
 				cmd->sc_data_direction = DMA_NONE;
 			
 			cmd->transfersize = req->data_len;
-			cmd->allowed = 3;
+			cmd->allowed = req->retries;
 			cmd->timeout_per_command = req->timeout;
 			cmd->done = scsi_generic_done;
 		}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9a68716dcf75..509e9a03a328 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -184,6 +184,7 @@ struct request {
 	void *sense;
 
 	unsigned int timeout;
+	int retries;
 
 	/*
 	 * For Power Management requests
-- 
cgit v1.2.3-71-gd317


From defd94b75409b983f94548ea2f52ff5787ddb848 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Mon, 5 Dec 2005 02:37:06 -0600
Subject: [SCSI] seperate max_sectors from max_hw_sectors

- export __blk_put_request and blk_execute_rq_nowait
needed for async REQ_BLOCK_PC requests
- seperate max_hw_sectors and max_sectors for block/scsi_ioctl.c and
SG_IO bio.c helpers per Jens's last comments. Since block/scsi_ioctl.c SG_IO was
already testing against max_sectors and SCSI-ml was setting max_sectors and
max_hw_sectors to the same value this does not change any scsi SG_IO behavior. It only
prepares ll_rw_blk.c, scsi_ioctl.c and bio.c for when SCSI-ml begins to set
a valid max_hw_sectors for all LLDs. Today if a LLD does not set it
SCSI-ml sets it to a safe default and some LLDs set it to a artificial low
value to overcome memory and feedback issues.

Note: Since we now cap max_sectors to BLK_DEF_MAX_SECTORS, which is 1024,
drivers that used to call blk_queue_max_sectors with a large value of
max_sectors will now see the fs requests capped to BLK_DEF_MAX_SECTORS.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 block/ll_rw_blk.c       | 34 ++++++++++++++++++++++++++--------
 block/scsi_ioctl.c      |  2 +-
 drivers/md/dm-table.c   |  2 +-
 drivers/scsi/scsi_lib.c |  2 +-
 fs/bio.c                | 20 +++++++++++---------
 include/linux/blkdev.h  |  3 ++-
 6 files changed, 42 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index c525b5a2b598..d4beb9a89ee0 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -239,7 +239,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
-	blk_queue_max_sectors(q, MAX_SECTORS);
+	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
@@ -555,7 +555,12 @@ void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
 		printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
 	}
 
-	q->max_sectors = q->max_hw_sectors = max_sectors;
+	if (BLK_DEF_MAX_SECTORS > max_sectors)
+		q->max_hw_sectors = q->max_sectors = max_sectors;
+ 	else {
+		q->max_sectors = BLK_DEF_MAX_SECTORS;
+		q->max_hw_sectors = max_sectors;
+	}
 }
 
 EXPORT_SYMBOL(blk_queue_max_sectors);
@@ -657,8 +662,8 @@ EXPORT_SYMBOL(blk_queue_hardsect_size);
 void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
 {
 	/* zero is "infinity" */
-	t->max_sectors = t->max_hw_sectors =
-		min_not_zero(t->max_sectors,b->max_sectors);
+	t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
+	t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
 
 	t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
 	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
@@ -1293,9 +1298,15 @@ static inline int ll_new_hw_segment(request_queue_t *q,
 static int ll_back_merge_fn(request_queue_t *q, struct request *req, 
 			    struct bio *bio)
 {
+	unsigned short max_sectors;
 	int len;
 
-	if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
+	if (unlikely(blk_pc_request(req)))
+		max_sectors = q->max_hw_sectors;
+	else
+		max_sectors = q->max_sectors;
+
+	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
 		req->flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -1325,9 +1336,16 @@ static int ll_back_merge_fn(request_queue_t *q, struct request *req,
 static int ll_front_merge_fn(request_queue_t *q, struct request *req, 
 			     struct bio *bio)
 {
+	unsigned short max_sectors;
 	int len;
 
-	if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
+	if (unlikely(blk_pc_request(req)))
+		max_sectors = q->max_hw_sectors;
+	else
+		max_sectors = q->max_sectors;
+
+
+	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
 		req->flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -2144,7 +2162,7 @@ int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf,
 	struct bio *bio;
 	int reading;
 
-	if (len > (q->max_sectors << 9))
+	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
 	if (!len || !ubuf)
 		return -EINVAL;
@@ -2259,7 +2277,7 @@ int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf,
 {
 	struct bio *bio;
 
-	if (len > (q->max_sectors << 9))
+	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
 	if (!len || !kbuf)
 		return -EINVAL;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 382dea7b224c..4e390dfd3157 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -233,7 +233,7 @@ static int sg_io(struct file *file, request_queue_t *q,
 	if (verify_command(file, cmd))
 		return -EPERM;
 
-	if (hdr->dxfer_len > (q->max_sectors << 9))
+	if (hdr->dxfer_len > (q->max_hw_sectors << 9))
 		return -EIO;
 
 	if (hdr->dxfer_len)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a6d3baa46f61..a6f2dc66c3db 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -638,7 +638,7 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 static void check_for_valid_limits(struct io_restrictions *rs)
 {
 	if (!rs->max_sectors)
-		rs->max_sectors = MAX_SECTORS;
+		rs->max_sectors = SAFE_MAX_SECTORS;
 	if (!rs->max_phys_segments)
 		rs->max_phys_segments = MAX_PHYS_SEGMENTS;
 	if (!rs->max_hw_segments)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 54a72f197487..14ad2a785a34 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -462,6 +462,7 @@ int scsi_execute_async(struct scsi_device *sdev, const unsigned char *cmd,
 	req = blk_get_request(sdev->request_queue, write, gfp);
 	if (!req)
 		goto free_sense;
+	req->flags |= REQ_BLOCK_PC | REQ_QUIET;
 
 	if (use_sg)
 		err = scsi_req_map_sg(req, buffer, use_sg, bufflen, gfp);
@@ -477,7 +478,6 @@ int scsi_execute_async(struct scsi_device *sdev, const unsigned char *cmd,
 	req->sense_len = 0;
 	req->timeout = timeout;
 	req->retries = retries;
-	req->flags |= REQ_BLOCK_PC | REQ_QUIET;
 	req->end_io_data = sioc;
 
 	sioc->data = privdata;
diff --git a/fs/bio.c b/fs/bio.c
index 4d21ee3873ec..38d3e8023a07 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -313,7 +313,8 @@ int bio_get_nr_vecs(struct block_device *bdev)
 }
 
 static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
-			  *page, unsigned int len, unsigned int offset)
+			  *page, unsigned int len, unsigned int offset,
+			  unsigned short max_sectors)
 {
 	int retried_segments = 0;
 	struct bio_vec *bvec;
@@ -327,7 +328,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
 		return 0;
 
-	if (((bio->bi_size + len) >> 9) > q->max_sectors)
+	if (((bio->bi_size + len) >> 9) > max_sectors)
 		return 0;
 
 	/*
@@ -401,7 +402,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page,
 		    unsigned int len, unsigned int offset)
 {
-	return __bio_add_page(q, bio, page, len, offset);
+	return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
 }
 
 /**
@@ -420,8 +421,8 @@ int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page,
 int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 		 unsigned int offset)
 {
-	return __bio_add_page(bdev_get_queue(bio->bi_bdev), bio, page,
-			      len, offset);
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
 }
 
 struct bio_map_data {
@@ -533,7 +534,7 @@ struct bio *bio_copy_user(request_queue_t *q, unsigned long uaddr,
 			break;
 		}
 
-		if (__bio_add_page(q, bio, page, bytes, 0) < bytes) {
+		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
 			ret = -EINVAL;
 			break;
 		}
@@ -647,7 +648,8 @@ static struct bio *__bio_map_user_iov(request_queue_t *q,
 			/*
 			 * sorry...
 			 */
-			if (__bio_add_page(q, bio, pages[j], bytes, offset) < bytes)
+			if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
+					    bytes)
 				break;
 
 			len -= bytes;
@@ -820,8 +822,8 @@ static struct bio *__bio_map_kern(request_queue_t *q, void *data,
 		if (bytes > len)
 			bytes = len;
 
-		if (__bio_add_page(q, bio, virt_to_page(data), bytes,
-				   offset) < bytes)
+		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
+				    offset) < bytes)
 			break;
 
 		data += bytes;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 509e9a03a328..a18500d196e1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -702,7 +702,8 @@ extern int blkdev_issue_flush(struct block_device *, sector_t *);
 
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128
-#define MAX_SECTORS 255
+#define SAFE_MAX_SECTORS 255
+#define BLK_DEF_MAX_SECTORS 1024
 
 #define MAX_SEGMENT_SIZE	65536
 
-- 
cgit v1.2.3-71-gd317


From f83b5e323f57d6e1f35a839d663e91cebe985e54 Mon Sep 17 00:00:00 2001
From: Ustyugov Roman <dr_unique@ymg.ru>
Date: Fri, 23 Sep 2005 08:42:11 +0400
Subject: kbuild: set correct KBUILD_MODNAME when using well known kernel
 symbols as module names

This patch fixes a problem when we use well known kernel symbols as module
names.

For example, if module source name is current.c, idle_stack.c or etc.,
we have a bad KBUILD_MODNAME value.
For example, KBUILD_MODNAME will be "get_current()" instead of "current", or
"(init_thread_union.stack)" instead of "idle_task".

The trick is to define a stringify macro on the commandline - named
KBUILD_STR for namespace reasons - and then to stringify the module
name.

There are a few uses of KBUILD_MODNAME throughout the tree but the usage
is for debug and will not be harmed by this change so left untouched for now.

While at it KBUILD_BASENAME was changed too. Any spinlock usage in the
unix module would have created wrong section names without it.
Usage in spinlock.h fixed so it no longer stringify KBUILD_BASENAME.

Original patch from Ustyogov Roman - all bugs introduced by me.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 include/linux/spinlock.h | 3 +--
 scripts/Makefile.lib     | 8 +++++---
 scripts/mod/modpost.c    | 3 +--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 0e9682c9def5..799be6747944 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -59,8 +59,7 @@
 /*
  * Must define these before including other files, inline functions need them
  */
-#define LOCK_SECTION_NAME                       \
-        ".text.lock." __stringify(KBUILD_BASENAME)
+#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME
 
 #define LOCK_SECTION_START(extra)               \
         ".subsection 1\n\t"                     \
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 0f81dcfd6909..550798f57da5 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -81,8 +81,10 @@ obj-dirs	:= $(addprefix $(obj)/,$(obj-dirs))
 # Note: It's possible that one object gets potentially linked into more
 #       than one module. In that case KBUILD_MODNAME will be set to foo_bar,
 #       where foo and bar are the name of the modules.
-basename_flags = -DKBUILD_BASENAME=$(subst $(comma),_,$(subst -,_,$(*F)))
-modname_flags  = $(if $(filter 1,$(words $(modname))),-DKBUILD_MODNAME=$(subst $(comma),_,$(subst -,_,$(modname))))
+name-fix = $(subst $(comma),_,$(subst -,_,$1))
+basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(*F)))"
+modname_flags  = $(if $(filter 1,$(words $(modname))),\
+                 -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
 
 _c_flags       = $(CFLAGS) $(EXTRA_CFLAGS) $(CFLAGS_$(*F).o)
 _a_flags       = $(AFLAGS) $(EXTRA_AFLAGS) $(AFLAGS_$(*F).o)
@@ -113,7 +115,7 @@ endif
 
 c_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(CPPFLAGS) \
 		 $(__c_flags) $(modkern_cflags) \
-		 $(basename_flags) $(modname_flags)
+		 -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags)
 
 a_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(CPPFLAGS) \
 		 $(__a_flags) $(modkern_aflags)
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 8ce5a6318684..f70ff13d4818 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -539,10 +539,9 @@ add_header(struct buffer *b, struct module *mod)
 	buf_printf(b, "\n");
 	buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n");
 	buf_printf(b, "\n");
-	buf_printf(b, "#undef unix\n"); /* We have a module called "unix" */
 	buf_printf(b, "struct module __this_module\n");
 	buf_printf(b, "__attribute__((section(\".gnu.linkonce.this_module\"))) = {\n");
-	buf_printf(b, " .name = __stringify(KBUILD_MODNAME),\n");
+	buf_printf(b, " .name = KBUILD_MODNAME,\n");
 	if (mod->has_init)
 		buf_printf(b, " .init = init_module,\n");
 	if (mod->has_cleanup)
-- 
cgit v1.2.3-71-gd317


From 9b4ffa48ae855c8657a36014c5b0243ff69f4722 Mon Sep 17 00:00:00 2001
From: Jaya Kumar <jayakumar.alsa@gmail.com>
Date: Thu, 17 Nov 2005 10:12:23 +0100
Subject: [ALSA] Add support for the CS5535 Audio device

Add support for the CS5535 Audio device.  I've fixed up some errors as per
Takashi's advice from the thread:

http://lkml.org/lkml/2005/9/15/119

 From: Alan Cox <alan@lxorguk.ukuu.org.uk>

        cs5535 is a 32bit x86 only device using weird CPU features

Signed-off-by: Jaya Kumar <jayakumar.alsa@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 CREDITS                                 |   1 +
 MAINTAINERS                             |   5 +
 include/linux/pci_ids.h                 |   4 +
 sound/pci/Kconfig                       |  13 +
 sound/pci/Makefile                      |   1 +
 sound/pci/cs5535audio/Makefile          |   8 +
 sound/pci/cs5535audio/cs5535audio.c     | 410 ++++++++++++++++++++++++++++++
 sound/pci/cs5535audio/cs5535audio.h     | 123 +++++++++
 sound/pci/cs5535audio/cs5535audio_pcm.c | 430 ++++++++++++++++++++++++++++++++
 9 files changed, 995 insertions(+)
 create mode 100644 sound/pci/cs5535audio/Makefile
 create mode 100644 sound/pci/cs5535audio/cs5535audio.c
 create mode 100644 sound/pci/cs5535audio/cs5535audio.h
 create mode 100644 sound/pci/cs5535audio/cs5535audio_pcm.c

(limited to 'include/linux')

diff --git a/CREDITS b/CREDITS
index 1b4f8694fa48..521f00d1b549 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1883,6 +1883,7 @@ N: Jaya Kumar
 E: jayalk@intworks.biz
 W: http://www.intworks.biz
 D: Arc monochrome LCD framebuffer driver, x86 reboot fixups
+D: pirq addr, CS5535 alsa audio driver
 S: Gurgaon, India
 S: Kuala Lumpur, Malaysia
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 6af683025ae0..93f97b3afac5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -650,6 +650,11 @@ L:	linux-crypto@vger.kernel.org
 T:	git kernel.org:/pub/scm/linux/kernel/git/herbert/crypto-2.6.git
 S:	Maintained
 
+CS5535 Audio ALSA driver
+P:	Jaya Kumar
+M:	jayakumar.alsa@gmail.com
+S:	Maintained
+
 CYBERPRO FB DRIVER
 P:	Russell King
 M:	rmk@arm.linux.org.uk
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4db67b3b05cc..9093f118f99d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -376,6 +376,10 @@
 #define PCI_DEVICE_ID_NS_87560_USB	0x0012
 #define PCI_DEVICE_ID_NS_83815		0x0020
 #define PCI_DEVICE_ID_NS_83820		0x0022
+#define PCI_DEVICE_ID_NS_CS5535_IDE	0x002d
+#define PCI_DEVICE_ID_NS_CS5535_AUDIO	0x002e
+#define PCI_DEVICE_ID_NS_CS5535_USB	0x002f
+#define PCI_DEVICE_ID_NS_CS5535_VIDEO	0x0030
 #define PCI_DEVICE_ID_NS_SATURN		0x0035
 #define PCI_DEVICE_ID_NS_SCx200_BRIDGE	0x0500
 #define PCI_DEVICE_ID_NS_SCx200_SMI	0x0501
diff --git a/sound/pci/Kconfig b/sound/pci/Kconfig
index 0fb16cf335ea..920305c7402f 100644
--- a/sound/pci/Kconfig
+++ b/sound/pci/Kconfig
@@ -359,6 +359,19 @@ config SND_ENS1370
 	  To compile this driver as a module, choose M here: the module
 	  will be called snd-ens1370.
 
+config SND_CS5535AUDIO
+	tristate "CS5535 Audio"
+	depends on SND && X86 && !X86_64
+	select SND_PCM
+	select SND_AC97_CODEC
+	help
+	  Say Y here to include support for audio on CS5535 chips. It is
+	  referred to as NS CS5535 IO or AMD CS5535 IO companion in
+	  various literature.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called snd-cs5535audio.
+
 config SND_ENS1371
 	tristate "(Creative) Ensoniq AudioPCI 1371/1373"
 	depends on SND
diff --git a/sound/pci/Makefile b/sound/pci/Makefile
index 42fabfcfc2a9..82a9c734f84d 100644
--- a/sound/pci/Makefile
+++ b/sound/pci/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_SND) += \
 	au88x0/ \
 	ca0106/ \
 	cs46xx/ \
+	cs5535audio/ \
 	emu10k1/ \
 	hda/ \
 	ice1712/ \
diff --git a/sound/pci/cs5535audio/Makefile b/sound/pci/cs5535audio/Makefile
new file mode 100644
index 000000000000..08d8ee6547d3
--- /dev/null
+++ b/sound/pci/cs5535audio/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for cs5535audio
+#
+
+snd-cs5535audio-objs := cs5535audio.o cs5535audio_pcm.o
+
+# Toplevel Module Dependency
+obj-$(CONFIG_SND_CS5535AUDIO) += snd-cs5535audio.o
diff --git a/sound/pci/cs5535audio/cs5535audio.c b/sound/pci/cs5535audio/cs5535audio.c
new file mode 100644
index 000000000000..920c857fc223
--- /dev/null
+++ b/sound/pci/cs5535audio/cs5535audio.c
@@ -0,0 +1,410 @@
+/*
+ * Driver for audio on multifunction CS5535 companion device
+ * Copyright (C) Jaya Kumar
+ *
+ * Based on Jaroslav Kysela and Takashi Iwai's examples.
+ * This work was sponsored by CIS(M) Sdn Bhd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/moduleparam.h>
+#include <asm/io.h>
+#include <sound/driver.h>
+#include <sound/core.h>
+#include <sound/control.h>
+#include <sound/pcm.h>
+#include <sound/rawmidi.h>
+#include <sound/ac97_codec.h>
+#include <sound/initval.h>
+#include <sound/asoundef.h>
+#include "cs5535audio.h"
+
+#define DRIVER_NAME "cs5535audio"
+
+
+static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
+static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
+static int enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
+
+static struct pci_device_id snd_cs5535audio_ids[] = {
+	{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_CS5535_AUDIO, PCI_ANY_ID,
+		PCI_ANY_ID, 0, 0, 0, },
+	{}
+};
+
+MODULE_DEVICE_TABLE(pci, snd_cs5535audio_ids);
+
+static void wait_till_cmd_acked(cs5535audio_t *cs5535au, unsigned long timeout)
+{
+	unsigned long tmp;
+	do {
+		tmp = cs_readl(cs5535au, ACC_CODEC_CNTL);
+		if (!(tmp & CMD_NEW))
+			break;
+		msleep(10);
+	} while (--timeout);
+	if (!timeout)
+		snd_printk(KERN_ERR "Failure writing to cs5535 codec\n");
+}
+
+static unsigned short snd_cs5535audio_codec_read(cs5535audio_t *cs5535au,
+							unsigned short reg)
+{
+	unsigned long regdata;
+	unsigned long timeout;
+	unsigned long val;
+
+	regdata = ((unsigned long) reg) << 24;
+	regdata |= ACC_CODEC_CNTL_RD_CMD;
+	regdata |= CMD_NEW;
+
+	cs_writel(cs5535au, ACC_CODEC_CNTL, regdata);
+	wait_till_cmd_acked(cs5535au, 500);
+
+	timeout = 50;
+	do {
+		val = cs_readl(cs5535au, ACC_CODEC_STATUS);
+		if (	(val & STS_NEW) &&
+			((unsigned long) reg == ((0xFF000000 & val)>>24)) )
+			break;
+		msleep(10);
+	} while (--timeout);
+	if (!timeout)
+		snd_printk(KERN_ERR "Failure reading cs5535 codec\n");
+
+	return ((unsigned short) val);
+}
+
+static void snd_cs5535audio_codec_write(cs5535audio_t *cs5535au,
+				   unsigned short reg, unsigned short val)
+{
+	unsigned long regdata;
+
+	regdata = ((unsigned long) reg) << 24;
+	regdata |= (unsigned long) val;
+	regdata &= CMD_MASK;
+	regdata |= CMD_NEW;
+	regdata &= ACC_CODEC_CNTL_WR_CMD;
+
+	cs_writel(cs5535au, ACC_CODEC_CNTL, regdata);
+	wait_till_cmd_acked(cs5535au, 50);
+}
+
+static void snd_cs5535audio_ac97_codec_write(ac97_t *ac97,
+				   unsigned short reg, unsigned short val)
+{
+	cs5535audio_t *cs5535au = ac97->private_data;
+	snd_cs5535audio_codec_write(cs5535au, reg, val);
+}
+
+static unsigned short snd_cs5535audio_ac97_codec_read(ac97_t *ac97,
+					    unsigned short reg)
+{
+	cs5535audio_t *cs5535au = ac97->private_data;
+	return snd_cs5535audio_codec_read(cs5535au, reg);
+}
+
+static void snd_cs5535audio_mixer_free_ac97(ac97_t *ac97)
+{
+	cs5535audio_t *cs5535audio = ac97->private_data;
+	cs5535audio->ac97 = NULL;
+}
+
+static int snd_cs5535audio_mixer(cs5535audio_t *cs5535au)
+{
+	snd_card_t *card = cs5535au->card;
+	ac97_bus_t *pbus;
+	ac97_template_t ac97;
+	int err;
+	static ac97_bus_ops_t ops = {
+		.write = snd_cs5535audio_ac97_codec_write,
+		.read = snd_cs5535audio_ac97_codec_read,
+	};
+
+	if ((err = snd_ac97_bus(card, 0, &ops, NULL, &pbus)) < 0)
+		return err;
+
+	memset(&ac97, 0, sizeof(ac97));
+	ac97.scaps = AC97_SCAP_AUDIO|AC97_SCAP_SKIP_MODEM;
+	ac97.private_data = cs5535au;
+	ac97.pci = cs5535au->pci;
+	ac97.private_free = snd_cs5535audio_mixer_free_ac97;
+
+	if ((err = snd_ac97_mixer(pbus, &ac97, &cs5535au->ac97)) < 0) {
+		snd_printk("mixer failed\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static void process_bm0_irq(cs5535audio_t *cs5535au)
+{
+	u8 bm_stat;
+	spin_lock(&cs5535au->reg_lock);
+	bm_stat = cs_readb(cs5535au, ACC_BM0_STATUS);
+	spin_unlock(&cs5535au->reg_lock);
+	if (bm_stat & EOP) {
+		cs5535audio_dma_t *dma;
+		dma = cs5535au->playback_substream->runtime->private_data;
+		snd_pcm_period_elapsed(cs5535au->playback_substream);
+	} else {
+		snd_printk(KERN_ERR "unexpected bm0 irq src, bm_stat=%x\n",
+					bm_stat);
+	}
+}
+
+static void process_bm1_irq(cs5535audio_t *cs5535au)
+{
+	u8 bm_stat;
+	spin_lock(&cs5535au->reg_lock);
+	bm_stat = cs_readb(cs5535au, ACC_BM1_STATUS);
+	spin_unlock(&cs5535au->reg_lock);
+	if (bm_stat & EOP) {
+		cs5535audio_dma_t *dma;
+		dma = cs5535au->capture_substream->runtime->private_data;
+		snd_pcm_period_elapsed(cs5535au->capture_substream);
+	}
+}
+
+static irqreturn_t snd_cs5535audio_interrupt(int irq, void *dev_id,
+						struct pt_regs *regs)
+{
+	u16 acc_irq_stat;
+	u8 bm_stat;
+	unsigned char count;
+	cs5535audio_t *cs5535au = dev_id;
+
+	if (cs5535au == NULL)
+		return IRQ_NONE;
+
+	acc_irq_stat = cs_readw(cs5535au, ACC_IRQ_STATUS);
+
+	if (!acc_irq_stat)
+		return IRQ_NONE;
+	for (count=0; count < 10; count++) {
+		if (acc_irq_stat & (1<<count)) {
+			switch (count) {
+			case IRQ_STS:
+				cs_readl(cs5535au, ACC_GPIO_STATUS);
+				break;
+			case WU_IRQ_STS:
+				cs_readl(cs5535au, ACC_GPIO_STATUS);
+				break;
+			case BM0_IRQ_STS:
+				process_bm0_irq(cs5535au);
+				break;
+			case BM1_IRQ_STS:
+				process_bm1_irq(cs5535au);
+				break;
+			case BM2_IRQ_STS:
+				bm_stat = cs_readb(cs5535au, ACC_BM2_STATUS);
+				break;
+			case BM3_IRQ_STS:
+				bm_stat = cs_readb(cs5535au, ACC_BM3_STATUS);
+				break;
+			case BM4_IRQ_STS:
+				bm_stat = cs_readb(cs5535au, ACC_BM4_STATUS);
+				break;
+			case BM5_IRQ_STS:
+				bm_stat = cs_readb(cs5535au, ACC_BM5_STATUS);
+				break;
+			case BM6_IRQ_STS:
+				bm_stat = cs_readb(cs5535au, ACC_BM6_STATUS);
+				break;
+			case BM7_IRQ_STS:
+				bm_stat = cs_readb(cs5535au, ACC_BM7_STATUS);
+				break;
+			default:
+				snd_printk(KERN_ERR "Unexpected irq src\n");
+				break;
+			}
+		}
+	}
+	return IRQ_HANDLED;
+}
+
+static int snd_cs5535audio_free(cs5535audio_t *cs5535au)
+{
+	synchronize_irq(cs5535au->irq);
+	pci_set_power_state(cs5535au->pci, 3);
+
+	if (cs5535au->irq >= 0)
+		free_irq(cs5535au->irq, cs5535au);
+
+	pci_release_regions(cs5535au->pci);
+	pci_disable_device(cs5535au->pci);
+	kfree(cs5535au);
+	return 0;
+}
+
+static int snd_cs5535audio_dev_free(snd_device_t *device)
+{
+	cs5535audio_t *cs5535au = device->device_data;
+	return snd_cs5535audio_free(cs5535au);
+}
+
+static int __devinit snd_cs5535audio_create(snd_card_t *card,
+				     struct pci_dev *pci,
+				     cs5535audio_t **rcs5535au)
+{
+	cs5535audio_t *cs5535au;
+
+	int err;
+	static snd_device_ops_t ops = {
+		.dev_free =	snd_cs5535audio_dev_free,
+	};
+
+	*rcs5535au = NULL;
+	if ((err = pci_enable_device(pci)) < 0)
+		return err;
+
+	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0 ||
+		pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) < 0) {
+		printk(KERN_WARNING "unable to get 32bit dma\n");
+		err = -ENXIO;
+		goto pcifail;
+	}
+
+	cs5535au = kzalloc(sizeof(*cs5535au), GFP_KERNEL);
+	if (cs5535au == NULL) {
+		err = -ENOMEM;
+		goto pcifail;
+	}
+
+	spin_lock_init(&cs5535au->reg_lock);
+	cs5535au->card = card;
+	cs5535au->pci = pci;
+	cs5535au->irq = -1;
+
+	if ((err = pci_request_regions(pci, "CS5535 Audio")) < 0) {
+		kfree(cs5535au);
+		goto pcifail;
+	}
+
+	cs5535au->port = pci_resource_start(pci, 0);
+
+	if (request_irq(pci->irq, snd_cs5535audio_interrupt,
+		SA_INTERRUPT|SA_SHIRQ, "CS5535 Audio", cs5535au)) {
+		snd_printk("unable to grab IRQ %d\n", pci->irq);
+		err = -EBUSY;
+		goto sndfail;
+	}
+
+	cs5535au->irq = pci->irq;
+	pci_set_master(pci);
+
+	if ((err = snd_device_new(card, SNDRV_DEV_LOWLEVEL,
+					cs5535au, &ops)) < 0)
+		goto sndfail;
+
+	snd_card_set_dev(card, &pci->dev);
+
+	*rcs5535au = cs5535au;
+	return 0;
+
+sndfail: /* leave the device alive, just kill the snd */
+	snd_cs5535audio_free(cs5535au);
+	return err;
+
+pcifail:
+	pci_disable_device(pci);
+	return err;
+}
+
+static int __devinit snd_cs5535audio_probe(struct pci_dev *pci,
+					const struct pci_device_id *pci_id)
+{
+	static int dev;
+	snd_card_t *card;
+	cs5535audio_t *cs5535au;
+	int err;
+
+	if (dev >= SNDRV_CARDS)
+		return -ENODEV;
+	if (!enable[dev]) {
+		dev++;
+		return -ENOENT;
+	}
+
+	card = snd_card_new(index[dev], id[dev], THIS_MODULE, 0);
+	if (card == NULL)
+		return -ENOMEM;
+
+	if ((err = snd_cs5535audio_create(card, pci, &cs5535au)) < 0)
+		goto probefail_out;
+
+	if ((err = snd_cs5535audio_mixer(cs5535au)) < 0)
+		goto probefail_out;
+
+	if ((err = snd_cs5535audio_pcm(cs5535au)) < 0)
+		goto probefail_out;
+
+	strcpy(card->driver, DRIVER_NAME);
+
+	strcpy(card->shortname, "CS5535 Audio");
+	sprintf(card->longname, "%s %s at 0x%lx, irq %i",
+		card->shortname, card->driver,
+		cs5535au->port, cs5535au->irq);
+
+	if ((err = snd_card_register(card)) < 0)
+		goto probefail_out;
+
+	pci_set_drvdata(pci, card);
+	dev++;
+	return 0;
+
+probefail_out:
+	snd_card_free(card);
+	return err;
+}
+
+static void __devexit snd_cs5535audio_remove(struct pci_dev *pci)
+{
+	snd_card_free(pci_get_drvdata(pci));
+	pci_set_drvdata(pci, NULL);
+}
+
+static struct pci_driver driver = {
+	.name = DRIVER_NAME,
+	.id_table = snd_cs5535audio_ids,
+	.probe = snd_cs5535audio_probe,
+	.remove = __devexit_p(snd_cs5535audio_remove),
+};
+
+static int __init alsa_card_cs5535audio_init(void)
+{
+	return pci_module_init(&driver);
+}
+
+static void __exit alsa_card_cs5535audio_exit(void)
+{
+	pci_unregister_driver(&driver);
+}
+
+module_init(alsa_card_cs5535audio_init)
+module_exit(alsa_card_cs5535audio_exit)
+
+MODULE_AUTHOR("Jaya Kumar");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CS5535 Audio");
+MODULE_SUPPORTED_DEVICE("CS5535 Audio");
diff --git a/sound/pci/cs5535audio/cs5535audio.h b/sound/pci/cs5535audio/cs5535audio.h
new file mode 100644
index 000000000000..e28177fb0991
--- /dev/null
+++ b/sound/pci/cs5535audio/cs5535audio.h
@@ -0,0 +1,123 @@
+#ifndef __SOUND_CS5535AUDIO_H
+#define __SOUND_CS5535AUDIO_H
+
+#define cs_writel(cs5535au, reg, val) outl(val, (int) cs5535au->port + reg)
+#define cs_writeb(cs5535au, reg, val) outb(val, (int) cs5535au->port + reg)
+#define cs_readl(cs5535au, reg)	inl((unsigned short) (cs5535au->port + reg))
+#define cs_readw(cs5535au, reg)	inw((unsigned short) (cs5535au->port + reg))
+#define cs_readb(cs5535au, reg)	inb((unsigned short) (cs5535au->port + reg))
+
+#define CS5535AUDIO_MAX_DESCRIPTORS	128
+
+/* acc_codec bar0 reg addrs */
+#define ACC_GPIO_STATUS			0x00
+#define ACC_CODEC_STATUS		0x08
+#define ACC_CODEC_CNTL			0x0C
+#define ACC_IRQ_STATUS			0x12
+#define ACC_BM0_CMD			0x20
+#define ACC_BM1_CMD			0x28
+#define ACC_BM2_CMD			0x30
+#define ACC_BM3_CMD			0x38
+#define ACC_BM4_CMD			0x40
+#define ACC_BM5_CMD			0x48
+#define ACC_BM6_CMD			0x50
+#define ACC_BM7_CMD			0x58
+#define ACC_BM0_PRD			0x24
+#define ACC_BM1_PRD			0x2C
+#define ACC_BM2_PRD			0x34
+#define ACC_BM3_PRD			0x3C
+#define ACC_BM4_PRD			0x44
+#define ACC_BM5_PRD			0x4C
+#define ACC_BM6_PRD			0x54
+#define ACC_BM7_PRD			0x5C
+#define ACC_BM0_STATUS			0x21
+#define ACC_BM1_STATUS			0x29
+#define ACC_BM2_STATUS			0x31
+#define ACC_BM3_STATUS			0x39
+#define ACC_BM4_STATUS			0x41
+#define ACC_BM5_STATUS			0x49
+#define ACC_BM6_STATUS			0x51
+#define ACC_BM7_STATUS			0x59
+#define ACC_BM0_PNTR			0x60
+#define ACC_BM1_PNTR			0x64
+#define ACC_BM2_PNTR			0x68
+#define ACC_BM3_PNTR			0x6C
+#define ACC_BM4_PNTR			0x70
+#define ACC_BM5_PNTR			0x74
+#define ACC_BM6_PNTR			0x78
+#define ACC_BM7_PNTR			0x7C
+/* acc_codec bar0 reg bits */
+/* ACC_IRQ_STATUS */
+#define IRQ_STS 			0
+#define WU_IRQ_STS 			1
+#define BM0_IRQ_STS 			2
+#define BM1_IRQ_STS 			3
+#define BM2_IRQ_STS 			4
+#define BM3_IRQ_STS 			5
+#define BM4_IRQ_STS 			6
+#define BM5_IRQ_STS		 	7
+#define BM6_IRQ_STS 			8
+#define BM7_IRQ_STS 			9
+/* ACC_BMX_STATUS */
+#define EOP				(1<<0)
+#define BM_EOP_ERR			(1<<1)
+/* ACC_BMX_CTL */
+#define BM_CTL_EN			0x00000001
+#define BM_CTL_PAUSE			0x00000011
+#define BM_CTL_DIS			0x00000000
+#define BM_CTL_BYTE_ORD_LE		0x00000000
+#define BM_CTL_BYTE_ORD_BE		0x00000100
+/* cs5535 specific ac97 codec register defines */
+#define CMD_MASK			0xFF00FFFF
+#define CMD_NEW				0x00010000
+#define STS_NEW				0x00020000
+#define PRM_RDY_STS			0x00800000
+#define ACC_CODEC_CNTL_WR_CMD		(~0x80000000)
+#define ACC_CODEC_CNTL_RD_CMD		0x80000000
+#define PRD_JMP				0x2000
+#define PRD_EOP				0x4000
+#define PRD_EOT				0x8000
+
+typedef struct _snd_cs5535audio cs5535audio_t;
+typedef struct snd_cs5535audio_dma cs5535audio_dma_t;
+typedef struct snd_cs5535audio_dma_ops cs5535audio_dma_ops_t;
+
+enum { CS5535AUDIO_DMA_PLAYBACK, CS5535AUDIO_DMA_CAPTURE, NUM_CS5535AUDIO_DMAS };
+struct snd_cs5535audio_dma_ops {
+	int type;
+	void (*enable_dma)(cs5535audio_t *cs5535au);
+	void (*disable_dma)(cs5535audio_t *cs5535au);
+	void (*pause_dma)(cs5535audio_t *cs5535au);
+	void (*setup_prd)(cs5535audio_t *cs5535au, u32 prd_addr);
+	u32 (*read_dma_pntr)(cs5535audio_t *cs5535au);
+};
+
+typedef struct cs5535audio_dma_desc {
+	u32 addr;
+	u16 size;
+	u16 ctlreserved;
+} cs5535audio_dma_desc_t;
+
+struct snd_cs5535audio_dma {
+	const cs5535audio_dma_ops_t *ops;
+	struct snd_dma_buffer desc_buf;
+	snd_pcm_substream_t *substream;
+	unsigned int buf_addr, buf_bytes;
+	unsigned int period_bytes, periods;
+};
+
+struct _snd_cs5535audio {
+	snd_card_t *card;
+	ac97_t *ac97;
+	int irq;
+	struct pci_dev *pci;
+	unsigned long port;
+	spinlock_t reg_lock;
+	snd_pcm_substream_t *playback_substream;
+	snd_pcm_substream_t *capture_substream;
+	cs5535audio_dma_t dmas[NUM_CS5535AUDIO_DMAS];
+};
+
+int __devinit snd_cs5535audio_pcm(cs5535audio_t *cs5535audio);
+#endif /* __SOUND_CS5535AUDIO_H */
+
diff --git a/sound/pci/cs5535audio/cs5535audio_pcm.c b/sound/pci/cs5535audio/cs5535audio_pcm.c
new file mode 100644
index 000000000000..5802ed9d57be
--- /dev/null
+++ b/sound/pci/cs5535audio/cs5535audio_pcm.c
@@ -0,0 +1,430 @@
+/*
+ * Driver for audio on multifunction CS5535 companion device
+ * Copyright (C) Jaya Kumar
+ *
+ * Based on Jaroslav Kysela and Takashi Iwai's examples.
+ * This work was sponsored by CIS(M) Sdn Bhd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * todo: add be fmt support, spdif, pm
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <sound/driver.h>
+#include <sound/core.h>
+#include <sound/control.h>
+#include <sound/initval.h>
+#include <sound/asoundef.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/ac97_codec.h>
+#include "cs5535audio.h"
+
+static snd_pcm_hardware_t snd_cs5535audio_playback =
+{
+	.info =			(
+				SNDRV_PCM_INFO_MMAP |
+				SNDRV_PCM_INFO_INTERLEAVED |
+		 		SNDRV_PCM_INFO_BLOCK_TRANSFER |
+		 		SNDRV_PCM_INFO_MMAP_VALID |
+		 		SNDRV_PCM_INFO_PAUSE |
+				SNDRV_PCM_INFO_SYNC_START
+				),
+	.formats =		(
+				SNDRV_PCM_FMTBIT_S16_LE
+				),
+	.rates =		(
+				SNDRV_PCM_RATE_CONTINUOUS |
+				SNDRV_PCM_RATE_8000_48000
+				),
+	.rate_min =		4000,
+	.rate_max =		48000,
+	.channels_min =		2,
+	.channels_max =		2,
+	.buffer_bytes_max =	(128*1024),
+	.period_bytes_min =	64,
+	.period_bytes_max =	(64*1024 - 16),
+	.periods_min =		1,
+	.periods_max =		CS5535AUDIO_MAX_DESCRIPTORS,
+	.fifo_size =		0,
+};
+
+static snd_pcm_hardware_t snd_cs5535audio_capture =
+{
+	.info =			(
+				SNDRV_PCM_INFO_MMAP |
+				SNDRV_PCM_INFO_INTERLEAVED |
+		 		SNDRV_PCM_INFO_BLOCK_TRANSFER |
+		 		SNDRV_PCM_INFO_MMAP_VALID |
+				SNDRV_PCM_INFO_SYNC_START
+				),
+	.formats =		(
+				SNDRV_PCM_FMTBIT_S16_LE
+				),
+	.rates =		(
+				SNDRV_PCM_RATE_CONTINUOUS |
+				SNDRV_PCM_RATE_8000_48000
+				),
+	.rate_min =		4000,
+	.rate_max =		48000,
+	.channels_min =		2,
+	.channels_max =		2,
+	.buffer_bytes_max =	(128*1024),
+	.period_bytes_min =	64,
+	.period_bytes_max =	(64*1024 - 16),
+	.periods_min =		1,
+	.periods_max =		CS5535AUDIO_MAX_DESCRIPTORS,
+	.fifo_size =		0,
+};
+
+static int snd_cs5535audio_playback_open(snd_pcm_substream_t *substream)
+{
+	int err;
+	cs5535audio_t *cs5535au = snd_pcm_substream_chip(substream);
+	snd_pcm_runtime_t *runtime = substream->runtime;
+
+	runtime->hw = snd_cs5535audio_playback;
+	cs5535au->playback_substream = substream;
+	runtime->private_data = &(cs5535au->dmas[CS5535AUDIO_DMA_PLAYBACK]);
+	snd_pcm_set_sync(substream);
+	if ((err = snd_pcm_hw_constraint_integer(runtime,
+				SNDRV_PCM_HW_PARAM_PERIODS)) < 0)
+		return err;
+
+	return 0;
+}
+
+static int snd_cs5535audio_playback_close(snd_pcm_substream_t *substream)
+{
+	return 0;
+}
+
+#define CS5535AUDIO_DESC_LIST_SIZE \
+	PAGE_ALIGN(CS5535AUDIO_MAX_DESCRIPTORS * sizeof(cs5535audio_dma_desc_t))
+
+static int cs5535audio_build_dma_packets(cs5535audio_t *cs5535au,
+					cs5535audio_dma_t *dma,
+					snd_pcm_substream_t *substream,
+					unsigned int periods,
+					unsigned int period_bytes)
+{
+	unsigned int i;
+	u32 addr, desc_addr, jmpprd_addr;
+	cs5535audio_dma_desc_t *lastdesc;
+
+	if (periods > CS5535AUDIO_MAX_DESCRIPTORS)
+		return -ENOMEM;
+
+	if (dma->desc_buf.area == NULL) {
+		if (snd_dma_alloc_pages(SNDRV_DMA_TYPE_DEV,
+					snd_dma_pci_data(cs5535au->pci),
+					CS5535AUDIO_DESC_LIST_SIZE+1,
+					&dma->desc_buf) < 0)
+			return -ENOMEM;
+		dma->period_bytes = dma->periods = 0;
+	}
+
+	if (dma->periods == periods && dma->period_bytes == period_bytes)
+		return 0;
+
+	/* the u32 cast is okay because in snd*create we succesfully told
+   	   pci alloc that we're only 32 bit capable so the uppper will be 0 */
+	addr = (u32) substream->runtime->dma_addr;
+	desc_addr = (u32) dma->desc_buf.addr;
+	for (i = 0; i < periods; i++) {
+		cs5535audio_dma_desc_t *desc =
+			&((cs5535audio_dma_desc_t *) dma->desc_buf.area)[i];
+		desc->addr = cpu_to_le32(addr);
+		desc->size = period_bytes;
+		desc->ctlreserved = PRD_EOP;
+		desc_addr += sizeof(cs5535audio_dma_desc_t);
+		addr += period_bytes;
+	}
+	/* we reserved one dummy descriptor at the end to do the PRD jump */
+	lastdesc = &((cs5535audio_dma_desc_t *) dma->desc_buf.area)[periods];
+	lastdesc->addr = cpu_to_le32((u32) dma->desc_buf.addr);
+	lastdesc->size = 0;
+	lastdesc->ctlreserved = PRD_JMP;
+	jmpprd_addr = cpu_to_le32(lastdesc->addr +
+				(sizeof(cs5535audio_dma_desc_t)*periods));
+
+	dma->period_bytes = period_bytes;
+	dma->periods = periods;
+	spin_lock_irq(&cs5535au->reg_lock);
+	dma->ops->disable_dma(cs5535au);
+	dma->ops->setup_prd(cs5535au, jmpprd_addr);
+	spin_unlock_irq(&cs5535au->reg_lock);
+	return 0;
+}
+
+static void cs5535audio_playback_enable_dma(cs5535audio_t *cs5535au)
+{
+	cs_writeb(cs5535au, ACC_BM0_CMD, BM_CTL_EN);
+}
+
+static void cs5535audio_playback_disable_dma(cs5535audio_t *cs5535au)
+{
+	cs_writeb(cs5535au, ACC_BM0_CMD, 0);
+}
+
+static void cs5535audio_playback_pause_dma(cs5535audio_t *cs5535au)
+{
+	cs_writeb(cs5535au, ACC_BM0_CMD, BM_CTL_PAUSE);
+}
+
+static void cs5535audio_playback_setup_prd(cs5535audio_t *cs5535au,
+						u32 prd_addr)
+{
+	cs_writel(cs5535au, ACC_BM0_PRD, prd_addr);
+}
+
+static u32 cs5535audio_playback_read_dma_pntr(cs5535audio_t *cs5535au)
+{
+	return cs_readl(cs5535au, ACC_BM0_PNTR);
+}
+
+static void cs5535audio_capture_enable_dma(cs5535audio_t *cs5535au)
+{
+	cs_writeb(cs5535au, ACC_BM1_CMD, BM_CTL_EN);
+}
+
+static void cs5535audio_capture_disable_dma(cs5535audio_t *cs5535au)
+{
+	cs_writeb(cs5535au, ACC_BM1_CMD, 0);
+}
+
+static void cs5535audio_capture_pause_dma(cs5535audio_t *cs5535au)
+{
+	cs_writeb(cs5535au, ACC_BM1_CMD, BM_CTL_PAUSE);
+}
+
+static void cs5535audio_capture_setup_prd(cs5535audio_t *cs5535au,
+						u32 prd_addr)
+{
+	cs_writel(cs5535au, ACC_BM1_PRD, prd_addr);
+}
+
+static u32 cs5535audio_capture_read_dma_pntr(cs5535audio_t *cs5535au)
+{
+	return cs_readl(cs5535au, ACC_BM1_PNTR);
+}
+
+static void cs5535audio_clear_dma_packets(cs5535audio_t *cs5535au,
+					cs5535audio_dma_t *dma,
+					snd_pcm_substream_t *substream)
+{
+	snd_dma_free_pages(&dma->desc_buf);
+	dma->desc_buf.area = NULL;
+}
+
+static int snd_cs5535audio_hw_params(snd_pcm_substream_t *substream,
+				 snd_pcm_hw_params_t *hw_params)
+{
+	cs5535audio_t *cs5535au = snd_pcm_substream_chip(substream);
+	cs5535audio_dma_t *dma = substream->runtime->private_data;
+	int err;
+
+	err = snd_pcm_lib_malloc_pages(substream,
+					params_buffer_bytes(hw_params));
+	if (err < 0)
+		return err;
+	dma->buf_addr = substream->runtime->dma_addr;
+	dma->buf_bytes = params_buffer_bytes(hw_params);
+
+	err = cs5535audio_build_dma_packets(cs5535au, dma, substream,
+				       params_periods(hw_params),
+				       params_period_bytes(hw_params));
+	return err;
+}
+
+static int snd_cs5535audio_hw_free(snd_pcm_substream_t *substream)
+{
+	cs5535audio_t *cs5535au = snd_pcm_substream_chip(substream);
+	cs5535audio_dma_t *dma = substream->runtime->private_data;
+
+	cs5535audio_clear_dma_packets(cs5535au, dma, substream);
+	return snd_pcm_lib_free_pages(substream);
+}
+
+static int snd_cs5535audio_playback_prepare(snd_pcm_substream_t *substream)
+{
+	cs5535audio_t *cs5535au = snd_pcm_substream_chip(substream);
+	return snd_ac97_set_rate(cs5535au->ac97, AC97_PCM_FRONT_DAC_RATE,
+					substream->runtime->rate);
+}
+
+static int snd_cs5535audio_trigger(snd_pcm_substream_t *substream, int cmd)
+{
+	cs5535audio_t *cs5535au = snd_pcm_substream_chip(substream);
+	cs5535audio_dma_t *dma = substream->runtime->private_data;
+
+	switch (cmd) {
+		case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
+			spin_lock_irq(&cs5535au->reg_lock);
+			dma->ops->pause_dma(cs5535au);
+			spin_unlock_irq(&cs5535au->reg_lock);
+			break;
+		case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
+			spin_lock_irq(&cs5535au->reg_lock);
+			dma->ops->enable_dma(cs5535au);
+			spin_unlock_irq(&cs5535au->reg_lock);
+			break;
+		case SNDRV_PCM_TRIGGER_START:
+			spin_lock_irq(&cs5535au->reg_lock);
+			dma->ops->enable_dma(cs5535au);
+			spin_unlock_irq(&cs5535au->reg_lock);
+			break;
+		case SNDRV_PCM_TRIGGER_STOP:
+			spin_lock_irq(&cs5535au->reg_lock);
+			dma->ops->disable_dma(cs5535au);
+			spin_unlock_irq(&cs5535au->reg_lock);
+			break;
+		default:
+			snd_printk(KERN_ERR "unhandled trigger\n");
+			return -EINVAL;
+			break;
+	}
+	return 0;
+}
+
+static snd_pcm_uframes_t snd_cs5535audio_pcm_pointer(snd_pcm_substream_t
+							*substream)
+{
+	cs5535audio_t *cs5535au = snd_pcm_substream_chip(substream);
+	u32 curdma;
+	cs5535audio_dma_t *dma;
+
+	dma = substream->runtime->private_data;
+	curdma = dma->ops->read_dma_pntr(cs5535au);
+	if (curdma < dma->buf_addr) {
+		snd_printk(KERN_ERR "curdma=%x < %x bufaddr.\n",
+					curdma, dma->buf_addr);
+		return 0;
+	}
+	curdma -= dma->buf_addr;
+	if (curdma >= dma->buf_bytes) {
+		snd_printk(KERN_ERR "diff=%x >= %x buf_bytes.\n",
+					curdma, dma->buf_bytes);
+		return 0;
+	}
+	return bytes_to_frames(substream->runtime, curdma);
+}
+
+static int snd_cs5535audio_capture_open(snd_pcm_substream_t *substream)
+{
+	int err;
+	cs5535audio_t *cs5535au = snd_pcm_substream_chip(substream);
+	snd_pcm_runtime_t *runtime = substream->runtime;
+
+	runtime->hw = snd_cs5535audio_capture;
+	cs5535au->capture_substream = substream;
+	runtime->private_data = &(cs5535au->dmas[CS5535AUDIO_DMA_CAPTURE]);
+	snd_pcm_set_sync(substream);
+	if ((err = snd_pcm_hw_constraint_integer(runtime,
+					 SNDRV_PCM_HW_PARAM_PERIODS)) < 0)
+		return err;
+	return 0;
+}
+
+static int snd_cs5535audio_capture_close(snd_pcm_substream_t *substream)
+{
+	return 0;
+}
+
+static int snd_cs5535audio_capture_prepare(snd_pcm_substream_t *substream)
+{
+	cs5535audio_t *cs5535au = snd_pcm_substream_chip(substream);
+	return snd_ac97_set_rate(cs5535au->ac97, AC97_PCM_LR_ADC_RATE,
+					substream->runtime->rate);
+}
+
+static snd_pcm_ops_t snd_cs5535audio_playback_ops = {
+	.open =		snd_cs5535audio_playback_open,
+	.close =	snd_cs5535audio_playback_close,
+	.ioctl =	snd_pcm_lib_ioctl,
+	.hw_params =	snd_cs5535audio_hw_params,
+	.hw_free =	snd_cs5535audio_hw_free,
+	.prepare =	snd_cs5535audio_playback_prepare,
+	.trigger =	snd_cs5535audio_trigger,
+	.pointer =	snd_cs5535audio_pcm_pointer,
+};
+
+static snd_pcm_ops_t snd_cs5535audio_capture_ops = {
+	.open =		snd_cs5535audio_capture_open,
+	.close =	snd_cs5535audio_capture_close,
+	.ioctl =	snd_pcm_lib_ioctl,
+	.hw_params =	snd_cs5535audio_hw_params,
+	.hw_free =	snd_cs5535audio_hw_free,
+	.prepare =	snd_cs5535audio_capture_prepare,
+	.trigger =	snd_cs5535audio_trigger,
+	.pointer =	snd_cs5535audio_pcm_pointer,
+};
+
+static void snd_cs5535audio_pcm_free(snd_pcm_t *pcm)
+{
+	snd_pcm_lib_preallocate_free_for_all(pcm);
+}
+
+static cs5535audio_dma_ops_t snd_cs5535audio_playback_dma_ops = {
+        .type = CS5535AUDIO_DMA_PLAYBACK,
+        .enable_dma = cs5535audio_playback_enable_dma,
+        .disable_dma = cs5535audio_playback_disable_dma,
+        .setup_prd = cs5535audio_playback_setup_prd,
+        .pause_dma = cs5535audio_playback_pause_dma,
+        .read_dma_pntr = cs5535audio_playback_read_dma_pntr,
+};
+
+static cs5535audio_dma_ops_t snd_cs5535audio_capture_dma_ops = {
+        .type = CS5535AUDIO_DMA_CAPTURE,
+        .enable_dma = cs5535audio_capture_enable_dma,
+        .disable_dma = cs5535audio_capture_disable_dma,
+        .setup_prd = cs5535audio_capture_setup_prd,
+        .pause_dma = cs5535audio_capture_pause_dma,
+        .read_dma_pntr = cs5535audio_capture_read_dma_pntr,
+};
+
+int __devinit snd_cs5535audio_pcm(cs5535audio_t *cs5535au)
+{
+	snd_pcm_t *pcm;
+	int err;
+
+	err = snd_pcm_new(cs5535au->card, "CS5535 Audio", 0, 1, 1, &pcm);
+	if (err < 0)
+		return err;
+
+	cs5535au->dmas[CS5535AUDIO_DMA_PLAYBACK].ops =
+					&snd_cs5535audio_playback_dma_ops;
+	cs5535au->dmas[CS5535AUDIO_DMA_CAPTURE].ops =
+					&snd_cs5535audio_capture_dma_ops;
+	snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK,
+					&snd_cs5535audio_playback_ops);
+	snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE,
+					&snd_cs5535audio_capture_ops);
+
+	pcm->private_data = cs5535au;
+	pcm->private_free = snd_cs5535audio_pcm_free;
+	pcm->info_flags = 0;
+	strcpy(pcm->name, "CS5535 Audio");
+
+	snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
+					snd_dma_pci_data(cs5535au->pci),
+					64*1024, 128*1024);
+
+	return 0;
+}
+
-- 
cgit v1.2.3-71-gd317


From 4c98cfef2efa6b6662ac28c4f0069964bbd9fdf9 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 29 Nov 2005 09:09:32 +0100
Subject: [ALSA] PATCH] Add PM support to PnP drivers

Add suspend/resume callback to pnp_driver and pnp_card_driver.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 drivers/pnp/card.c   | 25 +++++++++++++++++++++++++
 drivers/pnp/driver.c | 20 ++++++++++++++++++++
 include/linux/pnp.h  |  5 +++++
 3 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pnp/card.c b/drivers/pnp/card.c
index bd7c966ea2d7..0ecbe4edbec1 100644
--- a/drivers/pnp/card.c
+++ b/drivers/pnp/card.c
@@ -69,6 +69,7 @@ static int card_probe(struct pnp_card * card, struct pnp_card_driver * drv)
 			return 0;
 		clink->card = card;
 		clink->driver = drv;
+		clink->pm_state = PMSG_ON;
 		if (drv->probe) {
 			if (drv->probe(clink, id)>=0)
 				return 1;
@@ -333,6 +334,28 @@ void pnp_release_card_device(struct pnp_dev * dev)
 	up_write(&dev->dev.bus->subsys.rwsem);
 }
 
+/*
+ * suspend/resume callbacks
+ */
+static int card_suspend(struct pnp_dev *dev, pm_message_t state)
+{
+	struct pnp_card_link *link = dev->card_link;
+	if (link->pm_state.event == state.event)
+		return 0;
+	link->pm_state = state;
+	return link->driver->suspend(link, state);
+}
+
+static int card_resume(struct pnp_dev *dev)
+{
+	struct pnp_card_link *link = dev->card_link;
+	if (link->pm_state.event == PM_EVENT_ON)
+		return 0;
+	link->pm_state = PMSG_ON;
+	link->driver->resume(link);
+	return 0;
+}
+
 /**
  * pnp_register_card_driver - registers a PnP card driver with the PnP Layer
  * @drv: pointer to the driver to register
@@ -348,6 +371,8 @@ int pnp_register_card_driver(struct pnp_card_driver * drv)
 	drv->link.flags = drv->flags;
 	drv->link.probe = NULL;
 	drv->link.remove = &card_remove_first;
+	drv->link.suspend = drv->suspend ? card_suspend : NULL;
+	drv->link.resume = drv->resume ? card_resume : NULL;
 
 	spin_lock(&pnp_lock);
 	list_add_tail(&drv->global_list, &pnp_card_drivers);
diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index d3ccce706ab4..ea2cb9a8b21d 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -146,10 +146,30 @@ static int pnp_bus_match(struct device *dev, struct device_driver *drv)
 	return 1;
 }
 
+static int pnp_bus_suspend(struct device *dev, pm_message_t state)
+{
+	struct pnp_dev * pnp_dev = to_pnp_dev(dev);
+	struct pnp_driver * pnp_drv = pnp_dev->driver;
+
+	if (pnp_drv && pnp_drv->suspend)
+		return pnp_drv->suspend(pnp_dev, state);
+	return 0;
+}
+
+static void pnp_bus_resume(struct device *dev)
+{
+	struct pnp_dev * pnp_dev = to_pnp_dev(dev);
+	struct pnp_driver * pnp_drv = pnp_dev->driver;
+
+	if (pnp_drv && pnp_drv->resume)
+		pnp_drv->resume(pnp_dev);
+}
 
 struct bus_type pnp_bus_type = {
 	.name	= "pnp",
 	.match	= pnp_bus_match,
+	.suspend = pnp_bus_suspend,
+	.resume = pnp_bus_resume,
 };
 
 
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index 584d57cb393a..472319fcf631 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -162,6 +162,7 @@ struct pnp_card_link {
 	struct pnp_card * card;
 	struct pnp_card_driver * driver;
 	void * driver_data;
+	pm_message_t pm_state;
 };
 
 static inline void *pnp_get_card_drvdata (struct pnp_card_link *pcard)
@@ -294,6 +295,8 @@ struct pnp_driver {
 	unsigned int flags;
 	int  (*probe)  (struct pnp_dev *dev, const struct pnp_device_id *dev_id);
 	void (*remove) (struct pnp_dev *dev);
+	int  (*suspend) (struct pnp_dev *dev, pm_message_t state);
+	int  (*resume) (struct pnp_dev *dev);
 	struct device_driver driver;
 };
 
@@ -306,6 +309,8 @@ struct pnp_card_driver {
 	unsigned int flags;
 	int  (*probe)  (struct pnp_card_link *card, const struct pnp_card_device_id *card_id);
 	void (*remove) (struct pnp_card_link *card);
+	int  (*suspend) (struct pnp_card_link *card, pm_message_t state);
+	int  (*resume) (struct pnp_card_link *card);
 	struct pnp_driver link;
 };
 
-- 
cgit v1.2.3-71-gd317


From 68094e3251a664ee1389fcf179497237cbf78331 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus-list@drzeus.cx>
Date: Tue, 29 Nov 2005 09:09:32 +0100
Subject: [ALSA] [PATCH] alsa: Improved PnP suspend support

Also use the PnP functions to start/stop the devices during the suspend so
that drivers will not have to duplicate this code.

Cc: Adam Belay <ambx1@neo.rr.com>
Cc: Jaroslav Kysela <perex@suse.cz>
Cc: Takashi Iwai <tiwai@suse.de>

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 drivers/pnp/driver.c  | 37 ++++++++++++++++++++----
 drivers/pnp/manager.c | 78 ++++++++++++++++++++++++++++++++++++++-------------
 include/linux/pnp.h   |  4 +++
 3 files changed, 95 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index ea2cb9a8b21d..15fb758a9e52 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -150,19 +150,46 @@ static int pnp_bus_suspend(struct device *dev, pm_message_t state)
 {
 	struct pnp_dev * pnp_dev = to_pnp_dev(dev);
 	struct pnp_driver * pnp_drv = pnp_dev->driver;
+	int error;
+
+	if (!pnp_drv)
+		return 0;
+
+	if (pnp_drv->suspend) {
+		error = pnp_drv->suspend(pnp_dev, state);
+		if (error)
+			return error;
+	}
+
+	if (!(pnp_drv->flags & PNP_DRIVER_RES_DO_NOT_CHANGE) &&
+	    pnp_can_disable(pnp_dev)) {
+	    	error = pnp_stop_dev(pnp_dev);
+	    	if (error)
+	    		return error;
+	}
 
-	if (pnp_drv && pnp_drv->suspend)
-		return pnp_drv->suspend(pnp_dev, state);
 	return 0;
 }
 
-static void pnp_bus_resume(struct device *dev)
+static int pnp_bus_resume(struct device *dev)
 {
 	struct pnp_dev * pnp_dev = to_pnp_dev(dev);
 	struct pnp_driver * pnp_drv = pnp_dev->driver;
+	int error;
+
+	if (!pnp_drv)
+		return 0;
+
+	if (!(pnp_drv->flags & PNP_DRIVER_RES_DO_NOT_CHANGE)) {
+		error = pnp_start_dev(pnp_dev);
+		if (error)
+			return error;
+	}
 
-	if (pnp_drv && pnp_drv->resume)
-		pnp_drv->resume(pnp_dev);
+	if (pnp_drv->resume)
+		return pnp_drv->resume(pnp_dev);
+
+	return 0;
 }
 
 struct bus_type pnp_bus_type = {
diff --git a/drivers/pnp/manager.c b/drivers/pnp/manager.c
index 261668618b2d..c4256aa32bcb 100644
--- a/drivers/pnp/manager.c
+++ b/drivers/pnp/manager.c
@@ -469,6 +469,53 @@ int pnp_auto_config_dev(struct pnp_dev *dev)
 	return -EBUSY;
 }
 
+/**
+ * pnp_start_dev - low-level start of the PnP device
+ * @dev: pointer to the desired device
+ *
+ * assumes that resources have alread been allocated
+ */
+
+int pnp_start_dev(struct pnp_dev *dev)
+{
+	if (!pnp_can_write(dev)) {
+		pnp_info("Device %s does not supported activation.", dev->dev.bus_id);
+		return -EINVAL;
+	}
+
+	if (dev->protocol->set(dev, &dev->res)<0) {
+		pnp_err("Failed to activate device %s.", dev->dev.bus_id);
+		return -EIO;
+	}
+
+	pnp_info("Device %s activated.", dev->dev.bus_id);
+
+	return 0;
+}
+
+/**
+ * pnp_stop_dev - low-level disable of the PnP device
+ * @dev: pointer to the desired device
+ *
+ * does not free resources
+ */
+
+int pnp_stop_dev(struct pnp_dev *dev)
+{
+	if (!pnp_can_disable(dev)) {
+		pnp_info("Device %s does not supported disabling.", dev->dev.bus_id);
+		return -EINVAL;
+	}
+	if (dev->protocol->disable(dev)<0) {
+		pnp_err("Failed to disable device %s.", dev->dev.bus_id);
+		return -EIO;
+	}
+
+	pnp_info("Device %s disabled.", dev->dev.bus_id);
+
+	return 0;
+}
+
 /**
  * pnp_activate_dev - activates a PnP device for use
  * @dev: pointer to the desired device
@@ -477,6 +524,8 @@ int pnp_auto_config_dev(struct pnp_dev *dev)
  */
 int pnp_activate_dev(struct pnp_dev *dev)
 {
+	int error;
+
 	if (!dev)
 		return -EINVAL;
 	if (dev->active) {
@@ -487,18 +536,11 @@ int pnp_activate_dev(struct pnp_dev *dev)
 	if (pnp_auto_config_dev(dev))
 		return -EBUSY;
 
-	if (!pnp_can_write(dev)) {
-		pnp_info("Device %s does not supported activation.", dev->dev.bus_id);
-		return -EINVAL;
-	}
-
-	if (dev->protocol->set(dev, &dev->res)<0) {
-		pnp_err("Failed to activate device %s.", dev->dev.bus_id);
-		return -EIO;
-	}
+	error = pnp_start_dev(dev);
+	if (error)
+		return error;
 
 	dev->active = 1;
-	pnp_info("Device %s activated.", dev->dev.bus_id);
 
 	return 1;
 }
@@ -511,23 +553,19 @@ int pnp_activate_dev(struct pnp_dev *dev)
  */
 int pnp_disable_dev(struct pnp_dev *dev)
 {
+	int error;
+
         if (!dev)
                 return -EINVAL;
 	if (!dev->active) {
 		return 0; /* the device is already disabled */
 	}
 
-	if (!pnp_can_disable(dev)) {
-		pnp_info("Device %s does not supported disabling.", dev->dev.bus_id);
-		return -EINVAL;
-	}
-	if (dev->protocol->disable(dev)<0) {
-		pnp_err("Failed to disable device %s.", dev->dev.bus_id);
-		return -EIO;
-	}
+	error = pnp_stop_dev(dev);
+	if (error)
+		return error;
 
 	dev->active = 0;
-	pnp_info("Device %s disabled.", dev->dev.bus_id);
 
 	/* release the resources so that other devices can use them */
 	down(&pnp_res_mutex);
@@ -558,6 +596,8 @@ EXPORT_SYMBOL(pnp_manual_config_dev);
 #if 0
 EXPORT_SYMBOL(pnp_auto_config_dev);
 #endif
+EXPORT_SYMBOL(pnp_start_dev);
+EXPORT_SYMBOL(pnp_stop_dev);
 EXPORT_SYMBOL(pnp_activate_dev);
 EXPORT_SYMBOL(pnp_disable_dev);
 EXPORT_SYMBOL(pnp_resource_change);
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index 472319fcf631..93b0959eb40f 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -385,6 +385,8 @@ void pnp_init_resource_table(struct pnp_resource_table *table);
 int pnp_manual_config_dev(struct pnp_dev *dev, struct pnp_resource_table *res, int mode);
 int pnp_auto_config_dev(struct pnp_dev *dev);
 int pnp_validate_config(struct pnp_dev *dev);
+int pnp_start_dev(struct pnp_dev *dev);
+int pnp_stop_dev(struct pnp_dev *dev);
 int pnp_activate_dev(struct pnp_dev *dev);
 int pnp_disable_dev(struct pnp_dev *dev);
 void pnp_resource_change(struct resource *resource, unsigned long start, unsigned long size);
@@ -428,6 +430,8 @@ static inline void pnp_init_resource_table(struct pnp_resource_table *table) { }
 static inline int pnp_manual_config_dev(struct pnp_dev *dev, struct pnp_resource_table *res, int mode) { return -ENODEV; }
 static inline int pnp_auto_config_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_validate_config(struct pnp_dev *dev) { return -ENODEV; }
+static inline int pnp_start_dev(struct pnp_dev *dev) { return -ENODEV; }
+static inline int pnp_stop_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_activate_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_disable_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline void pnp_resource_change(struct resource *resource, unsigned long start, unsigned long size) { }
-- 
cgit v1.2.3-71-gd317


From 4d399cae3f5ec1f59b9e88084aae09c4f00760c9 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 3 Jan 2006 13:19:13 +0100
Subject: remove pointers to the defunct UDF mailing list

This patch removes pointers to the defunct UDF mailing list.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 MAINTAINERS               | 1 -
 fs/udf/balloc.c           | 5 -----
 fs/udf/crc.c              | 5 -----
 fs/udf/dir.c              | 5 -----
 fs/udf/directory.c        | 5 -----
 fs/udf/file.c             | 5 -----
 fs/udf/fsync.c            | 5 -----
 fs/udf/ialloc.c           | 5 -----
 fs/udf/inode.c            | 5 -----
 fs/udf/lowlevel.c         | 5 -----
 fs/udf/misc.c             | 5 -----
 fs/udf/namei.c            | 5 -----
 fs/udf/partition.c        | 5 -----
 fs/udf/super.c            | 5 -----
 fs/udf/symlink.c          | 5 -----
 fs/udf/truncate.c         | 5 -----
 fs/udf/unicode.c          | 5 -----
 include/linux/udf_fs.h    | 5 -----
 include/linux/udf_fs_i.h  | 5 -----
 include/linux/udf_fs_sb.h | 5 -----
 20 files changed, 96 deletions(-)

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 6af683025ae0..bbe5f04f59ec 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2587,7 +2587,6 @@ S:	Maintained
 UDF FILESYSTEM
 P:	Ben Fennema
 M:	bfennema@falcon.csc.calpoly.edu
-L:	linux_udf@hpesjro.fc.hp.com
 W:	http://linux-udf.sourceforge.net
 S:	Maintained
 
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b9ded26b10a9..6598a5037ac8 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Block allocation handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/crc.c b/fs/udf/crc.c
index d95c6e38a455..1b82a4adc2f7 100644
--- a/fs/udf/crc.c
+++ b/fs/udf/crc.c
@@ -14,11 +14,6 @@
  *
  *	AT&T gives permission for the free use of the CRC source code.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 82440b731142..f5222527fe39 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *  Directory handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 9a61ecc5451b..fe751a2a0e47 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Directory related functions
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 01f520c71dc1..8a388289040d 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *  File handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *  E-mail regarding any portion of the Linux UDF file system should be
- *  directed to the development team mailing list (run by majordomo):
- *    linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *  This file is distributed under the terms of the GNU General Public
  *  License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
index 2dde6b888c2b..5887d78cde43 100644
--- a/fs/udf/fsync.c
+++ b/fs/udf/fsync.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *  Fsync handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *  E-mail regarding any portion of the Linux UDF file system should be
- *  directed to the development team mailing list (run by majordomo):
- *      linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *  This file is distributed under the terms of the GNU General Public
  *  License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index a7e5d40f1ebc..c9b707b470ca 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Inode allocation handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b83890beaaac..4014f17d382e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *  Inode handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *  E-mail regarding any portion of the Linux UDF file system should be
- *  directed to the development team mailing list (run by majordomo):
- *    linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *  This file is distributed under the terms of the GNU General Public
  *  License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 2da5087dfe05..084216107667 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *  Low Level Device Routines for the UDF filesystem
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index fd321f9ace83..cc8ca3254db1 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Miscellaneous routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ac191ed7df0a..ca732e79c48b 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *      Inode name handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *      E-mail regarding any portion of the Linux UDF file system should be
- *      directed to the development team mailing list (run by majordomo):
- *              linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *      This file is distributed under the terms of the GNU General Public
  *      License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4d36f264be0d..dabf2b841db8 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *      Partition handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *      E-mail regarding any portion of the Linux UDF file system should be
- *      directed to the development team mailing list (run by majordomo):
- *              linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *      This file is distributed under the terms of the GNU General Public
  *      License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 15bd4f24c5b7..4a6f49adc609 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -14,11 +14,6 @@
  *    http://www.ecma.ch/
  *    http://www.iso.org/
  *
- * CONTACTS
- *  E-mail regarding any portion of the Linux UDF file system should be
- *  directed to the development team mailing list (run by majordomo):
- *	  linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *  This file is distributed under the terms of the GNU General Public
  *  License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 43f3051ef756..674bb40edc83 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Symlink handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 7dc8a5572ca1..e1b0e8cfecb4 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Truncate handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 5a80efd8debc..706c92e1dcc9 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -11,11 +11,6 @@
  *	UTF-8 is explained in the IETF RFC XXXX.
  *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team's mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/include/linux/udf_fs.h b/include/linux/udf_fs.h
index 46e2bb945353..36c684e1b110 100644
--- a/include/linux/udf_fs.h
+++ b/include/linux/udf_fs.h
@@ -13,11 +13,6 @@
  *    http://www.osta.org/ *    http://www.ecma.ch/
  *    http://www.iso.org/
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/include/linux/udf_fs_i.h b/include/linux/udf_fs_i.h
index 62b15a4214e6..1e7508420fcf 100644
--- a/include/linux/udf_fs_i.h
+++ b/include/linux/udf_fs_i.h
@@ -3,11 +3,6 @@
  *
  * This file is intended for the Linux kernel/module. 
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/include/linux/udf_fs_sb.h b/include/linux/udf_fs_sb.h
index 1966a6dbb4b6..b15ff2e99c91 100644
--- a/include/linux/udf_fs_sb.h
+++ b/include/linux/udf_fs_sb.h
@@ -3,11 +3,6 @@
  * 
  * This include file is for the Linux kernel/module.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
-- 
cgit v1.2.3-71-gd317


From 7063fbf2261194f72ee75afca67b3b38b554b5fa Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 15 Dec 2005 14:29:43 -0800
Subject: [PATCH] configfs: User-driven configuration filesystem

Configfs, a file system for userspace-driven kernel object configuration.
The OCFS2 stack makes extensive use of this for propagation of cluster
configuration information into kernel.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 Documentation/filesystems/00-INDEX                 |    2 +
 Documentation/filesystems/configfs/configfs.txt    |  434 ++++++++
 .../filesystems/configfs/configfs_example.c        |  474 +++++++++
 MAINTAINERS                                        |    5 +
 fs/Kconfig                                         |   14 +
 fs/Makefile                                        |    1 +
 fs/configfs/Makefile                               |    7 +
 fs/configfs/configfs_internal.h                    |  142 +++
 fs/configfs/dir.c                                  | 1102 ++++++++++++++++++++
 fs/configfs/file.c                                 |  360 +++++++
 fs/configfs/inode.c                                |  162 +++
 fs/configfs/item.c                                 |  227 ++++
 fs/configfs/mount.c                                |  159 +++
 fs/configfs/symlink.c                              |  281 +++++
 include/linux/configfs.h                           |  205 ++++
 15 files changed, 3575 insertions(+)
 create mode 100644 Documentation/filesystems/configfs/configfs.txt
 create mode 100644 Documentation/filesystems/configfs/configfs_example.c
 create mode 100644 fs/configfs/Makefile
 create mode 100644 fs/configfs/configfs_internal.h
 create mode 100644 fs/configfs/dir.c
 create mode 100644 fs/configfs/file.c
 create mode 100644 fs/configfs/inode.c
 create mode 100644 fs/configfs/item.c
 create mode 100644 fs/configfs/mount.c
 create mode 100644 fs/configfs/symlink.c
 create mode 100644 include/linux/configfs.h

(limited to 'include/linux')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index bcfbab899b37..628f8a7adb85 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -12,6 +12,8 @@ cifs.txt
 	- description of the CIFS filesystem
 coda.txt
 	- description of the CODA filesystem.
+configfs/
+	- directory containing configfs documentation and example code.
 cramfs.txt
 	- info on the cram filesystem for small storage (ROMs etc)
 devfs/
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
new file mode 100644
index 000000000000..c4ff96b7c4e0
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -0,0 +1,434 @@
+
+configfs - Userspace-driven kernel object configuation.
+
+Joel Becker <joel.becker@oracle.com>
+
+Updated: 31 March 2005
+
+Copyright (c) 2005 Oracle Corporation,
+	Joel Becker <joel.becker@oracle.com>
+
+
+[What is configfs?]
+
+configfs is a ram-based filesystem that provides the converse of
+sysfs's functionality.  Where sysfs is a filesystem-based view of
+kernel objects, configfs is a filesystem-based manager of kernel
+objects, or config_items.
+
+With sysfs, an object is created in kernel (for example, when a device
+is discovered) and it is registered with sysfs.  Its attributes then
+appear in sysfs, allowing userspace to read the attributes via
+readdir(3)/read(2).  It may allow some attributes to be modified via
+write(2).  The important point is that the object is created and
+destroyed in kernel, the kernel controls the lifecycle of the sysfs
+representation, and sysfs is merely a window on all this.
+
+A configfs config_item is created via an explicit userspace operation:
+mkdir(2).  It is destroyed via rmdir(2).  The attributes appear at
+mkdir(2) time, and can be read or modified via read(2) and write(2).
+As with sysfs, readdir(3) queries the list of items and/or attributes.
+symlink(2) can be used to group items together.  Unlike sysfs, the
+lifetime of the representation is completely driven by userspace.  The
+kernel modules backing the items must respond to this.
+
+Both sysfs and configfs can and should exist together on the same
+system.  One is not a replacement for the other.
+
+[Using configfs]
+
+configfs can be compiled as a module or into the kernel.  You can access
+it by doing
+
+	mount -t configfs none /config
+
+The configfs tree will be empty unless client modules are also loaded.
+These are modules that register their item types with configfs as
+subsystems.  Once a client subsystem is loaded, it will appear as a
+subdirectory (or more than one) under /config.  Like sysfs, the
+configfs tree is always there, whether mounted on /config or not.
+
+An item is created via mkdir(2).  The item's attributes will also
+appear at this time.  readdir(3) can determine what the attributes are,
+read(2) can query their default values, and write(2) can store new
+values.  Like sysfs, attributes should be ASCII text files, preferably
+with only one value per file.  The same efficiency caveats from sysfs
+apply.  Don't mix more than one attribute in one attribute file.
+
+Like sysfs, configfs expects write(2) to store the entire buffer at
+once.  When writing to configfs attributes, userspace processes should
+first read the entire file, modify the portions they wish to change, and
+then write the entire buffer back.  Attribute files have a maximum size
+of one page (PAGE_SIZE, 4096 on i386).
+
+When an item needs to be destroyed, remove it with rmdir(2).  An
+item cannot be destroyed if any other item has a link to it (via
+symlink(2)).  Links can be removed via unlink(2).
+
+[Configuring FakeNBD: an Example]
+
+Imagine there's a Network Block Device (NBD) driver that allows you to
+access remote block devices.  Call it FakeNBD.  FakeNBD uses configfs
+for its configuration.  Obviously, there will be a nice program that
+sysadmins use to configure FakeNBD, but somehow that program has to tell
+the driver about it.  Here's where configfs comes in.
+
+When the FakeNBD driver is loaded, it registers itself with configfs.
+readdir(3) sees this just fine:
+
+	# ls /config
+	fakenbd
+
+A fakenbd connection can be created with mkdir(2).  The name is
+arbitrary, but likely the tool will make some use of the name.  Perhaps
+it is a uuid or a disk name:
+
+	# mkdir /config/fakenbd/disk1
+	# ls /config/fakenbd/disk1
+	target device rw
+
+The target attribute contains the IP address of the server FakeNBD will
+connect to.  The device attribute is the device on the server.
+Predictably, the rw attribute determines whether the connection is
+read-only or read-write.
+
+	# echo 10.0.0.1 > /config/fakenbd/disk1/target
+	# echo /dev/sda1 > /config/fakenbd/disk1/device
+	# echo 1 > /config/fakenbd/disk1/rw
+
+That's it.  That's all there is.  Now the device is configured, via the
+shell no less.
+
+[Coding With configfs]
+
+Every object in configfs is a config_item.  A config_item reflects an
+object in the subsystem.  It has attributes that match values on that
+object.  configfs handles the filesystem representation of that object
+and its attributes, allowing the subsystem to ignore all but the
+basic show/store interaction.
+
+Items are created and destroyed inside a config_group.  A group is a
+collection of items that share the same attributes and operations.
+Items are created by mkdir(2) and removed by rmdir(2), but configfs
+handles that.  The group has a set of operations to perform these tasks
+
+A subsystem is the top level of a client module.  During initialization,
+the client module registers the subsystem with configfs, the subsystem
+appears as a directory at the top of the configfs filesystem.  A
+subsystem is also a config_group, and can do everything a config_group
+can.
+
+[struct config_item]
+
+	struct config_item {
+		char                    *ci_name;
+		char                    ci_namebuf[UOBJ_NAME_LEN];
+		struct kref             ci_kref;
+		struct list_head        ci_entry;
+		struct config_item      *ci_parent;
+		struct config_group     *ci_group;
+		struct config_item_type *ci_type;
+		struct dentry           *ci_dentry;
+	};
+
+	void config_item_init(struct config_item *);
+	void config_item_init_type_name(struct config_item *,
+					const char *name,
+					struct config_item_type *type);
+	struct config_item *config_item_get(struct config_item *);
+	void config_item_put(struct config_item *);
+
+Generally, struct config_item is embedded in a container structure, a
+structure that actually represents what the subsystem is doing.  The
+config_item portion of that structure is how the object interacts with
+configfs.
+
+Whether statically defined in a source file or created by a parent
+config_group, a config_item must have one of the _init() functions
+called on it.  This initializes the reference count and sets up the
+appropriate fields.
+
+All users of a config_item should have a reference on it via
+config_item_get(), and drop the reference when they are done via
+config_item_put().
+
+By itself, a config_item cannot do much more than appear in configfs.
+Usually a subsystem wants the item to display and/or store attributes,
+among other things.  For that, it needs a type.
+
+[struct config_item_type]
+
+	struct configfs_item_operations {
+		void (*release)(struct config_item *);
+		ssize_t (*show_attribute)(struct config_item *,
+					  struct configfs_attribute *,
+					  char *);
+		ssize_t (*store_attribute)(struct config_item *,
+					   struct configfs_attribute *,
+					   const char *, size_t);
+		int (*allow_link)(struct config_item *src,
+				  struct config_item *target);
+		int (*drop_link)(struct config_item *src,
+				 struct config_item *target);
+	};
+
+	struct config_item_type {
+		struct module                           *ct_owner;
+		struct configfs_item_operations         *ct_item_ops;
+		struct configfs_group_operations        *ct_group_ops;
+		struct configfs_attribute               **ct_attrs;
+	};
+
+The most basic function of a config_item_type is to define what
+operations can be performed on a config_item.  All items that have been
+allocated dynamically will need to provide the ct_item_ops->release()
+method.  This method is called when the config_item's reference count
+reaches zero.  Items that wish to display an attribute need to provide
+the ct_item_ops->show_attribute() method.  Similarly, storing a new
+attribute value uses the store_attribute() method.
+
+[struct configfs_attribute]
+
+	struct configfs_attribute {
+		char                    *ca_name;
+		struct module           *ca_owner;
+		mode_t                  ca_mode;
+	};
+
+When a config_item wants an attribute to appear as a file in the item's
+configfs directory, it must define a configfs_attribute describing it.
+It then adds the attribute to the NULL-terminated array
+config_item_type->ct_attrs.  When the item appears in configfs, the
+attribute file will appear with the configfs_attribute->ca_name
+filename.  configfs_attribute->ca_mode specifies the file permissions.
+
+If an attribute is readable and the config_item provides a
+ct_item_ops->show_attribute() method, that method will be called
+whenever userspace asks for a read(2) on the attribute.  The converse
+will happen for write(2).
+
+[struct config_group]
+
+A config_item cannot live in a vaccum.  The only way one can be created
+is via mkdir(2) on a config_group.  This will trigger creation of a
+child item.
+
+	struct config_group {
+		struct config_item		cg_item;
+		struct list_head		cg_children;
+		struct configfs_subsystem 	*cg_subsys;
+		struct config_group		**default_groups;
+	};
+
+	void config_group_init(struct config_group *group);
+	void config_group_init_type_name(struct config_group *group,
+					 const char *name,
+					 struct config_item_type *type);
+
+
+The config_group structure contains a config_item.  Properly configuring
+that item means that a group can behave as an item in its own right.
+However, it can do more: it can create child items or groups.  This is
+accomplished via the group operations specified on the group's
+config_item_type.
+
+	struct configfs_group_operations {
+		struct config_item *(*make_item)(struct config_group *group,
+						 const char *name);
+		struct config_group *(*make_group)(struct config_group *group,
+						   const char *name);
+		int (*commit_item)(struct config_item *item);
+		void (*drop_item)(struct config_group *group,
+				  struct config_item *item);
+	};
+
+A group creates child items by providing the
+ct_group_ops->make_item() method.  If provided, this method is called from mkdir(2) in the group's directory.  The subsystem allocates a new
+config_item (or more likely, its container structure), initializes it,
+and returns it to configfs.  Configfs will then populate the filesystem
+tree to reflect the new item.
+
+If the subsystem wants the child to be a group itself, the subsystem
+provides ct_group_ops->make_group().  Everything else behaves the same,
+using the group _init() functions on the group.
+
+Finally, when userspace calls rmdir(2) on the item or group,
+ct_group_ops->drop_item() is called.  As a config_group is also a
+config_item, it is not necessary for a seperate drop_group() method.
+The subsystem must config_item_put() the reference that was initialized
+upon item allocation.  If a subsystem has no work to do, it may omit
+the ct_group_ops->drop_item() method, and configfs will call
+config_item_put() on the item on behalf of the subsystem.
+
+IMPORTANT: drop_item() is void, and as such cannot fail.  When rmdir(2)
+is called, configfs WILL remove the item from the filesystem tree
+(assuming that it has no children to keep it busy).  The subsystem is
+responsible for responding to this.  If the subsystem has references to
+the item in other threads, the memory is safe.  It may take some time
+for the item to actually disappear from the subsystem's usage.  But it
+is gone from configfs.
+
+A config_group cannot be removed while it still has child items.  This
+is implemented in the configfs rmdir(2) code.  ->drop_item() will not be
+called, as the item has not been dropped.  rmdir(2) will fail, as the
+directory is not empty.
+
+[struct configfs_subsystem]
+
+A subsystem must register itself, ususally at module_init time.  This
+tells configfs to make the subsystem appear in the file tree.
+
+	struct configfs_subsystem {
+		struct config_group	su_group;
+		struct semaphore	su_sem;
+	};
+
+	int configfs_register_subsystem(struct configfs_subsystem *subsys);
+	void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+	A subsystem consists of a toplevel config_group and a semaphore.
+The group is where child config_items are created.  For a subsystem,
+this group is usually defined statically.  Before calling
+configfs_register_subsystem(), the subsystem must have initialized the
+group via the usual group _init() functions, and it must also have
+initialized the semaphore.
+	When the register call returns, the subsystem is live, and it
+will be visible via configfs.  At that point, mkdir(2) can be called and
+the subsystem must be ready for it.
+
+[An Example]
+
+The best example of these basic concepts is the simple_children
+subsystem/group and the simple_child item in configfs_example.c  It
+shows a trivial object displaying and storing an attribute, and a simple
+group creating and destroying these children.
+
+[Hierarchy Navigation and the Subsystem Semaphore]
+
+There is an extra bonus that configfs provides.  The config_groups and
+config_items are arranged in a hierarchy due to the fact that they
+appear in a filesystem.  A subsystem is NEVER to touch the filesystem
+parts, but the subsystem might be interested in this hierarchy.  For
+this reason, the hierarchy is mirrored via the config_group->cg_children
+and config_item->ci_parent structure members.
+
+A subsystem can navigate the cg_children list and the ci_parent pointer
+to see the tree created by the subsystem.  This can race with configfs'
+management of the hierarchy, so configfs uses the subsystem semaphore to
+protect modifications.  Whenever a subsystem wants to navigate the
+hierarchy, it must do so under the protection of the subsystem
+semaphore.
+
+A subsystem will be prevented from acquiring the semaphore while a newly
+allocated item has not been linked into this hierarchy.   Similarly, it
+will not be able to acquire the semaphore while a dropping item has not
+yet been unlinked.  This means that an item's ci_parent pointer will
+never be NULL while the item is in configfs, and that an item will only
+be in its parent's cg_children list for the same duration.  This allows
+a subsystem to trust ci_parent and cg_children while they hold the
+semaphore.
+
+[Item Aggregation Via symlink(2)]
+
+configfs provides a simple group via the group->item parent/child
+relationship.  Often, however, a larger environment requires aggregation
+outside of the parent/child connection.  This is implemented via
+symlink(2).
+
+A config_item may provide the ct_item_ops->allow_link() and
+ct_item_ops->drop_link() methods.  If the ->allow_link() method exists,
+symlink(2) may be called with the config_item as the source of the link.
+These links are only allowed between configfs config_items.  Any
+symlink(2) attempt outside the configfs filesystem will be denied.
+
+When symlink(2) is called, the source config_item's ->allow_link()
+method is called with itself and a target item.  If the source item
+allows linking to target item, it returns 0.  A source item may wish to
+reject a link if it only wants links to a certain type of object (say,
+in its own subsystem).
+
+When unlink(2) is called on the symbolic link, the source item is
+notified via the ->drop_link() method.  Like the ->drop_item() method,
+this is a void function and cannot return failure.  The subsystem is
+responsible for responding to the change.
+
+A config_item cannot be removed while it links to any other item, nor
+can it be removed while an item links to it.  Dangling symlinks are not
+allowed in configfs.
+
+[Automatically Created Subgroups]
+
+A new config_group may want to have two types of child config_items.
+While this could be codified by magic names in ->make_item(), it is much
+more explicit to have a method whereby userspace sees this divergence.
+
+Rather than have a group where some items behave differently than
+others, configfs provides a method whereby one or many subgroups are
+automatically created inside the parent at its creation.  Thus,
+mkdir("parent) results in "parent", "parent/subgroup1", up through
+"parent/subgroupN".  Items of type 1 can now be created in
+"parent/subgroup1", and items of type N can be created in
+"parent/subgroupN".
+
+These automatic subgroups, or default groups, do not preclude other
+children of the parent group.  If ct_group_ops->make_group() exists,
+other child groups can be created on the parent group directly.
+
+A configfs subsystem specifies default groups by filling in the
+NULL-terminated array default_groups on the config_group structure.
+Each group in that array is populated in the configfs tree at the same
+time as the parent group.  Similarly, they are removed at the same time
+as the parent.  No extra notification is provided.  When a ->drop_item()
+method call notifies the subsystem the parent group is going away, it
+also means every default group child associated with that parent group.
+
+As a consequence of this, default_groups cannot be removed directly via
+rmdir(2).  They also are not considered when rmdir(2) on the parent
+group is checking for children.
+
+[Committable Items]
+
+NOTE: Committable items are currently unimplemented.
+
+Some config_items cannot have a valid initial state.  That is, no
+default values can be specified for the item's attributes such that the
+item can do its work.  Userspace must configure one or more attributes,
+after which the subsystem can start whatever entity this item
+represents.
+
+Consider the FakeNBD device from above.  Without a target address *and*
+a target device, the subsystem has no idea what block device to import.
+The simple example assumes that the subsystem merely waits until all the
+appropriate attributes are configured, and then connects.  This will,
+indeed, work, but now every attribute store must check if the attributes
+are initialized.  Every attribute store must fire off the connection if
+that condition is met.
+
+Far better would be an explicit action notifying the subsystem that the
+config_item is ready to go.  More importantly, an explicit action allows
+the subsystem to provide feedback as to whether the attibutes are
+initialized in a way that makes sense.  configfs provides this as
+committable items.
+
+configfs still uses only normal filesystem operations.  An item is
+committed via rename(2).  The item is moved from a directory where it
+can be modified to a directory where it cannot.
+
+Any group that provides the ct_group_ops->commit_item() method has
+committable items.  When this group appears in configfs, mkdir(2) will
+not work directly in the group.  Instead, the group will have two
+subdirectories: "live" and "pending".  The "live" directory does not
+support mkdir(2) or rmdir(2) either.  It only allows rename(2).  The
+"pending" directory does allow mkdir(2) and rmdir(2).  An item is
+created in the "pending" directory.  Its attributes can be modified at
+will.  Userspace commits the item by renaming it into the "live"
+directory.  At this point, the subsystem recieves the ->commit_item()
+callback.  If all required attributes are filled to satisfaction, the
+method returns zero and the item is moved to the "live" directory.
+
+As rmdir(2) does not work in the "live" directory, an item must be
+shutdown, or "uncommitted".  Again, this is done via rename(2), this
+time from the "live" directory back to the "pending" one.  The subsystem
+is notified by the ct_group_ops->uncommit_object() method.
+
+
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
new file mode 100644
index 000000000000..f3c6e4946f98
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -0,0 +1,474 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example.c - This file is a demonstration module containing
+ *      a number of configfs subsystems.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem.  It cannot create
+ * any config_items.  It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem.  See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+	struct configfs_subsystem subsys;
+	int showme;
+	int storeme;
+};
+
+struct childless_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct childless *, char *);
+	ssize_t (*store)(struct childless *, const char *, size_t);
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+	return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+
+static ssize_t childless_showme_read(struct childless *childless,
+				     char *page)
+{
+	ssize_t pos;
+
+	pos = sprintf(page, "%d\n", childless->showme);
+	childless->showme++;
+
+	return pos;
+}
+
+static ssize_t childless_storeme_read(struct childless *childless,
+				      char *page)
+{
+	return sprintf(page, "%d\n", childless->storeme);
+}
+
+static ssize_t childless_storeme_write(struct childless *childless,
+				       const char *page,
+				       size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > INT_MAX)
+		return -ERANGE;
+
+	childless->storeme = tmp;
+
+	return count;
+}
+
+static ssize_t childless_description_read(struct childless *childless,
+					  char *page)
+{
+	return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs.  It does not support the creation of child config_items.\n"
+"It only has a few attributes.  In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+static struct childless_attribute childless_attr_showme = {
+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
+	.show	= childless_showme_read,
+};
+static struct childless_attribute childless_attr_storeme = {
+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= childless_storeme_read,
+	.store	= childless_storeme_write,
+};
+static struct childless_attribute childless_attr_description = {
+	.attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
+	.show = childless_description_read,
+};
+
+static struct configfs_attribute *childless_attrs[] = {
+	&childless_attr_showme.attr,
+	&childless_attr_storeme.attr,
+	&childless_attr_description.attr,
+	NULL,
+};
+
+static ssize_t childless_attr_show(struct config_item *item,
+				   struct configfs_attribute *attr,
+				   char *page)
+{
+	struct childless *childless = to_childless(item);
+	struct childless_attribute *childless_attr =
+		container_of(attr, struct childless_attribute, attr);
+	ssize_t ret = 0;
+
+	if (childless_attr->show)
+		ret = childless_attr->show(childless, page);
+	return ret;
+}
+
+static ssize_t childless_attr_store(struct config_item *item,
+				    struct configfs_attribute *attr,
+				    const char *page, size_t count)
+{
+	struct childless *childless = to_childless(item);
+	struct childless_attribute *childless_attr =
+		container_of(attr, struct childless_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (childless_attr->store)
+		ret = childless_attr->store(childless, page, count);
+	return ret;
+}
+
+static struct configfs_item_operations childless_item_ops = {
+	.show_attribute		= childless_attr_show,
+	.store_attribute	= childless_attr_store,
+};
+
+static struct config_item_type childless_type = {
+	.ct_item_ops	= &childless_item_ops,
+	.ct_attrs	= childless_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+	.subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "01-childless",
+				.ci_type = &childless_type,
+			},
+		},
+	},
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child.  Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go.  Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+	struct config_item item;
+	int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+	return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static struct configfs_attribute simple_child_attr_storeme = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "storeme",
+	.ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *simple_child_attrs[] = {
+	&simple_child_attr_storeme,
+	NULL,
+};
+
+static ssize_t simple_child_attr_show(struct config_item *item,
+				      struct configfs_attribute *attr,
+				      char *page)
+{
+	ssize_t count;
+	struct simple_child *simple_child = to_simple_child(item);
+
+	count = sprintf(page, "%d\n", simple_child->storeme);
+
+	return count;
+}
+
+static ssize_t simple_child_attr_store(struct config_item *item,
+				       struct configfs_attribute *attr,
+				       const char *page, size_t count)
+{
+	struct simple_child *simple_child = to_simple_child(item);
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > INT_MAX)
+		return -ERANGE;
+
+	simple_child->storeme = tmp;
+
+	return count;
+}
+
+static void simple_child_release(struct config_item *item)
+{
+	kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+	.release		= simple_child_release,
+	.show_attribute		= simple_child_attr_show,
+	.store_attribute	= simple_child_attr_store,
+};
+
+static struct config_item_type simple_child_type = {
+	.ct_item_ops	= &simple_child_item_ops,
+	.ct_attrs	= simple_child_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+	struct simple_child *simple_child;
+
+	simple_child = kmalloc(sizeof(struct simple_child), GFP_KERNEL);
+	if (!simple_child)
+		return NULL;
+
+	memset(simple_child, 0, sizeof(struct simple_child));
+
+	config_item_init_type_name(&simple_child->item, name,
+				   &simple_child_type);
+
+	simple_child->storeme = 0;
+
+	return &simple_child->item;
+}
+
+static struct configfs_attribute simple_children_attr_description = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "description",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *simple_children_attrs[] = {
+	&simple_children_attr_description,
+	NULL,
+};
+
+static ssize_t simple_children_attr_show(struct config_item *item,
+			   		 struct configfs_attribute *attr,
+			   		 char *page)
+{
+	return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items.  These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+	.show_attribute	= simple_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+	.make_item	= simple_children_make_item,
+};
+
+static struct config_item_type simple_children_type = {
+	.ct_item_ops	= &simple_children_item_ops,
+	.ct_group_ops	= &simple_children_group_ops,
+	.ct_attrs	= simple_children_attrs,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "02-simple-children",
+			.ci_type = &simple_children_type,
+		},
+	},
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above.  However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem.  Creation of a group in the subsystem creates
+ * a new simple_children group.  That group can then have simple_child
+ * children of its own.
+ */
+
+struct simple_children {
+	struct config_group group;
+};
+
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+	struct simple_children *simple_children;
+
+	simple_children = kmalloc(sizeof(struct simple_children),
+				  GFP_KERNEL);
+	if (!simple_children)
+		return NULL;
+
+	memset(simple_children, 0, sizeof(struct simple_children));
+
+	config_group_init_type_name(&simple_children->group, name,
+				    &simple_children_type);
+
+	return &simple_children->group;
+}
+
+static struct configfs_attribute group_children_attr_description = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "description",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *group_children_attrs[] = {
+	&group_children_attr_description,
+	NULL,
+};
+
+static ssize_t group_children_attr_show(struct config_item *item,
+			   		struct configfs_attribute *attr,
+			   		char *page)
+{
+	return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups.  These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+static struct configfs_item_operations group_children_item_ops = {
+	.show_attribute	= group_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+	.make_group	= group_children_make_group,
+};
+
+static struct config_item_type group_children_type = {
+	.ct_item_ops	= &group_children_item_ops,
+	.ct_group_ops	= &group_children_group_ops,
+	.ct_attrs	= group_children_attrs,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "03-group-children",
+			.ci_type = &group_children_type,
+		},
+	},
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all.  It
+ * allows the init function to easily register them.  Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+	&childless_subsys.subsys,
+	&simple_children_subsys,
+	&group_children_subsys,
+	NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+	int ret;
+	int i;
+	struct configfs_subsystem *subsys;
+
+	for (i = 0; example_subsys[i]; i++) {
+		subsys = example_subsys[i];
+
+		config_group_init(&subsys->su_group);
+		init_MUTEX(&subsys->su_sem);
+		ret = configfs_register_subsystem(subsys);
+		if (ret) {
+			printk(KERN_ERR "Error %d while registering subsystem %s\n",
+			       ret,
+			       subsys->su_group.cg_item.ci_namebuf);
+			goto out_unregister;
+		}
+	}
+
+	return 0;
+
+out_unregister:
+	for (; i >= 0; i--) {
+		configfs_unregister_subsystem(example_subsys[i]);
+	}
+
+	return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+	int i;
+
+	for (i = 0; example_subsys[i]; i++) {
+		configfs_unregister_subsystem(example_subsys[i]);
+	}
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/MAINTAINERS b/MAINTAINERS
index 6af683025ae0..86ee06f43794 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -554,6 +554,11 @@ W:	http://us1.samba.org/samba/Linux_CIFS_client.html
 T:	git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git
 S:	Supported	
 
+CONFIGFS
+P:	Joel Becker
+M:	Joel Becker <joel.becker@oracle.com>
+S:	Supported
+
 CIRRUS LOGIC GENERIC FBDEV DRIVER
 P:	Jeff Garzik
 M:	jgarzik@pobox.com
diff --git a/fs/Kconfig b/fs/Kconfig
index d5255e627b5f..ba1dbe2b2202 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -841,6 +841,20 @@ config RELAYFS_FS
 
 	  If unsure, say N.
 
+config CONFIGFS_FS
+	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  configfs is a ram-based filesystem that provides the converse
+	  of sysfs's functionality. Where sysfs is a filesystem-based
+	  view of kernel objects, configfs is a filesystem-based manager
+	  of kernel objects, or config_items.
+
+	  Both sysfs and configfs can and should exist together on the
+	  same system. One is not a replacement for the other.
+
+	  If unsure, say N.
+
 endmenu
 
 menu "Miscellaneous filesystems"
diff --git a/fs/Makefile b/fs/Makefile
index 4c2655759078..ff3d48a744f5 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -101,3 +101,4 @@ obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile
new file mode 100644
index 000000000000..00ffb278e98c
--- /dev/null
+++ b/fs/configfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the configfs virtual filesystem
+#
+
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs.o
+
+configfs-objs	:= inode.o file.o dir.o symlink.o mount.o item.o
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
new file mode 100644
index 000000000000..8899d9c5f6bf
--- /dev/null
+++ b/fs/configfs/configfs_internal.h
@@ -0,0 +1,142 @@
+/* -*- mode: c; c-basic-offset:8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs_internal.h - Internal stuff for configfs
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include <linux/list.h>
+
+struct configfs_dirent {
+	atomic_t		s_count;
+	struct list_head	s_sibling;
+	struct list_head	s_children;
+	struct list_head	s_links;
+	void 			* s_element;
+	int			s_type;
+	umode_t			s_mode;
+	struct dentry		* s_dentry;
+};
+
+#define CONFIGFS_ROOT		0x0001
+#define CONFIGFS_DIR		0x0002
+#define CONFIGFS_ITEM_ATTR 	0x0004
+#define CONFIGFS_ITEM_LINK 	0x0020
+#define CONFIGFS_USET_DIR	0x0040
+#define CONFIGFS_USET_DEFAULT	0x0080
+#define CONFIGFS_USET_DROPPING	0x0100
+#define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
+
+extern struct vfsmount * configfs_mount;
+
+extern int configfs_is_root(struct config_item *item);
+
+extern struct inode * configfs_new_inode(mode_t mode);
+extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+
+extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_make_dirent(struct configfs_dirent *,
+				struct dentry *, void *, umode_t, int);
+
+extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
+extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
+
+extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
+extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
+
+extern int configfs_pin_fs(void);
+extern void configfs_release_fs(void);
+
+extern struct rw_semaphore configfs_rename_sem;
+extern struct super_block * configfs_sb;
+extern struct file_operations configfs_dir_operations;
+extern struct file_operations configfs_file_operations;
+extern struct file_operations bin_fops;
+extern struct inode_operations configfs_dir_inode_operations;
+extern struct inode_operations configfs_symlink_inode_operations;
+
+extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *symname);
+extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
+
+struct configfs_symlink {
+	struct list_head sl_list;
+	struct config_item *sl_target;
+};
+
+extern int configfs_create_link(struct configfs_symlink *sl,
+				struct dentry *parent,
+				struct dentry *dentry);
+
+static inline struct config_item * to_item(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct config_item *) sd->s_element);
+}
+
+static inline struct configfs_attribute * to_attr(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct configfs_attribute *) sd->s_element);
+}
+
+static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
+{
+	struct config_item * item = NULL;
+
+	spin_lock(&dcache_lock);
+	if (!d_unhashed(dentry)) {
+		struct configfs_dirent * sd = dentry->d_fsdata;
+		if (sd->s_type & CONFIGFS_ITEM_LINK) {
+			struct configfs_symlink * sl = sd->s_element;
+			item = config_item_get(sl->sl_target);
+		} else
+			item = config_item_get(sd->s_element);
+	}
+	spin_unlock(&dcache_lock);
+
+	return item;
+}
+
+static inline void release_configfs_dirent(struct configfs_dirent * sd)
+{
+	if (!(sd->s_type & CONFIGFS_ROOT))
+		kfree(sd);
+}
+
+static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
+{
+	if (sd) {
+		WARN_ON(!atomic_read(&sd->s_count));
+		atomic_inc(&sd->s_count);
+	}
+	return sd;
+}
+
+static inline void configfs_put(struct configfs_dirent * sd)
+{
+	WARN_ON(!atomic_read(&sd->s_count));
+	if (atomic_dec_and_test(&sd->s_count))
+		release_configfs_dirent(sd);
+}
+
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
new file mode 100644
index 000000000000..e48b539243a1
--- /dev/null
+++ b/fs/configfs/dir.c
@@ -0,0 +1,1102 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c - Operations for configfs directories.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#undef DEBUG
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+DECLARE_RWSEM(configfs_rename_sem);
+
+static void configfs_d_iput(struct dentry * dentry,
+			    struct inode * inode)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+
+	if (sd) {
+		BUG_ON(sd->s_dentry != dentry);
+		sd->s_dentry = NULL;
+		configfs_put(sd);
+	}
+	iput(inode);
+}
+
+/*
+ * We _must_ delete our dentries on last dput, as the chain-to-parent
+ * behavior is required to clear the parents of default_groups.
+ */
+static int configfs_d_delete(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations configfs_dentry_ops = {
+	.d_iput		= configfs_d_iput,
+	/* simple_delete_dentry() isn't exported */
+	.d_delete	= configfs_d_delete,
+};
+
+/*
+ * Allocates a new configfs_dirent and links it to the parent configfs_dirent
+ */
+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
+						void * element)
+{
+	struct configfs_dirent * sd;
+
+	sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+	if (!sd)
+		return NULL;
+
+	memset(sd, 0, sizeof(*sd));
+	atomic_set(&sd->s_count, 1);
+	INIT_LIST_HEAD(&sd->s_links);
+	INIT_LIST_HEAD(&sd->s_children);
+	list_add(&sd->s_sibling, &parent_sd->s_children);
+	sd->s_element = element;
+
+	return sd;
+}
+
+int configfs_make_dirent(struct configfs_dirent * parent_sd,
+			 struct dentry * dentry, void * element,
+			 umode_t mode, int type)
+{
+	struct configfs_dirent * sd;
+
+	sd = configfs_new_dirent(parent_sd, element);
+	if (!sd)
+		return -ENOMEM;
+
+	sd->s_mode = mode;
+	sd->s_type = type;
+	sd->s_dentry = dentry;
+	if (dentry) {
+		dentry->d_fsdata = configfs_get(sd);
+		dentry->d_op = &configfs_dentry_ops;
+	}
+
+	return 0;
+}
+
+static int init_dir(struct inode * inode)
+{
+	inode->i_op = &configfs_dir_inode_operations;
+	inode->i_fop = &configfs_dir_operations;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inode->i_nlink++;
+	return 0;
+}
+
+static int init_file(struct inode * inode)
+{
+	inode->i_size = PAGE_SIZE;
+	inode->i_fop = &configfs_file_operations;
+	return 0;
+}
+
+static int init_symlink(struct inode * inode)
+{
+	inode->i_op = &configfs_symlink_inode_operations;
+	return 0;
+}
+
+static int create_dir(struct config_item * k, struct dentry * p,
+		      struct dentry * d)
+{
+	int error;
+	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+
+	error = configfs_create(d, mode, init_dir);
+	if (!error) {
+		error = configfs_make_dirent(p->d_fsdata, d, k, mode,
+					   CONFIGFS_DIR);
+		if (!error) {
+			p->d_inode->i_nlink++;
+			(d)->d_op = &configfs_dentry_ops;
+		}
+	}
+	return error;
+}
+
+
+/**
+ *	configfs_create_dir - create a directory for an config_item.
+ *	@item:		config_itemwe're creating directory for.
+ *	@dentry:	config_item's dentry.
+ */
+
+static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
+{
+	struct dentry * parent;
+	int error = 0;
+
+	BUG_ON(!item);
+
+	if (item->ci_parent)
+		parent = item->ci_parent->ci_dentry;
+	else if (configfs_mount && configfs_mount->mnt_sb)
+		parent = configfs_mount->mnt_sb->s_root;
+	else
+		return -EFAULT;
+
+	error = create_dir(item,parent,dentry);
+	if (!error)
+		item->ci_dentry = dentry;
+	return error;
+}
+
+int configfs_create_link(struct configfs_symlink *sl,
+			 struct dentry *parent,
+			 struct dentry *dentry)
+{
+	int err = 0;
+	umode_t mode = S_IFLNK | S_IRWXUGO;
+
+	err = configfs_create(dentry, mode, init_symlink);
+	if (!err) {
+		err = configfs_make_dirent(parent->d_fsdata, dentry, sl,
+					 mode, CONFIGFS_ITEM_LINK);
+		if (!err)
+			dentry->d_op = &configfs_dentry_ops;
+	}
+	return err;
+}
+
+static void remove_dir(struct dentry * d)
+{
+	struct dentry * parent = dget(d->d_parent);
+	struct configfs_dirent * sd;
+
+	sd = d->d_fsdata;
+ 	list_del_init(&sd->s_sibling);
+	configfs_put(sd);
+	if (d->d_inode)
+		simple_rmdir(parent->d_inode,d);
+
+	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
+		 atomic_read(&d->d_count));
+
+	dput(parent);
+}
+
+/**
+ * configfs_remove_dir - remove an config_item's directory.
+ * @item:	config_item we're removing.
+ *
+ * The only thing special about this is that we remove any files in
+ * the directory before we remove the directory, and we've inlined
+ * what used to be configfs_rmdir() below, instead of calling separately.
+ */
+
+static void configfs_remove_dir(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+
+	if (!dentry)
+		return;
+
+	remove_dir(dentry);
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+
+/* attaches attribute's configfs_dirent to the dentry corresponding to the
+ * attribute file
+ */
+static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
+{
+	struct configfs_attribute * attr = sd->s_element;
+	int error;
+
+	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
+	if (error)
+		return error;
+
+	dentry->d_op = &configfs_dentry_ops;
+	dentry->d_fsdata = configfs_get(sd);
+	sd->s_dentry = dentry;
+	d_rehash(dentry);
+
+	return 0;
+}
+
+static struct dentry * configfs_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       struct nameidata *nd)
+{
+	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
+	struct configfs_dirent * sd;
+	int found = 0;
+	int err = 0;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED) {
+			const unsigned char * name = configfs_get_name(sd);
+
+			if (strcmp(name, dentry->d_name.name))
+				continue;
+
+			found = 1;
+			err = configfs_attach_attr(sd, dentry);
+			break;
+		}
+	}
+
+	if (!found) {
+		/*
+		 * If it doesn't exist and it isn't a NOT_PINNED item,
+		 * it must be negative.
+		 */
+		return simple_lookup(dir, dentry, nd);
+	}
+
+	return ERR_PTR(err);
+}
+
+/*
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes and are removed by rmdir().  We recurse, taking i_sem
+ * on all children that are candidates for default detach.  If the
+ * result is clean, then configfs_detach_group() will handle dropping
+ * i_sem.  If there is an error, the caller will clean up the i_sem
+ * holders via configfs_detach_rollback().
+ */
+static int configfs_detach_prep(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+	int ret;
+
+	ret = -EBUSY;
+	if (!list_empty(&parent_sd->s_links))
+		goto out;
+
+	ret = 0;
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED)
+			continue;
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			down(&sd->s_dentry->d_inode->i_sem);
+			/* Mark that we've taken i_sem */
+			sd->s_type |= CONFIGFS_USET_DROPPING;
+
+			ret = configfs_detach_prep(sd->s_dentry);
+			if (!ret)
+			       	continue;
+		} else
+			ret = -ENOTEMPTY;
+
+		break;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is
+ * set.
+ */
+static void configfs_detach_rollback(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			configfs_detach_rollback(sd->s_dentry);
+
+			if (sd->s_type & CONFIGFS_USET_DROPPING) {
+				sd->s_type &= ~CONFIGFS_USET_DROPPING;
+				up(&sd->s_dentry->d_inode->i_sem);
+			}
+		}
+	}
+}
+
+static void detach_attrs(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+	struct configfs_dirent * parent_sd;
+	struct configfs_dirent * sd, * tmp;
+
+	if (!dentry)
+		return;
+
+	pr_debug("configfs %s: dropping attrs for  dir\n",
+		 dentry->d_name.name);
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
+			continue;
+		list_del_init(&sd->s_sibling);
+		configfs_drop_dentry(sd, dentry);
+		configfs_put(sd);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+static int populate_attrs(struct config_item *item)
+{
+	struct config_item_type *t = item->ci_type;
+	struct configfs_attribute *attr;
+	int error = 0;
+	int i;
+
+	if (!t)
+		return -EINVAL;
+	if (t->ct_attrs) {
+		for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
+			if ((error = configfs_create_file(item, attr)))
+				break;
+		}
+	}
+
+	if (error)
+		detach_attrs(item);
+
+	return error;
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry);
+static void configfs_detach_group(struct config_item *item);
+
+static void detach_groups(struct config_group *group)
+{
+	struct dentry * dentry = dget(group->cg_item.ci_dentry);
+	struct dentry *child;
+	struct configfs_dirent *parent_sd;
+	struct configfs_dirent *sd, *tmp;
+
+	if (!dentry)
+		return;
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element ||
+		    !(sd->s_type & CONFIGFS_USET_DEFAULT))
+			continue;
+
+		child = sd->s_dentry;
+
+		configfs_detach_group(sd->s_element);
+		child->d_inode->i_flags |= S_DEAD;
+
+		/*
+		 * From rmdir/unregister, a configfs_detach_prep() pass
+		 * has taken our i_sem for us.  Drop it.
+		 * From mkdir/register cleanup, there is no sem held.
+		 */
+		if (sd->s_type & CONFIGFS_USET_DROPPING)
+			up(&child->d_inode->i_sem);
+
+		d_delete(child);
+		dput(child);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+/*
+ * This fakes mkdir(2) on a default_groups[] entry.  It
+ * creates a dentry, attachs it, and then does fixup
+ * on the sd->s_type.
+ *
+ * We could, perhaps, tweak our parent's ->mkdir for a minute and
+ * try using vfs_mkdir.  Just a thought.
+ */
+static int create_default_group(struct config_group *parent_group,
+				struct config_group *group)
+{
+	int ret;
+	struct qstr name;
+	struct configfs_dirent *sd;
+	/* We trust the caller holds a reference to parent */
+	struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	ret = -ENOMEM;
+	child = d_alloc(parent, &name);
+	if (child) {
+		d_add(child, NULL);
+
+		ret = configfs_attach_group(&parent_group->cg_item,
+					    &group->cg_item, child);
+		if (!ret) {
+			sd = child->d_fsdata;
+			sd->s_type |= CONFIGFS_USET_DEFAULT;
+		} else {
+			d_delete(child);
+			dput(child);
+		}
+	}
+
+	return ret;
+}
+
+static int populate_groups(struct config_group *group)
+{
+	struct config_group *new_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+	int ret = 0;
+	int i;
+
+	if (group && group->default_groups) {
+		/* FYI, we're faking mkdir here
+		 * I'm not sure we need this semaphore, as we're called
+		 * from our parent's mkdir.  That holds our parent's
+		 * i_sem, so afaik lookup cannot continue through our
+		 * parent to find us, let alone mess with our tree.
+		 * That said, taking our i_sem is closer to mkdir
+		 * emulation, and shouldn't hurt. */
+		down(&dentry->d_inode->i_sem);
+
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+
+			ret = create_default_group(group, new_group);
+			if (ret)
+				break;
+		}
+
+		up(&dentry->d_inode->i_sem);
+	}
+
+	if (ret)
+		detach_groups(group);
+
+	return ret;
+}
+
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_sem is held.
+ */
+
+static void unlink_obj(struct config_item *item)
+{
+	struct config_group *group;
+
+	group = item->ci_group;
+	if (group) {
+		list_del_init(&item->ci_entry);
+
+		item->ci_group = NULL;
+		item->ci_parent = NULL;
+		config_item_put(item);
+
+		config_group_put(group);
+	}
+}
+
+static void link_obj(struct config_item *parent_item, struct config_item *item)
+{
+	/* Parent seems redundant with group, but it makes certain
+	 * traversals much nicer. */
+	item->ci_parent = parent_item;
+	item->ci_group = config_group_get(to_config_group(parent_item));
+	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
+
+	config_item_get(item);
+}
+
+static void unlink_group(struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			unlink_group(new_group);
+		}
+	}
+
+	group->cg_subsys = NULL;
+	unlink_obj(&group->cg_item);
+}
+
+static void link_group(struct config_group *parent_group, struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+	struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
+
+	link_obj(&parent_group->cg_item, &group->cg_item);
+
+	if (parent_group->cg_subsys)
+		subsys = parent_group->cg_subsys;
+	else if (configfs_is_root(&parent_group->cg_item))
+		subsys = to_configfs_subsystem(group);
+	else
+		BUG();
+	group->cg_subsys = subsys;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			link_group(group, new_group);
+		}
+	}
+}
+
+/*
+ * The goal is that configfs_attach_item() (and
+ * configfs_attach_group()) can be called from either the VFS or this
+ * module.  That is, they assume that the items have been created,
+ * the dentry allocated, and the dcache is all ready to go.
+ *
+ * If they fail, they must clean up after themselves as if they
+ * had never been called.  The caller (VFS or local function) will
+ * handle cleaning up the dcache bits.
+ *
+ * configfs_detach_group() and configfs_detach_item() behave similarly on
+ * the way out.  They assume that the proper semaphores are held, they
+ * clean up the configfs items, and they expect their callers will
+ * handle the dcache bits.
+ */
+static int configfs_attach_item(struct config_item *parent_item,
+				struct config_item *item,
+				struct dentry *dentry)
+{
+	int ret;
+
+	ret = configfs_create_dir(item, dentry);
+	if (!ret) {
+		ret = populate_attrs(item);
+		if (ret) {
+			configfs_remove_dir(item);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+static void configfs_detach_item(struct config_item *item)
+{
+	detach_attrs(item);
+	configfs_remove_dir(item);
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry)
+{
+	int ret;
+	struct configfs_dirent *sd;
+
+	ret = configfs_attach_item(parent_item, item, dentry);
+	if (!ret) {
+		sd = dentry->d_fsdata;
+		sd->s_type |= CONFIGFS_USET_DIR;
+
+		ret = populate_groups(to_config_group(item));
+		if (ret) {
+			configfs_detach_item(item);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+static void configfs_detach_group(struct config_item *item)
+{
+	detach_groups(to_config_group(item));
+	configfs_detach_item(item);
+}
+
+/*
+ * Drop the initial reference from make_item()/make_group()
+ * This function assumes that reference is held on item
+ * and that item holds a valid reference to the parent.  Also, it
+ * assumes the caller has validated ci_type.
+ */
+static void client_drop_item(struct config_item *parent_item,
+			     struct config_item *item)
+{
+	struct config_item_type *type;
+
+	type = parent_item->ci_type;
+	BUG_ON(!type);
+
+	if (type->ct_group_ops && type->ct_group_ops->drop_item)
+		type->ct_group_ops->drop_item(to_config_group(parent_item),
+						item);
+	else
+		config_item_put(item);
+}
+
+
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int ret;
+	struct config_group *group;
+	struct config_item *item;
+	struct config_item *parent_item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct config_item_type *type;
+	struct module *owner;
+	char *name;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_parent->d_fsdata;
+	if (!(sd->s_type & CONFIGFS_USET_DIR))
+		return -EPERM;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!type || !type->ct_group_ops ||
+	    (!type->ct_group_ops->make_group &&
+	     !type->ct_group_ops->make_item)) {
+		config_item_put(parent_item);
+		return -EPERM;  /* What lack-of-mkdir returns */
+	}
+
+	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
+	if (!name) {
+		config_item_put(parent_item);
+		return -ENOMEM;
+	}
+	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
+
+	down(&subsys->su_sem);
+	group = NULL;
+	item = NULL;
+	if (type->ct_group_ops->make_group) {
+		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
+		if (group) {
+			link_group(to_config_group(parent_item), group);
+			item = &group->cg_item;
+		}
+	} else {
+		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
+		if (item)
+			link_obj(parent_item, item);
+	}
+	up(&subsys->su_sem);
+
+	kfree(name);
+	if (!item) {
+		config_item_put(parent_item);
+		return -ENOMEM;
+	}
+
+	ret = -EINVAL;
+	type = item->ci_type;
+	if (type) {
+		owner = type->ct_owner;
+		if (try_module_get(owner)) {
+			if (group) {
+				ret = configfs_attach_group(parent_item,
+							    item,
+							    dentry);
+			} else {
+				ret = configfs_attach_item(parent_item,
+							   item,
+							   dentry);
+			}
+
+			if (ret) {
+				down(&subsys->su_sem);
+				if (group)
+					unlink_group(group);
+				else
+					unlink_obj(item);
+				client_drop_item(parent_item, item);
+				up(&subsys->su_sem);
+
+				config_item_put(parent_item);
+				module_put(owner);
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct config_item *parent_item;
+	struct config_item *item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct module *owner = NULL;
+	int ret;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_fsdata;
+	if (sd->s_type & CONFIGFS_USET_DEFAULT)
+		return -EPERM;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!parent_item->ci_type) {
+		config_item_put(parent_item);
+		return -EINVAL;
+	}
+
+	ret = configfs_detach_prep(dentry);
+	if (ret) {
+		configfs_detach_rollback(dentry);
+		config_item_put(parent_item);
+		return ret;
+	}
+
+	item = configfs_get_config_item(dentry);
+
+	/* Drop reference from above, item already holds one. */
+	config_item_put(parent_item);
+
+	if (item->ci_type)
+		owner = item->ci_type->ct_owner;
+
+	if (sd->s_type & CONFIGFS_USET_DIR) {
+		configfs_detach_group(item);
+
+		down(&subsys->su_sem);
+		unlink_group(to_config_group(item));
+	} else {
+		configfs_detach_item(item);
+
+		down(&subsys->su_sem);
+		unlink_obj(item);
+	}
+
+	client_drop_item(parent_item, item);
+	up(&subsys->su_sem);
+
+	/* Drop our reference from above */
+	config_item_put(item);
+
+	module_put(owner);
+
+	return 0;
+}
+
+struct inode_operations configfs_dir_inode_operations = {
+	.mkdir		= configfs_mkdir,
+	.rmdir		= configfs_rmdir,
+	.symlink	= configfs_symlink,
+	.unlink		= configfs_unlink,
+	.lookup		= configfs_lookup,
+};
+
+#if 0
+int configfs_rename_dir(struct config_item * item, const char *new_name)
+{
+	int error = 0;
+	struct dentry * new_dentry, * parent;
+
+	if (!strcmp(config_item_name(item), new_name))
+		return -EINVAL;
+
+	if (!item->parent)
+		return -EINVAL;
+
+	down_write(&configfs_rename_sem);
+	parent = item->parent->dentry;
+
+	down(&parent->d_inode->i_sem);
+
+	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
+	if (!IS_ERR(new_dentry)) {
+  		if (!new_dentry->d_inode) {
+			error = config_item_set_name(item, "%s", new_name);
+			if (!error) {
+				d_add(new_dentry, NULL);
+				d_move(item->dentry, new_dentry);
+			}
+			else
+				d_delete(new_dentry);
+		} else
+			error = -EEXIST;
+		dput(new_dentry);
+	}
+	up(&parent->d_inode->i_sem);
+	up_write(&configfs_rename_sem);
+
+	return error;
+}
+#endif
+
+static int configfs_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+
+	down(&dentry->d_inode->i_sem);
+	file->private_data = configfs_new_dirent(parent_sd, NULL);
+	up(&dentry->d_inode->i_sem);
+
+	return file->private_data ? 0 : -ENOMEM;
+
+}
+
+static int configfs_dir_close(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct configfs_dirent * cursor = file->private_data;
+
+	down(&dentry->d_inode->i_sem);
+	list_del_init(&cursor->s_sibling);
+	up(&dentry->d_inode->i_sem);
+
+	release_configfs_dirent(cursor);
+
+	return 0;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct configfs_dirent *sd)
+{
+	return (sd->s_mode >> 12) & 15;
+}
+
+static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *cursor = filp->private_data;
+	struct list_head *p, *q = &cursor->s_sibling;
+	ino_t ino;
+	int i = filp->f_pos;
+
+	switch (i) {
+		case 0:
+			ino = dentry->d_inode->i_ino;
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		default:
+			if (filp->f_pos == 2) {
+				list_del(q);
+				list_add(q, &parent_sd->s_children);
+			}
+			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
+				struct configfs_dirent *next;
+				const char * name;
+				int len;
+
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (!next->s_element)
+					continue;
+
+				name = configfs_get_name(next);
+				len = strlen(name);
+				if (next->s_dentry)
+					ino = next->s_dentry->d_inode->i_ino;
+				else
+					ino = iunique(configfs_sb, 2);
+
+				if (filldir(dirent, name, len, filp->f_pos, ino,
+						 dt_type(next)) < 0)
+					return 0;
+
+				list_del(q);
+				list_add(q, p);
+				p = q;
+				filp->f_pos++;
+			}
+	}
+	return 0;
+}
+
+static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
+{
+	struct dentry * dentry = file->f_dentry;
+
+	down(&dentry->d_inode->i_sem);
+	switch (origin) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			up(&file->f_dentry->d_inode->i_sem);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			struct configfs_dirent *cursor = file->private_data;
+			struct list_head *p;
+			loff_t n = file->f_pos - 2;
+
+			list_del(&cursor->s_sibling);
+			p = sd->s_children.next;
+			while (n && p != &sd->s_children) {
+				struct configfs_dirent *next;
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (next->s_element)
+					n--;
+				p = p->next;
+			}
+			list_add_tail(&cursor->s_sibling, p);
+		}
+	}
+	up(&dentry->d_inode->i_sem);
+	return offset;
+}
+
+struct file_operations configfs_dir_operations = {
+	.open		= configfs_dir_open,
+	.release	= configfs_dir_close,
+	.llseek		= configfs_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= configfs_readdir,
+};
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys)
+{
+	int err;
+	struct config_group *group = &subsys->su_group;
+	struct qstr name;
+	struct dentry *dentry;
+	struct configfs_dirent *sd;
+
+	err = configfs_pin_fs();
+	if (err)
+		return err;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+
+	sd = configfs_sb->s_root->d_fsdata;
+	link_group(to_config_group(sd->s_element), group);
+
+	down(&configfs_sb->s_root->d_inode->i_sem);
+
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	err = -ENOMEM;
+	dentry = d_alloc(configfs_sb->s_root, &name);
+	if (!dentry)
+		goto out_release;
+
+	d_add(dentry, NULL);
+
+	err = configfs_attach_group(sd->s_element, &group->cg_item,
+				    dentry);
+	if (!err)
+		dentry = NULL;
+	else
+		d_delete(dentry);
+
+	up(&configfs_sb->s_root->d_inode->i_sem);
+
+	if (dentry) {
+	    dput(dentry);
+out_release:
+	    unlink_group(group);
+	    configfs_release_fs();
+	}
+
+	return err;
+}
+
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
+{
+	struct config_group *group = &subsys->su_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+
+	if (dentry->d_parent != configfs_sb->s_root) {
+		printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
+		return;
+	}
+
+	down(&configfs_sb->s_root->d_inode->i_sem);
+	down(&dentry->d_inode->i_sem);
+	if (configfs_detach_prep(dentry)) {
+		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
+	}
+	configfs_detach_group(&group->cg_item);
+	dentry->d_inode->i_flags |= S_DEAD;
+	up(&dentry->d_inode->i_sem);
+
+	d_delete(dentry);
+
+	up(&configfs_sb->s_root->d_inode->i_sem);
+
+	dput(dentry);
+
+	unlink_group(group);
+	configfs_release_fs();
+}
+
+EXPORT_SYMBOL(configfs_register_subsystem);
+EXPORT_SYMBOL(configfs_unregister_subsystem);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
new file mode 100644
index 000000000000..af1ffc9a15c0
--- /dev/null
+++ b/fs/configfs/file.c
@@ -0,0 +1,360 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c - operations for regular (text) files.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/dnotify.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+
+struct configfs_buffer {
+	size_t			count;
+	loff_t			pos;
+	char			* page;
+	struct configfs_item_operations	* ops;
+	struct semaphore	sem;
+	int			needs_read_fill;
+};
+
+
+/**
+ *	fill_read_buffer - allocate and fill buffer from item.
+ *	@dentry:	dentry pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Allocate @buffer->page, if it hasn't been already, then call the
+ *	config_item's show() method to fill the buffer with this attribute's
+ *	data.
+ *	This is called only once, on the file's first read.
+ */
+static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+	int ret = 0;
+	ssize_t count;
+
+	if (!buffer->page)
+		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	count = ops->show_attribute(item,attr,buffer->page);
+	buffer->needs_read_fill = 0;
+	BUG_ON(count > (ssize_t)PAGE_SIZE);
+	if (count >= 0)
+		buffer->count = count;
+	else
+		ret = count;
+	return ret;
+}
+
+
+/**
+ *	flush_read_buffer - push buffer to userspace.
+ *	@buffer:	data buffer for file.
+ *	@userbuf:	user-passed buffer.
+ *	@count:		number of bytes requested.
+ *	@ppos:		file position.
+ *
+ *	Copy the buffer we filled in fill_read_buffer() to userspace.
+ *	This is done at the reader's leisure, copying and advancing
+ *	the amount they specify each time.
+ *	This may be called continuously until the buffer is empty.
+ */
+static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
+			     size_t count, loff_t * ppos)
+{
+	int error;
+
+	if (*ppos > buffer->count)
+		return 0;
+
+	if (count > (buffer->count - *ppos))
+		count = buffer->count - *ppos;
+
+	error = copy_to_user(buf,buffer->page + *ppos,count);
+	if (!error)
+		*ppos += count;
+	return error ? -EFAULT : count;
+}
+
+/**
+ *	configfs_read_file - read an attribute.
+ *	@file:	file pointer.
+ *	@buf:	buffer to fill.
+ *	@count:	number of bytes to read.
+ *	@ppos:	starting offset in file.
+ *
+ *	Userspace wants to read an attribute file. The attribute descriptor
+ *	is in the file's ->d_fsdata. The target item is in the directory's
+ *	->d_fsdata.
+ *
+ *	We call fill_read_buffer() to allocate and fill the buffer from the
+ *	item's show() method exactly once (if the read is happening from
+ *	the beginning of the file). That should fill the entire buffer with
+ *	all the data the item has to offer for that attribute.
+ *	We then call flush_read_buffer() to copy the buffer to userspace
+ *	in the increments specified.
+ */
+
+static ssize_t
+configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+	ssize_t retval = 0;
+
+	down(&buffer->sem);
+	if (buffer->needs_read_fill) {
+		if ((retval = fill_read_buffer(file->f_dentry,buffer)))
+			goto out;
+	}
+	pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
+		 __FUNCTION__,count,*ppos,buffer->page);
+	retval = flush_read_buffer(buffer,buf,count,ppos);
+out:
+	up(&buffer->sem);
+	return retval;
+}
+
+
+/**
+ *	fill_write_buffer - copy buffer from userspace.
+ *	@buffer:	data buffer for file.
+ *	@userbuf:	data from user.
+ *	@count:		number of bytes in @userbuf.
+ *
+ *	Allocate @buffer->page if it hasn't been already, then
+ *	copy the user-supplied buffer into it.
+ */
+
+static int
+fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
+{
+	int error;
+
+	if (!buffer->page)
+		buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+	error = copy_from_user(buffer->page,buf,count);
+	buffer->needs_read_fill = 1;
+	return error ? -EFAULT : count;
+}
+
+
+/**
+ *	flush_write_buffer - push buffer to config_item.
+ *	@file:		file pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Get the correct pointers for the config_item and the attribute we're
+ *	dealing with, then call the store() method for the attribute,
+ *	passing the buffer that we acquired in fill_write_buffer().
+ */
+
+static int
+flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+
+	return ops->store_attribute(item,attr,buffer->page,count);
+}
+
+
+/**
+ *	configfs_write_file - write an attribute.
+ *	@file:	file pointer
+ *	@buf:	data to write
+ *	@count:	number of bytes
+ *	@ppos:	starting offset
+ *
+ *	Similar to configfs_read_file(), though working in the opposite direction.
+ *	We allocate and fill the data from the user in fill_write_buffer(),
+ *	then push it to the config_item in flush_write_buffer().
+ *	There is no easy way for us to know if userspace is only doing a partial
+ *	write, so we don't support them. We expect the entire buffer to come
+ *	on the first write.
+ *	Hint: if you're writing a value, first read the file, modify only the
+ *	the value you're changing, then write entire buffer back.
+ */
+
+static ssize_t
+configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+
+	down(&buffer->sem);
+	count = fill_write_buffer(buffer,buf,count);
+	if (count > 0)
+		count = flush_write_buffer(file->f_dentry,buffer,count);
+	if (count > 0)
+		*ppos += count;
+	up(&buffer->sem);
+	return count;
+}
+
+static int check_perm(struct inode * inode, struct file * file)
+{
+	struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(file->f_dentry);
+	struct configfs_buffer * buffer;
+	struct configfs_item_operations * ops = NULL;
+	int error = 0;
+
+	if (!item || !attr)
+		goto Einval;
+
+	/* Grab the module reference for this attribute if we have one */
+	if (!try_module_get(attr->ca_owner)) {
+		error = -ENODEV;
+		goto Done;
+	}
+
+	if (item->ci_type)
+		ops = item->ci_type->ct_item_ops;
+	else
+		goto Eaccess;
+
+	/* File needs write support.
+	 * The inode's perms must say it's ok,
+	 * and we must have a store method.
+	 */
+	if (file->f_mode & FMODE_WRITE) {
+
+		if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+			goto Eaccess;
+
+	}
+
+	/* File needs read support.
+	 * The inode's perms must say it's ok, and we there
+	 * must be a show method for it.
+	 */
+	if (file->f_mode & FMODE_READ) {
+		if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+			goto Eaccess;
+	}
+
+	/* No error? Great, allocate a buffer for the file, and store it
+	 * it in file->private_data for easy access.
+	 */
+	buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
+	if (buffer) {
+		memset(buffer,0,sizeof(struct configfs_buffer));
+		init_MUTEX(&buffer->sem);
+		buffer->needs_read_fill = 1;
+		buffer->ops = ops;
+		file->private_data = buffer;
+	} else
+		error = -ENOMEM;
+	goto Done;
+
+ Einval:
+	error = -EINVAL;
+	goto Done;
+ Eaccess:
+	error = -EACCES;
+	module_put(attr->ca_owner);
+ Done:
+	if (error && item)
+		config_item_put(item);
+	return error;
+}
+
+static int configfs_open_file(struct inode * inode, struct file * filp)
+{
+	return check_perm(inode,filp);
+}
+
+static int configfs_release(struct inode * inode, struct file * filp)
+{
+	struct config_item * item = to_item(filp->f_dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(filp->f_dentry);
+	struct module * owner = attr->ca_owner;
+	struct configfs_buffer * buffer = filp->private_data;
+
+	if (item)
+		config_item_put(item);
+	/* After this point, attr should not be accessed. */
+	module_put(owner);
+
+	if (buffer) {
+		if (buffer->page)
+			free_page((unsigned long)buffer->page);
+		kfree(buffer);
+	}
+	return 0;
+}
+
+struct file_operations configfs_file_operations = {
+	.read		= configfs_read_file,
+	.write		= configfs_write_file,
+	.llseek		= generic_file_llseek,
+	.open		= configfs_open_file,
+	.release	= configfs_release,
+};
+
+
+int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
+{
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
+	int error = 0;
+
+	down(&dir->d_inode->i_sem);
+	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
+	up(&dir->d_inode->i_sem);
+
+	return error;
+}
+
+
+/**
+ *	configfs_create_file - create an attribute file for an item.
+ *	@item:	item we're creating for.
+ *	@attr:	atrribute descriptor.
+ */
+
+int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
+{
+	BUG_ON(!item || !item->ci_dentry || !attr);
+
+	return configfs_add_file(item->ci_dentry, attr,
+				 CONFIGFS_ITEM_ATTR);
+}
+
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
new file mode 100644
index 000000000000..6b274c6d428f
--- /dev/null
+++ b/fs/configfs/inode.c
@@ -0,0 +1,162 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c - basic inode and dentry operations.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see Documentation/filesystems/configfs.txt for more information.
+ */
+
+#undef DEBUG
+
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+extern struct super_block * configfs_sb;
+
+static struct address_space_operations configfs_aops = {
+	.readpage	= simple_readpage,
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
+};
+
+static struct backing_dev_info configfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+struct inode * configfs_new_inode(mode_t mode)
+{
+	struct inode * inode = new_inode(configfs_sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = 0;
+		inode->i_gid = 0;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_mapping->a_ops = &configfs_aops;
+		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+	}
+	return inode;
+}
+
+int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
+{
+	int error = 0;
+	struct inode * inode = NULL;
+	if (dentry) {
+		if (!dentry->d_inode) {
+			if ((inode = configfs_new_inode(mode))) {
+				if (dentry->d_parent && dentry->d_parent->d_inode) {
+					struct inode *p_inode = dentry->d_parent->d_inode;
+					p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+				}
+				goto Proceed;
+			}
+			else
+				error = -ENOMEM;
+		} else
+			error = -EEXIST;
+	} else
+		error = -ENOENT;
+	goto Done;
+
+ Proceed:
+	if (init)
+		error = init(inode);
+	if (!error) {
+		d_instantiate(dentry, inode);
+		if (S_ISDIR(mode) || S_ISLNK(mode))
+			dget(dentry);  /* pin link and directory dentries in core */
+	} else
+		iput(inode);
+ Done:
+	return error;
+}
+
+/*
+ * Get the name for corresponding element represented by the given configfs_dirent
+ */
+const unsigned char * configfs_get_name(struct configfs_dirent *sd)
+{
+	struct attribute * attr;
+
+	if (!sd || !sd->s_element)
+		BUG();
+
+	/* These always have a dentry, so use that */
+	if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
+		return sd->s_dentry->d_name.name;
+
+	if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+		attr = sd->s_element;
+		return attr->name;
+	}
+	return NULL;
+}
+
+
+/*
+ * Unhashes the dentry corresponding to given configfs_dirent
+ * Called with parent inode's i_sem held.
+ */
+void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
+{
+	struct dentry * dentry = sd->s_dentry;
+
+	if (dentry) {
+		spin_lock(&dcache_lock);
+		if (!(d_unhashed(dentry) && dentry->d_inode)) {
+			dget_locked(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dcache_lock);
+			simple_unlink(parent->d_inode, dentry);
+		} else
+			spin_unlock(&dcache_lock);
+	}
+}
+
+void configfs_hash_and_remove(struct dentry * dir, const char * name)
+{
+	struct configfs_dirent * sd;
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+
+	down(&dir->d_inode->i_sem);
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element)
+			continue;
+		if (!strcmp(configfs_get_name(sd), name)) {
+			list_del_init(&sd->s_sibling);
+			configfs_drop_dentry(sd, dir);
+			configfs_put(sd);
+			break;
+		}
+	}
+	up(&dir->d_inode->i_sem);
+}
+
+
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
new file mode 100644
index 000000000000..e07485ac50ad
--- /dev/null
+++ b/fs/configfs/item.c
@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * item.c - library routines for handling generic config items
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on kobject:
+ * 	kobject is Copyright (c) 2002-2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see the file Documentation/filesystems/configfs.txt for
+ * critical information about using the config_item interface.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+static inline struct config_item * to_item(struct list_head * entry)
+{
+	return container_of(entry,struct config_item,ci_entry);
+}
+
+/* Evil kernel */
+static void config_item_release(struct kref *kref);
+
+/**
+ *	config_item_init - initialize item.
+ *	@item:	item in question.
+ */
+void config_item_init(struct config_item * item)
+{
+	kref_init(&item->ci_kref);
+	INIT_LIST_HEAD(&item->ci_entry);
+}
+
+/**
+ *	config_item_set_name - Set the name of an item
+ *	@item:	item.
+ *	@name:	name.
+ *
+ *	If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
+ *	dynamically allocated string that @item->ci_name points to.
+ *	Otherwise, use the static @item->ci_namebuf array.
+ */
+
+int config_item_set_name(struct config_item * item, const char * fmt, ...)
+{
+	int error = 0;
+	int limit = CONFIGFS_ITEM_NAME_LEN;
+	int need;
+	va_list args;
+	char * name;
+
+	/*
+	 * First, try the static array
+	 */
+	va_start(args,fmt);
+	need = vsnprintf(item->ci_namebuf,limit,fmt,args);
+	va_end(args);
+	if (need < limit)
+		name = item->ci_namebuf;
+	else {
+		/*
+		 * Need more space? Allocate it and try again
+		 */
+		limit = need + 1;
+		name = kmalloc(limit,GFP_KERNEL);
+		if (!name) {
+			error = -ENOMEM;
+			goto Done;
+		}
+		va_start(args,fmt);
+		need = vsnprintf(name,limit,fmt,args);
+		va_end(args);
+
+		/* Still? Give up. */
+		if (need >= limit) {
+			kfree(name);
+			error = -EFAULT;
+			goto Done;
+		}
+	}
+
+	/* Free the old name, if necessary. */
+	if (item->ci_name && item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+
+	/* Now, set the new name */
+	item->ci_name = name;
+ Done:
+	return error;
+}
+
+EXPORT_SYMBOL(config_item_set_name);
+
+void config_item_init_type_name(struct config_item *item,
+				const char *name,
+				struct config_item_type *type)
+{
+	config_item_set_name(item, name);
+	item->ci_type = type;
+	config_item_init(item);
+}
+EXPORT_SYMBOL(config_item_init_type_name);
+
+void config_group_init_type_name(struct config_group *group, const char *name,
+			 struct config_item_type *type)
+{
+	config_item_set_name(&group->cg_item, name);
+	group->cg_item.ci_type = type;
+	config_group_init(group);
+}
+EXPORT_SYMBOL(config_group_init_type_name);
+
+struct config_item * config_item_get(struct config_item * item)
+{
+	if (item)
+		kref_get(&item->ci_kref);
+	return item;
+}
+
+/**
+ *	config_item_cleanup - free config_item resources.
+ *	@item:	item.
+ */
+
+void config_item_cleanup(struct config_item * item)
+{
+	struct config_item_type * t = item->ci_type;
+	struct config_group * s = item->ci_group;
+	struct config_item * parent = item->ci_parent;
+
+	pr_debug("config_item %s: cleaning up\n",config_item_name(item));
+	if (item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+	item->ci_name = NULL;
+	if (t && t->ct_item_ops && t->ct_item_ops->release)
+		t->ct_item_ops->release(item);
+	if (s)
+		config_group_put(s);
+	if (parent)
+		config_item_put(parent);
+}
+
+static void config_item_release(struct kref *kref)
+{
+	config_item_cleanup(container_of(kref, struct config_item, ci_kref));
+}
+
+/**
+ *	config_item_put - decrement refcount for item.
+ *	@item:	item.
+ *
+ *	Decrement the refcount, and if 0, call config_item_cleanup().
+ */
+void config_item_put(struct config_item * item)
+{
+	if (item)
+		kref_put(&item->ci_kref, config_item_release);
+}
+
+
+/**
+ *	config_group_init - initialize a group for use
+ *	@k:	group
+ */
+
+void config_group_init(struct config_group *group)
+{
+	config_item_init(&group->cg_item);
+	INIT_LIST_HEAD(&group->cg_children);
+}
+
+
+/**
+ *	config_group_find_obj - search for item in group.
+ *	@group:	group we're looking in.
+ *	@name:	item's name.
+ *
+ *	Lock group via @group->cg_subsys, and iterate over @group->cg_list,
+ *	looking for a matching config_item. If matching item is found
+ *	take a reference and return the item.
+ */
+
+struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+{
+	struct list_head * entry;
+	struct config_item * ret = NULL;
+
+        /* XXX LOCKING! */
+	list_for_each(entry,&group->cg_children) {
+		struct config_item * item = to_item(entry);
+		if (config_item_name(item) &&
+                    !strcmp(config_item_name(item), name)) {
+			ret = config_item_get(item);
+			break;
+		}
+	}
+	return ret;
+}
+
+
+EXPORT_SYMBOL(config_item_init);
+EXPORT_SYMBOL(config_group_init);
+EXPORT_SYMBOL(config_item_get);
+EXPORT_SYMBOL(config_item_put);
+
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
new file mode 100644
index 000000000000..1a2f6f6a4d91
--- /dev/null
+++ b/fs/configfs/mount.c
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mount.c - operations for initializing and mounting configfs.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/* Random magic number */
+#define CONFIGFS_MAGIC 0x62656570
+
+struct vfsmount * configfs_mount = NULL;
+struct super_block * configfs_sb = NULL;
+static int configfs_mnt_count = 0;
+
+static struct super_operations configfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct config_group configfs_root_group = {
+	.cg_item = {
+		.ci_namebuf	= "root",
+		.ci_name	= configfs_root_group.cg_item.ci_namebuf,
+	},
+};
+
+int configfs_is_root(struct config_item *item)
+{
+	return item == &configfs_root_group.cg_item;
+}
+
+static struct configfs_dirent configfs_root = {
+	.s_sibling	= LIST_HEAD_INIT(configfs_root.s_sibling),
+	.s_children	= LIST_HEAD_INIT(configfs_root.s_children),
+	.s_element	= &configfs_root_group.cg_item,
+	.s_type		= CONFIGFS_ROOT,
+};
+
+static int configfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = CONFIGFS_MAGIC;
+	sb->s_op = &configfs_ops;
+	configfs_sb = sb;
+
+	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
+	if (inode) {
+		inode->i_op = &configfs_dir_inode_operations;
+		inode->i_fop = &configfs_dir_operations;
+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+	} else {
+		pr_debug("configfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
+		iput(inode);
+		return -ENOMEM;
+	}
+	config_group_init(&configfs_root_group);
+	configfs_root_group.cg_item.ci_dentry = root;
+	root->d_fsdata = &configfs_root;
+	sb->s_root = root;
+	return 0;
+}
+
+static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_single(fs_type, flags, data, configfs_fill_super);
+}
+
+static struct file_system_type configfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "configfs",
+	.get_sb		= configfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+int configfs_pin_fs(void)
+{
+	return simple_pin_fs("configfs", &configfs_mount,
+			     &configfs_mnt_count);
+}
+
+void configfs_release_fs(void)
+{
+	simple_release_fs(&configfs_mount, &configfs_mnt_count);
+}
+
+
+static decl_subsys(config, NULL, NULL);
+
+static int __init configfs_init(void)
+{
+	int err;
+
+	kset_set_kset_s(&config_subsys, kernel_subsys);
+	err = subsystem_register(&config_subsys);
+	if (err)
+		return err;
+
+	err = register_filesystem(&configfs_fs_type);
+	if (err) {
+		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
+		subsystem_unregister(&config_subsys);
+	}
+
+	return err;
+}
+
+static void __exit configfs_exit(void)
+{
+	unregister_filesystem(&configfs_fs_type);
+	subsystem_unregister(&config_subsys);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.0.1");
+MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
+
+module_init(configfs_init);
+module_exit(configfs_exit);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
new file mode 100644
index 000000000000..50f5840521a9
--- /dev/null
+++ b/fs/configfs/symlink.c
@@ -0,0 +1,281 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.c - operations for configfs symlinks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+static int item_depth(struct config_item * item)
+{
+	struct config_item * p = item;
+	int depth = 0;
+	do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
+	return depth;
+}
+
+static int item_path_length(struct config_item * item)
+{
+	struct config_item * p = item;
+	int length = 1;
+	do {
+		length += strlen(config_item_name(p)) + 1;
+		p = p->ci_parent;
+	} while (p && !configfs_is_root(p));
+	return length;
+}
+
+static void fill_item_path(struct config_item * item, char * buffer, int length)
+{
+	struct config_item * p;
+
+	--length;
+	for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
+		int cur = strlen(config_item_name(p));
+
+		/* back up enough to print this bus id with '/' */
+		length -= cur;
+		strncpy(buffer + length,config_item_name(p),cur);
+		*(buffer + --length) = '/';
+	}
+}
+
+static int create_link(struct config_item *parent_item,
+ 		       struct config_item *item,
+		       struct dentry *dentry)
+{
+	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	int ret;
+
+	ret = -ENOMEM;
+	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
+	if (sl) {
+		sl->sl_target = config_item_get(item);
+		/* FIXME: needs a lock, I'd bet */
+		list_add(&sl->sl_list, &target_sd->s_links);
+		ret = configfs_create_link(sl, parent_item->ci_dentry,
+					   dentry);
+		if (ret) {
+			list_del_init(&sl->sl_list);
+			config_item_put(item);
+			kfree(sl);
+		}
+	}
+
+	return ret;
+}
+
+
+static int get_target(const char *symname, struct nameidata *nd,
+		      struct config_item **target)
+{
+	int ret;
+
+	ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
+	if (!ret) {
+		if (nd->dentry->d_sb == configfs_sb) {
+			*target = configfs_get_config_item(nd->dentry);
+			if (!*target) {
+				ret = -ENOENT;
+				path_release(nd);
+			}
+		} else
+			ret = -EPERM;
+	}
+
+	return ret;
+}
+
+
+int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	int ret;
+	struct nameidata nd;
+	struct config_item *parent_item;
+	struct config_item *target_item;
+	struct config_item_type *type;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (dentry->d_parent == configfs_sb->s_root)
+		goto out;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	if (!type || !type->ct_item_ops ||
+	    !type->ct_item_ops->allow_link)
+		goto out_put;
+
+	ret = get_target(symname, &nd, &target_item);
+	if (ret)
+		goto out_put;
+
+	ret = type->ct_item_ops->allow_link(parent_item, target_item);
+	if (!ret)
+		ret = create_link(parent_item, target_item, dentry);
+
+	config_item_put(target_item);
+	path_release(&nd);
+
+out_put:
+	config_item_put(parent_item);
+
+out:
+	return ret;
+}
+
+int configfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct configfs_dirent *sd = dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	struct config_item *parent_item;
+	struct config_item_type *type;
+	int ret;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
+		goto out;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		BUG();
+
+	sl = sd->s_element;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	list_del_init(&sd->s_sibling);
+	configfs_drop_dentry(sd, dentry->d_parent);
+	dput(dentry);
+	configfs_put(sd);
+
+	/*
+	 * drop_link() must be called before
+	 * list_del_init(&sl->sl_list), so that the order of
+	 * drop_link(this, target) and drop_item(target) is preserved.
+	 */
+	if (type && type->ct_item_ops &&
+	    type->ct_item_ops->drop_link)
+		type->ct_item_ops->drop_link(parent_item,
+					       sl->sl_target);
+
+	/* FIXME: Needs lock */
+	list_del_init(&sl->sl_list);
+
+	/* Put reference from create_link() */
+	config_item_put(sl->sl_target);
+	kfree(sl);
+
+	config_item_put(parent_item);
+
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int configfs_get_target_path(struct config_item * item, struct config_item * target,
+				   char *path)
+{
+	char * s;
+	int depth, size;
+
+	depth = item_depth(item);
+	size = item_path_length(target) + depth * 3 - 1;
+	if (size > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
+
+	for (s = path; depth--; s += 3)
+		strcpy(s,"../");
+
+	fill_item_path(target, path, size);
+	pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
+
+	return 0;
+}
+
+static int configfs_getlink(struct dentry *dentry, char * path)
+{
+	struct config_item *item, *target_item;
+	int error = 0;
+
+	item = configfs_get_config_item(dentry->d_parent);
+	if (!item)
+		return -EINVAL;
+
+	target_item = configfs_get_config_item(dentry);
+	if (!target_item) {
+		config_item_put(item);
+		return -EINVAL;
+	}
+
+	down_read(&configfs_rename_sem);
+	error = configfs_get_target_path(item, target_item, path);
+	up_read(&configfs_rename_sem);
+
+	config_item_put(item);
+	config_item_put(target_item);
+	return error;
+
+}
+
+static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	if (page) {
+		error = configfs_getlink(dentry, (char *)page);
+		if (!error) {
+			nd_set_link(nd, (char *)page);
+			return (void *)page;
+		}
+	}
+
+	nd_set_link(nd, ERR_PTR(error));
+	return NULL;
+}
+
+static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			      void *cookie)
+{
+	if (cookie) {
+		unsigned long page = (unsigned long)cookie;
+		free_page(page);
+	}
+}
+
+struct inode_operations configfs_symlink_inode_operations = {
+	.follow_link = configfs_follow_link,
+	.readlink = generic_readlink,
+	.put_link = configfs_put_link,
+};
+
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
new file mode 100644
index 000000000000..acffb8c9073a
--- /dev/null
+++ b/include/linux/configfs.h
@@ -0,0 +1,205 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs.h - definitions for the device driver filesystem
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * Based on kobject.h:
+ *      Copyright (c) 2002-2003	Patrick Mochel
+ *      Copyright (c) 2002-2003	Open Source Development Labs
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please read Documentation/filesystems/configfs.txt before using the
+ * configfs interface, ESPECIALLY the parts about reference counts and
+ * item destructors.
+ */
+
+#ifndef _CONFIGFS_H_
+#define _CONFIGFS_H_
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+#define CONFIGFS_ITEM_NAME_LEN	20
+
+struct module;
+
+struct configfs_item_operations;
+struct configfs_group_operations;
+struct configfs_attribute;
+struct configfs_subsystem;
+
+struct config_item {
+	char			*ci_name;
+	char			ci_namebuf[CONFIGFS_ITEM_NAME_LEN];
+	struct kref		ci_kref;
+	struct list_head	ci_entry;
+	struct config_item	*ci_parent;
+	struct config_group	*ci_group;
+	struct config_item_type	*ci_type;
+	struct dentry		*ci_dentry;
+};
+
+extern int config_item_set_name(struct config_item *, const char *, ...);
+
+static inline char *config_item_name(struct config_item * item)
+{
+	return item->ci_name;
+}
+
+extern void config_item_init(struct config_item *);
+extern void config_item_init_type_name(struct config_item *item,
+				       const char *name,
+				       struct config_item_type *type);
+extern void config_item_cleanup(struct config_item *);
+
+extern struct config_item * config_item_get(struct config_item *);
+extern void config_item_put(struct config_item *);
+
+struct config_item_type {
+	struct module				*ct_owner;
+	struct configfs_item_operations		*ct_item_ops;
+	struct configfs_group_operations	*ct_group_ops;
+	struct configfs_attribute		**ct_attrs;
+};
+
+
+/**
+ *	group - a group of config_items of a specific type, belonging
+ *	to a specific subsystem.
+ */
+
+struct config_group {
+	struct config_item		cg_item;
+	struct list_head		cg_children;
+	struct configfs_subsystem 	*cg_subsys;
+	struct config_group		**default_groups;
+};
+
+
+extern void config_group_init(struct config_group *group);
+extern void config_group_init_type_name(struct config_group *group,
+					const char *name,
+					struct config_item_type *type);
+
+
+static inline struct config_group *to_config_group(struct config_item *item)
+{
+	return item ? container_of(item,struct config_group,cg_item) : NULL;
+}
+
+static inline struct config_group *config_group_get(struct config_group *group)
+{
+	return group ? to_config_group(config_item_get(&group->cg_item)) : NULL;
+}
+
+static inline void config_group_put(struct config_group *group)
+{
+	config_item_put(&group->cg_item);
+}
+
+extern struct config_item *config_group_find_obj(struct config_group *, const char *);
+
+
+struct configfs_attribute {
+	char			*ca_name;
+	struct module 		*ca_owner;
+	mode_t			ca_mode;
+};
+
+
+/*
+ * If allow_link() exists, the item can symlink(2) out to other
+ * items.  If the item is a group, it may support mkdir(2).
+ * Groups supply one of make_group() and make_item().  If the
+ * group supports make_group(), one can create group children.  If it
+ * supports make_item(), one can create config_item children.  If it has
+ * default_groups on group->default_groups, it has automatically created
+ * group children.  default_groups may coexist alongsize make_group() or
+ * make_item(), but if the group wishes to have only default_groups
+ * children (disallowing mkdir(2)), it need not provide either function.
+ * If the group has commit(), it supports pending and commited (active)
+ * items.
+ */
+struct configfs_item_operations {
+	void (*release)(struct config_item *);
+	ssize_t	(*show_attribute)(struct config_item *, struct configfs_attribute *,char *);
+	ssize_t	(*store_attribute)(struct config_item *,struct configfs_attribute *,const char *, size_t);
+	int (*allow_link)(struct config_item *src, struct config_item *target);
+	int (*drop_link)(struct config_item *src, struct config_item *target);
+};
+
+struct configfs_group_operations {
+	struct config_item *(*make_item)(struct config_group *group, const char *name);
+	struct config_group *(*make_group)(struct config_group *group, const char *name);
+	int (*commit_item)(struct config_item *item);
+	void (*drop_item)(struct config_group *group, struct config_item *item);
+};
+
+
+
+/**
+ * Use these macros to make defining attributes easier. See include/linux/device.h
+ * for examples..
+ */
+
+#if 0
+#define __ATTR(_name,_mode,_show,_store) { \
+	.attr = {.ca_name = __stringify(_name), .ca_mode = _mode, .ca_owner = THIS_MODULE },	\
+	.show	= _show,					\
+	.store	= _store,					\
+}
+
+#define __ATTR_RO(_name) { \
+	.attr	= { .ca_name = __stringify(_name), .ca_mode = 0444, .ca_owner = THIS_MODULE },	\
+	.show	= _name##_show,	\
+}
+
+#define __ATTR_NULL { .attr = { .name = NULL } }
+
+#define attr_name(_attr) (_attr).attr.name
+#endif
+
+
+struct configfs_subsystem {
+	struct config_group	su_group;
+	struct semaphore	su_sem;
+};
+
+static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct configfs_subsystem, su_group) :
+		NULL;
+}
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys);
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+#endif  /* __KERNEL__ */
+
+#endif /* _CONFIGFS_H_ */
-- 
cgit v1.2.3-71-gd317


From 994fc28c7b1e697ac56befe4aecabf23f0689f46 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Thu, 15 Dec 2005 14:28:17 -0800
Subject: [PATCH] add AOP_TRUNCATED_PAGE, prepend AOP_ to WRITEPAGE_ACTIVATE

readpage(), prepare_write(), and commit_write() callers are updated to
understand the special return code AOP_TRUNCATED_PAGE in the style of
writepage() and WRITEPAGE_ACTIVATE.  AOP_TRUNCATED_PAGE tells the caller that
the callee has unlocked the page and that the operation should be tried again
with a new page.  OCFS2 uses this to detect and work around a lock inversion in
its aop methods.  There should be no change in behaviour for methods that don't
return AOP_TRUNCATED_PAGE.

WRITEPAGE_ACTIVATE is also prepended with AOP_ for consistency and they are
made enums so that kerneldoc can be used to document their semantics.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 drivers/block/loop.c      | 23 +++++++++++----
 drivers/block/rd.c        |  4 +--
 fs/mpage.c                |  2 +-
 include/linux/fs.h        | 31 ++++++++++++++++++++
 include/linux/writeback.h |  6 ----
 mm/filemap.c              | 73 ++++++++++++++++++++++++++++++++---------------
 mm/readahead.c            | 15 ++++++----
 mm/shmem.c                |  2 +-
 mm/vmscan.c               |  2 +-
 9 files changed, 113 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 96c664af8d06..a452b13620a2 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -213,7 +213,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	struct address_space_operations *aops = mapping->a_ops;
 	pgoff_t index;
 	unsigned offset, bv_offs;
-	int len, ret = 0;
+	int len, ret;
 
 	down(&mapping->host->i_sem);
 	index = pos >> PAGE_CACHE_SHIFT;
@@ -232,9 +232,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 		page = grab_cache_page(mapping, index);
 		if (unlikely(!page))
 			goto fail;
-		if (unlikely(aops->prepare_write(file, page, offset,
-				offset + size)))
+		ret = aops->prepare_write(file, page, offset,
+					  offset + size);
+		if (unlikely(ret)) {
+			if (ret == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				continue;
+			}
 			goto unlock;
+		}
 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
 				bvec->bv_page, bv_offs, size, IV);
 		if (unlikely(transfer_result)) {
@@ -251,9 +257,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 			kunmap_atomic(kaddr, KM_USER0);
 		}
 		flush_dcache_page(page);
-		if (unlikely(aops->commit_write(file, page, offset,
-				offset + size)))
+		ret = aops->commit_write(file, page, offset,
+					 offset + size);
+		if (unlikely(ret)) {
+			if (ret == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				continue;
+			}
 			goto unlock;
+		}
 		if (unlikely(transfer_result))
 			goto unlock;
 		bv_offs += size;
@@ -264,6 +276,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 		unlock_page(page);
 		page_cache_release(page);
 	}
+	ret = 0;
 out:
 	up(&mapping->host->i_sem);
 	return ret;
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 68c60a5bcdab..ffd6abd6d5a0 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -154,7 +154,7 @@ static int ramdisk_commit_write(struct file *file, struct page *page,
 
 /*
  * ->writepage to the the blockdev's mapping has to redirty the page so that the
- * VM doesn't go and steal it.  We return WRITEPAGE_ACTIVATE so that the VM
+ * VM doesn't go and steal it.  We return AOP_WRITEPAGE_ACTIVATE so that the VM
  * won't try to (pointlessly) write the page again for a while.
  *
  * Really, these pages should not be on the LRU at all.
@@ -165,7 +165,7 @@ static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
 		make_page_uptodate(page);
 	SetPageDirty(page);
 	if (wbc->for_reclaim)
-		return WRITEPAGE_ACTIVATE;
+		return AOP_WRITEPAGE_ACTIVATE;
 	unlock_page(page);
 	return 0;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index c5adcdddf3cc..f1d2d02bd4c8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -721,7 +721,7 @@ retry:
 						&last_block_in_bio, &ret, wbc,
 						page->mapping->a_ops->writepage);
 			}
-			if (unlikely(ret == WRITEPAGE_ACTIVATE))
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
 				unlock_page(page);
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc35b6ac778d..ed9a41a71e8b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -302,6 +302,37 @@ struct iattr {
  */
 #include <linux/quota.h>
 
+/** 
+ * enum positive_aop_returns - aop return codes with specific semantics
+ *
+ * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
+ * 			    completed, that the page is still locked, and
+ * 			    should be considered active.  The VM uses this hint
+ * 			    to return the page to the active list -- it won't
+ * 			    be a candidate for writeback again in the near
+ * 			    future.  Other callers must be careful to unlock
+ * 			    the page if they get this return.  Returned by
+ * 			    writepage(); 
+ *
+ * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
+ *  			unlocked it and the page might have been truncated.
+ *  			The caller should back up to acquiring a new page and
+ *  			trying again.  The aop will be taking reasonable
+ *  			precautions not to livelock.  If the caller held a page
+ *  			reference, it should drop it before retrying.  Returned
+ *  			by readpage(), prepare_write(), and commit_write().
+ *
+ * address_space_operation functions return these large constants to indicate
+ * special semantics to the caller.  These are much larger than the bytes in a
+ * page to allow for functions that return the number of bytes operated on in a
+ * given page.
+ */
+
+enum positive_aop_returns {
+	AOP_WRITEPAGE_ACTIVATE	= 0x80000,
+	AOP_TRUNCATED_PAGE	= 0x80001,
+};
+
 /*
  * oh the beauties of C type declarations.
  */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 343d883d69c5..64a36ba43b2f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -59,12 +59,6 @@ struct writeback_control {
 	unsigned for_reclaim:1;			/* Invoked from the page allocator */
 };
 
-/*
- * ->writepage() return values (make these much larger than a pagesize, in
- * case some fs is returning number-of-bytes-written from writepage)
- */
-#define WRITEPAGE_ACTIVATE	0x80000	/* IO was not started: activate page */
-
 /*
  * fs/fs-writeback.c
  */	
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bfde158..6e1d08a2b8b9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -831,8 +831,13 @@ readpage:
 		/* Start the actual read. The read will unlock the page. */
 		error = mapping->a_ops->readpage(filp, page);
 
-		if (unlikely(error))
+		if (unlikely(error)) {
+			if (error == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				goto find_page;
+			}
 			goto readpage_error;
+		}
 
 		if (!PageUptodate(page)) {
 			lock_page(page);
@@ -1152,26 +1157,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 {
 	struct address_space *mapping = file->f_mapping;
 	struct page *page; 
-	int error;
+	int ret;
 
-	page = page_cache_alloc_cold(mapping);
-	if (!page)
-		return -ENOMEM;
+	do {
+		page = page_cache_alloc_cold(mapping);
+		if (!page)
+			return -ENOMEM;
+
+		ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+		if (ret == 0)
+			ret = mapping->a_ops->readpage(file, page);
+		else if (ret == -EEXIST)
+			ret = 0; /* losing race to add is OK */
 
-	error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
-	if (!error) {
-		error = mapping->a_ops->readpage(file, page);
 		page_cache_release(page);
-		return error;
-	}
 
-	/*
-	 * We arrive here in the unlikely event that someone 
-	 * raced with us and added our page to the cache first
-	 * or we are out of memory for radix-tree nodes.
-	 */
-	page_cache_release(page);
-	return error == -EEXIST ? 0 : error;
+	} while (ret == AOP_TRUNCATED_PAGE);
+		
+	return ret;
 }
 
 #define MMAP_LOTSAMISS  (100)
@@ -1331,10 +1334,14 @@ page_not_uptodate:
 		goto success;
 	}
 
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1358,10 +1365,14 @@ page_not_uptodate:
 		goto success;
 	}
 	ClearPageError(page);
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1444,10 +1455,14 @@ page_not_uptodate:
 		goto success;
 	}
 
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1470,10 +1485,14 @@ page_not_uptodate:
 	}
 
 	ClearPageError(page);
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1934,12 +1953,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
 		if (unlikely(status)) {
 			loff_t isize = i_size_read(inode);
+
+			if (status != AOP_TRUNCATED_PAGE)
+				unlock_page(page);
+			page_cache_release(page);
+			if (status == AOP_TRUNCATED_PAGE)
+				continue;
 			/*
 			 * prepare_write() may have instantiated a few blocks
 			 * outside i_size.  Trim these off again.
 			 */
-			unlock_page(page);
-			page_cache_release(page);
 			if (pos + bytes > isize)
 				vmtruncate(inode, isize);
 			break;
@@ -1952,6 +1975,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 						cur_iov, iov_base, bytes);
 		flush_dcache_page(page);
 		status = a_ops->commit_write(file, page, offset, offset+bytes);
+		if (status == AOP_TRUNCATED_PAGE) {
+			page_cache_release(page);
+			continue;
+		}
 		if (likely(copied > 0)) {
 			if (!status)
 				status = copied;
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87c7..8d6eeaaa6296 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 {
 	unsigned page_idx;
 	struct pagevec lru_pvec;
-	int ret = 0;
+	int ret;
 
 	if (mapping->a_ops->readpages) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 		list_del(&page->lru);
 		if (!add_to_page_cache(page, mapping,
 					page->index, GFP_KERNEL)) {
-			mapping->a_ops->readpage(filp, page);
-			if (!pagevec_add(&lru_pvec, page))
-				__pagevec_lru_add(&lru_pvec);
-		} else {
-			page_cache_release(page);
+			ret = mapping->a_ops->readpage(filp, page);
+			if (ret != AOP_TRUNCATED_PAGE) {
+				if (!pagevec_add(&lru_pvec, page))
+					__pagevec_lru_add(&lru_pvec);
+				continue;
+			} /* else fall through to release */
 		}
+		page_cache_release(page);
 	}
 	pagevec_lru_add(&lru_pvec);
+	ret = 0;
 out:
 	return ret;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61e9..d9fc277940da 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -855,7 +855,7 @@ unlock:
 	swap_free(swap);
 redirty:
 	set_page_dirty(page);
-	return WRITEPAGE_ACTIVATE;	/* Return with the page locked */
+	return AOP_WRITEPAGE_ACTIVATE;	/* Return with the page locked */
 }
 
 #ifdef CONFIG_NUMA
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c32de6..795a050fe471 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 		res = mapping->a_ops->writepage(page, &wbc);
 		if (res < 0)
 			handle_write_error(mapping, page, res);
-		if (res == WRITEPAGE_ACTIVATE) {
+		if (res == AOP_WRITEPAGE_ACTIVATE) {
 			ClearPageReclaim(page);
 			return PAGE_ACTIVATE;
 		}
-- 
cgit v1.2.3-71-gd317


From df71837d5024e2524cd51c93621e558aa7dd9f3f Mon Sep 17 00:00:00 2001
From: Trent Jaeger <tjaeger@cse.psu.edu>
Date: Tue, 13 Dec 2005 23:12:27 -0800
Subject: [LSM-IPSec]: Security association restriction.

This patch series implements per packet access control via the
extension of the Linux Security Modules (LSM) interface by hooks in
the XFRM and pfkey subsystems that leverage IPSec security
associations to label packets.  Extensions to the SELinux LSM are
included that leverage the patch for this purpose.

This patch implements the changes necessary to the XFRM subsystem,
pfkey interface, ipv4/ipv6, and xfrm_user interface to restrict a
socket to use only authorized security associations (or no security
association) to send/receive network packets.

Patch purpose:

The patch is designed to enable access control per packets based on
the strongly authenticated IPSec security association.  Such access
controls augment the existing ones based on network interface and IP
address.  The former are very coarse-grained, and the latter can be
spoofed.  By using IPSec, the system can control access to remote
hosts based on cryptographic keys generated using the IPSec mechanism.
This enables access control on a per-machine basis or per-application
if the remote machine is running the same mechanism and trusted to
enforce the access control policy.

Patch design approach:

The overall approach is that policy (xfrm_policy) entries set by
user-level programs (e.g., setkey for ipsec-tools) are extended with a
security context that is used at policy selection time in the XFRM
subsystem to restrict the sockets that can send/receive packets via
security associations (xfrm_states) that are built from those
policies.

A presentation available at
www.selinux-symposium.org/2005/presentations/session2/2-3-jaeger.pdf
from the SELinux symposium describes the overall approach.

Patch implementation details:

On output, the policy retrieved (via xfrm_policy_lookup or
xfrm_sk_policy_lookup) must be authorized for the security context of
the socket and the same security context is required for resultant
security association (retrieved or negotiated via racoon in
ipsec-tools).  This is enforced in xfrm_state_find.

On input, the policy retrieved must also be authorized for the socket
(at __xfrm_policy_check), and the security context of the policy must
also match the security association being used.

The patch has virtually no impact on packets that do not use IPSec.
The existing Netfilter (outgoing) and LSM rcv_skb hooks are used as
before.

Also, if IPSec is used without security contexts, the impact is
minimal.  The LSM must allow such policies to be selected for the
combination of socket and remote machine, but subsequent IPSec
processing proceeds as in the original case.

Testing:

The pfkey interface is tested using the ipsec-tools.  ipsec-tools have
been modified (a separate ipsec-tools patch is available for version
0.5) that supports assignment of xfrm_policy entries and security
associations with security contexts via setkey and the negotiation
using the security contexts via racoon.

The xfrm_user interface is tested via ad hoc programs that set
security contexts.  These programs are also available from me, and
contain programs for setting, getting, and deleting policy for testing
this interface.  Testing of sa functions was done by tracing kernel
behavior.

Signed-off-by: Trent Jaeger <tjaeger@cse.psu.edu>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pfkeyv2.h  |  13 +++-
 include/linux/security.h | 132 +++++++++++++++++++++++++++++++
 include/linux/xfrm.h     |  29 +++++++
 include/net/flow.h       |   7 +-
 include/net/xfrm.h       |  27 ++++++-
 net/core/flow.c          |   8 +-
 net/key/af_key.c         | 197 +++++++++++++++++++++++++++++++++++++++++++++--
 net/xfrm/xfrm_policy.c   |  88 +++++++++++++--------
 net/xfrm/xfrm_state.c    |   9 ++-
 net/xfrm/xfrm_user.c     | 148 +++++++++++++++++++++++++++++++++--
 security/Kconfig         |  13 ++++
 security/dummy.c         |  45 ++++++++++-
 12 files changed, 655 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
index 724066778aff..6351c4055ace 100644
--- a/include/linux/pfkeyv2.h
+++ b/include/linux/pfkeyv2.h
@@ -216,6 +216,16 @@ struct sadb_x_nat_t_port {
 } __attribute__((packed));
 /* sizeof(struct sadb_x_nat_t_port) == 8 */
 
+/* Generic LSM security context */
+struct sadb_x_sec_ctx {
+	uint16_t	sadb_x_sec_len;
+	uint16_t	sadb_x_sec_exttype;
+	uint8_t		sadb_x_ctx_alg;  /* LSMs: e.g., selinux == 1 */
+	uint8_t		sadb_x_ctx_doi;
+	uint16_t	sadb_x_ctx_len;
+} __attribute__((packed));
+/* sizeof(struct sadb_sec_ctx) = 8 */
+
 /* Message types */
 #define SADB_RESERVED		0
 #define SADB_GETSPI		1
@@ -325,7 +335,8 @@ struct sadb_x_nat_t_port {
 #define SADB_X_EXT_NAT_T_SPORT		21
 #define SADB_X_EXT_NAT_T_DPORT		22
 #define SADB_X_EXT_NAT_T_OA		23
-#define SADB_EXT_MAX			23
+#define SADB_X_EXT_SEC_CTX		24
+#define SADB_EXT_MAX			24
 
 /* Identity Extension values */
 #define SADB_IDENTTYPE_RESERVED	0
diff --git a/include/linux/security.h b/include/linux/security.h
index f7e0ae018712..ef753654daa5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -59,6 +59,12 @@ struct sk_buff;
 struct sock;
 struct sockaddr;
 struct socket;
+struct flowi;
+struct dst_entry;
+struct xfrm_selector;
+struct xfrm_policy;
+struct xfrm_state;
+struct xfrm_user_sec_ctx;
 
 extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
 extern int cap_netlink_recv(struct sk_buff *skb);
@@ -788,6 +794,52 @@ struct swap_info_struct;
  *      which is used to copy security attributes between local stream sockets.
  * @sk_free_security:
  *	Deallocate security structure.
+ * @sk_getsid:
+ *	Retrieve the LSM-specific sid for the sock to enable caching of network
+ *	authorizations.
+ *
+ * Security hooks for XFRM operations.
+ *
+ * @xfrm_policy_alloc_security:
+ *	@xp contains the xfrm_policy being added to Security Policy Database
+ *	used by the XFRM system.
+ *	@sec_ctx contains the security context information being provided by
+ *	the user-level policy update program (e.g., setkey).
+ *	Allocate a security structure to the xp->selector.security field.
+ *	The security field is initialized to NULL when the xfrm_policy is
+ *	allocated.
+ *	Return 0 if operation was successful (memory to allocate, legal context)
+ * @xfrm_policy_clone_security:
+ *	@old contains an existing xfrm_policy in the SPD.
+ *	@new contains a new xfrm_policy being cloned from old.
+ *	Allocate a security structure to the new->selector.security field
+ *	that contains the information from the old->selector.security field.
+ *	Return 0 if operation was successful (memory to allocate).
+ * @xfrm_policy_free_security:
+ *	@xp contains the xfrm_policy
+ *	Deallocate xp->selector.security.
+ * @xfrm_state_alloc_security:
+ *	@x contains the xfrm_state being added to the Security Association
+ *	Database by the XFRM system.
+ *	@sec_ctx contains the security context information being provided by
+ *	the user-level SA generation program (e.g., setkey or racoon).
+ *	Allocate a security structure to the x->sel.security field.  The
+ *	security field is initialized to NULL when the xfrm_state is
+ *	allocated.
+ *	Return 0 if operation was successful (memory to allocate, legal context).
+ * @xfrm_state_free_security:
+ *	@x contains the xfrm_state.
+ *	Deallocate x>sel.security.
+ * @xfrm_policy_lookup:
+ *	@xp contains the xfrm_policy for which the access control is being
+ *	checked.
+ *	@sk_sid contains the sock security label that is used to authorize
+ *	access to the policy xp.
+ *	@dir contains the direction of the flow (input or output).
+ *	Check permission when a sock selects a xfrm_policy for processing
+ *	XFRMs on a packet.  The hook is called when selecting either a
+ *	per-socket policy or a generic xfrm policy.
+ *	Return 0 if permission is granted.
  *
  * Security hooks affecting all Key Management operations
  *
@@ -1237,8 +1289,18 @@ struct security_operations {
 	int (*socket_getpeersec) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len);
 	int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority);
 	void (*sk_free_security) (struct sock *sk);
+	unsigned int (*sk_getsid) (struct sock *sk, struct flowi *fl, u8 dir);
 #endif	/* CONFIG_SECURITY_NETWORK */
 
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+	int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
+	int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new);
+	void (*xfrm_policy_free_security) (struct xfrm_policy *xp);
+	int (*xfrm_state_alloc_security) (struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
+	void (*xfrm_state_free_security) (struct xfrm_state *x);
+	int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
+#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
+
 	/* key management security hooks */
 #ifdef CONFIG_KEYS
 	int (*key_alloc)(struct key *key);
@@ -2679,6 +2741,11 @@ static inline void security_sk_free(struct sock *sk)
 {
 	return security_ops->sk_free_security(sk);
 }
+
+static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
+{
+	return security_ops->sk_getsid(sk, fl, dir);
+}
 #else	/* CONFIG_SECURITY_NETWORK */
 static inline int security_unix_stream_connect(struct socket * sock,
 					       struct socket * other, 
@@ -2795,8 +2862,73 @@ static inline int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
 static inline void security_sk_free(struct sock *sk)
 {
 }
+
+static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
+{
+	return 0;
+}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return security_ops->xfrm_policy_alloc_security(xp, sec_ctx);
+}
+
+static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+	return security_ops->xfrm_policy_clone_security(old, new);
+}
+
+static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
+{
+	security_ops->xfrm_policy_free_security(xp);
+}
+
+static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return security_ops->xfrm_state_alloc_security(x, sec_ctx);
+}
+
+static inline void security_xfrm_state_free(struct xfrm_state *x)
+{
+	security_ops->xfrm_state_free_security(x);
+}
+
+static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+	return security_ops->xfrm_policy_lookup(xp, sk_sid, dir);
+}
+#else	/* CONFIG_SECURITY_NETWORK_XFRM */
+static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return 0;
+}
+
+static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+	return 0;
+}
+
+static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
+{
+}
+
+static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return 0;
+}
+
+static inline void security_xfrm_state_free(struct xfrm_state *x)
+{
+}
+
+static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+	return 0;
+}
+#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
+
 #ifdef CONFIG_KEYS
 #ifdef CONFIG_SECURITY
 static inline int security_key_alloc(struct key *key)
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 0fb077d68441..82fbb758e28f 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -27,6 +27,22 @@ struct xfrm_id
 	__u8		proto;
 };
 
+struct xfrm_sec_ctx {
+	__u8	ctx_doi;
+	__u8	ctx_alg;
+	__u16	ctx_len;
+	__u32	ctx_sid;
+	char	ctx_str[0];
+};
+
+/* Security Context Domains of Interpretation */
+#define XFRM_SC_DOI_RESERVED 0
+#define XFRM_SC_DOI_LSM 1
+
+/* Security Context Algorithms */
+#define XFRM_SC_ALG_RESERVED 0
+#define XFRM_SC_ALG_SELINUX 1
+
 /* Selector, used as selector both on policy rules (SPD) and SAs. */
 
 struct xfrm_selector
@@ -146,6 +162,18 @@ enum {
 
 #define XFRM_NR_MSGTYPES (XFRM_MSG_MAX + 1 - XFRM_MSG_BASE)
 
+/*
+ * Generic LSM security context for comunicating to user space
+ * NOTE: Same format as sadb_x_sec_ctx
+ */
+struct xfrm_user_sec_ctx {
+	__u16			len;
+	__u16			exttype;
+	__u8			ctx_alg;  /* LSMs: e.g., selinux == 1 */
+	__u8			ctx_doi;
+	__u16			ctx_len;
+};
+
 struct xfrm_user_tmpl {
 	struct xfrm_id		id;
 	__u16			family;
@@ -176,6 +204,7 @@ enum xfrm_attr_type_t {
 	XFRMA_TMPL,		/* 1 or more struct xfrm_user_tmpl */
 	XFRMA_SA,
 	XFRMA_POLICY,
+	XFRMA_SEC_CTX,		/* struct xfrm_sec_ctx */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/include/net/flow.h b/include/net/flow.h
index 9a5c94b1a0ec..ec7eb86eb203 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -84,11 +84,12 @@ struct flowi {
 #define FLOW_DIR_OUT	1
 #define FLOW_DIR_FWD	2
 
-typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir,
+struct sock;
+typedef void (*flow_resolve_t)(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
 			       void **objp, atomic_t **obj_refp);
 
-extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
-			       flow_resolve_t resolver);
+extern void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
+	 		       flow_resolve_t resolver);
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 1cdb87912137..487abca3ca6f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -144,6 +144,9 @@ struct xfrm_state
 	 * transformer. */
 	struct xfrm_type	*type;
 
+	/* Security context */
+	struct xfrm_sec_ctx	*security;
+
 	/* Private data of this transformer, format is opaque,
 	 * interpreted by xfrm_type methods. */
 	void			*data;
@@ -298,6 +301,7 @@ struct xfrm_policy
 	__u8			flags;
 	__u8			dead;
 	__u8			xfrm_nr;
+	struct xfrm_sec_ctx	*security;
 	struct xfrm_tmpl       	xfrm_vec[XFRM_MAX_DEPTH];
 };
 
@@ -510,6 +514,25 @@ xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl,
 	return 0;
 }
 
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+/*	If neither has a context --> match
+ * 	Otherwise, both must have a context and the sids, doi, alg must match
+ */
+static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
+{
+	return ((!s1 && !s2) ||
+		(s1 && s2 &&
+		 (s1->ctx_sid == s2->ctx_sid) &&
+		 (s1->ctx_doi == s2->ctx_doi) &&
+		 (s1->ctx_alg == s2->ctx_alg)));
+}
+#else
+static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
+{
+	return 1;
+}
+#endif
+
 /* A struct encoding bundle of transformations to apply to some set of flow.
  *
  * dst->child points to the next element of bundle.
@@ -878,8 +901,8 @@ static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsig
 struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp);
 extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *);
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
-struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
-				      int delete);
+struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
+					  struct xfrm_sec_ctx *ctx, int delete);
 struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete);
 void xfrm_policy_flush(void);
 u32 xfrm_get_acqseq(void);
diff --git a/net/core/flow.c b/net/core/flow.c
index 7e95b39de9fd..c4f25385029f 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -23,6 +23,7 @@
 #include <net/flow.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
+#include <linux/security.h>
 
 struct flow_cache_entry {
 	struct flow_cache_entry	*next;
@@ -30,6 +31,7 @@ struct flow_cache_entry {
 	u8			dir;
 	struct flowi		key;
 	u32			genid;
+	u32			sk_sid;
 	void			*object;
 	atomic_t		*object_ref;
 };
@@ -162,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
+void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
 			flow_resolve_t resolver)
 {
 	struct flow_cache_entry *fle, **head;
@@ -186,6 +188,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
+		    fle->sk_sid == sk_sid &&
 		    flow_key_compare(key, &fle->key) == 0) {
 			if (fle->genid == atomic_read(&flow_cache_genid)) {
 				void *ret = fle->object;
@@ -210,6 +213,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
 			*head = fle;
 			fle->family = family;
 			fle->dir = dir;
+			fle->sk_sid = sk_sid;
 			memcpy(&fle->key, key, sizeof(*key));
 			fle->object = NULL;
 			flow_count(cpu)++;
@@ -221,7 +225,7 @@ nocache:
 		void *obj;
 		atomic_t *obj_ref;
 
-		resolver(key, family, dir, &obj, &obj_ref);
+		resolver(key, sk_sid, family, dir, &obj, &obj_ref);
 
 		if (fle) {
 			fle->genid = atomic_read(&flow_cache_genid);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 39031684b65c..d32f7791f1e4 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -336,6 +336,7 @@ static u8 sadb_ext_min_len[] = {
 	[SADB_X_EXT_NAT_T_SPORT]	= (u8) sizeof(struct sadb_x_nat_t_port),
 	[SADB_X_EXT_NAT_T_DPORT]	= (u8) sizeof(struct sadb_x_nat_t_port),
 	[SADB_X_EXT_NAT_T_OA]		= (u8) sizeof(struct sadb_address),
+	[SADB_X_EXT_SEC_CTX]		= (u8) sizeof(struct sadb_x_sec_ctx),
 };
 
 /* Verify sadb_address_{len,prefixlen} against sa_family.  */
@@ -383,6 +384,55 @@ static int verify_address_len(void *p)
 	return 0;
 }
 
+static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx)
+{
+	int len = 0;
+
+	len += sizeof(struct sadb_x_sec_ctx);
+	len += sec_ctx->sadb_x_ctx_len;
+	len += sizeof(uint64_t) - 1;
+	len /= sizeof(uint64_t);
+
+	return len;
+}
+
+static inline int verify_sec_ctx_len(void *p)
+{
+	struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p;
+	int len;
+
+	if (sec_ctx->sadb_x_ctx_len > PAGE_SIZE)
+		return -EINVAL;
+
+	len = pfkey_sec_ctx_len(sec_ctx);
+
+	if (sec_ctx->sadb_x_sec_len != len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx)
+{
+	struct xfrm_user_sec_ctx *uctx = NULL;
+	int ctx_size = sec_ctx->sadb_x_ctx_len;
+
+	uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL);
+
+	if (!uctx)
+		return NULL;
+
+	uctx->len = pfkey_sec_ctx_len(sec_ctx);
+	uctx->exttype = sec_ctx->sadb_x_sec_exttype;
+	uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi;
+	uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg;
+	uctx->ctx_len = sec_ctx->sadb_x_ctx_len;
+	memcpy(uctx + 1, sec_ctx + 1,
+	       uctx->ctx_len);
+
+	return uctx;
+}
+
 static int present_and_same_family(struct sadb_address *src,
 				   struct sadb_address *dst)
 {
@@ -438,6 +488,10 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h
 				if (verify_address_len(p))
 					return -EINVAL;
 			}				
+			if (ext_type == SADB_X_EXT_SEC_CTX) {
+				if (verify_sec_ctx_len(p))
+					return -EINVAL;
+			}
 			ext_hdrs[ext_type-1] = p;
 		}
 		p   += ext_len;
@@ -586,6 +640,9 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
 	struct sadb_key *key;
 	struct sadb_x_sa2 *sa2;
 	struct sockaddr_in *sin;
+	struct sadb_x_sec_ctx *sec_ctx;
+	struct xfrm_sec_ctx *xfrm_ctx;
+	int ctx_size = 0;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct sockaddr_in6 *sin6;
 #endif
@@ -609,6 +666,12 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
 			sizeof(struct sadb_address)*2 + 
 				sockaddr_size*2 +
 					sizeof(struct sadb_x_sa2);
+
+	if ((xfrm_ctx = x->security)) {
+		ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
+		size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
+	}
+
 	/* identity & sensitivity */
 
 	if ((x->props.family == AF_INET &&
@@ -899,6 +962,20 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
 		n_port->sadb_x_nat_t_port_reserved = 0;
 	}
 
+	/* security context */
+	if (xfrm_ctx) {
+		sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb,
+				sizeof(struct sadb_x_sec_ctx) + ctx_size);
+		sec_ctx->sadb_x_sec_len =
+		  (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t);
+		sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+		sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+		sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+		sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+		memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+		       xfrm_ctx->ctx_len);
+	}
+
 	return skb;
 }
 
@@ -909,6 +986,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
 	struct sadb_lifetime *lifetime;
 	struct sadb_sa *sa;
 	struct sadb_key *key;
+	struct sadb_x_sec_ctx *sec_ctx;
 	uint16_t proto;
 	int err;
 	
@@ -993,6 +1071,21 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
 		x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
 		x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
 	}
+
+	sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+	if (sec_ctx != NULL) {
+		struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+		if (!uctx)
+			goto out;
+
+		err = security_xfrm_state_alloc(x, uctx);
+		kfree(uctx);
+
+		if (err)
+			goto out;
+	}
+
 	key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
 	if (sa->sadb_sa_auth) {
 		int keysize = 0;
@@ -1720,6 +1813,18 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
 	return 0;
 }
 
+static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp)
+{
+  struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+
+	if (xfrm_ctx) {
+		int len = sizeof(struct sadb_x_sec_ctx);
+		len += xfrm_ctx->ctx_len;
+		return PFKEY_ALIGN8(len);
+	}
+	return 0;
+}
+
 static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
 {
 	int sockaddr_size = pfkey_sockaddr_size(xp->family);
@@ -1733,7 +1838,8 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
 		(sockaddr_size * 2) +
 		sizeof(struct sadb_x_policy) +
 		(xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) +
-				(socklen * 2)));
+				(socklen * 2))) +
+		pfkey_xfrm_policy2sec_ctx_size(xp);
 }
 
 static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp)
@@ -1757,6 +1863,8 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
 	struct sadb_lifetime *lifetime;
 	struct sadb_x_policy *pol;
 	struct sockaddr_in   *sin;
+	struct sadb_x_sec_ctx *sec_ctx;
+	struct xfrm_sec_ctx *xfrm_ctx;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct sockaddr_in6  *sin6;
 #endif
@@ -1941,6 +2049,21 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
 			}
 		}
 	}
+
+	/* security context */
+	if ((xfrm_ctx = xp->security)) {
+		int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp);
+
+		sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size);
+		sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t);
+		sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+		sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+		sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+		sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+		memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+		       xfrm_ctx->ctx_len);
+	}
+
 	hdr->sadb_msg_len = size / sizeof(uint64_t);
 	hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
 }
@@ -1976,12 +2099,13 @@ out:
 
 static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
 {
-	int err;
+	int err = 0;
 	struct sadb_lifetime *lifetime;
 	struct sadb_address *sa;
 	struct sadb_x_policy *pol;
 	struct xfrm_policy *xp;
 	struct km_event c;
+	struct sadb_x_sec_ctx *sec_ctx;
 
 	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
 				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2028,6 +2152,22 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	if (xp->selector.dport)
 		xp->selector.dport_mask = ~0;
 
+	sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+	if (sec_ctx != NULL) {
+		struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+		if (!uctx) {
+			err = -ENOBUFS;
+			goto out;
+		}
+
+		err = security_xfrm_policy_alloc(xp, uctx);
+		kfree(uctx);
+
+		if (err)
+			goto out;
+	}
+
 	xp->lft.soft_byte_limit = XFRM_INF;
 	xp->lft.hard_byte_limit = XFRM_INF;
 	xp->lft.soft_packet_limit = XFRM_INF;
@@ -2051,10 +2191,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 
 	err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
 				 hdr->sadb_msg_type != SADB_X_SPDUPDATE);
-	if (err) {
-		kfree(xp);
-		return err;
-	}
+
+	if (err)
+		goto out;
 
 	if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
 		c.event = XFRM_MSG_UPDPOLICY;
@@ -2069,6 +2208,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	return 0;
 
 out:
+	security_xfrm_policy_free(xp);
 	kfree(xp);
 	return err;
 }
@@ -2078,9 +2218,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 	int err;
 	struct sadb_address *sa;
 	struct sadb_x_policy *pol;
-	struct xfrm_policy *xp;
+	struct xfrm_policy *xp, tmp;
 	struct xfrm_selector sel;
 	struct km_event c;
+	struct sadb_x_sec_ctx *sec_ctx;
 
 	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
 				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2109,7 +2250,24 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 	if (sel.dport)
 		sel.dport_mask = ~0;
 
-	xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1);
+	sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+	memset(&tmp, 0, sizeof(struct xfrm_policy));
+
+	if (sec_ctx != NULL) {
+		struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+		if (!uctx)
+			return -ENOMEM;
+
+		err = security_xfrm_policy_alloc(&tmp, uctx);
+		kfree(uctx);
+
+		if (err)
+			return err;
+	}
+
+	xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1);
+	security_xfrm_policy_free(&tmp);
 	if (xp == NULL)
 		return -ENOENT;
 
@@ -2660,6 +2818,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
 {
 	struct xfrm_policy *xp;
 	struct sadb_x_policy *pol = (struct sadb_x_policy*)data;
+	struct sadb_x_sec_ctx *sec_ctx;
 
 	switch (family) {
 	case AF_INET:
@@ -2709,10 +2868,32 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
 	    (*dir = parse_ipsecrequests(xp, pol)) < 0)
 		goto out;
 
+	/* security context too */
+	if (len >= (pol->sadb_x_policy_len*8 +
+	    sizeof(struct sadb_x_sec_ctx))) {
+		char *p = (char *)pol;
+		struct xfrm_user_sec_ctx *uctx;
+
+		p += pol->sadb_x_policy_len*8;
+		sec_ctx = (struct sadb_x_sec_ctx *)p;
+		if (len < pol->sadb_x_policy_len*8 +
+		    sec_ctx->sadb_x_sec_len)
+			goto out;
+		if ((*dir = verify_sec_ctx_len(p)))
+			goto out;
+		uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+		*dir = security_xfrm_policy_alloc(xp, uctx);
+		kfree(uctx);
+
+		if (*dir)
+			goto out;
+	}
+
 	*dir = pol->sadb_x_policy_dir-1;
 	return xp;
 
 out:
+	security_xfrm_policy_free(xp);
 	kfree(xp);
 	return NULL;
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d19e274b9c4a..64a447375fdb 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -10,7 +10,7 @@
  * 	YOSHIFUJI Hideaki
  * 		Split up af-specific portion
  *	Derek Atkins <derek@ihtfp.com>		Add the post_input processor
- * 	
+ *
  */
 
 #include <asm/bug.h>
@@ -256,6 +256,7 @@ void __xfrm_policy_destroy(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		BUG();
 
+	security_xfrm_policy_free(policy);
 	kfree(policy);
 }
 EXPORT_SYMBOL(__xfrm_policy_destroy);
@@ -350,7 +351,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 
 	write_lock_bh(&xfrm_policy_lock);
 	for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) {
-		if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) {
+		if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 &&
+		    xfrm_sec_ctx_match(pol->security, policy->security)) {
 			if (excl) {
 				write_unlock_bh(&xfrm_policy_lock);
 				return -EEXIST;
@@ -416,14 +418,15 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 }
 EXPORT_SYMBOL(xfrm_policy_insert);
 
-struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
-				      int delete)
+struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
+					  struct xfrm_sec_ctx *ctx, int delete)
 {
 	struct xfrm_policy *pol, **p;
 
 	write_lock_bh(&xfrm_policy_lock);
 	for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
-		if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) {
+		if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) &&
+		    (xfrm_sec_ctx_match(ctx, pol->security))) {
 			xfrm_pol_hold(pol);
 			if (delete)
 				*p = pol->next;
@@ -438,7 +441,7 @@ struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
 	}
 	return pol;
 }
-EXPORT_SYMBOL(xfrm_policy_bysel);
+EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
 
 struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
 {
@@ -519,7 +522,7 @@ EXPORT_SYMBOL(xfrm_policy_walk);
 
 /* Find policy to apply to this flow. */
 
-static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
+static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir,
 			       void **objp, atomic_t **obj_refp)
 {
 	struct xfrm_policy *pol;
@@ -533,9 +536,12 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
 			continue;
 
 		match = xfrm_selector_match(sel, fl, family);
+
 		if (match) {
-			xfrm_pol_hold(pol);
-			break;
+ 			if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) {
+				xfrm_pol_hold(pol);
+				break;
+			}
 		}
 	}
 	read_unlock_bh(&xfrm_policy_lock);
@@ -543,15 +549,37 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
 		*obj_refp = &pol->refcnt;
 }
 
-static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
+static inline int policy_to_flow_dir(int dir)
+{
+	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
+ 	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
+ 	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
+ 		return dir;
+ 	switch (dir) {
+ 	default:
+ 	case XFRM_POLICY_IN:
+ 		return FLOW_DIR_IN;
+ 	case XFRM_POLICY_OUT:
+ 		return FLOW_DIR_OUT;
+ 	case XFRM_POLICY_FWD:
+ 		return FLOW_DIR_FWD;
+	};
+}
+
+static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid)
 {
 	struct xfrm_policy *pol;
 
 	read_lock_bh(&xfrm_policy_lock);
 	if ((pol = sk->sk_policy[dir]) != NULL) {
-		int match = xfrm_selector_match(&pol->selector, fl,
+ 		int match = xfrm_selector_match(&pol->selector, fl,
 						sk->sk_family);
+ 		int err = 0;
+
 		if (match)
+		  err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir));
+
+ 		if (match && !err)
 			xfrm_pol_hold(pol);
 		else
 			pol = NULL;
@@ -624,6 +652,10 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
 
 	if (newp) {
 		newp->selector = old->selector;
+		if (security_xfrm_policy_clone(old, newp)) {
+			kfree(newp);
+			return NULL;  /* ENOMEM */
+		}
 		newp->lft = old->lft;
 		newp->curlft = old->curlft;
 		newp->action = old->action;
@@ -735,22 +767,6 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
 	return err;
 }
 
-static inline int policy_to_flow_dir(int dir)
-{
-	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
-	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
-	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
-		return dir;
-	switch (dir) {
-	default:
-	case XFRM_POLICY_IN:
-		return FLOW_DIR_IN;
-	case XFRM_POLICY_OUT:
-		return FLOW_DIR_OUT;
-	case XFRM_POLICY_FWD:
-		return FLOW_DIR_FWD;
-	};
-}
 
 static int stale_bundle(struct dst_entry *dst);
 
@@ -769,19 +785,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
 	int err;
 	u32 genid;
 	u16 family = dst_orig->ops->family;
+	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+	u32 sk_sid = security_sk_sid(sk, fl, dir);
 restart:
 	genid = atomic_read(&flow_cache_genid);
 	policy = NULL;
 	if (sk && sk->sk_policy[1])
-		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid);
 
 	if (!policy) {
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
 			return 0;
 
-		policy = flow_cache_lookup(fl, family,
-					   policy_to_flow_dir(XFRM_POLICY_OUT),
+		policy = flow_cache_lookup(fl, sk_sid, family, dir,
 					   xfrm_policy_lookup);
 	}
 
@@ -962,16 +979,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 {
 	struct xfrm_policy *pol;
 	struct flowi fl;
+	u8 fl_dir = policy_to_flow_dir(dir);
+	u32 sk_sid;
 
 	if (_decode_session(skb, &fl, family) < 0)
 		return 0;
 
+	sk_sid = security_sk_sid(sk, &fl, fl_dir);
+
 	/* First, check used SA against their selectors. */
 	if (skb->sp) {
 		int i;
 
 		for (i=skb->sp->len-1; i>=0; i--) {
-		  struct sec_decap_state *xvec = &(skb->sp->x[i]);
+			struct sec_decap_state *xvec = &(skb->sp->x[i]);
 			if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
 				return 0;
 
@@ -986,11 +1007,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 
 	pol = NULL;
 	if (sk && sk->sk_policy[dir])
-		pol = xfrm_sk_policy_lookup(sk, dir, &fl);
+		pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid);
 
 	if (!pol)
-		pol = flow_cache_lookup(&fl, family,
-					policy_to_flow_dir(dir),
+		pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir,
 					xfrm_policy_lookup);
 
 	if (!pol)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 479effc97666..e12d0be5f976 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -10,7 +10,7 @@
  * 		Split up af-specific functions
  *	Derek Atkins <derek@ihtfp.com>
  *		Add UDP Encapsulation
- * 	
+ *
  */
 
 #include <linux/workqueue.h>
@@ -70,6 +70,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 		x->type->destructor(x);
 		xfrm_put_type(x->type);
 	}
+	security_xfrm_state_free(x);
 	kfree(x);
 }
 
@@ -343,7 +344,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 			      selector.
 			 */
 			if (x->km.state == XFRM_STATE_VALID) {
-				if (!xfrm_selector_match(&x->sel, fl, family))
+				if (!xfrm_selector_match(&x->sel, fl, family) ||
+				    !xfrm_sec_ctx_match(pol->security, x->security))
 					continue;
 				if (!best ||
 				    best->km.dying > x->km.dying ||
@@ -354,7 +356,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 				acquire_in_progress = 1;
 			} else if (x->km.state == XFRM_STATE_ERROR ||
 				   x->km.state == XFRM_STATE_EXPIRED) {
-				if (xfrm_selector_match(&x->sel, fl, family))
+ 				if (xfrm_selector_match(&x->sel, fl, family) &&
+				    xfrm_sec_ctx_match(pol->security, x->security))
 					error = -ESRCH;
 			}
 		}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0cdd9a07e043..92e2b804c606 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -7,7 +7,7 @@
  * 	Kazunori MIYAZAWA @USAGI
  * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
  * 		IPv6 support
- * 	
+ *
  */
 
 #include <linux/module.h>
@@ -88,6 +88,34 @@ static int verify_encap_tmpl(struct rtattr **xfrma)
 	return 0;
 }
 
+
+static inline int verify_sec_ctx_len(struct rtattr **xfrma)
+{
+	struct rtattr *rt = xfrma[XFRMA_SEC_CTX - 1];
+	struct xfrm_user_sec_ctx *uctx;
+	int len = 0;
+
+	if (!rt)
+		return 0;
+
+	if (rt->rta_len < sizeof(*uctx))
+		return -EINVAL;
+
+	uctx = RTA_DATA(rt);
+
+	if (uctx->ctx_len > PAGE_SIZE)
+		return -EINVAL;
+
+	len += sizeof(struct xfrm_user_sec_ctx);
+	len += uctx->ctx_len;
+
+	if (uctx->len != len)
+		return -EINVAL;
+
+	return 0;
+}
+
+
 static int verify_newsa_info(struct xfrm_usersa_info *p,
 			     struct rtattr **xfrma)
 {
@@ -145,6 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		goto out;
 	if ((err = verify_encap_tmpl(xfrma)))
 		goto out;
+	if ((err = verify_sec_ctx_len(xfrma)))
+		goto out;
 
 	err = -EINVAL;
 	switch (p->mode) {
@@ -209,6 +239,30 @@ static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_a
 	return 0;
 }
 
+
+static inline int xfrm_user_sec_ctx_size(struct xfrm_policy *xp)
+{
+	struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+	int len = 0;
+
+	if (xfrm_ctx) {
+		len += sizeof(struct xfrm_user_sec_ctx);
+		len += xfrm_ctx->ctx_len;
+	}
+	return len;
+}
+
+static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg)
+{
+	struct xfrm_user_sec_ctx *uctx;
+
+	if (!u_arg)
+		return 0;
+
+	uctx = RTA_DATA(u_arg);
+	return security_xfrm_state_alloc(x, uctx);
+}
+
 static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
 {
 	memcpy(&x->id, &p->id, sizeof(x->id));
@@ -253,6 +307,9 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
 	if (err)
 		goto error;
 
+	if ((err = attach_sec_ctx(x, xfrma[XFRMA_SEC_CTX-1])))
+		goto error;
+
 	x->km.seq = p->seq;
 
 	return x;
@@ -272,11 +329,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	int err;
 	struct km_event c;
 
-	err = verify_newsa_info(p, (struct rtattr **) xfrma);
+	err = verify_newsa_info(p, (struct rtattr **)xfrma);
 	if (err)
 		return err;
 
-	x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err);
+	x = xfrm_state_construct(p, (struct rtattr **)xfrma, &err);
 	if (!x)
 		return err;
 
@@ -390,6 +447,19 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
 	if (x->encap)
 		RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
 
+	if (x->security) {
+		int ctx_size = sizeof(struct xfrm_sec_ctx) +
+				x->security->ctx_len;
+		struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
+		struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+		uctx->exttype = XFRMA_SEC_CTX;
+		uctx->len = ctx_size;
+		uctx->ctx_doi = x->security->ctx_doi;
+		uctx->ctx_alg = x->security->ctx_alg;
+		uctx->ctx_len = x->security->ctx_len;
+		memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len);
+	}
 	nlh->nlmsg_len = skb->tail - b;
 out:
 	sp->this_idx++;
@@ -603,6 +673,18 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
 	return verify_policy_dir(p->dir);
 }
 
+static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct rtattr **xfrma)
+{
+	struct rtattr *rt = xfrma[XFRMA_SEC_CTX-1];
+	struct xfrm_user_sec_ctx *uctx;
+
+	if (!rt)
+		return 0;
+
+	uctx = RTA_DATA(rt);
+	return security_xfrm_policy_alloc(pol, uctx);
+}
+
 static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
 			   int nr)
 {
@@ -681,7 +763,10 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p,
 	}
 
 	copy_from_user_policy(xp, p);
-	err = copy_from_user_tmpl(xp, xfrma);
+
+	if (!(err = copy_from_user_tmpl(xp, xfrma)))
+		err = copy_from_user_sec_ctx(xp, xfrma);
+
 	if (err) {
 		*errp = err;
 		kfree(xp);
@@ -700,10 +785,13 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 	int excl;
 
 	err = verify_newpolicy_info(p);
+	if (err)
+		return err;
+	err = verify_sec_ctx_len((struct rtattr **)xfrma);
 	if (err)
 		return err;
 
-	xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err);
+	xp = xfrm_policy_construct(p, (struct rtattr **)xfrma, &err);
 	if (!xp)
 		return err;
 
@@ -761,6 +849,27 @@ rtattr_failure:
 	return -1;
 }
 
+static int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb)
+{
+	if (xp->security) {
+		int ctx_size = sizeof(struct xfrm_sec_ctx) +
+				xp->security->ctx_len;
+		struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
+		struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+		uctx->exttype = XFRMA_SEC_CTX;
+		uctx->len = ctx_size;
+		uctx->ctx_doi = xp->security->ctx_doi;
+		uctx->ctx_alg = xp->security->ctx_alg;
+		uctx->ctx_len = xp->security->ctx_len;
+		memcpy(uctx + 1, xp->security->ctx_str, xp->security->ctx_len);
+	}
+	return 0;
+
+ rtattr_failure:
+	return -1;
+}
+
 static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
 {
 	struct xfrm_dump_info *sp = ptr;
@@ -782,6 +891,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
 	copy_to_user_policy(xp, p, dir);
 	if (copy_to_user_tmpl(xp, skb) < 0)
 		goto nlmsg_failure;
+	if (copy_to_user_sec_ctx(xp, skb))
+		goto nlmsg_failure;
 
 	nlh->nlmsg_len = skb->tail - b;
 out:
@@ -852,8 +963,25 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 
 	if (p->index)
 		xp = xfrm_policy_byid(p->dir, p->index, delete);
-	else
-		xp = xfrm_policy_bysel(p->dir, &p->sel, delete);
+	else {
+		struct rtattr **rtattrs = (struct rtattr **)xfrma;
+		struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1];
+		struct xfrm_policy tmp;
+
+		err = verify_sec_ctx_len(rtattrs);
+		if (err)
+			return err;
+
+		memset(&tmp, 0, sizeof(struct xfrm_policy));
+		if (rt) {
+			struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+			if ((err = security_xfrm_policy_alloc(&tmp, uctx)))
+				return err;
+		}
+		xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete);
+		security_xfrm_policy_free(&tmp);
+	}
 	if (xp == NULL)
 		return -ENOENT;
 
@@ -1224,6 +1352,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
 
 	if (copy_to_user_tmpl(xp, skb) < 0)
 		goto nlmsg_failure;
+	if (copy_to_user_sec_ctx(xp, skb))
+		goto nlmsg_failure;
 
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
@@ -1241,6 +1371,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
 
 	len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
 	len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire));
+	len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
 	skb = alloc_skb(len, GFP_ATOMIC);
 	if (skb == NULL)
 		return -ENOMEM;
@@ -1324,6 +1455,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
 	copy_to_user_policy(xp, &upe->pol, dir);
 	if (copy_to_user_tmpl(xp, skb) < 0)
 		goto nlmsg_failure;
+	if (copy_to_user_sec_ctx(xp, skb))
+		goto nlmsg_failure;
 	upe->hard = !!hard;
 
 	nlh->nlmsg_len = skb->tail - b;
@@ -1341,6 +1474,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
 
 	len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
 	len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire));
+	len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
 	skb = alloc_skb(len, GFP_ATOMIC);
 	if (skb == NULL)
 		return -ENOMEM;
diff --git a/security/Kconfig b/security/Kconfig
index 64d3f1e9ca85..34f593410d57 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,19 @@ config SECURITY_NETWORK
 	  implement socket and networking access controls.
 	  If you are unsure how to answer this question, answer N.
 
+config SECURITY_NETWORK_XFRM
+	bool "XFRM (IPSec) Networking Security Hooks"
+	depends on XFRM && SECURITY_NETWORK
+	help
+	  This enables the XFRM (IPSec) networking security hooks.
+	  If enabled, a security module can use these hooks to
+	  implement per-packet access controls based on labels
+	  derived from IPSec policy.  Non-IPSec communications are
+	  designated as unlabelled, and only sockets authorized
+	  to communicate unlabelled data can send without using
+	  IPSec.
+	  If you are unsure how to answer this question, answer N.
+
 config SECURITY_CAPABILITIES
 	tristate "Default Linux Capabilities"
 	depends on SECURITY
diff --git a/security/dummy.c b/security/dummy.c
index 3ca5f2b828a0..a15c54709fde 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -776,8 +776,42 @@ static inline int dummy_sk_alloc_security (struct sock *sk, int family, gfp_t pr
 static inline void dummy_sk_free_security (struct sock *sk)
 {
 }
+
+static unsigned int dummy_sk_getsid(struct sock *sk, struct flowi *fl, u8 dir)
+{
+	return 0;
+}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static int dummy_xfrm_policy_alloc_security(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return 0;
+}
+
+static inline int dummy_xfrm_policy_clone_security(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+	return 0;
+}
+
+static void dummy_xfrm_policy_free_security(struct xfrm_policy *xp)
+{
+}
+
+static int dummy_xfrm_state_alloc_security(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return 0;
+}
+
+static void dummy_xfrm_state_free_security(struct xfrm_state *x)
+{
+}
+
+static int dummy_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+	return 0;
+}
+#endif /* CONFIG_SECURITY_NETWORK_XFRM */
 static int dummy_register_security (const char *name, struct security_operations *ops)
 {
 	return -EINVAL;
@@ -970,7 +1004,16 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, socket_getpeersec);
 	set_to_dummy_if_null(ops, sk_alloc_security);
 	set_to_dummy_if_null(ops, sk_free_security);
-#endif	/* CONFIG_SECURITY_NETWORK */
+	set_to_dummy_if_null(ops, sk_getsid);
+ #endif	/* CONFIG_SECURITY_NETWORK */
+#ifdef  CONFIG_SECURITY_NETWORK_XFRM
+	set_to_dummy_if_null(ops, xfrm_policy_alloc_security);
+	set_to_dummy_if_null(ops, xfrm_policy_clone_security);
+	set_to_dummy_if_null(ops, xfrm_policy_free_security);
+	set_to_dummy_if_null(ops, xfrm_state_alloc_security);
+	set_to_dummy_if_null(ops, xfrm_state_free_security);
+	set_to_dummy_if_null(ops, xfrm_policy_lookup);
+#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 #ifdef CONFIG_KEYS
 	set_to_dummy_if_null(ops, key_alloc);
 	set_to_dummy_if_null(ops, key_free);
-- 
cgit v1.2.3-71-gd317


From 89cee8b1cbb9dac40c92ef1968aea2b45f82fd18 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 13 Dec 2005 23:14:27 -0800
Subject: [IPV4]: Safer reassembly

Another spin of Herbert Xu's "safer ip reassembly" patch
for 2.6.16.

(The original patch is here:
http://marc.theaimsgroup.com/?l=linux-netdev&m=112281936522415&w=2
and my only contribution is to have tested it.)

This patch (optionally) does additional checks before accepting IP
fragments, which can greatly reduce the possibility of reassembling
fragments which originated from different IP datagrams.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Arthur Kepner <akepner@sgi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 23 ++++++++++++
 include/linux/sysctl.h                 |  1 +
 include/net/inetpeer.h                 |  1 +
 include/net/ip.h                       |  2 +
 net/ipv4/inetpeer.c                    |  1 +
 net/ipv4/ip_fragment.c                 | 68 +++++++++++++++++++++++++++++++++-
 net/ipv4/ip_output.c                   |  1 +
 net/ipv4/sysctl_net_ipv4.c             | 10 +++++
 8 files changed, 106 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ebc09a159f62..2b7cf19a06ad 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER
 	for the hash secret) for IP fragments.
 	Default: 600
 
+ipfrag_max_dist - INTEGER
+	ipfrag_max_dist is a non-negative integer value which defines the 
+	maximum "disorder" which is allowed among fragments which share a 
+	common IP source address. Note that reordering of packets is 
+	not unusual, but if a large number of fragments arrive from a source 
+	IP address while a particular fragment queue remains incomplete, it 
+	probably indicates that one or more fragments belonging to that queue 
+	have been lost. When ipfrag_max_dist is positive, an additional check 
+	is done on fragments before they are added to a reassembly queue - if 
+	ipfrag_max_dist (or more) fragments have arrived from a particular IP 
+	address between additions to any IP fragment queue using that source 
+	address, it's presumed that one or more fragments in the queue are 
+	lost. The existing fragment queue will be dropped, and a new one 
+	started. An ipfrag_max_dist value of zero disables this check.
+
+	Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
+	result in unnecessarily dropping fragment queues when normal
+	reordering of packets occurs, which could lead to poor application 
+	performance. Using a very large value, e.g. 50000, increases the 
+	likelihood of incorrectly reassembling IP fragments that originate 
+	from different IP datagrams, which could result in data corruption.
+	Default: 64
+
 INET peer storage:
 
 inet_peer_threshold - INTEGER
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4be34ef8c2f7..93fa765e47d3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -390,6 +390,7 @@ enum
 	NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
 	NET_TCP_CONG_CONTROL=110,
 	NET_TCP_ABC=111,
+	NET_IPV4_IPFRAG_MAX_DIST=112,
 };
 
 enum {
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 7fda471002b6..0965515f40cf 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -25,6 +25,7 @@ struct inet_peer
 	__u32			v4daddr;	/* peer's address */
 	__u16			avl_height;
 	__u16			ip_id_count;	/* IP ID for the next packet */
+	atomic_t		rid;		/* Frag reception counter */
 	__u32			tcp_ts;
 	unsigned long		tcp_ts_stamp;
 };
diff --git a/include/net/ip.h b/include/net/ip.h
index e4563bbee6ea..4d6294ba038e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -45,6 +45,7 @@ struct inet_skb_parm
 #define IPSKB_TRANSLATED	2
 #define IPSKB_FORWARDED		4
 #define IPSKB_XFRM_TUNNEL_SIZE	8
+#define IPSKB_FRAG_COMPLETE	16
 };
 
 struct ipcm_cookie
@@ -168,6 +169,7 @@ extern int sysctl_ipfrag_high_thresh;
 extern int sysctl_ipfrag_low_thresh;
 extern int sysctl_ipfrag_time;
 extern int sysctl_ipfrag_secret_interval;
+extern int sysctl_ipfrag_max_dist;
 
 /* From inetpeer.c */
 extern int inet_peer_threshold;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2fc3fd38924f..ce5fe3f74a3d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create)
 		return NULL;
 	n->v4daddr = daddr;
 	atomic_set(&n->refcnt, 1);
+	atomic_set(&n->rid, 0);
 	n->ip_id_count = secure_ip_id(daddr);
 	n->tcp_ts_stamp = 0;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8ce0ce2ee48e..ce2b70ce4018 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -22,6 +22,7 @@
  *		Patrick McHardy :	LRU queue of frag heads for evictor.
  */
 
+#include <linux/compiler.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -38,6 +39,7 @@
 #include <net/ip.h>
 #include <net/icmp.h>
 #include <net/checksum.h>
+#include <net/inetpeer.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/inet.h>
@@ -56,6 +58,8 @@
 int sysctl_ipfrag_high_thresh = 256*1024;
 int sysctl_ipfrag_low_thresh = 192*1024;
 
+int sysctl_ipfrag_max_dist = 64;
+
 /* Important NOTE! Fragment queue must be destroyed before MSL expires.
  * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
  */
@@ -89,8 +93,10 @@ struct ipq {
 	spinlock_t	lock;
 	atomic_t	refcnt;
 	struct timer_list timer;	/* when will this queue expire?		*/
-	int		iif;
 	struct timeval	stamp;
+	int             iif;
+	unsigned int    rid;
+	struct inet_peer *peer;
 };
 
 /* Hash table. */
@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work)
 	BUG_TRAP(qp->last_in&COMPLETE);
 	BUG_TRAP(del_timer(&qp->timer) == 0);
 
+	if (qp->peer)
+		inet_putpeer(qp->peer);
+
 	/* Release all fragment data. */
 	fp = qp->fragments;
 	while (fp) {
@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
 	qp->meat = 0;
 	qp->fragments = NULL;
 	qp->iif = 0;
+	qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
 
 	/* Initialize a timer for this entry. */
 	init_timer(&qp->timer);
@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
 	return ip_frag_create(hash, iph, user);
 }
 
+/* Is the fragment too far ahead to be part of ipq? */
+static inline int ip_frag_too_far(struct ipq *qp)
+{
+	struct inet_peer *peer = qp->peer;
+	unsigned int max = sysctl_ipfrag_max_dist;
+	unsigned int start, end;
+
+	int rc;
+
+	if (!peer || !max)
+		return 0;
+
+	start = qp->rid;
+	end = atomic_inc_return(&peer->rid);
+	qp->rid = end;
+
+	rc = qp->fragments && (end - start) > max;
+
+	if (rc) {
+		IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+	}
+
+	return rc;
+}
+
+static int ip_frag_reinit(struct ipq *qp)
+{
+	struct sk_buff *fp;
+
+	if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
+		atomic_inc(&qp->refcnt);
+		return -ETIMEDOUT;
+	}
+
+	fp = qp->fragments;
+	do {
+		struct sk_buff *xp = fp->next;
+		frag_kfree_skb(fp, NULL);
+		fp = xp;
+	} while (fp);
+
+	qp->last_in = 0;
+	qp->len = 0;
+	qp->meat = 0;
+	qp->fragments = NULL;
+	qp->iif = 0;
+
+	return 0;
+}
+
 /* Add new segment to existing queue. */
 static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	if (qp->last_in & COMPLETE)
 		goto err;
 
+	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+	    unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
+		ipq_kill(qp);
+		goto err;
+	}
+
  	offset = ntohs(skb->nh.iph->frag_off);
 	flags = offset & ~IP_OFFSET;
 	offset &= IP_OFFSET;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eba64e2bd397..2a830de3a699 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 
 	hlen = iph->ihl * 4;
 	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
+	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 
 	/* When frag_list is given, use it. First, check its validity:
 	 * some transformers could create wrong frag_list or break existing
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 01444a02b48b..dbf82955aabe 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -22,6 +22,7 @@
 extern int sysctl_ip_nonlocal_bind;
 
 #ifdef CONFIG_SYSCTL
+static int zero;
 static int tcp_retr1_max = 255; 
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -613,6 +614,15 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec_jiffies,
 		.strategy	= &sysctl_jiffies
 	},
+	{
+		.ctl_name	= NET_IPV4_IPFRAG_MAX_DIST,
+		.procname	= "ipfrag_max_dist",
+		.data		= &sysctl_ipfrag_max_dist,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero
+	},
 	{
 		.ctl_name	= NET_TCP_NO_METRICS_SAVE,
 		.procname	= "tcp_no_metrics_save",
-- 
cgit v1.2.3-71-gd317


From ca304b6104ffdd120bb6687a88a0625e58bc71cd Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:15:40 -0800
Subject: [IPV6]: Introduce inet6_rsk()

And inet6_rsk_offset in inet_request_sock, for the same reasons as
inet_sock's pinfo6 member.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ip.h               |  4 ++++
 include/linux/ipv6.h             | 39 +++++++++++++++++++++++++++++++++------
 net/ipv4/inet_diag.c             |  8 ++++----
 net/ipv6/inet6_connection_sock.c |  4 ++--
 net/ipv6/tcp_ipv6.c              | 19 +++++++++----------
 5 files changed, 52 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ip.h b/include/linux/ip.h
index 33e8a19a1a0f..5a560daeade5 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -110,6 +110,10 @@ struct ip_options {
 
 struct inet_request_sock {
 	struct request_sock	req;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	u16			inet6_rsk_offset;
+	/* 2 bytes hole, try to pack */
+#endif
 	u32			loc_addr;
 	u32			rmt_addr;
 	u16			rmt_port;
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e0b922785d98..7d3e86d9576e 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -199,18 +199,17 @@ static inline int inet6_iif(const struct sk_buff *skb)
 	return IP6CB(skb)->iif;
 }
 
-struct tcp6_request_sock {
-	struct tcp_request_sock	req;
+struct inet6_request_sock {
 	struct in6_addr		loc_addr;
 	struct in6_addr		rmt_addr;
 	struct sk_buff		*pktopts;
 	int			iif;
 };
 
-static inline struct tcp6_request_sock *tcp6_rsk(const struct request_sock *sk)
-{
-	return (struct tcp6_request_sock *)sk;
-}
+struct tcp6_request_sock {
+	struct tcp_request_sock	  tcp6rsk_tcp;
+	struct inet6_request_sock tcp6rsk_inet6;
+};
 
 /**
  * struct ipv6_pinfo - ipv6 private area
@@ -304,6 +303,28 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
 	return inet_sk(__sk)->pinet6;
 }
 
+static inline struct inet6_request_sock *
+			inet6_rsk(const struct request_sock *rsk)
+{
+	return (struct inet6_request_sock *)(((u8 *)rsk) +
+					     inet_rsk(rsk)->inet6_rsk_offset);
+}
+
+static inline u32 inet6_rsk_offset(struct request_sock *rsk)
+{
+	return rsk->rsk_ops->obj_size - sizeof(struct inet6_request_sock);
+}
+
+static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops)
+{
+	struct request_sock *req = reqsk_alloc(ops);
+
+	if (req != NULL)
+		inet_rsk(req)->inet6_rsk_offset = inet6_rsk_offset(req);
+
+	return req;
+}
+
 static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 {
 	return (struct raw6_sock *)sk;
@@ -361,6 +382,12 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
 	return NULL;
 }
 
+static inline struct inet6_request_sock *
+			inet6_rsk(const struct request_sock *rsk)
+{
+	return NULL;
+}
+
 static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 {
 	return NULL;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 39061ed53cfd..3ce73b141d7e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 	if (r->idiag_family == AF_INET6) {
 		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
-			       &tcp6_rsk(req)->loc_addr);
+			       &inet6_rsk(req)->loc_addr);
 		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
-			       &tcp6_rsk(req)->rmt_addr);
+			       &inet6_rsk(req)->rmt_addr);
 	}
 #endif
 	nlh->nlmsg_len = skb->tail - b;
@@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 				entry.saddr =
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 					(entry.family == AF_INET6) ?
-					tcp6_rsk(req)->loc_addr.s6_addr32 :
+					inet6_rsk(req)->loc_addr.s6_addr32 :
 #endif
 					&ireq->loc_addr;
 				entry.daddr = 
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 					(entry.family == AF_INET6) ?
-					tcp6_rsk(req)->rmt_addr.s6_addr32 :
+					inet6_rsk(req)->rmt_addr.s6_addr32 :
 #endif
 					&ireq->rmt_addr;
 				entry.dport = ntohs(ireq->rmt_port);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 04ff44344f90..fe874eeaa40c 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -61,7 +61,7 @@ struct request_sock *inet6_csk_search_req(const struct sock *sk,
 						     lopt->nr_table_entries)];
 	     (req = *prev) != NULL;
 	     prev = &req->dl_next) {
-		const struct tcp6_request_sock *treq = tcp6_rsk(req);
+		const struct inet6_request_sock *treq = inet6_rsk(req);
 
 		if (inet_rsk(req)->rmt_port == rport &&
 		    req->rsk_ops->family == AF_INET6 &&
@@ -85,7 +85,7 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
-	const u32 h = inet6_synq_hash(&tcp6_rsk(req)->rmt_addr,
+	const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr,
 				      inet_rsk(req)->rmt_port,
 				      lopt->hash_rnd, lopt->nr_table_entries);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5a10d30cec4a..c2472d771664 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -656,7 +656,7 @@ out:
 static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 			      struct dst_entry *dst)
 {
-	struct tcp6_request_sock *treq = tcp6_rsk(req);
+	struct inet6_request_sock *treq = inet6_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff * skb;
 	struct ipv6_txoptions *opt = NULL;
@@ -722,8 +722,8 @@ done:
 
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
 {
-	if (tcp6_rsk(req)->pktopts)
-		kfree_skb(tcp6_rsk(req)->pktopts);
+	if (inet6_rsk(req)->pktopts)
+		kfree_skb(inet6_rsk(req)->pktopts);
 }
 
 static struct request_sock_ops tcp6_request_sock_ops = {
@@ -956,7 +956,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
  */
 static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcp6_request_sock *treq;
+	struct inet6_request_sock *treq;
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_options_received tmp_opt;
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -981,7 +981,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 		goto drop;
 
-	req = reqsk_alloc(&tcp6_request_sock_ops);
+	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
 	if (req == NULL)
 		goto drop;
 
@@ -994,7 +994,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb);
 
-	treq = tcp6_rsk(req);
+	treq = inet6_rsk(req);
 	ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
 	ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
 	TCP_ECN_create_request(req, skb->h.th);
@@ -1035,7 +1035,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 					  struct request_sock *req,
 					  struct dst_entry *dst)
 {
-	struct tcp6_request_sock *treq = tcp6_rsk(req);
+	struct inet6_request_sock *treq = inet6_rsk(req);
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct tcp6_sock *newtcp6sk;
 	struct inet_sock *newinet;
@@ -1723,14 +1723,13 @@ static int tcp_v6_destroy_sock(struct sock *sk)
 static void get_openreq6(struct seq_file *seq, 
 			 struct sock *sk, struct request_sock *req, int i, int uid)
 {
-	struct in6_addr *dest, *src;
 	int ttd = req->expires - jiffies;
+	struct in6_addr *src = &inet6_rsk(req)->loc_addr;
+	struct in6_addr *dest = &inet6_rsk(req)->rmt_addr;
 
 	if (ttd < 0)
 		ttd = 0;
 
-	src = &tcp6_rsk(req)->loc_addr;
-	dest = &tcp6_rsk(req)->rmt_addr;
 	seq_printf(seq,
 		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
 		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
-- 
cgit v1.2.3-71-gd317


From 8292a17a399ffb7c5c8b083db4ad994e090055f7 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:15:52 -0800
Subject: [ICSK]: Rename struct tcp_func to struct inet_connection_sock_af_ops

And move it to struct inet_connection_sock. DCCP will use it in the
upcoming changesets.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h                |  2 --
 include/net/inet_connection_sock.h | 26 ++++++++++++++++++++
 include/net/tcp.h                  | 50 +-------------------------------------
 include/net/transp_v6.h            |  2 +-
 net/ipv4/syncookies.c              |  4 +--
 net/ipv4/tcp.c                     |  8 +++---
 net/ipv4/tcp_input.c               | 11 +++++----
 net/ipv4/tcp_ipv4.c                | 11 ++++-----
 net/ipv4/tcp_minisocks.c           |  8 +++---
 net/ipv4/tcp_output.c              | 17 ++++++-------
 net/ipv6/ipv6_sockglue.c           |  2 +-
 net/ipv6/tcp_ipv6.c                | 28 ++++++++++-----------
 12 files changed, 71 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 0e1da6602e05..4e1434007f44 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -295,8 +295,6 @@ struct tcp_sock {
 
 	struct sk_buff_head	out_of_order_queue; /* Out of order segments go here */
 
-	struct tcp_func		*af_specific;	/* Operations which are AF_INET{4,6} specific	*/
-
  	__u32	rcv_wnd;	/* Current receiver window		*/
 	__u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
 	__u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index ccc81a1c550c..9e20d201e951 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -15,6 +15,7 @@
 #ifndef _INET_CONNECTION_SOCK_H
 #define _INET_CONNECTION_SOCK_H
 
+#include <linux/compiler.h>
 #include <linux/ip.h>
 #include <linux/string.h>
 #include <linux/timer.h>
@@ -29,6 +30,29 @@ struct inet_bind_bucket;
 struct inet_hashinfo;
 struct tcp_congestion_ops;
 
+/*
+ * Pointers to address related TCP functions
+ * (i.e. things that depend on the address family)
+ */
+struct inet_connection_sock_af_ops {
+	int	    (*queue_xmit)(struct sk_buff *skb, int ipfragok);
+	void	    (*send_check)(struct sock *sk, int len,
+				  struct sk_buff *skb);
+	int	    (*rebuild_header)(struct sock *sk);
+	int	    (*conn_request)(struct sock *sk, struct sk_buff *skb);
+	struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
+				      struct request_sock *req,
+				      struct dst_entry *dst);
+	int	    (*remember_stamp)(struct sock *sk);
+	__u16	    net_header_len;
+	int	    (*setsockopt)(struct sock *sk, int level, int optname, 
+				  char __user *optval, int optlen);
+	int	    (*getsockopt)(struct sock *sk, int level, int optname, 
+				  char __user *optval, int __user *optlen);
+	void	    (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
+	int sockaddr_len;
+};
+
 /** inet_connection_sock - INET connection oriented sock
  *
  * @icsk_accept_queue:	   FIFO of established children 
@@ -37,6 +61,7 @@ struct tcp_congestion_ops;
  * @icsk_retransmit_timer: Resend (no ack)
  * @icsk_rto:		   Retransmit timeout
  * @icsk_ca_ops		   Pluggable congestion control hook
+ * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
  * @icsk_ca_state:	   Congestion control state
  * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
  * @icsk_pending:	   Scheduled timer event
@@ -55,6 +80,7 @@ struct inet_connection_sock {
  	struct timer_list	  icsk_delack_timer;
 	__u32			  icsk_rto;
 	struct tcp_congestion_ops *icsk_ca_ops;
+	struct inet_connection_sock_af_ops *icsk_af_ops;
 	__u8			  icsk_ca_state;
 	__u8			  icsk_retransmits;
 	__u8			  icsk_pending;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d78025f9fbea..83b117a25c2a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -224,53 +224,6 @@ extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
 extern int tcp_memory_pressure;
 
-/*
- *	Pointers to address related TCP functions
- *	(i.e. things that depend on the address family)
- */
-
-struct tcp_func {
-	int			(*queue_xmit)		(struct sk_buff *skb,
-							 int ipfragok);
-
-	void			(*send_check)		(struct sock *sk,
-							 struct tcphdr *th,
-							 int len,
-							 struct sk_buff *skb);
-
-	int			(*rebuild_header)	(struct sock *sk);
-
-	int			(*conn_request)		(struct sock *sk,
-							 struct sk_buff *skb);
-
-	struct sock *		(*syn_recv_sock)	(struct sock *sk,
-							 struct sk_buff *skb,
-							 struct request_sock *req,
-							 struct dst_entry *dst);
-    
-	int			(*remember_stamp)	(struct sock *sk);
-
-	__u16			net_header_len;
-
-	int			(*setsockopt)		(struct sock *sk, 
-							 int level, 
-							 int optname, 
-							 char __user *optval, 
-							 int optlen);
-
-	int			(*getsockopt)		(struct sock *sk, 
-							 int level, 
-							 int optname, 
-							 char __user *optval, 
-							 int __user *optlen);
-
-
-	void			(*addr2sockaddr)	(struct sock *sk,
-							 struct sockaddr *);
-
-	int sockaddr_len;
-};
-
 /*
  * The next routines deal with comparing 32 bit unsigned ints
  * and worry about wraparound (automatic with unsigned arithmetic).
@@ -405,8 +358,7 @@ extern void			tcp_parse_options(struct sk_buff *skb,
  *	TCP v4 functions exported for the inet6 API
  */
 
-extern void		       	tcp_v4_send_check(struct sock *sk, 
-						  struct tcphdr *th, int len, 
+extern void		       	tcp_v4_send_check(struct sock *sk, int len,
 						  struct sk_buff *skb);
 
 extern int			tcp_v4_conn_request(struct sock *sk,
diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index 4e86f2de6638..61f724c1036f 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -44,7 +44,7 @@ extern int			datagram_send_ctl(struct msghdr *msg,
 /*
  *	address family specific functions
  */
-extern struct tcp_func	ipv4_specific;
+extern struct inet_connection_sock_af_ops ipv4_specific;
 
 extern int inet6_destroy_sock(struct sock *sk);
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a34e60ea48a1..e20be3331f67 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 					   struct request_sock *req,
 					   struct dst_entry *dst)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sock *child;
 
-	child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
+	child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
 	if (child)
 		inet_csk_reqsk_queue_add(sk, req, child);
 	else
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ef98b14ac56d..eacfe6a3442c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 	int err = 0;
 
 	if (level != SOL_TCP)
-		return tp->af_specific->setsockopt(sk, level, optname,
-						   optval, optlen);
+		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+						     optval, optlen);
 
 	/* This is a string value all the others are int's */
 	if (optname == TCP_CONGESTION) {
@@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 	int val, len;
 
 	if (level != SOL_TCP)
-		return tp->af_specific->getsockopt(sk, level, optname,
-						   optval, optlen);
+		return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+						     optval, optlen);
 
 	if (get_user(len, optlen))
 		return -EFAULT;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bf2e23086bce..7de6184d4bd8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4071,8 +4071,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		mb();
 		tcp_set_state(sk, TCP_ESTABLISHED);
 
+		icsk = inet_csk(sk);
+
 		/* Make sure socket is routed, for correct metrics.  */
-		tp->af_specific->rebuild_header(sk);
+		icsk->icsk_af_ops->rebuild_header(sk);
 
 		tcp_init_metrics(sk);
 
@@ -4098,8 +4100,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			sk_wake_async(sk, 0, POLL_OUT);
 		}
 
-		icsk = inet_csk(sk);
-
 		if (sk->sk_write_pending ||
 		    icsk->icsk_accept_queue.rskq_defer_accept ||
 		    icsk->icsk_ack.pingpong) {
@@ -4220,6 +4220,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			  struct tcphdr *th, unsigned len)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	int queued = 0;
 
 	tp->rx_opt.saw_tstamp = 0;
@@ -4236,7 +4237,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			goto discard;
 
 		if(th->syn) {
-			if(tp->af_specific->conn_request(sk, skb) < 0)
+			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
 				return 1;
 
 			/* Now we have several options: In theory there is 
@@ -4349,7 +4350,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				/* Make sure socket is routed, for
 				 * correct metrics.
 				 */
-				tp->af_specific->rebuild_header(sk);
+				icsk->icsk_af_ops->rebuild_header(sk);
 
 				tcp_init_metrics(sk);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2aa19c89a94a..704cf2105795 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -86,8 +86,7 @@ int sysctl_tcp_low_latency;
 /* Socket used for sending RSTs */
 static struct socket *tcp_socket;
 
-void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
-		       struct sk_buff *skb);
+void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
 
 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 	.lhash_lock	= RW_LOCK_UNLOCKED,
@@ -645,10 +644,10 @@ out:
 }
 
 /* This routine computes an IPv4 TCP checksum. */
-void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
-		       struct sk_buff *skb)
+void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 {
 	struct inet_sock *inet = inet_sk(sk);
+	struct tcphdr *th = skb->h.th;
 
 	if (skb->ip_summed == CHECKSUM_HW) {
 		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
@@ -1383,7 +1382,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
 	return 0;
 }
 
-struct tcp_func ipv4_specific = {
+struct inet_connection_sock_af_ops ipv4_specific = {
 	.queue_xmit	=	ip_queue_xmit,
 	.send_check	=	tcp_v4_send_check,
 	.rebuild_header	=	inet_sk_rebuild_header,
@@ -1434,7 +1433,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
-	tp->af_specific = &ipv4_specific;
+	icsk->icsk_af_ops = &ipv4_specific;
 
 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1b66a2ac4321..9c029683a626 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,18 +274,18 @@ kill:
 void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
 	struct inet_timewait_sock *tw = NULL;
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	int recycle_ok = 0;
 
 	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
-		recycle_ok = tp->af_specific->remember_stamp(sk);
+		recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
 
 	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
 		tw = inet_twsk_alloc(sk, state);
 
 	if (tw != NULL) {
 		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
-		const struct inet_connection_sock *icsk = inet_csk(sk);
 		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
 
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
@@ -456,7 +456,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 			   struct request_sock **prev)
 {
 	struct tcphdr *th = skb->h.th;
-	struct tcp_sock *tp = tcp_sk(sk);
 	u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 	int paws_reject = 0;
 	struct tcp_options_received tmp_opt;
@@ -613,7 +612,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		 * ESTABLISHED STATE. If it will be dropped after
 		 * socket is created, wait for troubles.
 		 */
-		child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+		child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
+								 req, NULL);
 		if (child == NULL)
 			goto listen_overflow;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b7325e0b406a..af1946c52c37 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -371,7 +371,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		TCP_ECN_send(sk, tp, skb, tcp_header_size);
 	}
 
-	tp->af_specific->send_check(sk, th, skb->len, skb);
+	icsk->icsk_af_ops->send_check(sk, skb->len, skb);
 
 	if (likely(tcb->flags & TCPCB_FLAG_ACK))
 		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
@@ -381,7 +381,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	TCP_INC_STATS(TCP_MIB_OUTSEGS);
 
-	err = tp->af_specific->queue_xmit(skb, 0);
+	err = icsk->icsk_af_ops->queue_xmit(skb, 0);
 	if (unlikely(err <= 0))
 		return err;
 
@@ -638,12 +638,11 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int mss_now;
-
 	/* Calculate base mss without TCP options:
 	   It is MMS_S - sizeof(tcphdr) of rfc1122
 	 */
-	mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+	int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len -
+		       sizeof(struct tcphdr));
 
 	/* Clamp it (mss_clamp does not include tcp options) */
 	if (mss_now > tp->rx_opt.mss_clamp)
@@ -705,9 +704,9 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 	xmit_size_goal = mss_now;
 
 	if (doing_tso) {
-		xmit_size_goal = 65535 -
-			tp->af_specific->net_header_len -
-			tp->ext_header_len - tp->tcp_header_len;
+		xmit_size_goal = (65535 -
+				  inet_csk(sk)->icsk_af_ops->net_header_len -
+				  tp->ext_header_len - tp->tcp_header_len);
 
 		if (tp->max_window &&
 		    (xmit_size_goal > (tp->max_window >> 1)))
@@ -1422,7 +1421,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	   (sysctl_tcp_retrans_collapse != 0))
 		tcp_retrans_try_collapse(sk, skb, cur_mss);
 
-	if(tp->af_specific->rebuild_header(sk))
+	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
 
 	/* Some Solaris stacks overoptimize and ignore the FIN on a
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3620718defe6..b6b63fa8454c 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -170,7 +170,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
 				sock_prot_inc_use(&tcp_prot);
 				local_bh_enable();
 				sk->sk_prot = &tcp_prot;
-				tp->af_specific = &ipv4_specific;
+				inet_csk(sk)->icsk_af_ops = &ipv4_specific;
 				sk->sk_socket->ops = &inet_stream_ops;
 				sk->sk_family = PF_INET;
 				tcp_sync_mss(sk, tp->pmtu_cookie);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c2472d771664..8ce8a1359d2b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -68,14 +68,14 @@
 
 static void	tcp_v6_send_reset(struct sk_buff *skb);
 static void	tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
-static void	tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 
+static void	tcp_v6_send_check(struct sock *sk, int len, 
 				  struct sk_buff *skb);
 
 static int	tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
 static int	tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
 
-static struct tcp_func ipv6_mapped;
-static struct tcp_func ipv6_specific;
+static struct inet_connection_sock_af_ops ipv6_mapped;
+static struct inet_connection_sock_af_ops ipv6_specific;
 
 int inet6_csk_bind_conflict(const struct sock *sk,
 			    const struct inet_bind_bucket *tb)
@@ -107,9 +107,7 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 static void tcp_v6_hash(struct sock *sk)
 {
 	if (sk->sk_state != TCP_CLOSE) {
-		struct tcp_sock *tp = tcp_sk(sk);
-
-		if (tp->af_specific == &ipv6_mapped) {
+		if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
 			tcp_prot.hash(sk);
 			return;
 		}
@@ -417,14 +415,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		sin.sin_port = usin->sin6_port;
 		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
 
-		tp->af_specific = &ipv6_mapped;
+		inet_csk(sk)->icsk_af_ops = &ipv6_mapped;
 		sk->sk_backlog_rcv = tcp_v4_do_rcv;
 
 		err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
 
 		if (err) {
 			tp->ext_header_len = exthdrlen;
-			tp->af_specific = &ipv6_specific;
+			inet_csk(sk)->icsk_af_ops = &ipv6_specific;
 			sk->sk_backlog_rcv = tcp_v6_do_rcv;
 			goto failure;
 		} else {
@@ -751,10 +749,10 @@ static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
 }
 
 
-static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 
-			      struct sk_buff *skb)
+static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct tcphdr *th = skb->h.th;
 
 	if (skb->ip_summed == CHECKSUM_HW) {
 		th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP,  0);
@@ -1070,7 +1068,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 		ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
 
-		newtp->af_specific = &ipv6_mapped;
+		inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
 		newsk->sk_backlog_rcv = tcp_v4_do_rcv;
 		newnp->pktoptions  = NULL;
 		newnp->opt	   = NULL;
@@ -1084,7 +1082,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		 */
 
 		/* It is tricky place. Until this moment IPv4 tcp
-		   worked with IPv6 af_tcp.af_specific.
+		   worked with IPv6 icsk.icsk_af_ops.
 		   Sync it now.
 		 */
 		tcp_sync_mss(newsk, newtp->pmtu_cookie);
@@ -1631,7 +1629,7 @@ static int tcp_v6_remember_stamp(struct sock *sk)
 	return 0;
 }
 
-static struct tcp_func ipv6_specific = {
+static struct inet_connection_sock_af_ops ipv6_specific = {
 	.queue_xmit	=	tcp_v6_xmit,
 	.send_check	=	tcp_v6_send_check,
 	.rebuild_header	=	tcp_v6_rebuild_header,
@@ -1650,7 +1648,7 @@ static struct tcp_func ipv6_specific = {
  *	TCP over IPv4 via INET6 API
  */
 
-static struct tcp_func ipv6_mapped = {
+static struct inet_connection_sock_af_ops ipv6_mapped = {
 	.queue_xmit	=	ip_queue_xmit,
 	.send_check	=	tcp_v4_send_check,
 	.rebuild_header	=	inet_sk_rebuild_header,
@@ -1700,7 +1698,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 
 	sk->sk_state = TCP_CLOSE;
 
-	tp->af_specific = &ipv6_specific;
+	icsk->icsk_af_ops = &ipv6_specific;
 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
-- 
cgit v1.2.3-71-gd317


From 3305b80c214c642b89cd5c21af83bc91ec13f8bd Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 13 Dec 2005 23:16:37 -0800
Subject: [IP]: Simplify and consolidate MSG_PEEK error handling

When a packet is obtained from skb_recv_datagram with MSG_PEEK enabled
it is left on the socket receive queue.  This means that when we detect
a checksum error we have to be careful when trying to free the packet
as someone could have dequeued it in the time being.

Currently this delicate logic is duplicated three times between UDPv4,
UDPv6 and RAWv6.  This patch moves them into a one place and simplifies
the code somewhat.

This is based on a suggestion by Eric Dumazet.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 ++
 net/core/datagram.c    | 36 ++++++++++++++++++++++++++++++++++++
 net/ipv4/udp.c         | 15 +--------------
 net/ipv6/raw.c         | 16 +++-------------
 net/ipv6/udp.c         | 16 ++--------------
 5 files changed, 44 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8c5d6001a923..97f6580ce039 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1239,6 +1239,8 @@ extern int	       skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
 							int hlen,
 							struct iovec *iov);
 extern void	       skb_free_datagram(struct sock *sk, struct sk_buff *skb);
+extern void	       skb_kill_datagram(struct sock *sk, struct sk_buff *skb,
+					 unsigned int flags);
 extern unsigned int    skb_checksum(const struct sk_buff *skb, int offset,
 				    int len, unsigned int csum);
 extern int	       skb_copy_bits(const struct sk_buff *skb, int offset,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 1bcfef51ac58..f8d322e1ea92 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -47,6 +47,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/poll.h>
 #include <linux/highmem.h>
+#include <linux/spinlock.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -199,6 +200,41 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
+/**
+ *	skb_kill_datagram - Free a datagram skbuff forcibly
+ *	@sk: socket
+ *	@skb: datagram skbuff
+ *	@flags: MSG_ flags
+ *
+ *	This function frees a datagram skbuff that was received by
+ *	skb_recv_datagram.  The flags argument must match the one
+ *	used for skb_recv_datagram.
+ *
+ *	If the MSG_PEEK flag is set, and the packet is still on the
+ *	receive queue of the socket, it will be taken off the queue
+ *	before it is freed.
+ *
+ *	This function currently only disables BH when acquiring the
+ *	sk_receive_queue lock.  Therefore it must not be used in a
+ *	context where that lock is acquired in an IRQ context.
+ */
+
+void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+{
+	if (flags & MSG_PEEK) {
+		spin_lock_bh(&sk->sk_receive_queue.lock);
+		if (skb == skb_peek(&sk->sk_receive_queue)) {
+			__skb_unlink(skb, &sk->sk_receive_queue);
+			atomic_dec(&skb->users);
+		}
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+	}
+
+	kfree_skb(skb);
+}
+
+EXPORT_SYMBOL(skb_kill_datagram);
+
 /**
  *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb: buffer to copy
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2422a5f7195d..012c4621e40a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -846,20 +846,7 @@ out:
 csum_copy_err:
 	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
 
-	/* Clear queue. */
-	if (flags&MSG_PEEK) {
-		int clear = 0;
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (skb == skb_peek(&sk->sk_receive_queue)) {
-			__skb_unlink(skb, &sk->sk_receive_queue);
-			clear = 1;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		if (clear)
-			kfree_skb(skb);
-	}
-
-	skb_free_datagram(sk, skb);
+	skb_kill_datagram(sk, skb, flags);
 
 	if (noblock)
 		return -EAGAIN;	
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a66900cda2af..66f1d12ea578 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -32,6 +32,7 @@
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
+#include <linux/skbuff.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 #include <asm/bug.h>
@@ -433,25 +434,14 @@ out:
 	return err;
 
 csum_copy_err:
-	/* Clear queue. */
-	if (flags&MSG_PEEK) {
-		int clear = 0;
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (skb == skb_peek(&sk->sk_receive_queue)) {
-			__skb_unlink(skb, &sk->sk_receive_queue);
-			clear = 1;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		if (clear)
-			kfree_skb(skb);
-	}
+	skb_kill_datagram(sk, skb, flags);
 
 	/* Error for blocking case is chosen to masquerade
 	   as some normal condition.
 	 */
 	err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
 	/* FIXME: increment a raw6 drops counter here */
-	goto out_free;
+	goto out;
 }
 
 static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5cc8731eb55b..d8538dcea813 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -36,6 +36,7 @@
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
 #include <linux/init.h>
+#include <linux/skbuff.h>
 #include <asm/uaccess.h>
 
 #include <net/sock.h>
@@ -300,20 +301,7 @@ out:
 	return err;
 
 csum_copy_err:
-	/* Clear queue. */
-	if (flags&MSG_PEEK) {
-		int clear = 0;
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (skb == skb_peek(&sk->sk_receive_queue)) {
-			__skb_unlink(skb, &sk->sk_receive_queue);
-			clear = 1;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		if (clear)
-			kfree_skb(skb);
-	}
-
-	skb_free_datagram(sk, skb);
+	skb_kill_datagram(sk, skb, flags);
 
 	if (flags & MSG_DONTWAIT) {
 		UDP6_INC_STATS_USER(UDP_MIB_INERRORS);
-- 
cgit v1.2.3-71-gd317


From b9750ce13c08aa8a71a9b138d741f3046aefd991 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:22:54 -0800
Subject: [IPV6]: Generalise some functions

Using sk->sk_protocol instead of IPPROTO_TCP.

Will be used by DCCPv6 in the next changesets.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h                |   2 +
 include/net/inet6_connection_sock.h |  13 ++-
 net/ipv6/af_inet6.c                 |  52 ++++++++++++
 net/ipv6/inet6_connection_sock.c    | 103 ++++++++++++++++++++++++
 net/ipv6/tcp_ipv6.c                 | 153 +-----------------------------------
 5 files changed, 173 insertions(+), 150 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 7d3e86d9576e..69a0decfbdf4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -297,6 +297,8 @@ struct tcp6_sock {
 	struct ipv6_pinfo inet6;
 };
 
+extern int inet6_sk_rebuild_header(struct sock *sk);
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
 {
diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
index aa30ebde70dc..b33b438bffcc 100644
--- a/include/net/inet6_connection_sock.h
+++ b/include/net/inet6_connection_sock.h
@@ -15,8 +15,15 @@
 
 #include <linux/types.h>
 
-struct sock;
+struct in6_addr;
+struct inet_bind_bucket;
 struct request_sock;
+struct sk_buff;
+struct sock;
+struct sockaddr;
+
+extern int inet6_csk_bind_conflict(const struct sock *sk,
+				   const struct inet_bind_bucket *tb);
 
 extern struct request_sock *inet6_csk_search_req(const struct sock *sk,
 						 struct request_sock ***prevp,
@@ -28,4 +35,8 @@ extern struct request_sock *inet6_csk_search_req(const struct sock *sk,
 extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
 					   struct request_sock *req,
 					   const unsigned long timeout);
+
+extern void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
+
+extern int inet6_csk_xmit(struct sk_buff *skb, int ipfragok);
 #endif /* _INET6_CONNECTION_SOCK_H */
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d9546380fa04..fd040e9a1f47 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -609,6 +609,58 @@ inet6_unregister_protosw(struct inet_protosw *p)
 	}
 }
 
+int inet6_sk_rebuild_header(struct sock *sk)
+{
+	int err;
+	struct dst_entry *dst;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	dst = __sk_dst_check(sk, np->dst_cookie);
+
+	if (dst == NULL) {
+		struct inet_sock *inet = inet_sk(sk);
+		struct in6_addr *final_p = NULL, final;
+		struct flowi fl;
+
+		memset(&fl, 0, sizeof(fl));
+		fl.proto = sk->sk_protocol;
+		ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+		ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+		fl.fl6_flowlabel = np->flow_label;
+		fl.oif = sk->sk_bound_dev_if;
+		fl.fl_ip_dport = inet->dport;
+		fl.fl_ip_sport = inet->sport;
+
+		if (np->opt && np->opt->srcrt) {
+			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+			ipv6_addr_copy(&final, &fl.fl6_dst);
+			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+			final_p = &final;
+		}
+
+		err = ip6_dst_lookup(sk, &dst, &fl);
+		if (err) {
+			sk->sk_route_caps = 0;
+			return err;
+		}
+		if (final_p)
+			ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+		if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+			sk->sk_err_soft = -err;
+			return err;
+		}
+
+		ip6_dst_store(sk, dst, NULL);
+		sk->sk_route_caps = dst->dev->features &
+			~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
+
 int
 snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
 {
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index fe874eeaa40c..792f90f0f9ec 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -21,8 +21,34 @@
 
 #include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
+#include <net/inet_ecn.h>
+#include <net/inet_hashtables.h>
+#include <net/ip6_route.h>
 #include <net/sock.h>
 
+int inet6_csk_bind_conflict(const struct sock *sk,
+			    const struct inet_bind_bucket *tb)
+{
+	const struct sock *sk2;
+	const struct hlist_node *node;
+
+	/* We must walk the whole port owner list in this case. -DaveM */
+	sk_for_each_bound(sk2, node, &tb->owners) {
+		if (sk != sk2 &&
+		    (!sk->sk_bound_dev_if ||
+		     !sk2->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
+		    (!sk->sk_reuse || !sk2->sk_reuse ||
+		     sk2->sk_state == TCP_LISTEN) &&
+		     ipv6_rcv_saddr_equal(sk, sk2))
+			break;
+	}
+
+	return node != NULL;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
+
 /*
  * request_sock (formerly open request) hash tables.
  */
@@ -94,3 +120,80 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
 }
 
 EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
+
+void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
+
+	sin6->sin6_family = AF_INET6;
+	ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
+	sin6->sin6_port	= inet_sk(sk)->dport;
+	/* We do not store received flowlabel for TCP */
+	sin6->sin6_flowinfo = 0;
+	sin6->sin6_scope_id = 0;
+	if (sk->sk_bound_dev_if &&
+	    ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+		sin6->sin6_scope_id = sk->sk_bound_dev_if;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
+
+int inet6_csk_xmit(struct sk_buff *skb, int ipfragok)
+{
+	struct sock *sk = skb->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct flowi fl;
+	struct dst_entry *dst;
+	struct in6_addr *final_p = NULL, final;
+
+	memset(&fl, 0, sizeof(fl));
+	fl.proto = sk->sk_protocol;
+	ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+	ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+	fl.fl6_flowlabel = np->flow_label;
+	IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
+	fl.oif = sk->sk_bound_dev_if;
+	fl.fl_ip_sport = inet->sport;
+	fl.fl_ip_dport = inet->dport;
+
+	if (np->opt && np->opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
+		ipv6_addr_copy(&final, &fl.fl6_dst);
+		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+		final_p = &final;
+	}
+
+	dst = __sk_dst_check(sk, np->dst_cookie);
+
+	if (dst == NULL) {
+		int err = ip6_dst_lookup(sk, &dst, &fl);
+
+		if (err) {
+			sk->sk_err_soft = -err;
+			return err;
+		}
+
+		if (final_p)
+			ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+		if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+			sk->sk_route_caps = 0;
+			return err;
+		}
+
+		ip6_dst_store(sk, dst, NULL);
+		sk->sk_route_caps = dst->dev->features &
+			~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+	}
+
+	skb->dst = dst_clone(dst);
+
+	/* Restore final destination back after routing done */
+	ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+
+	return ip6_xmit(sk, skb, &fl, np->opt, 0);
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_xmit);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8ce8a1359d2b..2f932ce72610 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -72,32 +72,10 @@ static void	tcp_v6_send_check(struct sock *sk, int len,
 				  struct sk_buff *skb);
 
 static int	tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
-static int	tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
 
 static struct inet_connection_sock_af_ops ipv6_mapped;
 static struct inet_connection_sock_af_ops ipv6_specific;
 
-int inet6_csk_bind_conflict(const struct sock *sk,
-			    const struct inet_bind_bucket *tb)
-{
-	const struct sock *sk2;
-	const struct hlist_node *node;
-
-	/* We must walk the whole port owner list in this case. -DaveM */
-	sk_for_each_bound(sk2, node, &tb->owners) {
-		if (sk != sk2 &&
-		    (!sk->sk_bound_dev_if ||
-		     !sk2->sk_bound_dev_if ||
-		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
-		    (!sk->sk_reuse || !sk2->sk_reuse ||
-		     sk2->sk_state == TCP_LISTEN) &&
-		     ipv6_rcv_saddr_equal(sk, sk2))
-			break;
-	}
-
-	return node != NULL;
-}
-
 static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 {
 	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
@@ -1500,129 +1478,6 @@ do_time_wait:
 	goto discard_it;
 }
 
-static int tcp_v6_rebuild_header(struct sock *sk)
-{
-	int err;
-	struct dst_entry *dst;
-	struct ipv6_pinfo *np = inet6_sk(sk);
-
-	dst = __sk_dst_check(sk, np->dst_cookie);
-
-	if (dst == NULL) {
-		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *final_p = NULL, final;
-		struct flowi fl;
-
-		memset(&fl, 0, sizeof(fl));
-		fl.proto = IPPROTO_TCP;
-		ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
-		ipv6_addr_copy(&fl.fl6_src, &np->saddr);
-		fl.fl6_flowlabel = np->flow_label;
-		fl.oif = sk->sk_bound_dev_if;
-		fl.fl_ip_dport = inet->dport;
-		fl.fl_ip_sport = inet->sport;
-
-		if (np->opt && np->opt->srcrt) {
-			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
-
-		err = ip6_dst_lookup(sk, &dst, &fl);
-		if (err) {
-			sk->sk_route_caps = 0;
-			return err;
-		}
-		if (final_p)
-			ipv6_addr_copy(&fl.fl6_dst, final_p);
-
-		if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
-			sk->sk_err_soft = -err;
-			return err;
-		}
-
-		ip6_dst_store(sk, dst, NULL);
-		sk->sk_route_caps = dst->dev->features &
-			~(NETIF_F_IP_CSUM | NETIF_F_TSO);
-	}
-
-	return 0;
-}
-
-static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok)
-{
-	struct sock *sk = skb->sk;
-	struct inet_sock *inet = inet_sk(sk);
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct flowi fl;
-	struct dst_entry *dst;
-	struct in6_addr *final_p = NULL, final;
-
-	memset(&fl, 0, sizeof(fl));
-	fl.proto = IPPROTO_TCP;
-	ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
-	ipv6_addr_copy(&fl.fl6_src, &np->saddr);
-	fl.fl6_flowlabel = np->flow_label;
-	IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
-	fl.oif = sk->sk_bound_dev_if;
-	fl.fl_ip_sport = inet->sport;
-	fl.fl_ip_dport = inet->dport;
-
-	if (np->opt && np->opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
-
-	dst = __sk_dst_check(sk, np->dst_cookie);
-
-	if (dst == NULL) {
-		int err = ip6_dst_lookup(sk, &dst, &fl);
-
-		if (err) {
-			sk->sk_err_soft = -err;
-			return err;
-		}
-
-		if (final_p)
-			ipv6_addr_copy(&fl.fl6_dst, final_p);
-
-		if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
-			sk->sk_route_caps = 0;
-			return err;
-		}
-
-		ip6_dst_store(sk, dst, NULL);
-		sk->sk_route_caps = dst->dev->features &
-			~(NETIF_F_IP_CSUM | NETIF_F_TSO);
-	}
-
-	skb->dst = dst_clone(dst);
-
-	/* Restore final destination back after routing done */
-	ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
-
-	return ip6_xmit(sk, skb, &fl, np->opt, 0);
-}
-
-static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
-{
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
-
-	sin6->sin6_family = AF_INET6;
-	ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
-	sin6->sin6_port	= inet_sk(sk)->dport;
-	/* We do not store received flowlabel for TCP */
-	sin6->sin6_flowinfo = 0;
-	sin6->sin6_scope_id = 0;
-	if (sk->sk_bound_dev_if &&
-	    ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
-		sin6->sin6_scope_id = sk->sk_bound_dev_if;
-}
-
 static int tcp_v6_remember_stamp(struct sock *sk)
 {
 	/* Alas, not yet... */
@@ -1630,9 +1485,9 @@ static int tcp_v6_remember_stamp(struct sock *sk)
 }
 
 static struct inet_connection_sock_af_ops ipv6_specific = {
-	.queue_xmit	=	tcp_v6_xmit,
+	.queue_xmit	=	inet6_csk_xmit,
 	.send_check	=	tcp_v6_send_check,
-	.rebuild_header	=	tcp_v6_rebuild_header,
+	.rebuild_header	=	inet6_sk_rebuild_header,
 	.conn_request	=	tcp_v6_conn_request,
 	.syn_recv_sock	=	tcp_v6_syn_recv_sock,
 	.remember_stamp	=	tcp_v6_remember_stamp,
@@ -1640,7 +1495,7 @@ static struct inet_connection_sock_af_ops ipv6_specific = {
 
 	.setsockopt	=	ipv6_setsockopt,
 	.getsockopt	=	ipv6_getsockopt,
-	.addr2sockaddr	=	v6_addr2sockaddr,
+	.addr2sockaddr	=	inet6_csk_addr2sockaddr,
 	.sockaddr_len	=	sizeof(struct sockaddr_in6)
 };
 
@@ -1659,7 +1514,7 @@ static struct inet_connection_sock_af_ops ipv6_mapped = {
 
 	.setsockopt	=	ipv6_setsockopt,
 	.getsockopt	=	ipv6_getsockopt,
-	.addr2sockaddr	=	v6_addr2sockaddr,
+	.addr2sockaddr	=	inet6_csk_addr2sockaddr,
 	.sockaddr_len	=	sizeof(struct sockaddr_in6)
 };
 
-- 
cgit v1.2.3-71-gd317


From 0fa1a53e1f055a6c790f40e7728f42a825b29248 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:23:09 -0800
Subject: [IPV6]: Introduce inet6_timewait_sock

Out of tcp6_timewait_sock, that now is just an aggregation of
inet_timewait_sock and inet6_timewait_sock, using tw_ipv6_offset in struct
inet_timewait_sock, that is common to the IPv6 transport protocols that use
timewait sockets, like DCCP and TCP.

tw_ipv6_offset plays the struct inet_sock pinfo6 role, i.e. for the generic
code to find the IPv6 area in a timewait sock.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h             | 32 +++++++++++++++++++++-----------
 include/net/inet6_hashtables.h   |  6 +++---
 include/net/inet_timewait_sock.h |  3 ++-
 net/ipv4/inet_diag.c             |  6 +++---
 net/ipv4/tcp_minisocks.c         |  8 +++++---
 net/ipv6/addrconf.c              |  2 +-
 net/ipv6/tcp_ipv6.c              | 12 ++++++------
 7 files changed, 41 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 69a0decfbdf4..7d3908594fac 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -348,26 +348,36 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 
 #include <linux/tcp.h>
 
+struct inet6_timewait_sock {
+	struct in6_addr tw_v6_daddr;
+	struct in6_addr	tw_v6_rcv_saddr;
+};
+
 struct tcp6_timewait_sock {
-	struct tcp_timewait_sock tw_v6_sk;
-	struct in6_addr		 tw_v6_daddr;
-	struct in6_addr		 tw_v6_rcv_saddr;
+	struct tcp_timewait_sock   tcp6tw_tcp;
+	struct inet6_timewait_sock tcp6tw_inet6;
 };
 
-static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk)
+static inline u16 inet6_tw_offset(const struct proto *prot)
+{
+	return prot->twsk_obj_size - sizeof(struct inet6_timewait_sock);
+}
+
+static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk)
 {
-	return (struct tcp6_timewait_sock *)sk;
+	return (struct inet6_timewait_sock *)(((u8 *)sk) +
+					      inet_twsk(sk)->tw_ipv6_offset);
 }
 
-static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk)
+static inline struct in6_addr *__inet6_rcv_saddr(const struct sock *sk)
 {
 	return likely(sk->sk_state != TCP_TIME_WAIT) ?
-		&inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr;
+		&inet6_sk(sk)->rcv_saddr : &inet6_twsk(sk)->tw_v6_rcv_saddr;
 }
 
-static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk)
+static inline struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
 {
-	return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
+	return sk->sk_family == AF_INET6 ? __inet6_rcv_saddr(sk) : NULL;
 }
 
 static inline int inet_v6_ipv6only(const struct sock *sk)
@@ -395,8 +405,8 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 	return NULL;
 }
 
-#define __tcp_v6_rcv_saddr(__sk)	NULL
-#define tcp_v6_rcv_saddr(__sk)		NULL
+#define __inet6_rcv_saddr(__sk)	NULL
+#define inet6_rcv_saddr(__sk)	NULL
 #define tcp_twsk_ipv6only(__sk)		0
 #define inet_v6_ipv6only(__sk)		0
 #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index a4a204f99ea6..25f708ff020e 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -110,10 +110,10 @@ static inline struct sock *
 
 		if(*((__u32 *)&(tw->tw_dport))	== ports	&&
 		   sk->sk_family		== PF_INET6) {
-			const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+			const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
 
-			if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr)	&&
-			    ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr)	&&
+			if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	&&
+			    ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr)	&&
 			    (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
 				goto hit;
 		}
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 28f7b2103505..ca240f856c46 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -127,7 +127,8 @@ struct inet_timewait_sock {
 	__u16			tw_num;
 	/* And these are ours. */
 	__u8			tw_ipv6only:1;
-	/* 31 bits hole, try to pack */
+	/* 15 bits hole, try to pack */
+	__u16			tw_ipv6_offset;
 	int			tw_timeout;
 	unsigned long		tw_ttd;
 	struct inet_bind_bucket	*tw_tb;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 3ce73b141d7e..c49908192047 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
 		r->idiag_inode = 0;
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 		if (r->idiag_family == AF_INET6) {
-			const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+			const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
 
 			ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
-				       &tcp6tw->tw_v6_rcv_saddr);
+				       &tw6->tw_v6_rcv_saddr);
 			ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
-				       &tcp6tw->tw_v6_daddr);
+				       &tw6->tw_v6_daddr);
 		}
 #endif
 		nlh->nlmsg_len = skb->tail - b;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 9c029683a626..2b9b7f6c7f7c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		if (tw->tw_family == PF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
-			struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
+			struct inet6_timewait_sock *tw6;
 
-			ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
-			ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
+			tw6 = inet6_twsk((struct sock *)tw);
+			ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
+			ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
 			tw->tw_ipv6only = np->ipv6only;
 		}
 #endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a60585fd85ad..704fb73e6c5f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1195,7 +1195,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device *
 int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 {
 	const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
-	const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2);
+	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
 	u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
 	u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
 	int sk_ipv6only = ipv6_only_sock(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2f932ce72610..cb880079daf3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -138,14 +138,14 @@ static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
 
 	/* Check TIME-WAIT sockets first. */
 	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
-		const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
+		const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
 
 		tw = inet_twsk(sk2);
 
 		if(*((__u32 *)&(tw->tw_dport))	== ports	&&
 		   sk2->sk_family		== PF_INET6	&&
-		   ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr)	&&
-		   ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr)	&&
+		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	&&
+		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr)	&&
 		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
 			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 			struct tcp_sock *tp = tcp_sk(sk);
@@ -1663,14 +1663,14 @@ static void get_timewait6_sock(struct seq_file *seq,
 {
 	struct in6_addr *dest, *src;
 	__u16 destp, srcp;
-	struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
+	struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw);
 	int ttd = tw->tw_ttd - jiffies;
 
 	if (ttd < 0)
 		ttd = 0;
 
-	dest = &tcp6tw->tw_v6_daddr;
-	src  = &tcp6tw->tw_v6_rcv_saddr;
+	dest = &tw6->tw_v6_daddr;
+	src  = &tw6->tw_v6_rcv_saddr;
 	destp = ntohs(tw->tw_dport);
 	srcp  = ntohs(tw->tw_sport);
 
-- 
cgit v1.2.3-71-gd317


From 6d6ee43e0b8b8d4847627fd43739b98ec2b9404f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:25:19 -0800
Subject: [TWSK]: Introduce struct timewait_sock_ops

So that we can share several timewait sockets related functions and
make the timewait mini sockets infrastructure closer to the request
mini sockets one.

Next changesets will take advantage of this, moving more code out of
TCP and DCCP v4 and v6 to common infrastructure.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h             |  3 +-
 include/net/inet_timewait_sock.h |  3 +-
 include/net/sock.h               |  4 +--
 include/net/tcp.h                |  3 ++
 include/net/timewait_sock.h      | 31 ++++++++++++++++++
 net/core/sock.c                  | 21 ++++++------
 net/dccp/ipv4.c                  |  9 ++++-
 net/dccp/ipv6.c                  |  6 +++-
 net/ipv4/inet_timewait_sock.c    |  5 +--
 net/ipv4/tcp_ipv4.c              | 71 ++++++++++++++++++++++++----------------
 net/ipv6/tcp_ipv6.c              | 25 +++++---------
 11 files changed, 118 insertions(+), 63 deletions(-)
 create mode 100644 include/net/timewait_sock.h

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 7d3908594fac..a0d04891fe12 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -360,7 +360,8 @@ struct tcp6_timewait_sock {
 
 static inline u16 inet6_tw_offset(const struct proto *prot)
 {
-	return prot->twsk_obj_size - sizeof(struct inet6_timewait_sock);
+	return prot->twsk_prot->twsk_obj_size -
+			sizeof(struct inet6_timewait_sock);
 }
 
 static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index ca240f856c46..e396a65473d7 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -26,6 +26,7 @@
 
 #include <net/sock.h>
 #include <net/tcp_states.h>
+#include <net/timewait_sock.h>
 
 #include <asm/atomic.h>
 
@@ -200,7 +201,7 @@ static inline void inet_twsk_put(struct inet_timewait_sock *tw)
 		printk(KERN_DEBUG "%s timewait_sock %p released\n",
 		       tw->tw_prot->name, tw);
 #endif
-		kmem_cache_free(tw->tw_prot->twsk_slab, tw);
+		kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
 		module_put(owner);
 	}
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index 0fbae85c6d55..91d28957dc10 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -493,6 +493,7 @@ extern void sk_stream_kill_queues(struct sock *sk);
 extern int sk_wait_data(struct sock *sk, long *timeo);
 
 struct request_sock_ops;
+struct timewait_sock_ops;
 
 /* Networking protocol blocks we attach to sockets.
  * socket layer -> transport layer interface
@@ -557,11 +558,10 @@ struct proto {
 	kmem_cache_t		*slab;
 	unsigned int		obj_size;
 
-	kmem_cache_t		*twsk_slab;
-	unsigned int		twsk_obj_size;
 	atomic_t		*orphan_count;
 
 	struct request_sock_ops	*rsk_prot;
+	struct timewait_sock_ops *twsk_prot;
 
 	struct module		*owner;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 83b117a25c2a..176221cd0cce 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -287,6 +287,9 @@ extern int			tcp_rcv_established(struct sock *sk,
 
 extern void			tcp_rcv_space_adjust(struct sock *sk);
 
+extern int			tcp_twsk_unique(struct sock *sk,
+						struct sock *sktw, void *twp);
+
 static inline void tcp_dec_quickack_mode(struct sock *sk,
 					 const unsigned int pkts)
 {
diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
new file mode 100644
index 000000000000..2544281e1d5e
--- /dev/null
+++ b/include/net/timewait_sock.h
@@ -0,0 +1,31 @@
+/*
+ * NET		Generic infrastructure for Network protocols.
+ *
+ * Authors:	Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _TIMEWAIT_SOCK_H
+#define _TIMEWAIT_SOCK_H
+
+#include <linux/slab.h>
+#include <net/sock.h>
+
+struct timewait_sock_ops {
+	kmem_cache_t	*twsk_slab;
+	unsigned int	twsk_obj_size;
+	int		(*twsk_unique)(struct sock *sk,
+				       struct sock *sktw, void *twp);
+};
+
+static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
+{
+	if (sk->sk_prot->twsk_prot->twsk_unique != NULL)
+		return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp);
+	return 0;
+}
+
+#endif /* _TIMEWAIT_SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 13cc3be4f056..6465b0e4c8cb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1488,7 +1488,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 			}
 		}
 
-		if (prot->twsk_obj_size) {
+		if (prot->twsk_prot != NULL) {
 			static const char mask[] = "tw_sock_%s";
 
 			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
@@ -1497,11 +1497,12 @@ int proto_register(struct proto *prot, int alloc_slab)
 				goto out_free_request_sock_slab;
 
 			sprintf(timewait_sock_slab_name, mask, prot->name);
-			prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
-							    prot->twsk_obj_size,
-							    0, SLAB_HWCACHE_ALIGN,
-							    NULL, NULL);
-			if (prot->twsk_slab == NULL)
+			prot->twsk_prot->twsk_slab =
+				kmem_cache_create(timewait_sock_slab_name,
+						  prot->twsk_prot->twsk_obj_size,
+						  0, SLAB_HWCACHE_ALIGN,
+						  NULL, NULL);
+			if (prot->twsk_prot->twsk_slab == NULL)
 				goto out_free_timewait_sock_slab_name;
 		}
 	}
@@ -1548,12 +1549,12 @@ void proto_unregister(struct proto *prot)
 		prot->rsk_prot->slab = NULL;
 	}
 
-	if (prot->twsk_slab != NULL) {
-		const char *name = kmem_cache_name(prot->twsk_slab);
+	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
+		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
 
-		kmem_cache_destroy(prot->twsk_slab);
+		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
 		kfree(name);
-		prot->twsk_slab = NULL;
+		prot->twsk_prot->twsk_slab = NULL;
 	}
 }
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index bc28d71905e2..e11cda0cb6b3 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -20,6 +20,7 @@
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
 #include <net/sock.h>
+#include <net/timewait_sock.h>
 #include <net/tcp_states.h>
 #include <net/xfrm.h>
 
@@ -1309,6 +1310,10 @@ static struct request_sock_ops dccp_request_sock_ops = {
 	.send_reset	= dccp_v4_ctl_send_reset,
 };
 
+static struct timewait_sock_ops dccp_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct inet_timewait_sock),
+};
+
 struct proto dccp_prot = {
 	.name			= "DCCP",
 	.owner			= THIS_MODULE,
@@ -1332,5 +1337,7 @@ struct proto dccp_prot = {
 	.max_header		= MAX_DCCP_HEADER,
 	.obj_size		= sizeof(struct dccp_sock),
 	.rsk_prot		= &dccp_request_sock_ops,
-	.twsk_obj_size		= sizeof(struct inet_timewait_sock),
+	.twsk_prot		= &dccp_timewait_sock_ops,
 };
+
+EXPORT_SYMBOL_GPL(dccp_prot);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index a7d2aee5b3af..4d078f5b911b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -652,6 +652,10 @@ static struct request_sock_ops dccp6_request_sock_ops = {
 	.send_reset	= dccp_v6_ctl_send_reset,
 };
 
+static struct timewait_sock_ops dccp6_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct dccp6_timewait_sock),
+};
+
 static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -1359,7 +1363,7 @@ static struct proto dccp_v6_prot = {
 	.max_header		= MAX_DCCP_HEADER,
 	.obj_size		= sizeof(struct dccp6_sock),
 	.rsk_prot		= &dccp6_request_sock_ops,
-	.twsk_obj_size		= sizeof(struct dccp6_timewait_sock),
+	.twsk_prot		= &dccp6_timewait_sock_ops,
 };
 
 static struct inet6_protocol dccp_v6_protocol = {
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index a010e9a68811..417f126c749e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
 
 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
 {
-	struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
-							 SLAB_ATOMIC);
+	struct inet_timewait_sock *tw =
+		kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+				 SLAB_ATOMIC);
 	if (tw != NULL) {
 		const struct inet_sock *inet = inet_sk(sk);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0b5ab04d3c5a..6728772a943a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -69,6 +69,7 @@
 #include <net/transp_v6.h>
 #include <net/ipv6.h>
 #include <net/inet_common.h>
+#include <net/timewait_sock.h>
 #include <net/xfrm.h>
 
 #include <linux/inet.h>
@@ -118,6 +119,39 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 					  skb->h.th->source);
 }
 
+int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
+{
+	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* With PAWS, it is safe from the viewpoint
+	   of data integrity. Even without PAWS it is safe provided sequence
+	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
+
+	   Actually, the idea is close to VJ's one, only timestamp cache is
+	   held not per host, but per port pair and TW bucket is used as state
+	   holder.
+
+	   If TW bucket has been already destroyed we fall back to VJ's scheme
+	   and use initial timestamp retrieved from peer table.
+	 */
+	if (tcptw->tw_ts_recent_stamp &&
+	    (twp == NULL || (sysctl_tcp_tw_reuse &&
+			     xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
+		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+		if (tp->write_seq == 0)
+			tp->write_seq = 1;
+		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
+		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+		sock_hold(sktw);
+		return 1;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+
 /* called with local bh disabled */
 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 				      struct inet_timewait_sock **twp)
@@ -142,35 +176,9 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 		tw = inet_twsk(sk2);
 
 		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
-			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
-			struct tcp_sock *tp = tcp_sk(sk);
-
-			/* With PAWS, it is safe from the viewpoint
-			   of data integrity. Even without PAWS it
-			   is safe provided sequence spaces do not
-			   overlap i.e. at data rates <= 80Mbit/sec.
-
-			   Actually, the idea is close to VJ's one,
-			   only timestamp cache is held not per host,
-			   but per port pair and TW bucket is used
-			   as state holder.
-
-			   If TW bucket has been already destroyed we
-			   fall back to VJ's scheme and use initial
-			   timestamp retrieved from peer table.
-			 */
-			if (tcptw->tw_ts_recent_stamp &&
-			    (!twp || (sysctl_tcp_tw_reuse &&
-				      xtime.tv_sec -
-				      tcptw->tw_ts_recent_stamp > 1))) {
-				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
-				if (tp->write_seq == 0)
-					tp->write_seq = 1;
-				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
-				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
-				sock_hold(sk2);
+			if (twsk_unique(sk, sk2, twp))
 				goto unique;
-			} else
+			else
 				goto not_unique;
 		}
 	}
@@ -869,6 +877,11 @@ struct request_sock_ops tcp_request_sock_ops = {
 	.send_reset	=	tcp_v4_send_reset,
 };
 
+static struct timewait_sock_ops tcp_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
+	.twsk_unique	= tcp_twsk_unique,
+};
+
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct inet_request_sock *ireq;
@@ -1979,7 +1992,7 @@ struct proto tcp_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
-	.twsk_obj_size		= sizeof(struct tcp_timewait_sock),
+	.twsk_prot		= &tcp_timewait_sock_ops,
 	.rsk_prot		= &tcp_request_sock_ops,
 };
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e5c8a669e84e..514b57bb80b7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -60,6 +60,7 @@
 #include <net/addrconf.h>
 #include <net/snmp.h>
 #include <net/dsfield.h>
+#include <net/timewait_sock.h>
 
 #include <asm/uaccess.h>
 
@@ -147,22 +148,9 @@ static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
 		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	&&
 		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr)	&&
 		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
-			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
-			struct tcp_sock *tp = tcp_sk(sk);
-
-			if (tcptw->tw_ts_recent_stamp &&
-			    (!twp ||
-			     (sysctl_tcp_tw_reuse &&
-			      xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
-				/* See comment in tcp_ipv4.c */
-				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
-				if (!tp->write_seq)
-					tp->write_seq = 1;
-				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
-				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
-				sock_hold(sk2);
+			if (twsk_unique(sk, sk2, twp))
 				goto unique;
-			} else
+			else
 				goto not_unique;
 		}
 	}
@@ -711,6 +699,11 @@ static struct request_sock_ops tcp6_request_sock_ops = {
 	.send_reset	=	tcp_v6_send_reset
 };
 
+static struct timewait_sock_ops tcp6_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),
+	.twsk_unique	= tcp_twsk_unique,
+};
+
 static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -1752,7 +1745,7 @@ struct proto tcpv6_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
-	.twsk_obj_size		= sizeof(struct tcp6_timewait_sock),
+	.twsk_prot		= &tcp6_timewait_sock_ops,
 	.rsk_prot		= &tcp6_request_sock_ops,
 };
 
-- 
cgit v1.2.3-71-gd317


From a7f5e7f164788a22eb5d3de8e2d3cee1bf58fdca Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:25:31 -0800
Subject: [INET]: Generalise tcp_v4_hash_connect

Renaming it to inet_hash_connect, making it possible to ditch
dccp_v4_hash_connect and share the same code with TCP instead.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/char/random.c         |   6 +-
 include/linux/random.h        |   2 +-
 include/net/inet_hashtables.h |   3 +
 net/dccp/ipv4.c               | 160 +------------------------------------
 net/ipv4/inet_hashtables.c    | 178 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c           | 173 +---------------------------------------
 6 files changed, 186 insertions(+), 336 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 7999da25fe40..79b59d986af4 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1554,10 +1554,8 @@ __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 
 EXPORT_SYMBOL(secure_tcp_sequence_number);
 
-
-
-/* Generate secure starting point for ephemeral TCP port search */
-u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
+/* Generate secure starting point for ephemeral IPV4 transport port search */
+u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
 {
 	struct keydata *keyptr = get_keyptr();
 	u32 hash[4];
diff --git a/include/linux/random.h b/include/linux/random.h
index 7b2adb3322d5..01424a8e621c 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -52,7 +52,7 @@ extern void get_random_bytes(void *buf, int nbytes);
 void generate_random_uuid(unsigned char uuid_out[16]);
 
 extern __u32 secure_ip_id(__u32 daddr);
-extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
+extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
 extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, 
 				       __u16 dport);
 extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 07840baa9341..c83baa79f66e 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -434,4 +434,7 @@ static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo,
 
 	return sk;
 }
+
+extern int inet_hash_connect(struct inet_timewait_death_row *death_row,
+			     struct sock *sk);
 #endif /* _INET_HASHTABLES_H */
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index e11cda0cb6b3..671fbf3b2379 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -54,164 +54,6 @@ void dccp_unhash(struct sock *sk)
 
 EXPORT_SYMBOL_GPL(dccp_unhash);
 
-/* called with local bh disabled */
-static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
-				      struct inet_timewait_sock **twp)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	const u32 daddr = inet->rcv_saddr;
-	const u32 saddr = inet->daddr;
-	const int dif = sk->sk_bound_dev_if;
-	INET_ADDR_COOKIE(acookie, saddr, daddr)
-	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
-	struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash);
-	const struct sock *sk2;
-	const struct hlist_node *node;
-	struct inet_timewait_sock *tw;
-
-	prefetch(head->chain.first);
-	write_lock(&head->lock);
-
-	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
-		tw = inet_twsk(sk2);
-
-		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
-			goto not_unique;
-	}
-	tw = NULL;
-
-	/* And established part... */
-	sk_for_each(sk2, node, &head->chain) {
-		if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
-			goto not_unique;
-	}
-
-	/* Must record num and sport now. Otherwise we will see
-	 * in hash table socket with a funny identity. */
-	inet->num = lport;
-	inet->sport = htons(lport);
-	sk->sk_hash = hash;
-	BUG_TRAP(sk_unhashed(sk));
-	__sk_add_node(sk, &head->chain);
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
-
-	if (twp != NULL) {
-		*twp = tw;
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-	} else if (tw != NULL) {
-		/* Silly. Should hash-dance instead... */
-		inet_twsk_deschedule(tw, &dccp_death_row);
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
-		inet_twsk_put(tw);
-	}
-
-	return 0;
-
-not_unique:
-	write_unlock(&head->lock);
-	return -EADDRNOTAVAIL;
-}
-
-/*
- * Bind a port for a connect operation and hash it.
- */
-static int dccp_v4_hash_connect(struct sock *sk)
-{
-	const unsigned short snum = inet_sk(sk)->num;
- 	struct inet_bind_hashbucket *head;
- 	struct inet_bind_bucket *tb;
-	int ret;
-
- 	if (snum == 0) {
- 		int low = sysctl_local_port_range[0];
- 		int high = sysctl_local_port_range[1];
- 		int remaining = (high - low) + 1;
- 		int rover = net_random() % (high - low) + low;
-		struct hlist_node *node;
- 		struct inet_timewait_sock *tw = NULL;
-
- 		local_bh_disable();
- 		do {
- 			head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
-						    dccp_hashinfo.bhash_size)];
- 			spin_lock(&head->lock);
-
- 			/* Does not bother with rcv_saddr checks,
- 			 * because the established check is already
- 			 * unique enough.
- 			 */
-			inet_bind_bucket_for_each(tb, node, &head->chain) {
- 				if (tb->port == rover) {
- 					BUG_TRAP(!hlist_empty(&tb->owners));
- 					if (tb->fastreuse >= 0)
- 						goto next_port;
- 					if (!__dccp_v4_check_established(sk,
-									 rover,
-									 &tw))
- 						goto ok;
- 					goto next_port;
- 				}
- 			}
-
- 			tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
-						     head, rover);
- 			if (tb == NULL) {
- 				spin_unlock(&head->lock);
- 				break;
- 			}
- 			tb->fastreuse = -1;
- 			goto ok;
-
- 		next_port:
- 			spin_unlock(&head->lock);
- 			if (++rover > high)
- 				rover = low;
- 		} while (--remaining > 0);
-
- 		local_bh_enable();
-
- 		return -EADDRNOTAVAIL;
-
-ok:
- 		/* All locks still held and bhs disabled */
- 		inet_bind_hash(sk, tb, rover);
-		if (sk_unhashed(sk)) {
- 			inet_sk(sk)->sport = htons(rover);
- 			__inet_hash(&dccp_hashinfo, sk, 0);
- 		}
- 		spin_unlock(&head->lock);
-
- 		if (tw != NULL) {
- 			inet_twsk_deschedule(tw, &dccp_death_row);
- 			inet_twsk_put(tw);
- 		}
-
-		ret = 0;
-		goto out;
- 	}
-
- 	head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
-						 dccp_hashinfo.bhash_size)];
- 	tb   = inet_csk(sk)->icsk_bind_hash;
-	spin_lock_bh(&head->lock);
-	if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
-		__inet_hash(&dccp_hashinfo, sk, 0);
-		spin_unlock_bh(&head->lock);
-		return 0;
-	} else {
-		spin_unlock(&head->lock);
-		/* No definite answer... Walk to established hash table */
-		ret = __dccp_v4_check_established(sk, snum, NULL);
-out:
-		local_bh_enable();
-		return ret;
-	}
-}
-
 int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -272,7 +114,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	 * complete initialization after this.
 	 */
 	dccp_set_state(sk, DCCP_REQUESTING);
-	err = dccp_v4_hash_connect(sk);
+	err = inet_hash_connect(&dccp_death_row, sk);
 	if (err != 0)
 		goto failure;
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e8d29fe736d2..33228115cda4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -15,12 +15,14 @@
 
 #include <linux/config.h>
 #include <linux/module.h>
+#include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
+#include <net/ip.h>
 
 /*
  * Allocate and initialize a new local port bind bucket.
@@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad
 }
 
 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+
+/* called with local bh disabled */
+static int __inet_check_established(struct inet_timewait_death_row *death_row,
+				    struct sock *sk, __u16 lport,
+				    struct inet_timewait_sock **twp)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	struct inet_sock *inet = inet_sk(sk);
+	u32 daddr = inet->rcv_saddr;
+	u32 saddr = inet->daddr;
+	int dif = sk->sk_bound_dev_if;
+	INET_ADDR_COOKIE(acookie, saddr, daddr)
+	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	struct sock *sk2;
+	const struct hlist_node *node;
+	struct inet_timewait_sock *tw;
+
+	prefetch(head->chain.first);
+	write_lock(&head->lock);
+
+	/* Check TIME-WAIT sockets first. */
+	sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
+		tw = inet_twsk(sk2);
+
+		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
+			if (twsk_unique(sk, sk2, twp))
+				goto unique;
+			else
+				goto not_unique;
+		}
+	}
+	tw = NULL;
+
+	/* And established part... */
+	sk_for_each(sk2, node, &head->chain) {
+		if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
+			goto not_unique;
+	}
+
+unique:
+	/* Must record num and sport now. Otherwise we will see
+	 * in hash table socket with a funny identity. */
+	inet->num = lport;
+	inet->sport = htons(lport);
+	sk->sk_hash = hash;
+	BUG_TRAP(sk_unhashed(sk));
+	__sk_add_node(sk, &head->chain);
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock(&head->lock);
+
+	if (twp) {
+		*twp = tw;
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+	} else if (tw) {
+		/* Silly. Should hash-dance instead... */
+		inet_twsk_deschedule(tw, death_row);
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+		inet_twsk_put(tw);
+	}
+
+	return 0;
+
+not_unique:
+	write_unlock(&head->lock);
+	return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet_sk_port_offset(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 
+					  inet->dport);
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+int inet_hash_connect(struct inet_timewait_death_row *death_row,
+		      struct sock *sk)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	const unsigned short snum = inet_sk(sk)->num;
+ 	struct inet_bind_hashbucket *head;
+ 	struct inet_bind_bucket *tb;
+	int ret;
+
+ 	if (!snum) {
+ 		int low = sysctl_local_port_range[0];
+ 		int high = sysctl_local_port_range[1];
+		int range = high - low;
+ 		int i;
+		int port;
+		static u32 hint;
+		u32 offset = hint + inet_sk_port_offset(sk);
+		struct hlist_node *node;
+ 		struct inet_timewait_sock *tw = NULL;
+
+ 		local_bh_disable();
+		for (i = 1; i <= range; i++) {
+			port = low + (i + offset) % range;
+ 			head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
+ 			spin_lock(&head->lock);
+
+ 			/* Does not bother with rcv_saddr checks,
+ 			 * because the established check is already
+ 			 * unique enough.
+ 			 */
+			inet_bind_bucket_for_each(tb, node, &head->chain) {
+ 				if (tb->port == port) {
+ 					BUG_TRAP(!hlist_empty(&tb->owners));
+ 					if (tb->fastreuse >= 0)
+ 						goto next_port;
+ 					if (!__inet_check_established(death_row,
+								      sk, port,
+								      &tw))
+ 						goto ok;
+ 					goto next_port;
+ 				}
+ 			}
+
+ 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port);
+ 			if (!tb) {
+ 				spin_unlock(&head->lock);
+ 				break;
+ 			}
+ 			tb->fastreuse = -1;
+ 			goto ok;
+
+ 		next_port:
+ 			spin_unlock(&head->lock);
+ 		}
+ 		local_bh_enable();
+
+ 		return -EADDRNOTAVAIL;
+
+ok:
+		hint += i;
+
+ 		/* Head lock still held and bh's disabled */
+ 		inet_bind_hash(sk, tb, port);
+		if (sk_unhashed(sk)) {
+ 			inet_sk(sk)->sport = htons(port);
+ 			__inet_hash(hinfo, sk, 0);
+ 		}
+ 		spin_unlock(&head->lock);
+
+ 		if (tw) {
+ 			inet_twsk_deschedule(tw, death_row);;
+ 			inet_twsk_put(tw);
+ 		}
+
+		ret = 0;
+		goto out;
+ 	}
+
+ 	head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
+ 	tb  = inet_csk(sk)->icsk_bind_hash;
+	spin_lock_bh(&head->lock);
+	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+		__inet_hash(hinfo, sk, 0);
+		spin_unlock_bh(&head->lock);
+		return 0;
+	} else {
+		spin_unlock(&head->lock);
+		/* No definite answer... Walk to established hash table */
+		ret = __inet_check_established(death_row, sk, snum, NULL);
+out:
+		local_bh_enable();
+		return ret;
+	}
+}
+
+EXPORT_SYMBOL_GPL(inet_hash_connect);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6728772a943a..c2fe61becd61 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -152,177 +152,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 
 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 
-/* called with local bh disabled */
-static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
-				      struct inet_timewait_sock **twp)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	u32 daddr = inet->rcv_saddr;
-	u32 saddr = inet->daddr;
-	int dif = sk->sk_bound_dev_if;
-	INET_ADDR_COOKIE(acookie, saddr, daddr)
-	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
-	struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
-	struct sock *sk2;
-	const struct hlist_node *node;
-	struct inet_timewait_sock *tw;
-
-	prefetch(head->chain.first);
-	write_lock(&head->lock);
-
-	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
-		tw = inet_twsk(sk2);
-
-		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
-			if (twsk_unique(sk, sk2, twp))
-				goto unique;
-			else
-				goto not_unique;
-		}
-	}
-	tw = NULL;
-
-	/* And established part... */
-	sk_for_each(sk2, node, &head->chain) {
-		if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
-			goto not_unique;
-	}
-
-unique:
-	/* Must record num and sport now. Otherwise we will see
-	 * in hash table socket with a funny identity. */
-	inet->num = lport;
-	inet->sport = htons(lport);
-	sk->sk_hash = hash;
-	BUG_TRAP(sk_unhashed(sk));
-	__sk_add_node(sk, &head->chain);
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
-
-	if (twp) {
-		*twp = tw;
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-	} else if (tw) {
-		/* Silly. Should hash-dance instead... */
-		inet_twsk_deschedule(tw, &tcp_death_row);
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
-		inet_twsk_put(tw);
-	}
-
-	return 0;
-
-not_unique:
-	write_unlock(&head->lock);
-	return -EADDRNOTAVAIL;
-}
-
-static inline u32 connect_port_offset(const struct sock *sk)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-
-	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
-					 inet->dport);
-}
-
-/*
- * Bind a port for a connect operation and hash it.
- */
-static inline int tcp_v4_hash_connect(struct sock *sk)
-{
-	const unsigned short snum = inet_sk(sk)->num;
- 	struct inet_bind_hashbucket *head;
- 	struct inet_bind_bucket *tb;
-	int ret;
-
- 	if (!snum) {
- 		int low = sysctl_local_port_range[0];
- 		int high = sysctl_local_port_range[1];
-		int range = high - low;
- 		int i;
-		int port;
-		static u32 hint;
-		u32 offset = hint + connect_port_offset(sk);
-		struct hlist_node *node;
- 		struct inet_timewait_sock *tw = NULL;
-
- 		local_bh_disable();
-		for (i = 1; i <= range; i++) {
-			port = low + (i + offset) % range;
- 			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
- 			spin_lock(&head->lock);
-
- 			/* Does not bother with rcv_saddr checks,
- 			 * because the established check is already
- 			 * unique enough.
- 			 */
-			inet_bind_bucket_for_each(tb, node, &head->chain) {
- 				if (tb->port == port) {
- 					BUG_TRAP(!hlist_empty(&tb->owners));
- 					if (tb->fastreuse >= 0)
- 						goto next_port;
- 					if (!__tcp_v4_check_established(sk,
-									port,
-									&tw))
- 						goto ok;
- 					goto next_port;
- 				}
- 			}
-
- 			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
- 			if (!tb) {
- 				spin_unlock(&head->lock);
- 				break;
- 			}
- 			tb->fastreuse = -1;
- 			goto ok;
-
- 		next_port:
- 			spin_unlock(&head->lock);
- 		}
- 		local_bh_enable();
-
- 		return -EADDRNOTAVAIL;
-
-ok:
-		hint += i;
-
- 		/* Head lock still held and bh's disabled */
- 		inet_bind_hash(sk, tb, port);
-		if (sk_unhashed(sk)) {
- 			inet_sk(sk)->sport = htons(port);
- 			__inet_hash(&tcp_hashinfo, sk, 0);
- 		}
- 		spin_unlock(&head->lock);
-
- 		if (tw) {
- 			inet_twsk_deschedule(tw, &tcp_death_row);;
- 			inet_twsk_put(tw);
- 		}
-
-		ret = 0;
-		goto out;
- 	}
-
- 	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
- 	tb  = inet_csk(sk)->icsk_bind_hash;
-	spin_lock_bh(&head->lock);
-	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		__inet_hash(&tcp_hashinfo, sk, 0);
-		spin_unlock_bh(&head->lock);
-		return 0;
-	} else {
-		spin_unlock(&head->lock);
-		/* No definite answer... Walk to established hash table */
-		ret = __tcp_v4_check_established(sk, snum, NULL);
-out:
-		local_bh_enable();
-		return ret;
-	}
-}
-
 /* This will initiate an outgoing connection. */
 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
@@ -403,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	 * complete initialization after this.
 	 */
 	tcp_set_state(sk, TCP_SYN_SENT);
-	err = tcp_v4_hash_connect(sk);
+	err = inet_hash_connect(&tcp_death_row, sk);
 	if (err)
 		goto failure;
 
-- 
cgit v1.2.3-71-gd317


From d8313f5ca2b1f86b7df6c99fc4b3fffa1f84e92b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:25:44 -0800
Subject: [INET6]: Generalise tcp_v6_hash_connect

Renaming it to inet6_hash_connect, making it possible to ditch
dccp_v6_hash_connect and share the same code with TCP instead.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/char/random.c       |   4 +-
 include/linux/random.h      |   4 +-
 include/net/ipv6.h          |   3 +
 net/dccp/ipv6.c             | 171 +----------------------------------------
 net/ipv6/inet6_hashtables.c | 183 +++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/tcp_ipv6.c         | 173 +----------------------------------------
 6 files changed, 190 insertions(+), 348 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 79b59d986af4..bdfdfd28594d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1573,7 +1573,7 @@ u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport)
+u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport)
 {
 	struct keydata *keyptr = get_keyptr();
 	u32 hash[12];
@@ -1584,7 +1584,7 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp
 
 	return twothirdsMD4Transform(daddr, hash);
 }
-EXPORT_SYMBOL(secure_tcpv6_port_ephemeral);
+EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
 #endif
 
 #if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
diff --git a/include/linux/random.h b/include/linux/random.h
index 01424a8e621c..5d6456bcdeba 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -53,8 +53,8 @@ void generate_random_uuid(unsigned char uuid_out[16]);
 
 extern __u32 secure_ip_id(__u32 daddr);
 extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
-extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, 
-				       __u16 dport);
+extern u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, 
+				      __u16 dport);
 extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 					__u16 sport, __u16 dport);
 extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr,
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 851376108ac2..e3d5d7bc8837 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -527,6 +527,9 @@ extern int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 extern int inet6_ioctl(struct socket *sock, unsigned int cmd, 
 		       unsigned long arg);
 
+extern int inet6_hash_connect(struct inet_timewait_death_row *death_row,
+			      struct sock *sk);
+
 /*
  * reassembly.c
  */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 4d078f5b911b..71bf04eb21e1 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -84,175 +84,6 @@ static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
 						   dh->dccph_sport);
 }
 
-static int __dccp_v6_check_established(struct sock *sk, const __u16 lport,
-				       struct inet_timewait_sock **twp)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-	const struct in6_addr *daddr = &np->rcv_saddr;
-	const struct in6_addr *saddr = &np->daddr;
-	const int dif = sk->sk_bound_dev_if;
-	const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-	const unsigned int hash = inet6_ehashfn(daddr, inet->num,
-						saddr, inet->dport);
-	struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash);
-	struct sock *sk2;
-	const struct hlist_node *node;
-	struct inet_timewait_sock *tw;
-
-	prefetch(head->chain.first);
-	write_lock(&head->lock);
-
-	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
-		const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
-
-		tw = inet_twsk(sk2);
-
-		if(*((__u32 *)&(tw->tw_dport))	== ports	 &&
-		   sk2->sk_family		== PF_INET6	 &&
-		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	 &&
-		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
-		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if)
-			goto not_unique;
-	}
-	tw = NULL;
-
-	/* And established part... */
-	sk_for_each(sk2, node, &head->chain) {
-		if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
-			goto not_unique;
-	}
-
-	BUG_TRAP(sk_unhashed(sk));
-	__sk_add_node(sk, &head->chain);
-	sk->sk_hash = hash;
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
-
-	if (twp) {
-		*twp = tw;
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-	} else if (tw) {
-		/* Silly. Should hash-dance instead... */
-		inet_twsk_deschedule(tw, &dccp_death_row);
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
-		inet_twsk_put(tw);
-	}
-	return 0;
-
-not_unique:
-	write_unlock(&head->lock);
-	return -EADDRNOTAVAIL;
-}
-
-static inline u32 dccp_v6_port_offset(const struct sock *sk)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-
-	return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32,
-					   np->daddr.s6_addr32,
-					   inet->dport);
-}
-
-static int dccp_v6_hash_connect(struct sock *sk)
-{
-	const unsigned short snum = inet_sk(sk)->num;
- 	struct inet_bind_hashbucket *head;
- 	struct inet_bind_bucket *tb;
-	int ret;
-
- 	if (snum == 0) {
- 		int low = sysctl_local_port_range[0];
- 		int high = sysctl_local_port_range[1];
-		int range = high - low;
- 		int i;
-		int port;
-		static u32 hint;
-		u32 offset = hint + dccp_v6_port_offset(sk);
-		struct hlist_node *node;
- 		struct inet_timewait_sock *tw = NULL;
-
- 		local_bh_disable();
-		for (i = 1; i <= range; i++) {
-			port = low + (i + offset) % range;
- 			head = &dccp_hashinfo.bhash[inet_bhashfn(port,
-						    dccp_hashinfo.bhash_size)];
- 			spin_lock(&head->lock);
-
- 			/* Does not bother with rcv_saddr checks,
- 			 * because the established check is already
- 			 * unique enough.
- 			 */
-			inet_bind_bucket_for_each(tb, node, &head->chain) {
- 				if (tb->port == port) {
- 					BUG_TRAP(!hlist_empty(&tb->owners));
- 					if (tb->fastreuse >= 0)
- 						goto next_port;
- 					if (!__dccp_v6_check_established(sk,
-									 port,
-									 &tw))
- 						goto ok;
- 					goto next_port;
- 				}
- 			}
-
- 			tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
-						     head, port);
- 			if (!tb) {
- 				spin_unlock(&head->lock);
- 				break;
- 			}
- 			tb->fastreuse = -1;
- 			goto ok;
-
- 		next_port:
- 			spin_unlock(&head->lock);
- 		}
- 		local_bh_enable();
-
- 		return -EADDRNOTAVAIL;
-ok:
-		hint += i;
-
- 		/* Head lock still held and bh's disabled */
- 		inet_bind_hash(sk, tb, port);
-		if (sk_unhashed(sk)) {
- 			inet_sk(sk)->sport = htons(port);
- 			__inet6_hash(&dccp_hashinfo, sk);
- 		}
- 		spin_unlock(&head->lock);
-
- 		if (tw) {
- 			inet_twsk_deschedule(tw, &dccp_death_row);
- 			inet_twsk_put(tw);
- 		}
-
-		ret = 0;
-		goto out;
- 	}
-
- 	head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
-						 dccp_hashinfo.bhash_size)];
- 	tb   = inet_csk(sk)->icsk_bind_hash;
-	spin_lock_bh(&head->lock);
-
-	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		__inet6_hash(&dccp_hashinfo, sk);
-		spin_unlock_bh(&head->lock);
-		return 0;
-	} else {
-		spin_unlock(&head->lock);
-		/* No definite answer... Walk to established hash table */
-		ret = __dccp_v6_check_established(sk, snum, NULL);
-out:
-		local_bh_enable();
-		return ret;
-	}
-}
-
 static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 
 			   int addr_len)
 {
@@ -403,7 +234,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	inet->dport = usin->sin6_port;
 
 	dccp_set_state(sk, DCCP_REQUESTING);
-	err = dccp_v6_hash_connect(sk);
+	err = inet6_hash_connect(&dccp_death_row, sk);
 	if (err)
 		goto late_failure;
 	/* FIXME */
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 01d5f46d4e40..4154f3a8b6cf 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -5,7 +5,8 @@
  *
  *		Generic INET6 transport hashtables
  *
- * Authors:	Lotsa people, from code originally in tcp
+ * Authors:	Lotsa people, from code originally in tcp, generalised here
+ * 		by Arnaldo Carvalho de Melo <acme@mandriva.com>
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -14,12 +15,13 @@
  */
 
 #include <linux/config.h>
-
 #include <linux/module.h>
+#include <linux/random.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
 #include <net/inet6_hashtables.h>
+#include <net/ip.h>
 
 struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
 				   const struct in6_addr *daddr,
@@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
 }
 
 EXPORT_SYMBOL_GPL(inet6_lookup);
+
+static int __inet6_check_established(struct inet_timewait_death_row *death_row,
+				     struct sock *sk, const __u16 lport,
+				     struct inet_timewait_sock **twp)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	const struct in6_addr *daddr = &np->rcv_saddr;
+	const struct in6_addr *saddr = &np->daddr;
+	const int dif = sk->sk_bound_dev_if;
+	const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+	const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr,
+						inet->dport);
+	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	struct sock *sk2;
+	const struct hlist_node *node;
+	struct inet_timewait_sock *tw;
+
+	prefetch(head->chain.first);
+	write_lock(&head->lock);
+
+	/* Check TIME-WAIT sockets first. */
+	sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
+		const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
+
+		tw = inet_twsk(sk2);
+
+		if(*((__u32 *)&(tw->tw_dport)) == ports		 &&
+		   sk2->sk_family	       == PF_INET6	 &&
+		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	 &&
+		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
+		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
+			if (twsk_unique(sk, sk2, twp))
+				goto unique;
+			else
+				goto not_unique;
+		}
+	}
+	tw = NULL;
+
+	/* And established part... */
+	sk_for_each(sk2, node, &head->chain) {
+		if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
+			goto not_unique;
+	}
+
+unique:
+	BUG_TRAP(sk_unhashed(sk));
+	__sk_add_node(sk, &head->chain);
+	sk->sk_hash = hash;
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock(&head->lock);
+
+	if (twp != NULL) {
+		*twp = tw;
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+	} else if (tw != NULL) {
+		/* Silly. Should hash-dance instead... */
+		inet_twsk_deschedule(tw, death_row);
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+		inet_twsk_put(tw);
+	}
+	return 0;
+
+not_unique:
+	write_unlock(&head->lock);
+	return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet6_sk_port_offset(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32,
+					  np->daddr.s6_addr32,
+					  inet->dport);
+}
+
+int inet6_hash_connect(struct inet_timewait_death_row *death_row,
+		       struct sock *sk)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	const unsigned short snum = inet_sk(sk)->num;
+ 	struct inet_bind_hashbucket *head;
+ 	struct inet_bind_bucket *tb;
+	int ret;
+
+ 	if (snum == 0) {
+ 		const int low = sysctl_local_port_range[0];
+ 		const int high = sysctl_local_port_range[1];
+		const int range = high - low;
+ 		int i, port;
+		static u32 hint;
+		const u32 offset = hint + inet6_sk_port_offset(sk);
+		struct hlist_node *node;
+ 		struct inet_timewait_sock *tw = NULL;
+
+ 		local_bh_disable();
+		for (i = 1; i <= range; i++) {
+			port = low + (i + offset) % range;
+ 			head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
+ 			spin_lock(&head->lock);
+
+ 			/* Does not bother with rcv_saddr checks,
+ 			 * because the established check is already
+ 			 * unique enough.
+ 			 */
+			inet_bind_bucket_for_each(tb, node, &head->chain) {
+ 				if (tb->port == port) {
+ 					BUG_TRAP(!hlist_empty(&tb->owners));
+ 					if (tb->fastreuse >= 0)
+ 						goto next_port;
+ 					if (!__inet6_check_established(death_row,
+								       sk, port,
+								       &tw))
+ 						goto ok;
+ 					goto next_port;
+ 				}
+ 			}
+
+ 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+						     head, port);
+ 			if (!tb) {
+ 				spin_unlock(&head->lock);
+ 				break;
+ 			}
+ 			tb->fastreuse = -1;
+ 			goto ok;
+
+ 		next_port:
+ 			spin_unlock(&head->lock);
+ 		}
+ 		local_bh_enable();
+
+ 		return -EADDRNOTAVAIL;
+
+ok:
+		hint += i;
+
+ 		/* Head lock still held and bh's disabled */
+ 		inet_bind_hash(sk, tb, port);
+		if (sk_unhashed(sk)) {
+ 			inet_sk(sk)->sport = htons(port);
+ 			__inet6_hash(hinfo, sk);
+ 		}
+ 		spin_unlock(&head->lock);
+
+ 		if (tw) {
+ 			inet_twsk_deschedule(tw, death_row);
+ 			inet_twsk_put(tw);
+ 		}
+
+		ret = 0;
+		goto out;
+ 	}
+
+ 	head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
+ 	tb   = inet_csk(sk)->icsk_bind_hash;
+	spin_lock_bh(&head->lock);
+
+	if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
+		__inet6_hash(hinfo, sk);
+		spin_unlock_bh(&head->lock);
+		return 0;
+	} else {
+		spin_unlock(&head->lock);
+		/* No definite answer... Walk to established hash table */
+		ret = __inet6_check_established(death_row, sk, snum, NULL);
+out:
+		local_bh_enable();
+		return ret;
+	}
+}
+
+EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 514b57bb80b7..a682eb9093e1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -119,177 +119,6 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
-static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
-				      struct inet_timewait_sock **twp)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-	const struct in6_addr *daddr = &np->rcv_saddr;
-	const struct in6_addr *saddr = &np->daddr;
-	const int dif = sk->sk_bound_dev_if;
-	const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-	unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport);
-	struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
-	struct sock *sk2;
-	const struct hlist_node *node;
-	struct inet_timewait_sock *tw;
-
-	prefetch(head->chain.first);
-	write_lock(&head->lock);
-
-	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
-		const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
-
-		tw = inet_twsk(sk2);
-
-		if(*((__u32 *)&(tw->tw_dport))	== ports	&&
-		   sk2->sk_family		== PF_INET6	&&
-		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	&&
-		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr)	&&
-		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
-			if (twsk_unique(sk, sk2, twp))
-				goto unique;
-			else
-				goto not_unique;
-		}
-	}
-	tw = NULL;
-
-	/* And established part... */
-	sk_for_each(sk2, node, &head->chain) {
-		if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
-			goto not_unique;
-	}
-
-unique:
-	BUG_TRAP(sk_unhashed(sk));
-	__sk_add_node(sk, &head->chain);
-	sk->sk_hash = hash;
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
-
-	if (twp) {
-		*twp = tw;
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-	} else if (tw) {
-		/* Silly. Should hash-dance instead... */
-		inet_twsk_deschedule(tw, &tcp_death_row);
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
-		inet_twsk_put(tw);
-	}
-	return 0;
-
-not_unique:
-	write_unlock(&head->lock);
-	return -EADDRNOTAVAIL;
-}
-
-static inline u32 tcpv6_port_offset(const struct sock *sk)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-
-	return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32,
-					   np->daddr.s6_addr32,
-					   inet->dport);
-}
-
-static int tcp_v6_hash_connect(struct sock *sk)
-{
-	unsigned short snum = inet_sk(sk)->num;
- 	struct inet_bind_hashbucket *head;
- 	struct inet_bind_bucket *tb;
-	int ret;
-
- 	if (!snum) {
- 		int low = sysctl_local_port_range[0];
- 		int high = sysctl_local_port_range[1];
-		int range = high - low;
- 		int i;
-		int port;
-		static u32 hint;
-		u32 offset = hint + tcpv6_port_offset(sk);
-		struct hlist_node *node;
- 		struct inet_timewait_sock *tw = NULL;
-
- 		local_bh_disable();
-		for (i = 1; i <= range; i++) {
-			port = low + (i + offset) % range;
- 			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
- 			spin_lock(&head->lock);
-
- 			/* Does not bother with rcv_saddr checks,
- 			 * because the established check is already
- 			 * unique enough.
- 			 */
-			inet_bind_bucket_for_each(tb, node, &head->chain) {
- 				if (tb->port == port) {
- 					BUG_TRAP(!hlist_empty(&tb->owners));
- 					if (tb->fastreuse >= 0)
- 						goto next_port;
- 					if (!__tcp_v6_check_established(sk,
-									port,
-									&tw))
- 						goto ok;
- 					goto next_port;
- 				}
- 			}
-
- 			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
- 			if (!tb) {
- 				spin_unlock(&head->lock);
- 				break;
- 			}
- 			tb->fastreuse = -1;
- 			goto ok;
-
- 		next_port:
- 			spin_unlock(&head->lock);
- 		}
- 		local_bh_enable();
-
- 		return -EADDRNOTAVAIL;
-
-ok:
-		hint += i;
-
- 		/* Head lock still held and bh's disabled */
- 		inet_bind_hash(sk, tb, port);
-		if (sk_unhashed(sk)) {
- 			inet_sk(sk)->sport = htons(port);
- 			__inet6_hash(&tcp_hashinfo, sk);
- 		}
- 		spin_unlock(&head->lock);
-
- 		if (tw) {
- 			inet_twsk_deschedule(tw, &tcp_death_row);
- 			inet_twsk_put(tw);
- 		}
-
-		ret = 0;
-		goto out;
- 	}
-
- 	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
- 	tb   = inet_csk(sk)->icsk_bind_hash;
-	spin_lock_bh(&head->lock);
-
-	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		__inet6_hash(&tcp_hashinfo, sk);
-		spin_unlock_bh(&head->lock);
-		return 0;
-	} else {
-		spin_unlock(&head->lock);
-		/* No definite answer... Walk to established hash table */
-		ret = __tcp_v6_check_established(sk, snum, NULL);
-out:
-		local_bh_enable();
-		return ret;
-	}
-}
-
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 
 			  int addr_len)
 {
@@ -450,7 +279,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	inet->dport = usin->sin6_port;
 
 	tcp_set_state(sk, TCP_SYN_SENT);
-	err = tcp_v6_hash_connect(sk);
+	err = inet6_hash_connect(&tcp_death_row, sk);
 	if (err)
 		goto late_failure;
 
-- 
cgit v1.2.3-71-gd317


From 22712813620fa8e682dbfb253a60ca0131da1e07 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:25:56 -0800
Subject: [TCP]: Move the TCPF_ enum to tcp_states.h

Upcoming patches will make, for instance, ip_sockglue.c need just this enum
and not all of tcp.h.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 16 ----------------
 include/net/tcp_states.h | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4e1434007f44..da38eea1994b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -55,22 +55,6 @@ struct tcphdr {
 	__u16	urg_ptr;
 };
 
-#define TCP_ACTION_FIN	(1 << 7)
-
-enum {
-  TCPF_ESTABLISHED = (1 << 1),
-  TCPF_SYN_SENT  = (1 << 2),
-  TCPF_SYN_RECV  = (1 << 3),
-  TCPF_FIN_WAIT1 = (1 << 4),
-  TCPF_FIN_WAIT2 = (1 << 5),
-  TCPF_TIME_WAIT = (1 << 6),
-  TCPF_CLOSE     = (1 << 7),
-  TCPF_CLOSE_WAIT = (1 << 8),
-  TCPF_LAST_ACK  = (1 << 9),
-  TCPF_LISTEN    = (1 << 10),
-  TCPF_CLOSING   = (1 << 11) 
-};
-
 /*
  *	The union cast uses a gcc extension to avoid aliasing problems
  *  (union is compatible to any of its members)
diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
index b9d4176b2d15..b0b645988bd8 100644
--- a/include/net/tcp_states.h
+++ b/include/net/tcp_states.h
@@ -31,4 +31,20 @@ enum {
 
 #define TCP_STATE_MASK	0xF
 
+#define TCP_ACTION_FIN	(1 << 7)
+
+enum {
+	TCPF_ESTABLISHED = (1 << 1),
+	TCPF_SYN_SENT	 = (1 << 2),
+	TCPF_SYN_RECV	 = (1 << 3),
+	TCPF_FIN_WAIT1	 = (1 << 4),
+	TCPF_FIN_WAIT2	 = (1 << 5),
+	TCPF_TIME_WAIT	 = (1 << 6),
+	TCPF_CLOSE	 = (1 << 7),
+	TCPF_CLOSE_WAIT	 = (1 << 8),
+	TCPF_LAST_ACK	 = (1 << 9),
+	TCPF_LISTEN	 = (1 << 10),
+	TCPF_CLOSING	 = (1 << 11) 
+};
+
 #endif	/* _LINUX_TCP_STATES_H */
-- 
cgit v1.2.3-71-gd317


From d83d8461f902c672bc1bd8fbc6a94e19f092da97 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:26:10 -0800
Subject: [IP_SOCKGLUE]: Remove most of the tcp specific calls

As DCCP needs to be called in the same spots.

Now we have a member in inet_sock (is_icsk), set at sock creation time from
struct inet_protosw->flags (if INET_PROTOSW_ICSK is set, like for TCP and
DCCP) to see if a struct sock instance is a inet_connection_sock for places
like the ones in ip_sockglue.c (v4 and v6) where we previously were looking if
sk_type was SOCK_STREAM, that is insufficient because we now use the same code
for DCCP, that has sk_type SOCK_DCCP.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h               |  4 ----
 include/linux/ip.h                 |  1 +
 include/linux/tcp.h                |  3 +--
 include/net/inet_connection_sock.h |  6 +++++-
 include/net/protocol.h             |  1 +
 net/dccp/diag.c                    |  2 +-
 net/dccp/input.c                   |  2 +-
 net/dccp/ipv4.c                    | 12 +++++++-----
 net/dccp/ipv6.c                    | 26 ++++++++++++++------------
 net/dccp/output.c                  |  7 ++++---
 net/dccp/proto.c                   |  2 +-
 net/ipv4/af_inet.c                 |  4 +++-
 net/ipv4/ip_sockglue.c             | 13 ++++++-------
 net/ipv4/tcp.c                     |  2 +-
 net/ipv4/tcp_input.c               | 10 ++++------
 net/ipv4/tcp_ipv4.c                | 12 ++++++------
 net/ipv4/tcp_output.c              | 18 ++++++++++--------
 net/ipv6/af_inet6.c                |  1 +
 net/ipv6/ipv6_sockglue.c           | 24 +++++++++++++-----------
 net/ipv6/tcp_ipv6.c                | 30 +++++++++++++++++-------------
 20 files changed, 97 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 71fab4311e92..d0bdb499cf8d 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -408,8 +408,6 @@ struct dccp_ackvec;
  * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss
  * @dccps_timestamp_time - time of latest TIMESTAMP option
  * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option
- * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options)
- * @dccps_pmtu_cookie - Last pmtu seen by socket
  * @dccps_packet_size - Set thru setsockopt
  * @dccps_role - Role of this sock, one of %dccp_role
  * @dccps_ndp_count - number of Non Data Packets since last data packet
@@ -434,8 +432,6 @@ struct dccp_sock {
 	__u32				dccps_timestamp_echo;
 	__u32				dccps_packet_size;
 	unsigned long			dccps_ndp_count;
-	__u16				dccps_ext_header_len;
-	__u32				dccps_pmtu_cookie;
 	__u32				dccps_mss_cache;
 	struct dccp_options		dccps_options;
 	struct dccp_ackvec		*dccps_hc_rx_ackvec;
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 5a560daeade5..6ccc596c19c8 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -155,6 +155,7 @@ struct inet_sock {
 	__u8			mc_ttl;		/* Multicasting TTL */
 	__u8			pmtudisc;
 	unsigned		recverr : 1,
+				is_icsk : 1,	/* inet_connection_sock? */
 				freebind : 1,
 				hdrincl : 1,
 				mc_loop : 1;
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index da38eea1994b..f2bb2396853f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -238,10 +238,9 @@ struct tcp_sock {
 	__u32	snd_wl1;	/* Sequence for window update		*/
 	__u32	snd_wnd;	/* The window we expect to receive	*/
 	__u32	max_window;	/* Maximal window ever seen from peer	*/
-	__u32	pmtu_cookie;	/* Last pmtu seen by socket		*/
 	__u32	mss_cache;	/* Cached effective mss, not including SACKS */
 	__u16	xmit_size_goal;	/* Goal for segmenting output packets	*/
-	__u16	ext_header_len;	/* Network protocol overhead (IP/IPv6 options) */
+	/* XXX Two bytes hole, try to pack */
 
 	__u32	window_clamp;	/* Maximal window to advertise		*/
 	__u32	rcv_ssthresh;	/* Current window clamp			*/
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index e50e2b890c6d..91888967d3e3 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -60,6 +60,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_timeout:	   Timeout
  * @icsk_retransmit_timer: Resend (no ack)
  * @icsk_rto:		   Retransmit timeout
+ * @icsk_pmtu_cookie	   Last pmtu seen by socket
  * @icsk_ca_ops		   Pluggable congestion control hook
  * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
  * @icsk_ca_state:	   Congestion control state
@@ -68,6 +69,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_backoff:	   Backoff
  * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
  * @icsk_probes_out:	   unanswered 0 window probes
+ * @icsk_ext_hdr_len:	   Network protocol overhead (IP/IPv6 options)
  * @icsk_ack:		   Delayed ACK control data
  */
 struct inet_connection_sock {
@@ -79,15 +81,17 @@ struct inet_connection_sock {
  	struct timer_list	  icsk_retransmit_timer;
  	struct timer_list	  icsk_delack_timer;
 	__u32			  icsk_rto;
+	__u32			  icsk_pmtu_cookie;
 	struct tcp_congestion_ops *icsk_ca_ops;
 	struct inet_connection_sock_af_ops *icsk_af_ops;
+	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
 	__u8			  icsk_ca_state;
 	__u8			  icsk_retransmits;
 	__u8			  icsk_pending;
 	__u8			  icsk_backoff;
 	__u8			  icsk_syn_retries;
 	__u8			  icsk_probes_out;
-	/* 2 BYTES HOLE, TRY TO PACK! */
+	__u16			  icsk_ext_hdr_len;
 	struct {
 		__u8		  pending;	 /* ACK is pending			   */
 		__u8		  quick;	 /* Scheduled number of quick acks	   */
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 357691f6a45f..a29cb29647d0 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -76,6 +76,7 @@ struct inet_protosw {
 };
 #define INET_PROTOSW_REUSE 0x01	     /* Are ports automatically reusable? */
 #define INET_PROTOSW_PERMANENT 0x02  /* Permanent protocols are unremovable. */
+#define INET_PROTOSW_ICSK      0x04  /* Is this an inet_connection_sock? */
 
 extern struct net_protocol *inet_protocol_base;
 extern struct net_protocol *inet_protos[MAX_INET_PROTOS];
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index f675d8e642d3..3f78c00e3822 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_retransmits	= icsk->icsk_retransmits;
 	info->tcpi_probes	= icsk->icsk_probes_out;
 	info->tcpi_backoff	= icsk->icsk_backoff;
-	info->tcpi_pmtu		= dp->dccps_pmtu_cookie;
+	info->tcpi_pmtu		= icsk->icsk_pmtu_cookie;
 
 	if (dp->dccps_options.dccpo_send_ack_vector)
 		info->tcpi_options |= TCPI_OPT_SACK;
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 9a724ff2a622..55e921bdd131 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -311,7 +311,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 			goto out_invalid_packet;
 		}
 
-		dccp_sync_mss(sk, dp->dccps_pmtu_cookie);
+		dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 
 		/*
 		 *    Step 10: Process REQUEST state (second part)
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 671fbf3b2379..c363051a7f16 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -104,9 +104,9 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->dport = usin->sin_port;
 	inet->daddr = daddr;
 
-	dp->dccps_ext_header_len = 0;
+	inet_csk(sk)->icsk_ext_hdr_len = 0;
 	if (inet->opt != NULL)
-		dp->dccps_ext_header_len = inet->opt->optlen;
+		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 	/*
 	 * Socket identity is still unknown (sport may be zero).
 	 * However we set state to DCCP_REQUESTING and not releasing socket
@@ -191,7 +191,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
 	mtu = dst_mtu(dst);
 
 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
-	    dp->dccps_pmtu_cookie > mtu) {
+	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 		dccp_sync_mss(sk, mtu);
 
 		/*
@@ -1051,6 +1051,7 @@ struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
 int dccp_v4_init_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	static int dccp_ctl_socket_init = 1;
 
 	dccp_options_init(&dp->dccps_options);
@@ -1090,10 +1091,11 @@ int dccp_v4_init_sock(struct sock *sk)
 		dccp_ctl_socket_init = 0;
 
 	dccp_init_xmit_timers(sk);
-	inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT;
+	icsk->icsk_rto = DCCP_TIMEOUT_INIT;
 	sk->sk_state = DCCP_CLOSED;
 	sk->sk_write_space = dccp_write_space;
-	inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops;
+	icsk->icsk_af_ops = &dccp_ipv4_af_ops;
+	icsk->icsk_sync_mss = dccp_sync_mss;
 	dp->dccps_mss_cache = 536;
 	dp->dccps_role = DCCP_ROLE_UNDEFINED;
 	dp->dccps_service = DCCP_SERVICE_INVALID_VALUE;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 71bf04eb21e1..599b0be21515 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -88,6 +88,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			   int addr_len)
 {
 	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -158,7 +159,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	 */
 
 	if (addr_type == IPV6_ADDR_MAPPED) {
-		u32 exthdrlen = dp->dccps_ext_header_len;
+		u32 exthdrlen = icsk->icsk_ext_hdr_len;
 		struct sockaddr_in sin;
 
 		SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
@@ -170,14 +171,14 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		sin.sin_port = usin->sin6_port;
 		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
 
-		inet_csk(sk)->icsk_af_ops = &dccp_ipv6_mapped;
+		icsk->icsk_af_ops = &dccp_ipv6_mapped;
 		sk->sk_backlog_rcv = dccp_v4_do_rcv;
 
 		err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
 
 		if (err) {
-			dp->dccps_ext_header_len = exthdrlen;
-			inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
+			icsk->icsk_ext_hdr_len = exthdrlen;
+			icsk->icsk_af_ops = &dccp_ipv6_af_ops;
 			sk->sk_backlog_rcv = dccp_v6_do_rcv;
 			goto failure;
 		} else {
@@ -227,9 +228,10 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	ip6_dst_store(sk, dst, NULL);
 
-	dp->dccps_ext_header_len = 0;
+	icsk->icsk_ext_hdr_len = 0;
 	if (np->opt)
-		dp->dccps_ext_header_len = np->opt->opt_flen + np->opt->opt_nflen;
+		icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+					  np->opt->opt_nflen);
 
 	inet->dport = usin->sin6_port;
 
@@ -292,7 +294,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	np = inet6_sk(sk);
 
 	if (type == ICMPV6_PKT_TOOBIG) {
-		struct dccp_sock *dp = dccp_sk(sk);
 		struct dst_entry *dst = NULL;
 
 		if (sock_owned_by_user(sk))
@@ -332,7 +333,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		} else
 			dst_hold(dst);
 
-		if (dp->dccps_pmtu_cookie > dst_mtu(dst)) {
+		if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
 			dccp_sync_mss(sk, dst_mtu(dst));
 		} /* else let the usual retransmit timer handle it */
 		dst_release(dst);
@@ -808,7 +809,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 		   worked with IPv6 icsk.icsk_af_ops.
 		   Sync it now.
 		 */
-		dccp_sync_mss(newsk, newdp->dccps_pmtu_cookie);
+		dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
 
 		return newsk;
 	}
@@ -916,10 +917,10 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 			sock_kfree_s(sk, opt, opt->tot_len);
 	}
 
-	newdp->dccps_ext_header_len = 0;
+	inet_csk(newsk)->icsk_ext_hdr_len = 0;
 	if (newnp->opt)
-		newdp->dccps_ext_header_len = newnp->opt->opt_nflen +
-					      newnp->opt->opt_flen;
+		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+						     newnp->opt->opt_flen);
 
 	dccp_sync_mss(newsk, dst_mtu(dst));
 
@@ -1230,6 +1231,7 @@ static struct inet_protosw dccp_v6_protosw = {
 	.prot		= &dccp_v6_prot,
 	.ops		= &inet6_dccp_ops,
 	.capability	= -1,
+	.flags		= INET_PROTOSW_ICSK,
 };
 
 static int __init dccp_v6_init(void)
diff --git a/net/dccp/output.c b/net/dccp/output.c
index c40f7f8a328b..95a3c2c6a3ce 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -134,12 +134,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 
 unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
-	int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len -
+	int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
 		       sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext));
 
 	/* Now subtract optional transport overhead */
-	mss_now -= dp->dccps_ext_header_len;
+	mss_now -= icsk->icsk_ext_hdr_len;
 
 	/*
 	 * FIXME: this should come from the CCID infrastructure, where, say,
@@ -152,7 +153,7 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 	mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
 
 	/* And store cached results */
-	dp->dccps_pmtu_cookie = pmtu;
+	icsk->icsk_pmtu_cookie = pmtu;
 	dp->dccps_mss_cache = mss_now;
 
 	return mss_now;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 51dfacd22a6e..40a4c6899051 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -712,7 +712,7 @@ static struct inet_protosw dccp_v4_protosw = {
 	.ops		= &inet_dccp_ops,
 	.capability	= -1,
 	.no_check	= 0,
-	.flags		= 0,
+	.flags		= INET_PROTOSW_ICSK,
 };
 
 /*
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d368cf249000..617e858beff1 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -302,6 +302,7 @@ lookup_protocol:
 		sk->sk_reuse = 1;
 
 	inet = inet_sk(sk);
+	inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
 
 	if (SOCK_RAW == sock->type) {
 		inet->num = protocol;
@@ -869,7 +870,8 @@ static struct inet_protosw inetsw_array[] =
                 .ops =        &inet_stream_ops,
                 .capability = -1,
                 .no_check =   0,
-                .flags =      INET_PROTOSW_PERMANENT,
+                .flags =      INET_PROTOSW_PERMANENT |
+			      INET_PROTOSW_ICSK,
         },
 
         {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4f2d87257309..add019c746f8 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -29,8 +29,7 @@
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/icmp.h>
-#include <net/tcp.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
 #include <linux/udp.h>
 #include <linux/igmp.h>
 #include <linux/netfilter.h>
@@ -427,8 +426,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 			err = ip_options_get_from_user(&opt, optval, optlen);
 			if (err)
 				break;
-			if (sk->sk_type == SOCK_STREAM) {
-				struct tcp_sock *tp = tcp_sk(sk);
+			if (inet->is_icsk) {
+				struct inet_connection_sock *icsk = inet_csk(sk);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 				if (sk->sk_family == PF_INET ||
 				    (!((1 << sk->sk_state) &
@@ -436,10 +435,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 				     inet->daddr != LOOPBACK4_IPV6)) {
 #endif
 					if (inet->opt)
-						tp->ext_header_len -= inet->opt->optlen;
+						icsk->icsk_ext_hdr_len -= inet->opt->optlen;
 					if (opt)
-						tp->ext_header_len += opt->optlen;
-					tcp_sync_mss(sk, tp->pmtu_cookie);
+						icsk->icsk_ext_hdr_len += opt->optlen;
+					icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 				}
 #endif
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index eacfe6a3442c..00aa80e93243 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
 
-	info->tcpi_pmtu = tp->pmtu_cookie;
+	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
 	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
 	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7de6184d4bd8..981d1203b152 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2342,7 +2342,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
 
 			if (nwin > tp->max_window) {
 				tp->max_window = nwin;
-				tcp_sync_mss(sk, tp->pmtu_cookie);
+				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
 			}
 		}
 	}
@@ -3967,12 +3967,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 struct tcphdr *th, unsigned len)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	int saved_clamp = tp->rx_opt.mss_clamp;
 
 	tcp_parse_options(skb, &tp->rx_opt, 0);
 
 	if (th->ack) {
-		struct inet_connection_sock *icsk;
 		/* rfc793:
 		 * "If the state is SYN-SENT then
 		 *    first check the ACK bit
@@ -4061,7 +4061,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
 			tp->rx_opt.sack_ok |= 2;
 
-		tcp_sync_mss(sk, tp->pmtu_cookie);
+		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
 		/* Remember, tcp_poll() does not lock socket!
@@ -4071,8 +4071,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		mb();
 		tcp_set_state(sk, TCP_ESTABLISHED);
 
-		icsk = inet_csk(sk);
-
 		/* Make sure socket is routed, for correct metrics.  */
 		icsk->icsk_af_ops->rebuild_header(sk);
 
@@ -4173,7 +4171,7 @@ discard:
 		if (tp->ecn_flags&TCP_ECN_OK)
 			sock_set_flag(sk, SOCK_NO_LARGESEND);
 
-		tcp_sync_mss(sk, tp->pmtu_cookie);
+		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c2fe61becd61..9b62d80bb20f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -220,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->dport = usin->sin_port;
 	inet->daddr = daddr;
 
-	tp->ext_header_len = 0;
+	inet_csk(sk)->icsk_ext_hdr_len = 0;
 	if (inet->opt)
-		tp->ext_header_len = inet->opt->optlen;
+		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 
 	tp->rx_opt.mss_clamp = 536;
 
@@ -275,7 +275,6 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 {
 	struct dst_entry *dst;
 	struct inet_sock *inet = inet_sk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
 
 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 	 * send out by Linux are always <576bytes so they should go through
@@ -304,7 +303,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 	mtu = dst_mtu(dst);
 
 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
-	    tp->pmtu_cookie > mtu) {
+	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 		tcp_sync_mss(sk, mtu);
 
 		/* Resend the TCP packet because it's
@@ -895,9 +894,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	ireq->opt	      = NULL;
 	newinet->mc_index     = inet_iif(skb);
 	newinet->mc_ttl	      = skb->nh.iph->ttl;
-	newtp->ext_header_len = 0;
+	inet_csk(newsk)->icsk_ext_hdr_len = 0;
 	if (newinet->opt)
-		newtp->ext_header_len = newinet->opt->optlen;
+		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
 	newinet->id = newtp->write_seq ^ jiffies;
 
 	tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1266,6 +1265,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
 	icsk->icsk_af_ops = &ipv4_specific;
+	icsk->icsk_sync_mss = tcp_sync_mss;
 
 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index af1946c52c37..3a0a914de917 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -621,7 +621,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
    It is minimum of user_mss and mss received with SYN.
    It also does not include TCP options.
 
-   tp->pmtu_cookie is last pmtu, seen by this function.
+   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
 
    tp->mss_cache is current effective sending mss, including
    all tcp options except for SACKs. It is evaluated,
@@ -631,17 +631,18 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
    NOTE1. rfc1122 clearly states that advertised MSS
    DOES NOT include either tcp or ip options.
 
-   NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
-   this function.			--ANK (980731)
+   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
+   are READ ONLY outside this function.		--ANK (980731)
  */
 
 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	/* Calculate base mss without TCP options:
 	   It is MMS_S - sizeof(tcphdr) of rfc1122
 	 */
-	int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len -
+	int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
 		       sizeof(struct tcphdr));
 
 	/* Clamp it (mss_clamp does not include tcp options) */
@@ -649,7 +650,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 		mss_now = tp->rx_opt.mss_clamp;
 
 	/* Now subtract optional transport overhead */
-	mss_now -= tp->ext_header_len;
+	mss_now -= icsk->icsk_ext_hdr_len;
 
 	/* Then reserve room for full set of TCP options and 8 bytes of data */
 	if (mss_now < 48)
@@ -663,7 +664,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 		mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
 
 	/* And store cached results */
-	tp->pmtu_cookie = pmtu;
+	icsk->icsk_pmtu_cookie = pmtu;
 	tp->mss_cache = mss_now;
 
 	return mss_now;
@@ -693,7 +694,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 
 	if (dst) {
 		u32 mtu = dst_mtu(dst);
-		if (mtu != tp->pmtu_cookie)
+		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
 			mss_now = tcp_sync_mss(sk, mtu);
 	}
 
@@ -706,7 +707,8 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 	if (doing_tso) {
 		xmit_size_goal = (65535 -
 				  inet_csk(sk)->icsk_af_ops->net_header_len -
-				  tp->ext_header_len - tp->tcp_header_len);
+				  inet_csk(sk)->icsk_ext_hdr_len -
+				  tp->tcp_header_len);
 
 		if (tp->max_window &&
 		    (xmit_size_goal > (tp->max_window >> 1)))
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index bf17aab9b776..70a510ff31ee 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -167,6 +167,7 @@ lookup_protocol:
 		sk->sk_reuse = 1;
 
 	inet = inet_sk(sk);
+	inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
 
 	if (SOCK_RAW == sock->type) {
 		inet->num = protocol;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index b6b63fa8454c..c63868dd2ca2 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -163,17 +163,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
 			sk_refcnt_debug_dec(sk);
 
 			if (sk->sk_protocol == IPPROTO_TCP) {
-				struct tcp_sock *tp = tcp_sk(sk);
+				struct inet_connection_sock *icsk = inet_csk(sk);
 
 				local_bh_disable();
 				sock_prot_dec_use(sk->sk_prot);
 				sock_prot_inc_use(&tcp_prot);
 				local_bh_enable();
 				sk->sk_prot = &tcp_prot;
-				inet_csk(sk)->icsk_af_ops = &ipv4_specific;
+				icsk->icsk_af_ops = &ipv4_specific;
 				sk->sk_socket->ops = &inet_stream_ops;
 				sk->sk_family = PF_INET;
-				tcp_sync_mss(sk, tp->pmtu_cookie);
+				tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 			} else {
 				local_bh_disable();
 				sock_prot_dec_use(sk->sk_prot);
@@ -317,14 +317,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
 		}
 
 		retv = 0;
-		if (sk->sk_type == SOCK_STREAM) {
+		if (inet_sk(sk)->is_icsk) {
 			if (opt) {
-				struct tcp_sock *tp = tcp_sk(sk);
+				struct inet_connection_sock *icsk = inet_csk(sk);
 				if (!((1 << sk->sk_state) &
 				      (TCPF_LISTEN | TCPF_CLOSE))
 				    && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
-					tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
-					tcp_sync_mss(sk, tp->pmtu_cookie);
+					icsk->icsk_ext_hdr_len =
+						opt->opt_flen + opt->opt_nflen;
+					icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
 				}
 			}
 			opt = xchg(&np->opt, opt);
@@ -380,14 +381,15 @@ sticky_done:
 			goto done;
 update:
 		retv = 0;
-		if (sk->sk_type == SOCK_STREAM) {
+		if (inet_sk(sk)->is_icsk) {
 			if (opt) {
-				struct tcp_sock *tp = tcp_sk(sk);
+				struct inet_connection_sock *icsk = inet_csk(sk);
 				if (!((1 << sk->sk_state) &
 				      (TCPF_LISTEN | TCPF_CLOSE))
 				    && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
-					tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
-					tcp_sync_mss(sk, tp->pmtu_cookie);
+					icsk->icsk_ext_hdr_len =
+						opt->opt_flen + opt->opt_nflen;
+					icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
 				}
 			}
 			opt = xchg(&np->opt, opt);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a682eb9093e1..2947bc56d8a0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -123,7 +123,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			  int addr_len)
 {
 	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
-	struct inet_sock *inet = inet_sk(sk);
+ 	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct in6_addr *saddr = NULL, *final_p = NULL, final;
@@ -198,7 +199,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	 */
 
 	if (addr_type == IPV6_ADDR_MAPPED) {
-		u32 exthdrlen = tp->ext_header_len;
+		u32 exthdrlen = icsk->icsk_ext_hdr_len;
 		struct sockaddr_in sin;
 
 		SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
@@ -210,14 +211,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		sin.sin_port = usin->sin6_port;
 		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
 
-		inet_csk(sk)->icsk_af_ops = &ipv6_mapped;
+		icsk->icsk_af_ops = &ipv6_mapped;
 		sk->sk_backlog_rcv = tcp_v4_do_rcv;
 
 		err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
 
 		if (err) {
-			tp->ext_header_len = exthdrlen;
-			inet_csk(sk)->icsk_af_ops = &ipv6_specific;
+			icsk->icsk_ext_hdr_len = exthdrlen;
+			icsk->icsk_af_ops = &ipv6_specific;
 			sk->sk_backlog_rcv = tcp_v6_do_rcv;
 			goto failure;
 		} else {
@@ -270,9 +271,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	sk->sk_route_caps = dst->dev->features &
 		~(NETIF_F_IP_CSUM | NETIF_F_TSO);
 
-	tp->ext_header_len = 0;
+	icsk->icsk_ext_hdr_len = 0;
 	if (np->opt)
-		tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen;
+		icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+					  np->opt->opt_nflen);
 
 	tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 
@@ -385,7 +387,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		} else
 			dst_hold(dst);
 
-		if (tp->pmtu_cookie > dst_mtu(dst)) {
+		if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
 			tcp_sync_mss(sk, dst_mtu(dst));
 			tcp_simple_retransmit(sk);
 		} /* else let the usual retransmit timer handle it */
@@ -869,7 +871,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		   worked with IPv6 icsk.icsk_af_ops.
 		   Sync it now.
 		 */
-		tcp_sync_mss(newsk, newtp->pmtu_cookie);
+		tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
 
 		return newsk;
 	}
@@ -976,10 +978,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 			sock_kfree_s(sk, opt, opt->tot_len);
 	}
 
-	newtp->ext_header_len = 0;
+	inet_csk(newsk)->icsk_ext_hdr_len = 0;
 	if (newnp->opt)
-		newtp->ext_header_len = newnp->opt->opt_nflen +
-					newnp->opt->opt_flen;
+		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+						     newnp->opt->opt_flen);
 
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
@@ -1361,6 +1363,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 
 	icsk->icsk_af_ops = &ipv6_specific;
 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+	icsk->icsk_sync_mss = tcp_sync_mss;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
@@ -1591,7 +1594,8 @@ static struct inet_protosw tcpv6_protosw = {
 	.ops		=	&inet6_stream_ops,
 	.capability	=	-1,
 	.no_check	=	0,
-	.flags		=	INET_PROTOSW_PERMANENT,
+	.flags		=	INET_PROTOSW_PERMANENT |
+				INET_PROTOSW_ICSK,
 };
 
 void __init tcpv6_init(void)
-- 
cgit v1.2.3-71-gd317


From c865e5d99e25a171e8262fc0f7ba608568633c64 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 19:03:44 -0800
Subject: [PKT_SCHED] netem: packet corruption option

Here is a new feature for netem in 2.6.16. It adds the ability to
randomly corrupt packets with netem. A version was done by
Hagen Paul Pfeifer, but I redid it to handle the cases of backwards
compatibility with netlink interface and presence of hardware checksum
offload. It is useful for testing hardware offload in devices.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_sched.h |  7 +++++++
 net/sched/sch_netem.c     | 49 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index e87b233615b3..d10f35338507 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -429,6 +429,7 @@ enum
 	TCA_NETEM_CORR,
 	TCA_NETEM_DELAY_DIST,
 	TCA_NETEM_REORDER,
+	TCA_NETEM_CORRUPT,
 	__TCA_NETEM_MAX,
 };
 
@@ -457,6 +458,12 @@ struct tc_netem_reorder
 	__u32	correlation;
 };
 
+struct tc_netem_corrupt
+{
+	__u32	probability;
+	__u32	correlation;
+};
+
 #define NETEM_DIST_SCALE	8192
 
 #endif
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 82fb07aa06a5..ba5283204837 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,7 +25,7 @@
 
 #include <net/pkt_sched.h>
 
-#define VERSION "1.1"
+#define VERSION "1.2"
 
 /*	Network Emulation Queuing algorithm.
 	====================================
@@ -65,11 +65,12 @@ struct netem_sched_data {
 	u32 jitter;
 	u32 duplicate;
 	u32 reorder;
+	u32 corrupt;
 
 	struct crndstate {
 		unsigned long last;
 		unsigned long rho;
-	} delay_cor, loss_cor, dup_cor, reorder_cor;
+	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 
 	struct disttable {
 		u32  size;
@@ -183,6 +184,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		q->duplicate = dupsave;
 	}
 
+	/*
+	 * Randomized packet corruption.
+	 * Make copy if needed since we are modifying
+	 * If packet is going to be hardware checksummed, then
+	 * do it now in software before we mangle it.
+	 */
+	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+		if (!(skb = skb_unshare(skb, GFP_ATOMIC))
+		    || (skb->ip_summed == CHECKSUM_HW
+			&& skb_checksum_help(skb, 0))) {
+			sch->qstats.drops++;
+			return NET_XMIT_DROP;
+		}
+
+		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
+	}
+
 	if (q->gap == 0 		/* not doing reordering */
 	    || q->counter < q->gap 	/* inside last reordering gap */
 	    || q->reorder < get_crandom(&q->reorder_cor)) {
@@ -382,6 +400,20 @@ static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
 	return 0;
 }
 
+static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	const struct tc_netem_corrupt *r = RTA_DATA(attr);
+
+	if (RTA_PAYLOAD(attr) != sizeof(*r))
+		return -EINVAL;
+
+	q->corrupt = r->probability;
+	init_crandom(&q->corrupt_cor, r->correlation);
+	return 0;
+}
+
+/* Parse netlink message to set options */
 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -432,13 +464,19 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
 			if (ret)
 				return ret;
 		}
+
 		if (tb[TCA_NETEM_REORDER-1]) {
 			ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
 			if (ret)
 				return ret;
 		}
-	}
 
+		if (tb[TCA_NETEM_CORRUPT-1]) {
+			ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]);
+			if (ret)
+				return ret;
+		}
+	}
 
 	return 0;
 }
@@ -564,6 +602,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct tc_netem_qopt qopt;
 	struct tc_netem_corr cor;
 	struct tc_netem_reorder reorder;
+	struct tc_netem_corrupt corrupt;
 
 	qopt.latency = q->latency;
 	qopt.jitter = q->jitter;
@@ -582,6 +621,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	reorder.correlation = q->reorder_cor.rho;
 	RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
 
+	corrupt.probability = q->corrupt;
+	corrupt.correlation = q->corrupt_cor.rho;
+	RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
+
 	rta->rta_len = skb->tail - b;
 
 	return skb->len;
-- 
cgit v1.2.3-71-gd317


From 3821af2fe13700cab6fd67367128fa180e43f8b8 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 19:30:53 -0800
Subject: [FLS64]: generic version

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/asm-alpha/bitops.h     | 1 +
 include/asm-arm/bitops.h       | 2 ++
 include/asm-arm26/bitops.h     | 1 +
 include/asm-cris/bitops.h      | 1 +
 include/asm-frv/bitops.h       | 1 +
 include/asm-generic/bitops.h   | 1 +
 include/asm-h8300/bitops.h     | 1 +
 include/asm-i386/bitops.h      | 1 +
 include/asm-ia64/bitops.h      | 1 +
 include/asm-m32r/bitops.h      | 1 +
 include/asm-m68k/bitops.h      | 1 +
 include/asm-m68knommu/bitops.h | 1 +
 include/asm-mips/bitops.h      | 2 +-
 include/asm-parisc/bitops.h    | 1 +
 include/asm-powerpc/bitops.h   | 1 +
 include/asm-s390/bitops.h      | 1 +
 include/asm-sh/bitops.h        | 1 +
 include/asm-sh64/bitops.h      | 1 +
 include/asm-sparc/bitops.h     | 1 +
 include/asm-sparc64/bitops.h   | 1 +
 include/asm-v850/bitops.h      | 1 +
 include/asm-x86_64/bitops.h    | 1 +
 include/asm-xtensa/bitops.h    | 1 +
 include/linux/bitops.h         | 9 +++++++++
 24 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/asm-alpha/bitops.h b/include/asm-alpha/bitops.h
index 578ed3f1a607..302201f1a097 100644
--- a/include/asm-alpha/bitops.h
+++ b/include/asm-alpha/bitops.h
@@ -321,6 +321,7 @@ static inline int fls(int word)
 #else
 #define fls	generic_fls
 #endif
+#define fls64   generic_fls64
 
 /* Compute powers of two for the given integer.  */
 static inline long floor_log2(unsigned long word)
diff --git a/include/asm-arm/bitops.h b/include/asm-arm/bitops.h
index 7399d431edfe..d02de721ecc1 100644
--- a/include/asm-arm/bitops.h
+++ b/include/asm-arm/bitops.h
@@ -332,6 +332,7 @@ static inline unsigned long __ffs(unsigned long word)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 /*
  * ffs: find first bit set. This is defined the same way as
@@ -351,6 +352,7 @@ static inline unsigned long __ffs(unsigned long word)
 #define fls(x) \
 	( __builtin_constant_p(x) ? generic_fls(x) : \
 	  ({ int __r; asm("clz\t%0, %1" : "=r"(__r) : "r"(x) : "cc"); 32-__r; }) )
+#define fls64(x)   generic_fls64(x)
 #define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
 #define __ffs(x) (ffs(x) - 1)
 #define ffz(x) __ffs( ~(x) )
diff --git a/include/asm-arm26/bitops.h b/include/asm-arm26/bitops.h
index 7d062fb2e343..15cc6f2da792 100644
--- a/include/asm-arm26/bitops.h
+++ b/include/asm-arm26/bitops.h
@@ -259,6 +259,7 @@ static inline unsigned long __ffs(unsigned long word)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 /*
  * ffs: find first bit set. This is defined the same way as
diff --git a/include/asm-cris/bitops.h b/include/asm-cris/bitops.h
index 1bddb3f3a289..d3eb0f1e4208 100644
--- a/include/asm-cris/bitops.h
+++ b/include/asm-cris/bitops.h
@@ -240,6 +240,7 @@ static inline int test_bit(int nr, const volatile unsigned long *addr)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 /*
  * hweightN - returns the hamming weight of a N-bit word
diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h
index b664bd5b6663..02be7b3a8a83 100644
--- a/include/asm-frv/bitops.h
+++ b/include/asm-frv/bitops.h
@@ -228,6 +228,7 @@ found_middle:
 							\
 	bit ? 33 - bit : bit;				\
 })
+#define fls64(x)   generic_fls64(x)
 
 /*
  * Every architecture must define this function. It's the fastest
diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h
index ce31b739fd80..0e6d9852008c 100644
--- a/include/asm-generic/bitops.h
+++ b/include/asm-generic/bitops.h
@@ -56,6 +56,7 @@ extern __inline__ int test_bit(int nr, const unsigned long * addr)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-h8300/bitops.h b/include/asm-h8300/bitops.h
index 5036f595f8c9..c0411ec9d651 100644
--- a/include/asm-h8300/bitops.h
+++ b/include/asm-h8300/bitops.h
@@ -406,5 +406,6 @@ found_middle:
 #endif /* __KERNEL__ */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #endif /* _H8300_BITOPS_H */
diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index ddf1739dc7fd..4807aa1d2e3d 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -372,6 +372,7 @@ static inline unsigned long ffz(unsigned long word)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-ia64/bitops.h b/include/asm-ia64/bitops.h
index 7232528e2d0c..36d0fb95ea89 100644
--- a/include/asm-ia64/bitops.h
+++ b/include/asm-ia64/bitops.h
@@ -345,6 +345,7 @@ fls (int t)
 	x |= x >> 16;
 	return ia64_popcnt(x);
 }
+#define fls64(x)   generic_fls64(x)
 
 /*
  * ffs: find first bit set. This is defined the same way as the libc and compiler builtin
diff --git a/include/asm-m32r/bitops.h b/include/asm-m32r/bitops.h
index e78443981349..abea2fdd8689 100644
--- a/include/asm-m32r/bitops.h
+++ b/include/asm-m32r/bitops.h
@@ -465,6 +465,7 @@ static __inline__ unsigned long __ffs(unsigned long word)
  * fls: find last bit set.
  */
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-m68k/bitops.h b/include/asm-m68k/bitops.h
index b1bcf7c66516..13f4c0048463 100644
--- a/include/asm-m68k/bitops.h
+++ b/include/asm-m68k/bitops.h
@@ -310,6 +310,7 @@ static inline int fls(int x)
 
 	return 32 - cnt;
 }
+#define fls64(x)   generic_fls64(x)
 
 /*
  * Every architecture must define this function. It's the fastest
diff --git a/include/asm-m68knommu/bitops.h b/include/asm-m68knommu/bitops.h
index c42f88a9b9f9..4058dd086a02 100644
--- a/include/asm-m68knommu/bitops.h
+++ b/include/asm-m68knommu/bitops.h
@@ -499,5 +499,6 @@ found_middle:
  * fls: find last bit set.
  */
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #endif /* _M68KNOMMU_BITOPS_H */
diff --git a/include/asm-mips/bitops.h b/include/asm-mips/bitops.h
index 5496f9064a6a..3b0c8aaf6e8b 100644
--- a/include/asm-mips/bitops.h
+++ b/include/asm-mips/bitops.h
@@ -695,7 +695,7 @@ static inline unsigned long fls(unsigned long word)
 
 	return flz(~word) + 1;
 }
-
+#define fls64(x)   generic_fls64(x)
 
 /*
  * find_next_zero_bit - find the first zero bit in a memory region
diff --git a/include/asm-parisc/bitops.h b/include/asm-parisc/bitops.h
index 55b98c67fd82..15d8c2b51584 100644
--- a/include/asm-parisc/bitops.h
+++ b/include/asm-parisc/bitops.h
@@ -263,6 +263,7 @@ static __inline__ int fls(int x)
 
 	return ret;
 }
+#define fls64(x)   generic_fls64(x)
 
 /*
  * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-powerpc/bitops.h b/include/asm-powerpc/bitops.h
index 5727229b0444..1996eaa8aeae 100644
--- a/include/asm-powerpc/bitops.h
+++ b/include/asm-powerpc/bitops.h
@@ -310,6 +310,7 @@ static __inline__ int fls(unsigned int x)
 	asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x));
 	return 32 - lz;
 }
+#define fls64(x)   generic_fls64(x)
 
 /*
  * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h
index b07c578b22ea..61232760cc3b 100644
--- a/include/asm-s390/bitops.h
+++ b/include/asm-s390/bitops.h
@@ -839,6 +839,7 @@ static inline int sched_find_first_bit(unsigned long *b)
  * fls: find last bit set.
  */
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 /*
  * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-sh/bitops.h b/include/asm-sh/bitops.h
index 5163d1ff2f1b..1c5260860045 100644
--- a/include/asm-sh/bitops.h
+++ b/include/asm-sh/bitops.h
@@ -470,6 +470,7 @@ found_middle:
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-sh64/bitops.h b/include/asm-sh64/bitops.h
index e1ff63e09227..ce9c3ad45fe0 100644
--- a/include/asm-sh64/bitops.h
+++ b/include/asm-sh64/bitops.h
@@ -510,6 +510,7 @@ found_middle:
 
 #define ffs(x)	generic_ffs(x)
 #define fls(x)	generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-sparc/bitops.h b/include/asm-sparc/bitops.h
index bfbd795a0a80..41722b5e45ef 100644
--- a/include/asm-sparc/bitops.h
+++ b/include/asm-sparc/bitops.h
@@ -298,6 +298,7 @@ static inline int ffs(int x)
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 /*
  * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-sparc64/bitops.h b/include/asm-sparc64/bitops.h
index 6388b8376c50..6efc0162fb09 100644
--- a/include/asm-sparc64/bitops.h
+++ b/include/asm-sparc64/bitops.h
@@ -119,6 +119,7 @@ static inline unsigned long __ffs(unsigned long word)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-v850/bitops.h b/include/asm-v850/bitops.h
index b91e799763fd..8955d2376ac8 100644
--- a/include/asm-v850/bitops.h
+++ b/include/asm-v850/bitops.h
@@ -276,6 +276,7 @@ found_middle:
 
 #define ffs(x) generic_ffs (x)
 #define fls(x) generic_fls (x)
+#define fls64(x) generic_fls64(x)
 #define __ffs(x) ffs(x)
 
 
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index 05a0d374404b..94b52c8ce97f 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -409,6 +409,7 @@ static __inline__ int ffs(int x)
 
 /* find last set bit */
 #define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-xtensa/bitops.h b/include/asm-xtensa/bitops.h
index e76ee889e21d..0a2065f1a372 100644
--- a/include/asm-xtensa/bitops.h
+++ b/include/asm-xtensa/bitops.h
@@ -245,6 +245,7 @@ static __inline__ int fls (unsigned int x)
 {
 	return __cntlz(x);
 }
+#define fls64(x)   generic_fls64(x)
 
 static __inline__ int
 find_next_bit(const unsigned long *addr, int size, int offset)
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 38c2fb7ebe09..6a2a19f14bb2 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -76,6 +76,15 @@ static __inline__ int generic_fls(int x)
  */
 #include <asm/bitops.h>
 
+
+static inline int generic_fls64(__u64 x)
+{
+	__u32 h = x >> 32;
+	if (h)
+		return fls(x) + 32;
+	return fls(x);
+}
+
 static __inline__ int get_bitmask_order(unsigned int count)
 {
 	int order;
-- 
cgit v1.2.3-71-gd317


From 77d76ea310b50a9c8ff15bd290fcb4ed4961adf2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 22 Dec 2005 12:43:42 -0800
Subject: [NET]: Small cleanup to socket initialization

sock_init can be done as a core_initcall instead of calling
it directly in init/main.c

Also I removed an out of date #ifdef.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 -
 include/linux/socket.h |  1 -
 init/main.c            |  4 ----
 net/socket.c           | 10 +++++-----
 4 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 97f6580ce039..971677178e0c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -32,7 +32,6 @@
 
 #define HAVE_ALLOC_SKB		/* For the drivers to know */
 #define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
-#define SLAB_SKB 		/* Slabified skbuffs 	   */
 
 #define CHECKSUM_NONE 0
 #define CHECKSUM_HW 1
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 1739c2d5b95b..9f4019156fd8 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -27,7 +27,6 @@ struct __kernel_sockaddr_storage {
 #include <linux/compiler.h>		/* __user			*/
 
 extern int sysctl_somaxconn;
-extern void sock_init(void);
 #ifdef CONFIG_PROC_FS
 struct seq_file;
 extern void socket_seq_show(struct seq_file *seq);
diff --git a/init/main.c b/init/main.c
index 27f97f9b4636..54aaf561cf66 100644
--- a/init/main.c
+++ b/init/main.c
@@ -47,7 +47,6 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
-#include <net/sock.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -614,9 +613,6 @@ static void __init do_basic_setup(void)
 	sysctl_init();
 #endif
 
-	/* Networking initialization needs a process context */ 
-	sock_init();
-
 	do_initcalls();
 }
 
diff --git a/net/socket.c b/net/socket.c
index 3145103cdf54..98be7ef3c086 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2036,7 +2036,7 @@ int sock_unregister(int family)
 	return 0;
 }
 
-void __init sock_init(void)
+static int __init sock_init(void)
 {
 	/*
 	 *	Initialize sock SLAB cache.
@@ -2044,12 +2044,10 @@ void __init sock_init(void)
 	 
 	sk_init();
 
-#ifdef SLAB_SKB
 	/*
 	 *	Initialize skbuff SLAB cache 
 	 */
 	skb_init();
-#endif
 
 	/*
 	 *	Initialize the protocols module. 
@@ -2058,8 +2056,8 @@ void __init sock_init(void)
 	init_inodecache();
 	register_filesystem(&sock_fs_type);
 	sock_mnt = kern_mount(&sock_fs_type);
-	/* The real protocol initialization is performed when
-	 *  do_initcalls is run.  
+
+	/* The real protocol initialization is performed in later initcalls.
 	 */
 
 #ifdef CONFIG_NETFILTER
@@ -2067,6 +2065,8 @@ void __init sock_init(void)
 #endif
 }
 
+core_initcall(sock_init);	/* early initcall */
+
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {
-- 
cgit v1.2.3-71-gd317


From 90ddc4f0470427df306f308ad03db6b6b21644b8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 22 Dec 2005 12:49:22 -0800
Subject: [NET]: move struct proto_ops to const

I noticed that some of 'struct proto_ops' used in the kernel may share
a cache line used by locks or other heavily modified data. (default
linker alignement is 32 bytes, and L1_CACHE_LINE is 64 or 128 at
least)

This patch makes sure a 'struct proto_ops' can be declared as const,
so that all cpus can share all parts of it without false sharing.

This is not mandatory : a driver can still use a read/write structure
if it needs to (and eventually a __read_mostly)

I made a global stubstitute to change all existing occurences to make
them const.

This should reduce the possibility of false sharing on SMP, and
speedup some socket system calls.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h         |  4 ++--
 include/net/inet_common.h   |  4 ++--
 include/net/ipv6.h          |  4 ++--
 include/net/protocol.h      |  2 +-
 net/appletalk/ddp.c         |  4 ++--
 net/atm/pvc.c               |  2 +-
 net/atm/svc.c               |  2 +-
 net/ax25/af_ax25.c          |  4 ++--
 net/bluetooth/bnep/sock.c   |  2 +-
 net/bluetooth/cmtp/sock.c   |  2 +-
 net/bluetooth/hci_sock.c    |  2 +-
 net/bluetooth/hidp/sock.c   |  2 +-
 net/bluetooth/l2cap.c       |  4 ++--
 net/bluetooth/rfcomm/sock.c |  4 ++--
 net/bluetooth/sco.c         |  4 ++--
 net/dccp/proto.c            |  2 +-
 net/decnet/af_decnet.c      |  4 ++--
 net/econet/af_econet.c      |  4 ++--
 net/ipv4/af_inet.c          |  6 +++---
 net/ipv6/af_inet6.c         |  6 +++---
 net/ipx/af_ipx.c            |  4 ++--
 net/irda/af_irda.c          | 16 ++++++++--------
 net/key/af_key.c            |  4 ++--
 net/llc/af_llc.c            |  4 ++--
 net/netlink/af_netlink.c    |  4 ++--
 net/netrom/af_netrom.c      |  4 ++--
 net/packet/af_packet.c      |  8 ++++----
 net/sctp/ipv6.c             |  2 +-
 net/sctp/protocol.c         |  2 +-
 net/sunrpc/svcsock.c        |  2 +-
 net/unix/af_unix.c          |  6 +++---
 net/wanrouter/af_wanpipe.c  |  4 ++--
 net/x25/af_x25.c            |  4 ++--
 33 files changed, 66 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index d6a41e6577f6..28195a2d8ff0 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -107,7 +107,7 @@ enum sock_type {
 struct socket {
 	socket_state		state;
 	unsigned long		flags;
-	struct proto_ops	*ops;
+	const struct proto_ops	*ops;
 	struct fasync_struct	*fasync_list;
 	struct file		*file;
 	struct sock		*sk;
@@ -260,7 +260,7 @@ SOCKCALL_WRAP(name, recvmsg, (struct kiocb *iocb, struct socket *sock, struct ms
 SOCKCALL_WRAP(name, mmap, (struct file *file, struct socket *sock, struct vm_area_struct *vma), \
 	      (file, sock, vma)) \
 	      \
-static struct proto_ops name##_ops = {			\
+static const struct proto_ops name##_ops = {			\
 	.family		= fam,				\
 	.owner		= THIS_MODULE,			\
 	.release	= __lock_##name##_release,	\
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index f943306ce5ff..227adcbdfec8 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -1,8 +1,8 @@
 #ifndef _INET_COMMON_H
 #define _INET_COMMON_H
 
-extern struct proto_ops		inet_stream_ops;
-extern struct proto_ops		inet_dgram_ops;
+extern const struct proto_ops		inet_stream_ops;
+extern const struct proto_ops		inet_dgram_ops;
 
 /*
  *	INET4 prototypes used by INET6
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index e3d5d7bc8837..11a725662c36 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -538,8 +538,8 @@ extern int sysctl_ip6frag_low_thresh;
 extern int sysctl_ip6frag_time;
 extern int sysctl_ip6frag_secret_interval;
 
-extern struct proto_ops inet6_stream_ops;
-extern struct proto_ops inet6_dgram_ops;
+extern const struct proto_ops inet6_stream_ops;
+extern const struct proto_ops inet6_dgram_ops;
 
 extern int ip6_mc_source(int add, int omode, struct sock *sk,
 			 struct group_source_req *pgsr);
diff --git a/include/net/protocol.h b/include/net/protocol.h
index a29cb29647d0..63f7db99c2a6 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -65,7 +65,7 @@ struct inet_protosw {
 	int		 protocol; /* This is the L4 protocol number.  */
 
 	struct proto	 *prot;
-	struct proto_ops *ops;
+	const struct proto_ops *ops;
   
 	int              capability; /* Which (if any) capability do
 				      * we need to use this socket
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 7982656b9c83..296f186802ff 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -63,7 +63,7 @@
 #include <linux/atalk.h>
 
 struct datalink_proto *ddp_dl, *aarp_dl;
-static struct proto_ops atalk_dgram_ops;
+static const struct proto_ops atalk_dgram_ops;
 
 /**************************************************************************\
 *                                                                          *
@@ -1841,7 +1841,7 @@ static struct net_proto_family atalk_family_ops = {
 	.owner		= THIS_MODULE,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
 	.family		= PF_APPLETALK,
 	.owner		= THIS_MODULE,
 	.release	= atalk_release,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 2684a92da22b..f2c541774dcd 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr,
 }
 
 
-static struct proto_ops pvc_proto_ops = {
+static const struct proto_ops pvc_proto_ops = {
 	.family =	PF_ATMPVC,
 	.owner =	THIS_MODULE,
 
diff --git a/net/atm/svc.c b/net/atm/svc.c
index d7b266136bf6..3a180cfd7b48 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	return error;
 }
 
-static struct proto_ops svc_proto_ops = {
+static const struct proto_ops svc_proto_ops = {
 	.family =	PF_ATMSVC,
 	.owner =	THIS_MODULE,
 
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 1b683f302657..8b5d10aaba05 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -54,7 +54,7 @@
 HLIST_HEAD(ax25_list);
 DEFINE_SPINLOCK(ax25_list_lock);
 
-static struct proto_ops ax25_proto_ops;
+static const struct proto_ops ax25_proto_ops;
 
 static void ax25_free_sock(struct sock *sk)
 {
@@ -1944,7 +1944,7 @@ static struct net_proto_family ax25_family_ops = {
 	.owner	=	THIS_MODULE,
 };
 
-static struct proto_ops ax25_proto_ops = {
+static const struct proto_ops ax25_proto_ops = {
 	.family		= PF_AX25,
 	.owner		= THIS_MODULE,
 	.release	= ax25_release,
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index 9778c6acd53b..ccbaf69afc5b 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -146,7 +146,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
 	return 0;
 }
 
-static struct proto_ops bnep_sock_ops = {
+static const struct proto_ops bnep_sock_ops = {
 	.family     = PF_BLUETOOTH,
 	.owner      = THIS_MODULE,
 	.release    = bnep_sock_release,
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index beb045bf5714..5e22343b6090 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -137,7 +137,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
 	return -EINVAL;
 }
 
-static struct proto_ops cmtp_sock_ops = {
+static const struct proto_ops cmtp_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= cmtp_sock_release,
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1d6d0a15c099..84e6c93a044a 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -575,7 +575,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char
 	return 0;
 }
 
-static struct proto_ops hci_sock_ops = {
+static const struct proto_ops hci_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= hci_sock_release,
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index f8986f881431..8f8dd931b294 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -143,7 +143,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
 	return -EINVAL;
 }
 
-static struct proto_ops hidp_sock_ops = {
+static const struct proto_ops hidp_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= hidp_sock_release,
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 95f33cc7a24e..7f0781e4326f 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -57,7 +57,7 @@
 
 #define VERSION "2.8"
 
-static struct proto_ops l2cap_sock_ops;
+static const struct proto_ops l2cap_sock_ops;
 
 static struct bt_sock_list l2cap_sk_list = {
 	.lock = RW_LOCK_UNLOCKED
@@ -2161,7 +2161,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
 
 static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL);
 
-static struct proto_ops l2cap_sock_ops = {
+static const struct proto_ops l2cap_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= l2cap_sock_release,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 6c34261b232e..757d2dd3b02f 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -58,7 +58,7 @@
 #define BT_DBG(D...)
 #endif
 
-static struct proto_ops rfcomm_sock_ops;
+static const struct proto_ops rfcomm_sock_ops;
 
 static struct bt_sock_list rfcomm_sk_list = {
 	.lock = RW_LOCK_UNLOCKED
@@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf)
 
 static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL);
 
-static struct proto_ops rfcomm_sock_ops = {
+static const struct proto_ops rfcomm_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= rfcomm_sock_release,
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 648181430699..6b61323ce23c 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -56,7 +56,7 @@
 
 #define VERSION "0.5"
 
-static struct proto_ops sco_sock_ops;
+static const struct proto_ops sco_sock_ops;
 
 static struct bt_sock_list sco_sk_list = {
 	.lock = RW_LOCK_UNLOCKED
@@ -914,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf)
 
 static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL);
 
-static struct proto_ops sco_sock_ops = {
+static const struct proto_ops sco_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= sco_sock_release,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 40a4c6899051..e4e629ed9bf7 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -680,7 +680,7 @@ void dccp_shutdown(struct sock *sk, int how)
 
 EXPORT_SYMBOL_GPL(dccp_shutdown);
 
-static struct proto_ops inet_dccp_ops = {
+static const struct proto_ops inet_dccp_ops = {
 	.family		= PF_INET,
 	.owner		= THIS_MODULE,
 	.release	= inet_release,
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index d402e9020c68..65e3baed0251 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -149,7 +149,7 @@ static void dn_keepalive(struct sock *sk);
 #define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
 
 
-static struct proto_ops dn_proto_ops;
+static const struct proto_ops dn_proto_ops;
 static DEFINE_RWLOCK(dn_hash_lock);
 static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
 static struct hlist_head dn_wild_sk;
@@ -2342,7 +2342,7 @@ static struct net_proto_family	dn_family_ops = {
 	.owner	=	THIS_MODULE,
 };
 
-static struct proto_ops dn_proto_ops = {
+static const struct proto_ops dn_proto_ops = {
 	.family =	AF_DECnet,
 	.owner =	THIS_MODULE,
 	.release =	dn_release,
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 34fdac51df96..ff58f49c8b4a 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -45,7 +45,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
-static struct proto_ops econet_ops;
+static const struct proto_ops econet_ops;
 static struct hlist_head econet_sklist;
 static DEFINE_RWLOCK(econet_lock);
 
@@ -698,7 +698,7 @@ static struct net_proto_family econet_family_ops = {
 	.owner	=	THIS_MODULE,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = {
 	.family =	PF_ECONET,
 	.owner =	THIS_MODULE,
 	.release =	econet_release,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 617e858beff1..4ed8a814c6cb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -785,7 +785,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-struct proto_ops inet_stream_ops = {
+const struct proto_ops inet_stream_ops = {
 	.family =	PF_INET,
 	.owner =	THIS_MODULE,
 	.release =	inet_release,
@@ -806,7 +806,7 @@ struct proto_ops inet_stream_ops = {
 	.sendpage =	tcp_sendpage
 };
 
-struct proto_ops inet_dgram_ops = {
+const struct proto_ops inet_dgram_ops = {
 	.family =	PF_INET,
 	.owner =	THIS_MODULE,
 	.release =	inet_release,
@@ -831,7 +831,7 @@ struct proto_ops inet_dgram_ops = {
  * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
  * udp_poll
  */
-static struct proto_ops inet_sockraw_ops = {
+static const struct proto_ops inet_sockraw_ops = {
 	.family =	PF_INET,
 	.owner =	THIS_MODULE,
 	.release =	inet_release,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 70a510ff31ee..7c9f19269f21 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -462,7 +462,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	return(0);
 }
 
-struct proto_ops inet6_stream_ops = {
+const struct proto_ops inet6_stream_ops = {
 	.family =	PF_INET6,
 	.owner =	THIS_MODULE,
 	.release =	inet6_release,
@@ -483,7 +483,7 @@ struct proto_ops inet6_stream_ops = {
 	.sendpage =	tcp_sendpage
 };
 
-struct proto_ops inet6_dgram_ops = {
+const struct proto_ops inet6_dgram_ops = {
 	.family =	PF_INET6,
 	.owner =	THIS_MODULE,
 	.release =	inet6_release,
@@ -511,7 +511,7 @@ static struct net_proto_family inet6_family_ops = {
 };
 
 /* Same as inet6_dgram_ops, sans udp_poll.  */
-static struct proto_ops inet6_sockraw_ops = {
+static const struct proto_ops inet6_sockraw_ops = {
 	.family =	PF_INET6,
 	.owner =	THIS_MODULE,
 	.release =	inet6_release,
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 34b3bb868409..6c464c11bb09 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -75,7 +75,7 @@ static struct datalink_proto *pEII_datalink;
 static struct datalink_proto *p8023_datalink;
 static struct datalink_proto *pSNAP_datalink;
 
-static struct proto_ops ipx_dgram_ops;
+static const struct proto_ops ipx_dgram_ops;
 
 LIST_HEAD(ipx_interfaces);
 DEFINE_SPINLOCK(ipx_interfaces_lock);
@@ -1901,7 +1901,7 @@ static struct net_proto_family ipx_family_ops = {
 	.owner		= THIS_MODULE,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
 	.family		= PF_IPX,
 	.owner		= THIS_MODULE,
 	.release	= ipx_release,
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index f121f7de2032..e57683d424f7 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -62,12 +62,12 @@
 
 static int irda_create(struct socket *sock, int protocol);
 
-static struct proto_ops irda_stream_ops;
-static struct proto_ops irda_seqpacket_ops;
-static struct proto_ops irda_dgram_ops;
+static const struct proto_ops irda_stream_ops;
+static const struct proto_ops irda_seqpacket_ops;
+static const struct proto_ops irda_dgram_ops;
 
 #ifdef CONFIG_IRDA_ULTRA
-static struct proto_ops irda_ultra_ops;
+static const struct proto_ops irda_ultra_ops;
 #define ULTRA_MAX_DATA 382
 #endif /* CONFIG_IRDA_ULTRA */
 
@@ -2464,7 +2464,7 @@ static struct net_proto_family irda_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
 	.family =	PF_IRDA,
 	.owner =	THIS_MODULE,
 	.release =	irda_release,
@@ -2485,7 +2485,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
 	.sendpage =	sock_no_sendpage,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
 	.family =	PF_IRDA,
 	.owner =	THIS_MODULE,
 	.release =	irda_release,
@@ -2506,7 +2506,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
 	.sendpage =	sock_no_sendpage,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
 	.family =	PF_IRDA,
 	.owner =	THIS_MODULE,
 	.release =	irda_release,
@@ -2528,7 +2528,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
 };
 
 #ifdef CONFIG_IRDA_ULTRA
-static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = {
 	.family =	PF_IRDA,
 	.owner =	THIS_MODULE,
 	.release =	irda_release,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index d32f7791f1e4..52efd04cbedb 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -113,7 +113,7 @@ static __inline__ void pfkey_unlock_table(void)
 }
 
 
-static struct proto_ops pfkey_ops;
+static const struct proto_ops pfkey_ops;
 
 static void pfkey_insert(struct sock *sk)
 {
@@ -3127,7 +3127,7 @@ out:
 	return err;
 }
 
-static struct proto_ops pfkey_ops = {
+static const struct proto_ops pfkey_ops = {
 	.family		=	PF_KEY,
 	.owner		=	THIS_MODULE,
 	/* Operations that make no sense on pfkey sockets. */
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index b6d3df5c911c..9cf65f9d8902 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -36,7 +36,7 @@
 static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
 static u16 llc_ui_sap_link_no_max[256];
 static struct sockaddr_llc llc_ui_addrnull;
-static struct proto_ops llc_ui_ops;
+static const struct proto_ops llc_ui_ops;
 
 static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
 static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
@@ -1098,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
-static struct proto_ops llc_ui_ops = {
+static const struct proto_ops llc_ui_ops = {
 	.family	     = PF_LLC,
 	.owner       = THIS_MODULE,
 	.release     = llc_ui_release,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 96020d7087e8..7849cac14d3a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -293,7 +293,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len)
 	return 0;
 }
 
-static struct proto_ops netlink_ops;
+static const struct proto_ops netlink_ops;
 
 static int netlink_insert(struct sock *sk, u32 pid)
 {
@@ -1656,7 +1656,7 @@ int netlink_unregister_notifier(struct notifier_block *nb)
 	return notifier_chain_unregister(&netlink_chain, nb);
 }
                 
-static struct proto_ops netlink_ops = {
+static const struct proto_ops netlink_ops = {
 	.family =	PF_NETLINK,
 	.owner =	THIS_MODULE,
 	.release =	netlink_release,
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index e5d82d711cae..05b653c05971 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -63,7 +63,7 @@ static unsigned short circuit = 0x101;
 static HLIST_HEAD(nr_list);
 static DEFINE_SPINLOCK(nr_list_lock);
 
-static struct proto_ops nr_proto_ops;
+static const struct proto_ops nr_proto_ops;
 
 /*
  *	Socket removal during an interrupt is now safe.
@@ -1337,7 +1337,7 @@ static struct net_proto_family nr_family_ops = {
 	.owner		=	THIS_MODULE,
 };
 
-static struct proto_ops nr_proto_ops = {
+static const struct proto_ops nr_proto_ops = {
 	.family		=	PF_NETROM,
 	.owner		=	THIS_MODULE,
 	.release	=	nr_release,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3e2462760413..deda6fdb1e53 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -251,10 +251,10 @@ static void packet_sock_destruct(struct sock *sk)
 }
 
 
-static struct proto_ops packet_ops;
+static const struct proto_ops packet_ops;
 
 #ifdef CONFIG_SOCK_PACKET
-static struct proto_ops packet_ops_spkt;
+static const struct proto_ops packet_ops_spkt;
 
 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
 {
@@ -1784,7 +1784,7 @@ out:
 
 
 #ifdef CONFIG_SOCK_PACKET
-static struct proto_ops packet_ops_spkt = {
+static const struct proto_ops packet_ops_spkt = {
 	.family =	PF_PACKET,
 	.owner =	THIS_MODULE,
 	.release =	packet_release,
@@ -1806,7 +1806,7 @@ static struct proto_ops packet_ops_spkt = {
 };
 #endif
 
-static struct proto_ops packet_ops = {
+static const struct proto_ops packet_ops = {
 	.family =	PF_PACKET,
 	.owner =	THIS_MODULE,
 	.release =	packet_release,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fa3be2b8fb5f..15c05165c905 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -866,7 +866,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt,
 	return 2;
 }
 
-static struct proto_ops inet6_seqpacket_ops = {
+static const struct proto_ops inet6_seqpacket_ops = {
 	.family     = PF_INET6,
 	.owner      = THIS_MODULE,
 	.release    = inet6_release,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f775d78aa59d..d1b0747a5b9d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -829,7 +829,7 @@ static struct notifier_block sctp_inetaddr_notifier = {
 };
 
 /* Socket operations.  */
-static struct proto_ops inet_seqpacket_ops = {
+static const struct proto_ops inet_seqpacket_ops = {
 	.family      = PF_INET,
 	.owner       = THIS_MODULE,
 	.release     = inet_release,       /* Needs to be wrapped... */
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c6a51911e71e..d68eba481291 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk)
 	struct svc_serv	*serv = svsk->sk_server;
 	struct socket	*sock = svsk->sk_sock;
 	struct socket	*newsock;
-	struct proto_ops *ops;
+	const struct proto_ops *ops;
 	struct svc_sock	*newsvsk;
 	int		err, slen;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 04e850e04e3d..7d3fe6aebcdb 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *,
 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 				  struct msghdr *, size_t);
 
-static struct proto_ops unix_stream_ops = {
+static const struct proto_ops unix_stream_ops = {
 	.family =	PF_UNIX,
 	.owner =	THIS_MODULE,
 	.release =	unix_release,
@@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = {
 	.sendpage =	sock_no_sendpage,
 };
 
-static struct proto_ops unix_dgram_ops = {
+static const struct proto_ops unix_dgram_ops = {
 	.family =	PF_UNIX,
 	.owner =	THIS_MODULE,
 	.release =	unix_release,
@@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = {
 	.sendpage =	sock_no_sendpage,
 };
 
-static struct proto_ops unix_seqpacket_ops = {
+static const struct proto_ops unix_seqpacket_ops = {
 	.family =	PF_UNIX,
 	.owner =	THIS_MODULE,
 	.release =	unix_release,
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index 59fec59b2132..67948bf22dc4 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -181,7 +181,7 @@ struct wanpipe_opt
 #endif
 
 static int sk_count;
-extern struct proto_ops wanpipe_ops;
+extern const struct proto_ops wanpipe_ops;
 static unsigned long find_free_critical;
 
 static void wanpipe_unlink_driver(struct sock *sk);
@@ -2546,7 +2546,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr
 	return 0;
 }
 
-struct proto_ops wanpipe_ops = {
+const struct proto_ops wanpipe_ops = {
 	.family = 	PF_WANPIPE,
 	.owner =	THIS_MODULE,
 	.release = 	wanpipe_release,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 020d73cc8414..ca8b3b0b920d 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -64,7 +64,7 @@ int sysctl_x25_ack_holdback_timeout    = X25_DEFAULT_T2;
 HLIST_HEAD(x25_list);
 DEFINE_RWLOCK(x25_list_lock);
 
-static struct proto_ops x25_proto_ops;
+static const struct proto_ops x25_proto_ops;
 
 static struct x25_address null_x25_address = {"               "};
 
@@ -1391,7 +1391,7 @@ static struct net_proto_family x25_family_ops = {
 	.owner	=	THIS_MODULE,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
 	.family =	AF_X25,
 	.owner =	THIS_MODULE,
 	.release =	x25_release,
-- 
cgit v1.2.3-71-gd317


From 14c850212ed8f8cbb5972ad6b8812e08a0bc901c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 27 Dec 2005 02:43:12 -0200
Subject: [INET_SOCK]: Move struct inet_sock & helper functions to
 net/inet_sock.h

To help in reducing the number of include dependencies, several files were
touched as they were getting needed headers indirectly for stuff they use.

Thanks also to Alan Menegotto for pointing out that net/dccp/proto.c had
linux/dccp.h include twice.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c      |   2 +
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   2 +
 drivers/net/ns83820.c                          |   1 +
 drivers/net/sk98lin/skge.c                     |   1 +
 drivers/net/skge.c                             |   1 +
 drivers/net/tg3.c                              |   1 +
 fs/9p/trans_sock.c                             |   1 +
 fs/nfs/callback.c                              |   3 +
 include/linux/dccp.h                           |   3 +-
 include/linux/ip.h                             | 126 +---------------
 include/linux/ipv6.h                           |   7 +-
 include/linux/udp.h                            |   6 +-
 include/net/atmclip.h                          |   2 +-
 include/net/dst.h                              |   1 +
 include/net/icmp.h                             |   9 +-
 include/net/ieee80211_crypt.h                  |   9 +-
 include/net/inet_connection_sock.h             |   3 +-
 include/net/inet_ecn.h                         |   2 +
 include/net/inet_hashtables.h                  |  21 +--
 include/net/inet_sock.h                        | 193 +++++++++++++++++++++++++
 include/net/inet_timewait_sock.h               |   2 +-
 include/net/ip.h                               |  17 ++-
 include/net/ip_fib.h                           |   2 +
 include/net/ip_vs.h                            |  12 +-
 include/net/ipv6.h                             |   3 +
 include/net/ndisc.h                            |  17 ++-
 include/net/neighbour.h                        |   2 +-
 include/net/pkt_act.h                          |   1 -
 include/net/raw.h                              |   2 +
 include/net/udp.h                              |   4 +-
 include/net/xfrm.h                             |   3 +-
 net/bridge/br_netfilter.c                      |   4 +
 net/bridge/netfilter/ebt_log.c                 |   1 +
 net/core/netpoll.c                             |   1 +
 net/dccp/ccid.h                                |   2 +
 net/dccp/ipv4.c                                |   1 +
 net/dccp/ipv6.c                                |   1 +
 net/dccp/output.c                              |   1 +
 net/dccp/proto.c                               |   3 +-
 net/econet/af_econet.c                         |   1 +
 net/ipv4/af_inet.c                             |   1 +
 net/ipv4/ah4.c                                 |   1 +
 net/ipv4/arp.c                                 |   1 +
 net/ipv4/devinet.c                             |   1 +
 net/ipv4/esp4.c                                |   1 +
 net/ipv4/fib_frontend.c                        |   1 +
 net/ipv4/fib_hash.c                            |   1 +
 net/ipv4/fib_rules.c                           |   1 +
 net/ipv4/fib_semantics.c                       |   2 +
 net/ipv4/icmp.c                                |   1 +
 net/ipv4/igmp.c                                |   2 +
 net/ipv4/ip_input.c                            |   1 +
 net/ipv4/ip_options.c                          |   1 +
 net/ipv4/ip_sockglue.c                         |   1 +
 net/ipv4/ipcomp.c                              |   1 +
 net/ipv4/ipconfig.c                            |   2 +
 net/ipv4/ipmr.c                                |   1 +
 net/ipv4/ipvs/ip_vs_conn.c                     |   2 +
 net/ipv4/ipvs/ip_vs_ctl.c                      |   1 +
 net/ipv4/ipvs/ip_vs_dh.c                       |   2 +
 net/ipv4/ipvs/ip_vs_est.c                      |   3 +
 net/ipv4/ipvs/ip_vs_lblc.c                     |   2 +
 net/ipv4/ipvs/ip_vs_lblcr.c                    |   2 +
 net/ipv4/ipvs/ip_vs_proto_ah.c                 |   2 +
 net/ipv4/ipvs/ip_vs_proto_esp.c                |   2 +
 net/ipv4/ipvs/ip_vs_proto_udp.c                |   3 +
 net/ipv4/ipvs/ip_vs_sh.c                       |   2 +
 net/ipv4/ipvs/ip_vs_sync.c                     |   2 +
 net/ipv4/netfilter/ip_conntrack_amanda.c       |   2 +
 net/ipv4/netfilter/ip_conntrack_proto_gre.c    |   1 +
 net/ipv4/netfilter/ip_conntrack_proto_udp.c    |   1 +
 net/ipv4/netfilter/ip_conntrack_standalone.c   |   1 +
 net/ipv4/netfilter/ip_nat_snmp_basic.c         |   2 +
 net/ipv4/netfilter/ipt_MASQUERADE.c            |   2 +
 net/ipv4/netfilter/ipt_physdev.c               |   1 +
 net/ipv4/proc.c                                |   1 +
 net/ipv4/sysctl_net_ipv4.c                     |   1 +
 net/ipv4/udp.c                                 |   1 +
 net/ipv6/ah6.c                                 |   1 +
 net/ipv6/esp6.c                                |   1 +
 net/ipv6/ipcomp6.c                             |   1 +
 net/ipv6/netfilter/ip6_tables.c                |   1 +
 net/ipv6/netfilter/ip6t_LOG.c                  |   1 +
 net/ipv6/netfilter/ip6t_ah.c                   |   1 +
 net/ipv6/netfilter/ip6t_esp.c                  |   1 +
 net/sched/sch_teql.c                           |   1 +
 net/sctp/protocol.c                            |   1 +
 87 files changed, 355 insertions(+), 184 deletions(-)
 create mode 100644 include/net/inet_sock.h

(limited to 'include/linux')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 475d98fa9e26..780009c7eaa6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -47,6 +47,8 @@
 #include <linux/ip.h>
 #include <linux/in.h>
 
+#include <net/dst.h>
+
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index ef3ee035bbc8..ed0c2ead8bc1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -43,6 +43,8 @@
 #include <linux/delay.h>
 #include <linux/completion.h>
 
+#include <net/dst.h>
+
 #include "ipoib.h"
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index f857ae94d261..b0c3b6ab6263 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -115,6 +115,7 @@
 #include <linux/ethtool.h>
 #include <linux/timer.h>
 #include <linux/if_vlan.h>
+#include <linux/rtnetlink.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c
index ae7343934758..e1a2d52cc1fe 100644
--- a/drivers/net/sk98lin/skge.c
+++ b/drivers/net/sk98lin/skge.c
@@ -107,6 +107,7 @@
 
 #include	"h/skversion.h"
 
+#include	<linux/in.h>
 #include	<linux/module.h>
 #include	<linux/moduleparam.h>
 #include	<linux/init.h>
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 00d683063c01..d8cc3aea032a 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -25,6 +25,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 2fc9893d69e1..59d916ccc810 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -24,6 +24,7 @@
 #include <linux/compiler.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/in.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index a93c2bf94c33..6a9a75d40f73 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -26,6 +26,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <linux/ipv6.h>
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f2ca782aba33..30cae3602867 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,9 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
+
+#include <net/inet_sock.h>
+
 #include "nfs4_fs.h"
 #include "callback.h"
 
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index d0bdb499cf8d..088529f54965 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -192,10 +192,9 @@ enum {
 #include <linux/workqueue.h>
 
 #include <net/inet_connection_sock.h>
+#include <net/inet_sock.h>
 #include <net/inet_timewait_sock.h>
-#include <net/sock.h>
 #include <net/tcp_states.h>
-#include <net/tcp.h>
 
 enum dccp_state {
 	DCCP_OPEN	= TCP_ESTABLISHED,
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 6ccc596c19c8..9e2eb9a602eb 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -16,6 +16,7 @@
  */
 #ifndef _LINUX_IP_H
 #define _LINUX_IP_H
+#include <linux/types.h>
 #include <asm/byteorder.h>
 
 #define IPTOS_TOS_MASK		0x1E
@@ -78,131 +79,6 @@
 #define	IPOPT_TS_TSANDADDR	1		/* timestamps and addresses */
 #define	IPOPT_TS_PRESPEC	3		/* specified modules only */
 
-#ifdef __KERNEL__
-#include <linux/config.h>
-#include <linux/types.h>
-#include <net/request_sock.h>
-#include <net/sock.h>
-#include <linux/igmp.h>
-#include <net/flow.h>
-
-struct ip_options {
-  __u32		faddr;				/* Saved first hop address */
-  unsigned char	optlen;
-  unsigned char srr;
-  unsigned char rr;
-  unsigned char ts;
-  unsigned char is_setbyuser:1,			/* Set by setsockopt?			*/
-                is_data:1,			/* Options in __data, rather than skb	*/
-                is_strictroute:1,		/* Strict source route			*/
-                srr_is_hit:1,			/* Packet destination addr was our one	*/
-                is_changed:1,			/* IP checksum more not valid		*/	
-                rr_needaddr:1,			/* Need to record addr of outgoing dev	*/
-                ts_needtime:1,			/* Need to record timestamp		*/
-                ts_needaddr:1;			/* Need to record addr of outgoing dev  */
-  unsigned char router_alert;
-  unsigned char __pad1;
-  unsigned char __pad2;
-  unsigned char __data[0];
-};
-
-#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
-
-struct inet_request_sock {
-	struct request_sock	req;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	u16			inet6_rsk_offset;
-	/* 2 bytes hole, try to pack */
-#endif
-	u32			loc_addr;
-	u32			rmt_addr;
-	u16			rmt_port;
-	u16			snd_wscale : 4, 
-				rcv_wscale : 4, 
-				tstamp_ok  : 1,
-				sack_ok	   : 1,
-				wscale_ok  : 1,
-				ecn_ok	   : 1,
-				acked	   : 1;
-	struct ip_options	*opt;
-};
-
-static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
-{
-	return (struct inet_request_sock *)sk;
-}
-
-struct ipv6_pinfo;
-
-struct inet_sock {
-	/* sk and pinet6 has to be the first two members of inet_sock */
-	struct sock		sk;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	struct ipv6_pinfo	*pinet6;
-#endif
-	/* Socket demultiplex comparisons on incoming packets. */
-	__u32			daddr;		/* Foreign IPv4 addr */
-	__u32			rcv_saddr;	/* Bound local IPv4 addr */
-	__u16			dport;		/* Destination port */
-	__u16			num;		/* Local port */
-	__u32			saddr;		/* Sending source */
-	__s16			uc_ttl;		/* Unicast TTL */
-	__u16			cmsg_flags;
-	struct ip_options	*opt;
-	__u16			sport;		/* Source port */
-	__u16			id;		/* ID counter for DF pkts */
-	__u8			tos;		/* TOS */
-	__u8			mc_ttl;		/* Multicasting TTL */
-	__u8			pmtudisc;
-	unsigned		recverr : 1,
-				is_icsk : 1,	/* inet_connection_sock? */
-				freebind : 1,
-				hdrincl : 1,
-				mc_loop : 1;
-	int			mc_index;	/* Multicast device index */
-	__u32			mc_addr;
-	struct ip_mc_socklist	*mc_list;	/* Group array */
-	/*
-	 * Following members are used to retain the infomation to build
-	 * an ip header on each ip fragmentation while the socket is corked.
-	 */
-	struct {
-		unsigned int		flags;
-		unsigned int		fragsize;
-		struct ip_options	*opt;
-		struct rtable		*rt;
-		int			length; /* Total length of all frames */
-		u32			addr;
-		struct flowi		fl;
-	} cork;
-};
-
-#define IPCORK_OPT	1	/* ip-options has been held in ipcork.opt */
-#define IPCORK_ALLFRAG	2	/* always fragment (for ipv6 for now) */
-
-static inline struct inet_sock *inet_sk(const struct sock *sk)
-{
-	return (struct inet_sock *)sk;
-}
-
-static inline void __inet_sk_copy_descendant(struct sock *sk_to,
-					     const struct sock *sk_from,
-					     const int ancestor_size)
-{
-	memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
-	       sk_from->sk_prot->obj_size - ancestor_size);
-}
-#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE))
-static inline void inet_sk_copy_descendant(struct sock *sk_to,
-					   const struct sock *sk_from)
-{
-	__inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
-}
-#endif
-#endif
-
-extern int inet_sk_rebuild_header(struct sock *sk);
-
 struct iphdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u8	ihl:4,
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index a0d04891fe12..93bbed5c6cf4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -171,12 +171,13 @@ enum {
 };
 
 #ifdef __KERNEL__
-#include <linux/in6.h>          /* struct sockaddr_in6 */
 #include <linux/icmpv6.h>
-#include <net/if_inet6.h>       /* struct ipv6_mc_socklist */
 #include <linux/tcp.h>
 #include <linux/udp.h>
 
+#include <net/if_inet6.h>       /* struct ipv6_mc_socklist */
+#include <net/inet_sock.h>
+
 /* 
    This structure contains results of exthdrs parsing
    as offsets from skb->nh.
@@ -346,8 +347,6 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 #define __ipv6_only_sock(sk)	(inet6_sk(sk)->ipv6only)
 #define ipv6_only_sock(sk)	((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))
 
-#include <linux/tcp.h>
-
 struct inet6_timewait_sock {
 	struct in6_addr tw_v6_daddr;
 	struct in6_addr	tw_v6_rcv_saddr;
diff --git a/include/linux/udp.h b/include/linux/udp.h
index b60e0b4a25c4..85a55658831c 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -35,10 +35,10 @@ struct udphdr {
 #define UDP_ENCAP_ESPINUDP	2 /* draft-ietf-ipsec-udp-encaps-06 */
 
 #ifdef __KERNEL__
-
 #include <linux/config.h>
-#include <net/sock.h>
-#include <linux/ip.h>
+#include <linux/types.h>
+
+#include <net/inet_sock.h>
 
 struct udp_sock {
 	/* inet_sock has to be the first member */
diff --git a/include/net/atmclip.h b/include/net/atmclip.h
index 47048b1d179a..90fcc98e676f 100644
--- a/include/net/atmclip.h
+++ b/include/net/atmclip.h
@@ -7,7 +7,6 @@
 #define _ATMCLIP_H
 
 #include <linux/netdevice.h>
-#include <linux/skbuff.h>
 #include <linux/atm.h>
 #include <linux/atmdev.h>
 #include <linux/atmarp.h>
@@ -18,6 +17,7 @@
 #define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back))
 #define NEIGH2ENTRY(neigh) ((struct atmarp_entry *) (neigh)->primary_key)
 
+struct sk_buff;
 
 struct clip_vcc {
 	struct atm_vcc	*vcc;		/* VCC descriptor */
diff --git a/include/net/dst.h b/include/net/dst.h
index 6c196a5baf24..bee8b84d329d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -9,6 +9,7 @@
 #define _NET_DST_H
 
 #include <linux/config.h>
+#include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
 #include <linux/jiffies.h>
diff --git a/include/net/icmp.h b/include/net/icmp.h
index 6cdebeee5f96..e7c3f20fbafc 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -20,12 +20,9 @@
 
 #include <linux/config.h>
 #include <linux/icmp.h>
-#include <linux/skbuff.h>
 
-#include <net/sock.h>
-#include <net/protocol.h>
+#include <net/inet_sock.h>
 #include <net/snmp.h>
-#include <linux/ip.h>
 
 struct icmp_err {
   int		errno;
@@ -38,6 +35,10 @@ DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics);
 #define ICMP_INC_STATS_BH(field)	SNMP_INC_STATS_BH(icmp_statistics, field)
 #define ICMP_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(icmp_statistics, field)
 
+struct dst_entry;
+struct net_proto_family;
+struct sk_buff;
+
 extern void	icmp_send(struct sk_buff *skb_in,  int type, int code, u32 info);
 extern int	icmp_rcv(struct sk_buff *skb);
 extern int	icmp_ioctl(struct sock *sk, int cmd, unsigned long arg);
diff --git a/include/net/ieee80211_crypt.h b/include/net/ieee80211_crypt.h
index 225fc751d464..03b766afdc39 100644
--- a/include/net/ieee80211_crypt.h
+++ b/include/net/ieee80211_crypt.h
@@ -23,12 +23,17 @@
 #ifndef IEEE80211_CRYPT_H
 #define IEEE80211_CRYPT_H
 
-#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
 
 enum {
 	IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0),
 };
 
+struct sk_buff;
+struct module;
+
 struct ieee80211_crypto_ops {
 	const char *name;
 	struct list_head list;
@@ -87,6 +92,8 @@ struct ieee80211_crypt_data {
 	atomic_t refcnt;
 };
 
+struct ieee80211_device;
+
 int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops);
 int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops);
 struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name);
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 91888967d3e3..50234fa56a68 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -16,9 +16,10 @@
 #define _INET_CONNECTION_SOCK_H
 
 #include <linux/compiler.h>
-#include <linux/ip.h>
 #include <linux/string.h>
 #include <linux/timer.h>
+
+#include <net/inet_sock.h>
 #include <net/request_sock.h>
 
 #define INET_CSK_DEBUG 1
diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index b0c47e2eccf1..d599c6bfbb86 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -3,6 +3,8 @@
 
 #include <linux/ip.h>
 #include <linux/skbuff.h>
+
+#include <net/inet_sock.h>
 #include <net/dsfield.h>
 
 enum {
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index c83baa79f66e..135d80fd658e 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -26,6 +26,7 @@
 #include <linux/wait.h>
 
 #include <net/inet_connection_sock.h>
+#include <net/inet_sock.h>
 #include <net/route.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
@@ -128,26 +129,6 @@ struct inet_hashinfo {
 	kmem_cache_t			*bind_bucket_cachep;
 };
 
-static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
-			       const __u32 faddr, const __u16 fport)
-{
-	unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
-	h ^= h >> 16;
-	h ^= h >> 8;
-	return h;
-}
-
-static inline int inet_sk_ehashfn(const struct sock *sk)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	const __u32 laddr = inet->rcv_saddr;
-	const __u16 lport = inet->num;
-	const __u32 faddr = inet->daddr;
-	const __u16 fport = inet->dport;
-
-	return inet_ehashfn(laddr, lport, faddr, fport);
-}
-
 static inline struct inet_ehash_bucket *inet_ehash_bucket(
 	struct inet_hashinfo *hashinfo,
 	unsigned int hash)
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
new file mode 100644
index 000000000000..883eb529ef8e
--- /dev/null
+++ b/include/net/inet_sock.h
@@ -0,0 +1,193 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Definitions for inet_sock
+ *
+ * Authors:	Many, reorganised here by
+ * 		Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _INET_SOCK_H
+#define _INET_SOCK_H
+
+#include <linux/config.h>
+
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <net/flow.h>
+#include <net/sock.h>
+#include <net/request_sock.h>
+
+/** struct ip_options - IP Options
+ *
+ * @faddr - Saved first hop address
+ * @is_setbyuser - Set by setsockopt?
+ * @is_data - Options in __data, rather than skb
+ * @is_strictroute - Strict source route
+ * @srr_is_hit - Packet destination addr was our one
+ * @is_changed - IP checksum more not valid
+ * @rr_needaddr - Need to record addr of outgoing dev
+ * @ts_needtime - Need to record timestamp
+ * @ts_needaddr - Need to record addr of outgoing dev
+ */
+struct ip_options {
+	__u32		faddr;
+	unsigned char	optlen;
+	unsigned char	srr;
+	unsigned char	rr;
+	unsigned char	ts;
+	unsigned char	is_setbyuser:1,
+			is_data:1,
+			is_strictroute:1,
+			srr_is_hit:1,
+			is_changed:1,
+			rr_needaddr:1,
+			ts_needtime:1,
+			ts_needaddr:1;
+	unsigned char	router_alert;
+	unsigned char	__pad1;
+	unsigned char	__pad2;
+	unsigned char	__data[0];
+};
+
+#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
+
+struct inet_request_sock {
+	struct request_sock	req;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	u16			inet6_rsk_offset;
+	/* 2 bytes hole, try to pack */
+#endif
+	u32			loc_addr;
+	u32			rmt_addr;
+	u16			rmt_port;
+	u16			snd_wscale : 4, 
+				rcv_wscale : 4, 
+				tstamp_ok  : 1,
+				sack_ok	   : 1,
+				wscale_ok  : 1,
+				ecn_ok	   : 1,
+				acked	   : 1;
+	struct ip_options	*opt;
+};
+
+static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
+{
+	return (struct inet_request_sock *)sk;
+}
+
+struct ip_mc_socklist;
+struct ipv6_pinfo;
+struct rtable;
+
+/** struct inet_sock - representation of INET sockets
+ *
+ * @sk - ancestor class
+ * @pinet6 - pointer to IPv6 control block
+ * @daddr - Foreign IPv4 addr
+ * @rcv_saddr - Bound local IPv4 addr
+ * @dport - Destination port
+ * @num - Local port
+ * @saddr - Sending source
+ * @uc_ttl - Unicast TTL
+ * @sport - Source port
+ * @id - ID counter for DF pkts
+ * @tos - TOS
+ * @mc_ttl - Multicasting TTL
+ * @is_icsk - is this an inet_connection_sock?
+ * @mc_index - Multicast device index
+ * @mc_list - Group array
+ * @cork - info to build ip hdr on each ip frag while socket is corked
+ */
+struct inet_sock {
+	/* sk and pinet6 has to be the first two members of inet_sock */
+	struct sock		sk;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct ipv6_pinfo	*pinet6;
+#endif
+	/* Socket demultiplex comparisons on incoming packets. */
+	__u32			daddr;
+	__u32			rcv_saddr;
+	__u16			dport;
+	__u16			num;
+	__u32			saddr;
+	__s16			uc_ttl;
+	__u16			cmsg_flags;
+	struct ip_options	*opt;
+	__u16			sport;
+	__u16			id;
+	__u8			tos;
+	__u8			mc_ttl;
+	__u8			pmtudisc;
+	__u8			recverr:1,
+				is_icsk:1,
+				freebind:1,
+				hdrincl:1,
+				mc_loop:1;
+	int			mc_index;
+	__u32			mc_addr;
+	struct ip_mc_socklist	*mc_list;
+	struct {
+		unsigned int		flags;
+		unsigned int		fragsize;
+		struct ip_options	*opt;
+		struct rtable		*rt;
+		int			length; /* Total length of all frames */
+		u32			addr;
+		struct flowi		fl;
+	} cork;
+};
+
+#define IPCORK_OPT	1	/* ip-options has been held in ipcork.opt */
+#define IPCORK_ALLFRAG	2	/* always fragment (for ipv6 for now) */
+
+static inline struct inet_sock *inet_sk(const struct sock *sk)
+{
+	return (struct inet_sock *)sk;
+}
+
+static inline void __inet_sk_copy_descendant(struct sock *sk_to,
+					     const struct sock *sk_from,
+					     const int ancestor_size)
+{
+	memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
+	       sk_from->sk_prot->obj_size - ancestor_size);
+}
+#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE))
+static inline void inet_sk_copy_descendant(struct sock *sk_to,
+					   const struct sock *sk_from)
+{
+	__inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
+}
+#endif
+
+extern int inet_sk_rebuild_header(struct sock *sk);
+
+static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
+					const __u32 faddr, const __u16 fport)
+{
+	unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
+	h ^= h >> 16;
+	h ^= h >> 8;
+	return h;
+}
+
+static inline int inet_sk_ehashfn(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const __u32 laddr = inet->rcv_saddr;
+	const __u16 lport = inet->num;
+	const __u32 faddr = inet->daddr;
+	const __u16 fport = inet->dport;
+
+	return inet_ehashfn(laddr, lport, faddr, fport);
+}
+
+#endif	/* _INET_SOCK_H */
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index e396a65473d7..1da294c47522 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -17,13 +17,13 @@
 
 #include <linux/config.h>
 
-#include <linux/ip.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/timer.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
+#include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
 #include <net/timewait_sock.h>
diff --git a/include/net/ip.h b/include/net/ip.h
index 4d6294ba038e..f7e7fd728b67 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -24,14 +24,10 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
-#include <linux/socket.h>
 #include <linux/ip.h>
 #include <linux/in.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/in_route.h>
-#include <net/route.h>
-#include <net/arp.h>
+
+#include <net/inet_sock.h>
 #include <net/snmp.h>
 
 struct sock;
@@ -75,6 +71,13 @@ extern rwlock_t ip_ra_lock;
 
 #define IP_FRAG_TIME	(30 * HZ)		/* fragment lifetime	*/
 
+struct msghdr;
+struct net_device;
+struct packet_type;
+struct rtable;
+struct sk_buff;
+struct sockaddr;
+
 extern void		ip_mc_dropsocket(struct sock *);
 extern void		ip_mc_dropdevice(struct net_device *dev);
 extern int		igmp_mc_proc_init(void);
@@ -184,6 +187,8 @@ extern int sysctl_ip_dynaddr;
 extern void ipfrag_init(void);
 
 #ifdef CONFIG_INET
+#include <net/dst.h>
+
 /* The function in 2.2 was invalid, producing wrong result for
  * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
 static inline
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 14de4ebd1211..e000fa2cd5f6 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -238,6 +238,8 @@ extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
 			       struct net_device *dev, u32 *spec_dst, u32 *itag);
 extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res);
 
+struct rtentry;
+
 /* Exported by fib_semantics.c */
 extern int ip_fib_check_default(u32 gw, struct net_device *dev);
 extern int fib_sync_down(u32 local, struct net_device *dev, int force);
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 3b5559a023a4..7d2674fde19a 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -251,16 +251,15 @@ struct ip_vs_daemon_user {
 #include <linux/config.h>
 #include <linux/list.h>                 /* for struct list_head */
 #include <linux/spinlock.h>             /* for struct rwlock_t */
-#include <linux/skbuff.h>               /* for struct sk_buff */
-#include <linux/ip.h>                   /* for struct iphdr */
 #include <asm/atomic.h>                 /* for struct atomic_t */
-#include <linux/netdevice.h>		/* for struct neighbour */
-#include <net/dst.h>			/* for struct dst_entry */
-#include <net/udp.h>
 #include <linux/compiler.h>
+#include <linux/timer.h>
 
+#include <net/checksum.h>
 
 #ifdef CONFIG_IP_VS_DEBUG
+#include <linux/net.h>
+
 extern int ip_vs_get_debug_level(void);
 #define IP_VS_DBG(level, msg...)			\
     do {						\
@@ -429,8 +428,11 @@ struct ip_vs_stats
 	spinlock_t              lock;           /* spin lock */
 };
 
+struct dst_entry;
+struct iphdr;
 struct ip_vs_conn;
 struct ip_vs_app;
+struct sk_buff;
 
 struct ip_vs_protocol {
 	struct ip_vs_protocol	*next;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 11a725662c36..860bbac4c4ee 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -541,6 +541,9 @@ extern int sysctl_ip6frag_secret_interval;
 extern const struct proto_ops inet6_stream_ops;
 extern const struct proto_ops inet6_dgram_ops;
 
+struct group_source_req;
+struct group_filter;
+
 extern int ip6_mc_source(int add, int omode, struct sock *sk,
 			 struct group_source_req *pgsr);
 extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index f85d6e4b7442..bbac87eeb422 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -35,11 +35,20 @@ enum {
 
 #ifdef __KERNEL__
 
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
+#include <linux/config.h>
+#include <linux/compiler.h>
 #include <linux/icmpv6.h>
+#include <linux/in6.h>
+#include <linux/types.h>
+
 #include <net/neighbour.h>
-#include <asm/atomic.h>
+
+struct ctl_table;
+struct file;
+struct inet6_dev;
+struct net_device;
+struct net_proto_family;
+struct sk_buff;
 
 extern struct neigh_table nd_tbl;
 
@@ -108,7 +117,7 @@ extern int			igmp6_event_report(struct sk_buff *skb);
 extern void			igmp6_cleanup(void);
 
 #ifdef CONFIG_SYSCTL
-extern int 			ndisc_ifinfo_sysctl_change(ctl_table *ctl,
+extern int 			ndisc_ifinfo_sysctl_change(struct ctl_table *ctl,
 							   int write,
 							   struct file * filp,
 							   void __user *buffer,
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 34c07731933d..6fa9ae190741 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -49,8 +49,8 @@
 #ifdef __KERNEL__
 
 #include <asm/atomic.h>
-#include <linux/skbuff.h>
 #include <linux/netdevice.h>
+#include <linux/skbuff.h>
 #include <linux/rcupdate.h>
 #include <linux/seq_file.h>
 
diff --git a/include/net/pkt_act.h b/include/net/pkt_act.h
index bd08964b72c0..b225d8472b7e 100644
--- a/include/net/pkt_act.h
+++ b/include/net/pkt_act.h
@@ -15,7 +15,6 @@
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
diff --git a/include/net/raw.h b/include/net/raw.h
index f47917469b12..e67b28a0248c 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -19,6 +19,8 @@
 
 #include <linux/config.h>
 
+#include <net/protocol.h>
+
 extern struct proto raw_prot;
 
 extern void 	raw_err(struct sock *, struct sk_buff *, u32 info);
diff --git a/include/net/udp.h b/include/net/udp.h
index 107b9d791a1f..766fba1369ce 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -22,9 +22,8 @@
 #ifndef _UDP_H
 #define _UDP_H
 
-#include <linux/udp.h>
-#include <linux/ip.h>
 #include <linux/list.h>
+#include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/snmp.h>
 #include <linux/seq_file.h>
@@ -62,6 +61,7 @@ static inline int udp_lport_inuse(u16 num)
 
 extern struct proto udp_prot;
 
+struct sk_buff;
 
 extern void	udp_err(struct sk_buff *, u32);
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 487abca3ca6f..07d7b50cdd76 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2,11 +2,12 @@
 #define _NET_XFRM_H
 
 #include <linux/compiler.h>
+#include <linux/in.h>
 #include <linux/xfrm.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/skbuff.h>
-#include <linux/netdevice.h>
+#include <linux/socket.h>
 #include <linux/crypto.h>
 #include <linux/pfkeyv2.h>
 #include <linux/in6.h>
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 23422bd53a5e..223f8270daee 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -26,6 +26,7 @@
 #include <linux/ip.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
 #include <linux/netfilter_bridge.h>
@@ -33,8 +34,11 @@
 #include <linux/netfilter_ipv6.h>
 #include <linux/netfilter_arp.h>
 #include <linux/in_route.h>
+
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/route.h>
+
 #include <asm/uaccess.h>
 #include <asm/checksum.h>
 #include "br_private.h"
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index c436e6c6242b..9f6e0193ae10 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -9,6 +9,7 @@
  *
  */
 
+#include <linux/in.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_log.h>
 #include <linux/netfilter.h>
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 49424a42a2c0..281a632fa6a6 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -13,6 +13,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/string.h>
+#include <linux/if_arp.h>
 #include <linux/inetdevice.h>
 #include <linux/inet.h>
 #include <linux/interrupt.h>
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index c37eeeaf5c6e..de681c6ad081 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -21,6 +21,8 @@
 
 #define CCID_MAX 255
 
+struct tcp_info;
+
 struct ccid {
 	unsigned char	ccid_id;
 	const char	*ccid_name;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 99e8afa7ba1e..3f244670764a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -19,6 +19,7 @@
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/timewait_sock.h>
 #include <net/tcp_states.h>
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 2e194c8f9953..c609dc78f487 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -20,6 +20,7 @@
 #include <net/addrconf.h>
 #include <net/inet_common.h>
 #include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
 #include <net/inet6_connection_sock.h>
 #include <net/inet6_hashtables.h>
 #include <net/ip6_route.h>
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 95a3c2c6a3ce..efd7ffb903a1 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 
+#include <net/inet_sock.h>
 #include <net/sock.h>
 
 #include "ackvec.h"
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e4e629ed9bf7..65b11ea90d85 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,7 +24,7 @@
 #include <net/checksum.h>
 
 #include <net/inet_common.h>
-#include <net/ip.h>
+#include <net/inet_sock.h>
 #include <net/protocol.h>
 #include <net/sock.h>
 #include <net/xfrm.h>
@@ -34,7 +34,6 @@
 #include <linux/timer.h>
 #include <linux/delay.h>
 #include <linux/poll.h>
-#include <linux/dccp.h>
 
 #include "ccid.h"
 #include "dccp.h"
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index ff58f49c8b4a..70fb2b88da65 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -31,6 +31,7 @@
 #include <linux/if_arp.h>
 #include <linux/wireless.h>
 #include <linux/skbuff.h>
+#include <linux/udp.h>
 #include <net/sock.h>
 #include <net/inet_common.h>
 #include <linux/stat.h>
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4ed8a814c6cb..36a6306ca5a3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -93,6 +93,7 @@
 #include <linux/smp_lock.h>
 #include <linux/inet.h>
 #include <linux/igmp.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <net/ip.h>
 #include <net/protocol.h>
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 035ad2c9e1ba..aed537fa2c88 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -6,6 +6,7 @@
 #include <linux/crypto.h>
 #include <linux/pfkeyv2.h>
 #include <net/icmp.h>
+#include <net/protocol.h>
 #include <asm/scatterlist.h>
 
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index b425748f02d7..37432088fe6d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -86,6 +86,7 @@
 #include <linux/in.h>
 #include <linux/mm.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/fddidevice.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 04a6fe3e95a2..7b9bb28e2ee9 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -58,6 +58,7 @@
 #endif
 #include <linux/kmod.h>
 
+#include <net/arp.h>
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/ip_fib.h>
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1b18ce66e7b7..73bfcae8af9c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -9,6 +9,7 @@
 #include <linux/pfkeyv2.h>
 #include <linux/random.h>
 #include <net/icmp.h>
+#include <net/protocol.h>
 #include <net/udp.h>
 
 /* decapsulation data for use when post-processing */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 19b1b984d687..18f5e509281a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -30,6 +30,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 7ea0209cb169..e2890ec8159e 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 0b298bbc1518..0dd4d06e456d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -33,6 +33,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6d2a6ac070e3..ef4724de7350 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
@@ -36,6 +37,7 @@
 #include <linux/netlink.h>
 #include <linux/init.h>
 
+#include <net/arp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 92e23b2ad4d2..be5a519cd2f8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -73,6 +73,7 @@
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/string.h>
 #include <linux/netfilter_ipv4.h>
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4a195c724f01..34758118c10c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -91,6 +91,8 @@
 #include <linux/if_arp.h>
 #include <linux/rtnetlink.h>
 #include <linux/times.h>
+
+#include <net/arp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 473d0f2b2e0d..e45846ae570b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -128,6 +128,7 @@
 #include <linux/sockios.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index dbe12da8d8b3..d3f6c468faf4 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -22,6 +22,7 @@
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/icmp.h>
+#include <net/route.h>
 
 /* 
  * Write options to IP header, record destination address to
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index add019c746f8..6986e11d65cc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -25,6 +25,7 @@
 #include <linux/skbuff.h>
 #include <linux/ip.h>
 #include <linux/icmp.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <net/sock.h>
 #include <net/ip.h>
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index fc718df17b40..d64e2ec8da7b 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -28,6 +28,7 @@
 #include <net/xfrm.h>
 #include <net/icmp.h>
 #include <net/ipcomp.h>
+#include <net/protocol.h>
 
 struct ipcomp_tfms {
 	struct list_head list;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e8674baaa8d9..bb3613ec448c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -42,6 +42,7 @@
 #include <linux/in.h>
 #include <linux/if.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
@@ -58,6 +59,7 @@
 #include <net/arp.h>
 #include <net/ip.h>
 #include <net/ipconfig.h>
+#include <net/route.h>
 
 #include <asm/uaccess.h>
 #include <net/checksum.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 302b7eb507c9..caa3b7d2e48a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -52,6 +52,7 @@
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
+#include <net/route.h>
 #include <net/sock.h>
 #include <net/icmp.h>
 #include <net/udp.h>
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 2a3a8c59c655..d35cea31cb55 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -24,7 +24,9 @@
  *
  */
 
+#include <linux/in.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/proc_fs.h>		/* for proc_net_* */
 #include <linux/seq_file.h>
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 9bdcf31b760e..fe2c39d2a002 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,6 +35,7 @@
 #include <linux/netfilter_ipv4.h>
 
 #include <net/ip.h>
+#include <net/route.h>
 #include <net/sock.h>
 
 #include <asm/uaccess.h>
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index f3bc320dce93..9fee19c4c617 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -37,8 +37,10 @@
  *
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 #include <net/ip_vs.h>
 
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 67b3e2fc1fa1..e7004741ac73 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -13,7 +13,10 @@
  * Changes:
  *
  */
+#include <linux/config.h>
 #include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 
 #include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index b6dad7e3710c..6e5cb92a5c83 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -41,8 +41,10 @@
  * me to write this module.
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 /* for sysctl */
 #include <linux/fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 8c78ef76c121..32ba37ba72d8 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -39,8 +39,10 @@
  *
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 /* for sysctl */
 #include <linux/fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
index 453e94a0bbd7..8b0505b09317 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -12,6 +12,8 @@
  *
  */
 
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
index 478e5c7c7e8e..c36ccf057a19 100644
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -12,6 +12,8 @@
  *
  */
 
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 8ae5f2e0aefa..89d9175d8f28 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -15,8 +15,11 @@
  *
  */
 
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/kernel.h>
 #include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
 
 #include <net/ip_vs.h>
 
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 6f7c50e44a39..7775e6cc68be 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -34,8 +34,10 @@
  *
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 #include <net/ip_vs.h>
 
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 2e5ced3d8062..1bca714bda3d 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -21,12 +21,14 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/inetdevice.h>
 #include <linux/net.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
 #include <linux/skbuff.h>
 #include <linux/in.h>
 #include <linux/igmp.h>                 /* for ip_mc_join_group */
+#include <linux/udp.h>
 
 #include <net/ip.h>
 #include <net/sock.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index e52847fa10f5..0366eedb4d70 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -18,11 +18,13 @@
  *
  */
 
+#include <linux/in.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/netfilter.h>
 #include <linux/ip.h>
 #include <linux/moduleparam.h>
+#include <linux/udp.h>
 #include <net/checksum.h>
 #include <net/udp.h>
 
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 744abb9d377a..57956dee60c8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -31,6 +31,7 @@
 #include <linux/ip.h>
 #include <linux/in.h>
 #include <linux/list.h>
+#include <linux/seq_file.h>
 
 static DEFINE_RWLOCK(ip_ct_gre_lock);
 #define ASSERT_READ_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index f2dcac7c7660..46becbe4fe58 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -11,6 +11,7 @@
 #include <linux/timer.h>
 #include <linux/netfilter.h>
 #include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/udp.h>
 #include <linux/seq_file.h>
 #include <net/checksum.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index dd476b191f4b..a88bcc551244 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -27,6 +27,7 @@
 #endif
 #include <net/checksum.h>
 #include <net/ip.h>
+#include <net/route.h>
 
 #define ASSERT_READ_LOCK(x)
 #define ASSERT_WRITE_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 8acb7ed40b47..4f95d477805c 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -44,6 +44,7 @@
  *
  */
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -53,6 +54,7 @@
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_nat_helper.h>
 #include <linux/ip.h>
+#include <linux/udp.h>
 #include <net/checksum.h>
 #include <net/udp.h>
 #include <asm/uaccess.h>
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 275a174c6fe6..27860510ca6d 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -11,6 +11,7 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linux/inetdevice.h>
 #include <linux/ip.h>
 #include <linux/timer.h>
 #include <linux/module.h>
@@ -18,6 +19,7 @@
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/checksum.h>
+#include <net/route.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv4/ip_nat_rule.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
index 1a53924041fc..03f554857a4d 100644
--- a/net/ipv4/netfilter/ipt_physdev.c
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/netfilter_ipv4/ipt_physdev.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 0d7dc668db46..39d49dc333a7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -38,6 +38,7 @@
 #include <net/protocol.h>
 #include <net/tcp.h>
 #include <net/udp.h>
+#include <linux/inetdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <net/sock.h>
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index dbf82955aabe..16984d4a8a06 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
 #include <linux/sysctl.h>
 #include <linux/config.h>
 #include <linux/igmp.h>
+#include <linux/inetdevice.h>
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 67c036384e77..223abaa72bc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -86,6 +86,7 @@
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
+#include <linux/igmp.h>
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index f3629730eb15..13cc7f895583 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <net/icmp.h>
 #include <net/ipv6.h>
+#include <net/protocol.h>
 #include <net/xfrm.h>
 #include <asm/scatterlist.h>
 
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 8bfbe9970793..6de8ee1a5ad9 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -36,6 +36,7 @@
 #include <linux/random.h>
 #include <net/icmp.h>
 #include <net/ipv6.h>
+#include <net/protocol.h>
 #include <linux/icmpv6.h>
 
 static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 55917fb17094..626dd39685f2 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -47,6 +47,7 @@
 #include <linux/rtnetlink.h>
 #include <net/icmp.h>
 #include <net/ipv6.h>
+#include <net/protocol.h>
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index dd80020d8740..ea43ef1d94a7 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -15,6 +15,7 @@
  *      - new extension header parser code
  */
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/skbuff.h>
 #include <linux/kmod.h>
 #include <linux/vmalloc.h>
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0cd1d1bd9033..ae4653bfd654 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/skbuff.h>
+#include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/spinlock.h>
 #include <linux/icmpv6.h>
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index dde37793d20b..268918d5deea 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -9,6 +9,7 @@
 
 #include <linux/module.h>
 #include <linux/skbuff.h>
+#include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/types.h>
 #include <net/checksum.h>
diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c
index 24bc0cde43a1..65937de1b58c 100644
--- a/net/ipv6/netfilter/ip6t_esp.c
+++ b/net/ipv6/netfilter/ip6t_esp.c
@@ -9,6 +9,7 @@
 
 #include <linux/module.h>
 #include <linux/skbuff.h>
+#include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/types.h>
 #include <net/checksum.h>
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 6cf0342706b5..c4a2a8c4c339 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -22,6 +22,7 @@
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/inet.h>
 #include <linux/netdevice.h>
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d1b0747a5b9d..de693b43c8ea 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -54,6 +54,7 @@
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/route.h>
 #include <net/sctp/sctp.h>
 #include <net/addrconf.h>
 #include <net/inet_common.h>
-- 
cgit v1.2.3-71-gd317


From 17ba15fb6264f27374bc87f4c3f8519b80289d85 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 27 Dec 2005 20:57:40 -0800
Subject: [PPPOX]: Fix assignment into const proto_ops.

And actually, with this, the whole pppox layer can basically
be removed and subsumed into pppoe.c, no other pppox sub-protocol
implementation exists and we've had this thing for at least 4
years.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pppoe.c      |  9 ++++-----
 drivers/net/pppox.c      | 10 +++-------
 include/linux/if_pppox.h |  3 +--
 3 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index 71e303b28646..9369f811075d 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -85,7 +85,7 @@ static int pppoe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb);
 static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb);
 
-static struct proto_ops pppoe_ops;
+static const struct proto_ops pppoe_ops;
 static DEFINE_RWLOCK(pppoe_hash_lock);
 
 static struct ppp_channel_ops pppoe_chan_ops;
@@ -1063,9 +1063,7 @@ static int __init pppoe_proc_init(void)
 static inline int pppoe_proc_init(void) { return 0; }
 #endif /* CONFIG_PROC_FS */
 
-/* ->ioctl are set at pppox_create */
-
-static struct proto_ops pppoe_ops = {
+static const struct proto_ops pppoe_ops = {
     .family		= AF_PPPOX,
     .owner		= THIS_MODULE,
     .release		= pppoe_release,
@@ -1081,7 +1079,8 @@ static struct proto_ops pppoe_ops = {
     .getsockopt		= sock_no_getsockopt,
     .sendmsg		= pppoe_sendmsg,
     .recvmsg		= pppoe_recvmsg,
-    .mmap		= sock_no_mmap
+    .mmap		= sock_no_mmap,
+    .ioctl		= pppox_ioctl,
 };
 
 static struct pppox_proto pppoe_proto = {
diff --git a/drivers/net/pppox.c b/drivers/net/pppox.c
index 0c1e114527fb..9315046b3f55 100644
--- a/drivers/net/pppox.c
+++ b/drivers/net/pppox.c
@@ -68,8 +68,7 @@ EXPORT_SYMBOL(register_pppox_proto);
 EXPORT_SYMBOL(unregister_pppox_proto);
 EXPORT_SYMBOL(pppox_unbind_sock);
 
-static int pppox_ioctl(struct socket* sock, unsigned int cmd, 
-		       unsigned long arg)
+int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
 	struct pppox_sock *po = pppox_sk(sk);
@@ -105,6 +104,7 @@ static int pppox_ioctl(struct socket* sock, unsigned int cmd,
 	return rc;
 }
 
+EXPORT_SYMBOL(pppox_ioctl);
 
 static int pppox_create(struct socket *sock, int protocol)
 {
@@ -119,11 +119,7 @@ static int pppox_create(struct socket *sock, int protocol)
 		goto out;
 
 	rc = pppox_protos[protocol]->create(sock);
-	if (!rc) {
-		/* We get to set the ioctl handler. */
-		/* For everything else, pppox is just a shell. */
-		sock->ops->ioctl = pppox_ioctl;
-	}
+
 	module_put(pppox_protos[protocol]->owner);
 out:
 	return rc;
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index e677f73f13dd..4fab3d0a4bce 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -157,8 +157,7 @@ struct pppox_proto {
 extern int register_pppox_proto(int proto_num, struct pppox_proto *pp);
 extern void unregister_pppox_proto(int proto_num);
 extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */
-extern int pppox_channel_ioctl(struct ppp_channel *pc, unsigned int cmd,
-			       unsigned long arg);
+extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 
 /* PPPoX socket states */
 enum {
-- 
cgit v1.2.3-71-gd317


From 4947d3ef8de7b4f42aed6ea9ba689dc8fb45b5a5 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Tue, 3 Jan 2006 14:06:50 -0800
Subject: [NET]: Speed up __alloc_skb()

From: Benjamin LaHaise <bcrl@kvack.org>

In __alloc_skb(), the use of skb_shinfo() which casts a u8 * to the
shared info structure results in gcc being forced to do a reload of the
pointer since it has no information on possible aliasing.  Fix this by
using a pointer to refer to skb_shared_info.

By initializing skb_shared_info sequentially, the write combining buffers
can reduce the number of memory transactions to a single write.  Reorder
the initialization in __alloc_skb() to match the structure definition.
There is also an alignment issue on 64 bit systems with skb_shared_info
by converting nr_frags to a short everything packs up nicely.

Also, pass the slab cache pointer according to the fclone flag instead
of using two almost identical function calls.

This raises bw_unix performance up to a peak of 707KB/s when combined
with the spinlock patch.  It should help other networking protocols, too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 +-
 net/core/skbuff.c      | 27 +++++++++++++--------------
 2 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 971677178e0c..483cfc47ec34 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -133,7 +133,7 @@ struct skb_frag_struct {
  */
 struct skb_shared_info {
 	atomic_t	dataref;
-	unsigned int	nr_frags;
+	unsigned short	nr_frags;
 	unsigned short	tso_size;
 	unsigned short	tso_segs;
 	unsigned short  ufo_size;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 83fee37de38e..070f91cfde59 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -135,17 +135,13 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 			    int fclone)
 {
+	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
 	u8 *data;
 
 	/* Get the HEAD */
-	if (fclone)
-		skb = kmem_cache_alloc(skbuff_fclone_cache,
-				       gfp_mask & ~__GFP_DMA);
-	else
-		skb = kmem_cache_alloc(skbuff_head_cache,
-				       gfp_mask & ~__GFP_DMA);
-
+	skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache,
+				gfp_mask & ~__GFP_DMA);
 	if (!skb)
 		goto out;
 
@@ -162,6 +158,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	skb->data = data;
 	skb->tail = data;
 	skb->end  = data + size;
+	/* make sure we initialize shinfo sequentially */
+	shinfo = skb_shinfo(skb);
+	atomic_set(&shinfo->dataref, 1);
+	shinfo->nr_frags  = 0;
+	shinfo->tso_size = 0;
+	shinfo->tso_segs = 0;
+	shinfo->ufo_size = 0;
+	shinfo->ip6_frag_id = 0;
+	shinfo->frag_list = NULL;
+
 	if (fclone) {
 		struct sk_buff *child = skb + 1;
 		atomic_t *fclone_ref = (atomic_t *) (child + 1);
@@ -171,13 +177,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 
 		child->fclone = SKB_FCLONE_UNAVAILABLE;
 	}
-	atomic_set(&(skb_shinfo(skb)->dataref), 1);
-	skb_shinfo(skb)->nr_frags  = 0;
-	skb_shinfo(skb)->tso_size = 0;
-	skb_shinfo(skb)->tso_segs = 0;
-	skb_shinfo(skb)->frag_list = NULL;
-	skb_shinfo(skb)->ufo_size = 0;
-	skb_shinfo(skb)->ip6_frag_id = 0;
 out:
 	return skb;
 nodata:
-- 
cgit v1.2.3-71-gd317


From a6f6c96b65d7f65a7a7bf5cbe874eda182a6b2cc Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Tue, 3 Jan 2006 22:38:44 +0000
Subject: [MMC] Improve MMC card block size selection

Select a block size for IO based on the read and write block size
combinations, and whether the card supports partial block reads
and/or partial block writes.

If we are able to satisfy block reads but not block writes, mark
the device read only.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/mmc.c        |  10 +++
 drivers/mmc/mmc_block.c  | 175 ++++++++++++++++++++++++++++++-----------------
 include/linux/mmc/card.h |   5 ++
 3 files changed, 128 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/mmc.c b/drivers/mmc/mmc.c
index eb41391e06e9..6696f71363b9 100644
--- a/drivers/mmc/mmc.c
+++ b/drivers/mmc/mmc.c
@@ -550,6 +550,11 @@ static void mmc_decode_csd(struct mmc_card *card)
 		csd->capacity	  = (1 + m) << (e + 2);
 
 		csd->read_blkbits = UNSTUFF_BITS(resp, 80, 4);
+		csd->read_partial = UNSTUFF_BITS(resp, 79, 1);
+		csd->write_misalign = UNSTUFF_BITS(resp, 78, 1);
+		csd->read_misalign = UNSTUFF_BITS(resp, 77, 1);
+		csd->write_blkbits = UNSTUFF_BITS(resp, 22, 4);
+		csd->write_partial = UNSTUFF_BITS(resp, 21, 1);
 	} else {
 		/*
 		 * We only understand CSD structure v1.1 and v1.2.
@@ -579,6 +584,11 @@ static void mmc_decode_csd(struct mmc_card *card)
 		csd->capacity	  = (1 + m) << (e + 2);
 
 		csd->read_blkbits = UNSTUFF_BITS(resp, 80, 4);
+		csd->read_partial = UNSTUFF_BITS(resp, 79, 1);
+		csd->write_misalign = UNSTUFF_BITS(resp, 78, 1);
+		csd->read_misalign = UNSTUFF_BITS(resp, 77, 1);
+		csd->write_blkbits = UNSTUFF_BITS(resp, 22, 4);
+		csd->write_partial = UNSTUFF_BITS(resp, 21, 1);
 	}
 }
 
diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c
index abcf19116d70..b9837cc5b9ac 100644
--- a/drivers/mmc/mmc_block.c
+++ b/drivers/mmc/mmc_block.c
@@ -54,6 +54,7 @@ struct mmc_blk_data {
 
 	unsigned int	usage;
 	unsigned int	block_bits;
+	unsigned int	read_only;
 };
 
 static DECLARE_MUTEX(open_lock);
@@ -85,12 +86,6 @@ static void mmc_blk_put(struct mmc_blk_data *md)
 	up(&open_lock);
 }
 
-static inline int mmc_blk_readonly(struct mmc_card *card)
-{
-	return mmc_card_readonly(card) ||
-	       !(card->csd.cmdclass & CCC_BLOCK_WRITE);
-}
-
 static int mmc_blk_open(struct inode *inode, struct file *filp)
 {
 	struct mmc_blk_data *md;
@@ -102,8 +97,7 @@ static int mmc_blk_open(struct inode *inode, struct file *filp)
 			check_disk_change(inode->i_bdev);
 		ret = 0;
 
-		if ((filp->f_mode & FMODE_WRITE) &&
-			mmc_blk_readonly(md->queue.card))
+		if ((filp->f_mode & FMODE_WRITE) && md->read_only)
 			ret = -EROFS;
 	}
 
@@ -299,6 +293,12 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 
 static unsigned long dev_use[MMC_NUM_MINORS/(8*sizeof(unsigned long))];
 
+static inline int mmc_blk_readonly(struct mmc_card *card)
+{
+	return mmc_card_readonly(card) ||
+	       !(card->csd.cmdclass & CCC_BLOCK_WRITE);
+}
+
 static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card)
 {
 	struct mmc_blk_data *md;
@@ -310,64 +310,121 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card)
 	__set_bit(devidx, dev_use);
 
 	md = kmalloc(sizeof(struct mmc_blk_data), GFP_KERNEL);
-	if (md) {
-		memset(md, 0, sizeof(struct mmc_blk_data));
+	if (!md) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-		md->disk = alloc_disk(1 << MMC_SHIFT);
-		if (md->disk == NULL) {
-			kfree(md);
-			md = ERR_PTR(-ENOMEM);
-			goto out;
-		}
+	memset(md, 0, sizeof(struct mmc_blk_data));
 
-		spin_lock_init(&md->lock);
-		md->usage = 1;
+	/*
+	 * Set the read-only status based on the supported commands
+	 * and the write protect switch.
+	 */
+	md->read_only = mmc_blk_readonly(card);
 
-		ret = mmc_init_queue(&md->queue, card, &md->lock);
-		if (ret) {
-			put_disk(md->disk);
-			kfree(md);
-			md = ERR_PTR(ret);
-			goto out;
+	/*
+	 * Figure out a workable block size.  MMC cards have:
+	 *  - two block sizes, one for read and one for write.
+	 *  - may support partial reads and/or writes
+	 *    (allows block sizes smaller than specified)
+	 */
+	md->block_bits = card->csd.read_blkbits;
+	if (card->csd.write_blkbits != card->csd.read_blkbits) {
+		if (card->csd.write_blkbits < card->csd.read_blkbits &&
+		    card->csd.read_partial) {
+			/*
+			 * write block size is smaller than read block
+			 * size, but we support partial reads, so choose
+			 * the smaller write block size.
+			 */
+			md->block_bits = card->csd.write_blkbits;
+		} else if (card->csd.write_blkbits > card->csd.read_blkbits &&
+			   card->csd.write_partial) {
+			/*
+			 * read block size is smaller than write block
+			 * size, but we support partial writes.  Use read
+			 * block size.
+			 */
+		} else {
+			/*
+			 * We don't support this configuration for writes.
+			 */
+			printk(KERN_ERR "%s: unable to select block size for "
+				"writing (rb%u wb%u rp%u wp%u)\n",
+				md->disk->disk_name,
+				1 << card->csd.read_blkbits,
+				1 << card->csd.write_blkbits,
+				card->csd.read_partial,
+				card->csd.write_partial);
+			md->read_only = 1;
 		}
-		md->queue.prep_fn = mmc_blk_prep_rq;
-		md->queue.issue_fn = mmc_blk_issue_rq;
-		md->queue.data = md;
+	}
 
-		md->disk->major	= major;
-		md->disk->first_minor = devidx << MMC_SHIFT;
-		md->disk->fops = &mmc_bdops;
-		md->disk->private_data = md;
-		md->disk->queue = md->queue.queue;
-		md->disk->driverfs_dev = &card->dev;
+	/*
+	 * Refuse to allow block sizes smaller than 512 bytes.
+	 */
+	if (md->block_bits < 9) {
+		printk(KERN_ERR "%s: unable to support block size %u\n",
+			mmc_card_id(card), 1 << md->block_bits);
+		ret = -EINVAL;
+		goto err_kfree;
+	}
 
-		/*
-		 * As discussed on lkml, GENHD_FL_REMOVABLE should:
-		 *
-		 * - be set for removable media with permanent block devices
-		 * - be unset for removable block devices with permanent media
-		 *
-		 * Since MMC block devices clearly fall under the second
-		 * case, we do not set GENHD_FL_REMOVABLE.  Userspace
-		 * should use the block device creation/destruction hotplug
-		 * messages to tell when the card is present.
-		 */
+	md->disk = alloc_disk(1 << MMC_SHIFT);
+	if (md->disk == NULL) {
+		ret = -ENOMEM;
+		goto err_kfree;
+	}
 
-		sprintf(md->disk->disk_name, "mmcblk%d", devidx);
-		sprintf(md->disk->devfs_name, "mmc/blk%d", devidx);
+	spin_lock_init(&md->lock);
+	md->usage = 1;
 
-		md->block_bits = card->csd.read_blkbits;
+	ret = mmc_init_queue(&md->queue, card, &md->lock);
+	if (ret)
+		goto err_putdisk;
 
-		blk_queue_hardsect_size(md->queue.queue, 1 << md->block_bits);
+	md->queue.prep_fn = mmc_blk_prep_rq;
+	md->queue.issue_fn = mmc_blk_issue_rq;
+	md->queue.data = md;
 
-		/*
-		 * The CSD capacity field is in units of read_blkbits.
-		 * set_capacity takes units of 512 bytes.
-		 */
-		set_capacity(md->disk, card->csd.capacity << (card->csd.read_blkbits - 9));
-	}
- out:
+	md->disk->major	= major;
+	md->disk->first_minor = devidx << MMC_SHIFT;
+	md->disk->fops = &mmc_bdops;
+	md->disk->private_data = md;
+	md->disk->queue = md->queue.queue;
+	md->disk->driverfs_dev = &card->dev;
+
+	/*
+	 * As discussed on lkml, GENHD_FL_REMOVABLE should:
+	 *
+	 * - be set for removable media with permanent block devices
+	 * - be unset for removable block devices with permanent media
+	 *
+	 * Since MMC block devices clearly fall under the second
+	 * case, we do not set GENHD_FL_REMOVABLE.  Userspace
+	 * should use the block device creation/destruction hotplug
+	 * messages to tell when the card is present.
+	 */
+
+	sprintf(md->disk->disk_name, "mmcblk%d", devidx);
+	sprintf(md->disk->devfs_name, "mmc/blk%d", devidx);
+
+	blk_queue_hardsect_size(md->queue.queue, 1 << md->block_bits);
+
+	/*
+	 * The CSD capacity field is in units of read_blkbits.
+	 * set_capacity takes units of 512 bytes.
+	 */
+	set_capacity(md->disk, card->csd.capacity << (card->csd.read_blkbits - 9));
 	return md;
+
+ err_putdisk:
+	put_disk(md->disk);
+ err_kfree:
+	kfree(md);
+ out:
+	return ERR_PTR(ret);
 }
 
 static int
@@ -403,12 +460,6 @@ static int mmc_blk_probe(struct mmc_card *card)
 	if (!(card->csd.cmdclass & CCC_BLOCK_READ))
 		return -ENODEV;
 
-	if (card->csd.read_blkbits < 9) {
-		printk(KERN_WARNING "%s: read blocksize too small (%u)\n",
-			mmc_card_id(card), 1 << card->csd.read_blkbits);
-		return -ENODEV;
-	}
-
 	md = mmc_blk_alloc(card);
 	if (IS_ERR(md))
 		return PTR_ERR(md);
@@ -419,7 +470,7 @@ static int mmc_blk_probe(struct mmc_card *card)
 
 	printk(KERN_INFO "%s: %s %s %luKiB %s\n",
 		md->disk->disk_name, mmc_card_id(card), mmc_card_name(card),
-		get_capacity(md->disk) >> 1, mmc_blk_readonly(card)?"(ro)":"");
+		get_capacity(md->disk) >> 1, md->read_only ? "(ro)" : "");
 
 	mmc_set_drvdata(card, md);
 	add_disk(md->disk);
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 18fc77f682de..7f7d40684288 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -30,7 +30,12 @@ struct mmc_csd {
 	unsigned int		tacc_ns;
 	unsigned int		max_dtr;
 	unsigned int		read_blkbits;
+	unsigned int		write_blkbits;
 	unsigned int		capacity;
+	unsigned int		read_partial:1,
+				read_misalign:1,
+				write_partial:1
+				write_misalign:1;
 };
 
 struct sd_scr {
-- 
cgit v1.2.3-71-gd317


From 88df8ef59a3eb54b1e2412765ff2736d2376d1ca Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 3 Jan 2006 15:25:45 -0800
Subject: [NET]: Don't exclude broadcast addresses from
 is_multicast_ether_addr()

The check for multicast shouldn't exclude broadcast type addresses.
This reverts the incorrect change done in 2.6.13.

The broadcast address is a multicast address and should be excluded
from being a valid_ether_address for use in bridging or device address.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 5f49a30eb6f2..745c988359c0 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -63,10 +63,11 @@ static inline int is_zero_ether_addr(const u8 *addr)
  * @addr: Pointer to a six-byte array containing the Ethernet address
  *
  * Return true if the address is a multicast address.
+ * By definition the broadcast address is also a multicast address.
  */
 static inline int is_multicast_ether_addr(const u8 *addr)
 {
-	return ((addr[0] != 0xff) && (0x01 & addr[0]));
+	return (0x01 & addr[0]);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From ce11a161c11868f268964274edc7a26a3e063e08 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Wed, 4 Jan 2006 12:40:39 +0000
Subject: [MMC] Fix missing ','

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/mmc/card.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 7f7d40684288..30dd978c1ec8 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -34,7 +34,7 @@ struct mmc_csd {
 	unsigned int		capacity;
 	unsigned int		read_partial:1,
 				read_misalign:1,
-				write_partial:1
+				write_partial:1,
 				write_misalign:1;
 };
 
-- 
cgit v1.2.3-71-gd317


From 26e92861be9c0da3be30718de693976b3f6a8026 Mon Sep 17 00:00:00 2001
From: Gareth Howlett <ghowlett@connecttech.com>
Date: Wed, 4 Jan 2006 17:00:42 +0000
Subject: [SERIAL] Add support for more Connect Tech PCI serial boards

I've also fixed the sort-ordering comments on this naming convention.

Signed-off-by: Stuart MacDonald <stuartm@connecttech.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/8250_pci.c | 120 +++++++++++++++++++++++++++++++++++++++++++---
 include/linux/pci_ids.h   |  17 +++++++
 2 files changed, 131 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 8adca0ce267f..4a589a9456f5 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -853,14 +853,15 @@ get_pci_irq(struct pci_dev *dev, struct pciserial_board *board)
  * driver_data member.
  *
  * The makeup of these names are:
- *  pbn_bn{_bt}_n_baud
+ *  pbn_bn{_bt}_n_baud{_offsetinhex}
  *
- *  bn   = PCI BAR number
- *  bt   = Index using PCI BARs
- *  n    = number of serial ports
- *  baud = baud rate
+ *  bn		= PCI BAR number
+ *  bt		= Index using PCI BARs
+ *  n		= number of serial ports
+ *  baud	= baud rate
+ *  offsetinhex	= offset for each sequential port (in hex)
  *
- * This table is sorted by (in order): baud, bt, bn, n.
+ * This table is sorted by (in order): bn, bt, baud, offsetindex, n.
  *
  * Please note: in theory if n = 1, _bt infix should make no difference.
  * ie, pbn_b0_1_115200 is the same as pbn_b0_bt_1_115200
@@ -881,6 +882,13 @@ enum pci_board_num_t {
 
 	pbn_b0_4_1152000,
 
+	pbn_b0_2_1843200,
+	pbn_b0_4_1843200,
+
+	pbn_b0_2_1843200_200,
+	pbn_b0_4_1843200_200,
+	pbn_b0_8_1843200_200,
+
 	pbn_b0_bt_1_115200,
 	pbn_b0_bt_2_115200,
 	pbn_b0_bt_8_115200,
@@ -904,6 +912,8 @@ enum pci_board_num_t {
 	pbn_b1_4_921600,
 	pbn_b1_8_921600,
 
+	pbn_b1_2_1250000,
+
 	pbn_b1_bt_2_921600,
 
 	pbn_b1_1_1382400,
@@ -1029,6 +1039,38 @@ static struct pciserial_board pci_boards[] __devinitdata = {
 		.uart_offset	= 8,
 	},
 
+	[pbn_b0_2_1843200] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 2,
+		.base_baud	= 1843200,
+		.uart_offset	= 8,
+	},
+	[pbn_b0_4_1843200] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 4,
+		.base_baud	= 1843200,
+		.uart_offset	= 8,
+	},
+
+	[pbn_b0_2_1843200_200] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 2,
+		.base_baud	= 1843200,
+		.uart_offset	= 0x200,
+	},
+	[pbn_b0_4_1843200_200] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 4,
+		.base_baud	= 1843200,
+		.uart_offset	= 0x200,
+	},
+	[pbn_b0_8_1843200_200] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 8,
+		.base_baud	= 1843200,
+		.uart_offset	= 0x200,
+	},
+
 	[pbn_b0_bt_1_115200] = {
 		.flags		= FL_BASE0|FL_BASE_BARS,
 		.num_ports	= 1,
@@ -1141,6 +1183,12 @@ static struct pciserial_board pci_boards[] __devinitdata = {
 		.base_baud	= 921600,
 		.uart_offset	= 8,
 	},
+	[pbn_b1_2_1250000] = {
+		.flags		= FL_BASE1,
+		.num_ports	= 2,
+		.base_baud	= 1250000,
+		.uart_offset	= 8,
+	},
 
 	[pbn_b1_bt_2_921600] = {
 		.flags		= FL_BASE1|FL_BASE_BARS,
@@ -1801,6 +1849,66 @@ static struct pci_device_id serial_pci_tbl[] = {
 		PCI_SUBVENDOR_ID_CONNECT_TECH,
 		PCI_SUBDEVICE_ID_CONNECT_TECH_BH041101V1, 0, 0,
 		pbn_b1_4_921600 },
+	{	PCI_VENDOR_ID_V3, PCI_DEVICE_ID_V3_V351,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_BH2_20MHZ, 0, 0,
+		pbn_b1_2_1250000 },
+	{	PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI954,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_TITAN_2, 0, 0,
+		pbn_b0_2_1843200 },
+	{	PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI954,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_TITAN_4, 0, 0,
+		pbn_b0_4_1843200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C152,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_2_232, 0, 0,
+		pbn_b0_2_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C154,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_4_232, 0, 0,
+		pbn_b0_4_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C158,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_8_232, 0, 0,
+		pbn_b0_8_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C152,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_1_1, 0, 0,
+		pbn_b0_2_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C154,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_2_2, 0, 0,
+		pbn_b0_4_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C158,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_4_4, 0, 0,
+		pbn_b0_8_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C152,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_2, 0, 0,
+		pbn_b0_2_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C154,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_4, 0, 0,
+		pbn_b0_4_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C158,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_8, 0, 0,
+		pbn_b0_8_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C152,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_2_485, 0, 0,
+		pbn_b0_2_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C154,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_4_485, 0, 0,
+		pbn_b0_4_1843200_200 },
+	{	PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C158,
+		PCI_SUBVENDOR_ID_CONNECT_TECH,
+		PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_8_485, 0, 0,
+		pbn_b0_8_1843200_200 },
 
 	{	PCI_VENDOR_ID_SEALEVEL, PCI_DEVICE_ID_SEALEVEL_U530,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4db67b3b05cc..efb5d22f8c09 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1567,6 +1567,23 @@
 #define PCI_SUBDEVICE_ID_CONNECT_TECH_BH8_485_2_6	0x0009
 #define PCI_SUBDEVICE_ID_CONNECT_TECH_BH081101V1	0x000A
 #define PCI_SUBDEVICE_ID_CONNECT_TECH_BH041101V1	0x000B
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_BH2_20MHZ		0x000C
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_BH2_PTM		0x000D
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_NT960PCI		0x0100
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_TITAN_2		0x0201
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_TITAN_4		0x0202
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_2_232	0x0300
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_4_232	0x0301
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_8_232	0x0302
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_1_1	0x0310
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_2_2	0x0311
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_4_4	0x0312
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_2	0x0320
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_4	0x0321
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_8	0x0322
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_2_485	0x0330
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_4_485	0x0331
+#define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_8_485	0x0332
 
 
 #define PCI_VENDOR_ID_NVIDIA_SGS	0x12d2
-- 
cgit v1.2.3-71-gd317


From a00828e9ac62caed7b830d631914d7748817ccd1 Mon Sep 17 00:00:00 2001
From: Pete Zaitcev <zaitcev@redhat.com>
Date: Sat, 22 Oct 2005 20:15:09 -0700
Subject: [PATCH] USB: drivers/usb/storage/libusual

This patch adds a shim driver libusual, which routes devices between
usb-storage and ub according to the common table, based on unusual_devs.h.
The help and example syntax is in Kconfig.

Signed-off-by: Pete Zaitcev <zaitcev@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/block/Kconfig              |   3 +-
 drivers/block/ub.c                 |  23 ++--
 drivers/usb/Makefile               |   1 +
 drivers/usb/storage/Kconfig        |  14 ++
 drivers/usb/storage/Makefile       |   4 +
 drivers/usb/storage/libusual.c     | 266 +++++++++++++++++++++++++++++++++++++
 drivers/usb/storage/protocol.h     |  14 --
 drivers/usb/storage/transport.h    |  31 -----
 drivers/usb/storage/unusual_devs.h |  24 ++++
 drivers/usb/storage/usb.c          | 119 ++++++-----------
 drivers/usb/storage/usb.h          |  31 +----
 include/linux/usb_usual.h          | 123 +++++++++++++++++
 12 files changed, 486 insertions(+), 167 deletions(-)
 create mode 100644 drivers/usb/storage/libusual.c
 create mode 100644 include/linux/usb_usual.h

(limited to 'include/linux')

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 7b1cd93892be..c4b9d2adfc08 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -358,7 +358,8 @@ config BLK_DEV_UB
 	  This driver supports certain USB attached storage devices
 	  such as flash keys.
 
-	  Warning: Enabling this cripples the usb-storage driver.
+	  If you enable this driver, it is recommended to avoid conflicts
+	  with usb-storage by enabling USB_LIBUSUAL.
 
 	  If unsure, say N.
 
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index bfb23d543ff7..06d741d58a68 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -29,6 +29,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/usb.h>
+#include <linux/usb_usual.h>
 #include <linux/blkdev.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/timer.h>
@@ -106,16 +107,6 @@
  *                                                                   +--------+
  */
 
-/*
- * Definitions which have to be scattered once we understand the layout better.
- */
-
-/* Transport (despite PR in the name) */
-#define US_PR_BULK	0x50		/* bulk only */
-
-/* Protocol */
-#define US_SC_SCSI	0x06		/* Transparent */
-
 /*
  * This many LUNs per USB device.
  * Every one of them takes a host, see UB_MAX_HOSTS.
@@ -422,13 +413,18 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum);
 
 /*
  */
+#ifdef CONFIG_USB_LIBUSUAL
+
+#define ub_usb_ids  storage_usb_ids
+#else
+
 static struct usb_device_id ub_usb_ids[] = {
-	// { USB_DEVICE_VER(0x0781, 0x0002, 0x0009, 0x0009) },	/* SDDR-31 */
 	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_BULK) },
 	{ }
 };
 
 MODULE_DEVICE_TABLE(usb, ub_usb_ids);
+#endif /* CONFIG_USB_LIBUSUAL */
 
 /*
  * Find me a way to identify "next free minor" for add_disk(),
@@ -2172,6 +2168,9 @@ static int ub_probe(struct usb_interface *intf,
 	int rc;
 	int i;
 
+	if (usb_usual_check_type(dev_id, USB_US_TYPE_UB))
+		return -ENXIO;
+
 	rc = -ENOMEM;
 	if ((sc = kmalloc(sizeof(struct ub_dev), GFP_KERNEL)) == NULL)
 		goto err_core;
@@ -2479,6 +2478,7 @@ static int __init ub_init(void)
 	if ((rc = usb_register(&ub_driver)) != 0)
 		goto err_register;
 
+	usb_usual_set_present(USB_US_TYPE_UB);
 	return 0;
 
 err_register:
@@ -2494,6 +2494,7 @@ static void __exit ub_exit(void)
 
 	devfs_remove(DEVFS_NAME);
 	unregister_blkdev(UB_MAJOR, DRV_NAME);
+	usb_usual_clear_present(USB_US_TYPE_UB);
 }
 
 module_init(ub_init);
diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index a50c2bc506f2..3639c3f8d357 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_USB_MIDI)		+= class/
 obj-$(CONFIG_USB_PRINTER)	+= class/
 
 obj-$(CONFIG_USB_STORAGE)	+= storage/
+obj-$(CONFIG_USB)		+= storage/
 
 obj-$(CONFIG_USB_AIPTEK)	+= input/
 obj-$(CONFIG_USB_ATI_REMOTE)	+= input/
diff --git a/drivers/usb/storage/Kconfig b/drivers/usb/storage/Kconfig
index c41d64dbb0f0..bdfcb95d9c12 100644
--- a/drivers/usb/storage/Kconfig
+++ b/drivers/usb/storage/Kconfig
@@ -124,3 +124,17 @@ config USB_STORAGE_ONETOUCH
 	  hard drive's as an input device. An action can be associated with
 	  this input in any keybinding software. (e.g. gnome's keyboard short-
 	  cuts)
+
+config USB_LIBUSUAL
+	bool "The shared table of common (or usual) storage devices"
+	depends on USB
+	help
+	  This module contains a table of common (or usual) devices
+	  for usb-storage and ub drivers, and allows to switch binding
+	  of these devices without rebuilding modules.
+
+	  Typical syntax of /etc/modprobe.conf is:
+
+		options libusual bias="ub"
+
+	  If unsure, say N.
diff --git a/drivers/usb/storage/Makefile b/drivers/usb/storage/Makefile
index 44ab8f9978fe..2d416e9028bb 100644
--- a/drivers/usb/storage/Makefile
+++ b/drivers/usb/storage/Makefile
@@ -22,3 +22,7 @@ usb-storage-obj-$(CONFIG_USB_STORAGE_ONETOUCH)	+= onetouch.o
 
 usb-storage-objs :=	scsiglue.o protocol.o transport.o usb.o \
 			initializers.o $(usb-storage-obj-y)
+
+ifneq ($(CONFIG_USB_LIBUSUAL),)
+	obj-$(CONFIG_USB)	+= libusual.o
+endif
diff --git a/drivers/usb/storage/libusual.c b/drivers/usb/storage/libusual.c
new file mode 100644
index 000000000000..61f73d8a2c0f
--- /dev/null
+++ b/drivers/usb/storage/libusual.c
@@ -0,0 +1,266 @@
+/*
+ * libusual
+ *
+ * The libusual contains the table of devices common for ub and usb-storage.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+#include <linux/usb_usual.h>
+#include <linux/vmalloc.h>
+
+/*
+ */
+#define USU_MOD_FL_THREAD   1	/* Thread is running */
+#define USU_MOD_FL_PRESENT  2	/* The module is loaded */
+
+struct mod_status {
+	unsigned long fls;
+};
+
+static struct mod_status stat[3];
+static DEFINE_SPINLOCK(usu_lock);
+
+/*
+ */
+#define USB_US_DEFAULT_BIAS	USB_US_TYPE_STOR
+
+#define BIAS_NAME_SIZE  (sizeof("usb-storage"))
+static char bias[BIAS_NAME_SIZE];
+static int usb_usual_bias;
+static const char *bias_names[3] = { "none", "usb-storage", "ub" };
+
+static DECLARE_MUTEX_LOCKED(usu_init_notify);
+static DECLARE_COMPLETION(usu_end_notify);
+static atomic_t total_threads = ATOMIC_INIT(0);
+
+static int usu_probe_thread(void *arg);
+static int parse_bias(const char *bias_s);
+
+/*
+ * The table.
+ */
+#define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \
+		    vendorName, productName,useProtocol, useTransport, \
+		    initFunction, flags) \
+{ USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin,bcdDeviceMax), \
+  .driver_info = (flags)|(USB_US_TYPE_STOR<<24) }
+
+#define USUAL_DEV(useProto, useTrans, useType) \
+{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, useProto, useTrans), \
+  .driver_info = ((useType)<<24) }
+
+struct usb_device_id storage_usb_ids [] = {
+#	include "unusual_devs.h"
+	{ } /* Terminating entry */
+};
+
+#undef USUAL_DEV
+#undef UNUSUAL_DEV
+
+MODULE_DEVICE_TABLE(usb, storage_usb_ids);
+EXPORT_SYMBOL_GPL(storage_usb_ids);
+
+/*
+ * @type: the module type as an integer
+ */
+void usb_usual_set_present(int type)
+{
+	struct mod_status *st;
+	unsigned long flags;
+
+	if (type <= 0 || type >= 3)
+		return;
+	st = &stat[type];
+	spin_lock_irqsave(&usu_lock, flags);
+	st->fls |= USU_MOD_FL_PRESENT;
+	spin_unlock_irqrestore(&usu_lock, flags);
+}
+EXPORT_SYMBOL_GPL(usb_usual_set_present);
+
+void usb_usual_clear_present(int type)
+{
+	struct mod_status *st;
+	unsigned long flags;
+
+	if (type <= 0 || type >= 3)
+		return;
+	st = &stat[type];
+	spin_lock_irqsave(&usu_lock, flags);
+	st->fls &= ~USU_MOD_FL_PRESENT;
+	spin_unlock_irqrestore(&usu_lock, flags);
+}
+EXPORT_SYMBOL_GPL(usb_usual_clear_present);
+
+/*
+ * Match the calling driver type against the table.
+ * Returns: 0 if the device matches.
+ */
+int usb_usual_check_type(const struct usb_device_id *id, int caller_type)
+{
+	int id_type = USB_US_TYPE(id->driver_info);
+
+	if (caller_type <= 0 || caller_type >= 3)
+		return -EINVAL;
+
+	/* Drivers grab fixed assignment devices */
+	if (id_type == caller_type)
+		return 0;
+	/* Drivers grab devices biased to them */
+	if (id_type == USB_US_TYPE_NONE && caller_type == usb_usual_bias)
+		return 0;
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(usb_usual_check_type);
+
+/*
+ */
+static int usu_probe(struct usb_interface *intf,
+			 const struct usb_device_id *id)
+{
+	int type;
+	int rc;
+	unsigned long flags;
+
+	type = USB_US_TYPE(id->driver_info);
+	if (type == 0)
+		type = usb_usual_bias;
+
+	spin_lock_irqsave(&usu_lock, flags);
+	if ((stat[type].fls & (USU_MOD_FL_THREAD|USU_MOD_FL_PRESENT)) != 0) {
+		spin_unlock_irqrestore(&usu_lock, flags);
+		return -ENXIO;
+	}
+	stat[type].fls |= USU_MOD_FL_THREAD;
+	spin_unlock_irqrestore(&usu_lock, flags);
+
+	rc = kernel_thread(usu_probe_thread, (void*)type, CLONE_VM);
+	if (rc < 0) {
+		printk(KERN_WARNING "libusual: "
+		    "Unable to start the thread for %s: %d\n",
+		    bias_names[type], rc);
+		spin_lock_irqsave(&usu_lock, flags);
+		stat[type].fls &= ~USU_MOD_FL_THREAD;
+		spin_unlock_irqrestore(&usu_lock, flags);
+		return rc;	/* Not being -ENXIO causes a message printed */
+	}
+	atomic_inc(&total_threads);
+
+	return -ENXIO;
+}
+
+static void usu_disconnect(struct usb_interface *intf)
+{
+	;	/* We should not be here. */
+}
+
+static struct usb_driver usu_driver = {
+	.owner =	THIS_MODULE,
+	.name =		"libusual",
+	.probe =	usu_probe,
+	.disconnect =	usu_disconnect,
+	.id_table =	storage_usb_ids,
+};
+
+/*
+ * A whole new thread for a purpose of request_module seems quite stupid.
+ * The request_module forks once inside again. However, if we attempt
+ * to load a storage module from our own modprobe thread, that module
+ * references our symbols, which cannot be resolved until our module is
+ * initialized. I wish there was a way to wait for the end of initialization.
+ * The module notifier reports MODULE_STATE_COMING only.
+ * So, we wait until module->init ends as the next best thing.
+ */
+static int usu_probe_thread(void *arg)
+{
+	int type = (unsigned long) arg;
+	struct mod_status *st = &stat[type];
+	int rc;
+	unsigned long flags;
+
+	daemonize("libusual_%d", type);	/* "usb-storage" is kinda too long */
+
+	/* A completion does not work here because it's counted. */
+	down(&usu_init_notify);
+	up(&usu_init_notify);
+
+	rc = request_module(bias_names[type]);
+	spin_lock_irqsave(&usu_lock, flags);
+	if (rc == 0 && (st->fls & USU_MOD_FL_PRESENT) == 0) {
+		/*
+		 * This should not happen, but let us keep tabs on it.
+		 */
+		printk(KERN_NOTICE "libusual: "
+		    "modprobe for %s succeeded, but module is not present\n",
+		    bias_names[type]);
+	}
+	st->fls &= ~USU_MOD_FL_THREAD;
+	spin_unlock_irqrestore(&usu_lock, flags);
+
+	complete_and_exit(&usu_end_notify, 0);
+}
+
+/*
+ */
+static int __init usb_usual_init(void)
+{
+	int rc;
+
+	bias[BIAS_NAME_SIZE-1] = 0;
+	usb_usual_bias = parse_bias(bias);
+
+	rc = usb_register(&usu_driver);
+	up(&usu_init_notify);
+	return rc;
+}
+
+static void __exit usb_usual_exit(void)
+{
+	/*
+	 * We do not check for any drivers present, because
+	 * they keep us pinned with symbol references.
+	 */
+
+	usb_deregister(&usu_driver);
+
+	while (atomic_read(&total_threads) > 0) {
+		wait_for_completion(&usu_end_notify);
+		atomic_dec(&total_threads);
+	}
+}
+
+/*
+ * Validate and accept the bias parameter.
+ * Maybe make an sysfs method later. XXX
+ */
+static int parse_bias(const char *bias_s)
+{
+	int i;
+	int bias_n = 0;
+
+	if (bias_s[0] == 0 || bias_s[0] == ' ') {
+		bias_n = USB_US_DEFAULT_BIAS;
+	} else {
+		for (i = 1; i < 3; i++) {
+			if (strcmp(bias_s, bias_names[i]) == 0) {
+				bias_n = i;
+				break;
+			}
+		}
+		if (bias_n == 0) {
+			bias_n = USB_US_DEFAULT_BIAS;
+			printk(KERN_INFO
+			    "libusual: unknown bias \"%s\", using \"%s\"\n",
+			    bias_s, bias_names[bias_n]);
+		}
+	}
+	return bias_n;
+}
+
+module_init(usb_usual_init);
+module_exit(usb_usual_exit);
+
+module_param_string(bias, bias, BIAS_NAME_SIZE,  S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(bias, "Bias to usb-storage or ub");
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/storage/protocol.h b/drivers/usb/storage/protocol.h
index 02bff01ab09c..845bed4b8031 100644
--- a/drivers/usb/storage/protocol.h
+++ b/drivers/usb/storage/protocol.h
@@ -41,20 +41,6 @@
 #ifndef _PROTOCOL_H_
 #define _PROTOCOL_H_
 
-/* Sub Classes */
-
-#define US_SC_RBC	0x01		/* Typically, flash devices */
-#define US_SC_8020	0x02		/* CD-ROM */
-#define US_SC_QIC	0x03		/* QIC-157 Tapes */
-#define US_SC_UFI	0x04		/* Floppy */
-#define US_SC_8070	0x05		/* Removable media */
-#define US_SC_SCSI	0x06		/* Transparent */
-#define US_SC_ISD200    0x07		/* ISD200 ATA */
-#define US_SC_MIN	US_SC_RBC
-#define US_SC_MAX	US_SC_ISD200
-
-#define US_SC_DEVICE	0xff		/* Use device's value */
-
 /* Protocol handling routines */
 extern void usb_stor_ATAPI_command(struct scsi_cmnd*, struct us_data*);
 extern void usb_stor_qic157_command(struct scsi_cmnd*, struct us_data*);
diff --git a/drivers/usb/storage/transport.h b/drivers/usb/storage/transport.h
index 0a362cc781ad..633a715850a4 100644
--- a/drivers/usb/storage/transport.h
+++ b/drivers/usb/storage/transport.h
@@ -41,39 +41,8 @@
 #ifndef _TRANSPORT_H_
 #define _TRANSPORT_H_
 
-#include <linux/config.h>
 #include <linux/blkdev.h>
 
-/* Protocols */
-
-#define US_PR_CBI	0x00		/* Control/Bulk/Interrupt */
-#define US_PR_CB	0x01		/* Control/Bulk w/o interrupt */
-#define US_PR_BULK	0x50		/* bulk only */
-#ifdef CONFIG_USB_STORAGE_USBAT
-#define US_PR_USBAT	0x80		/* SCM-ATAPI bridge */
-#endif
-#ifdef CONFIG_USB_STORAGE_SDDR09
-#define US_PR_EUSB_SDDR09	0x81	/* SCM-SCSI bridge for SDDR-09 */
-#endif
-#ifdef CONFIG_USB_STORAGE_SDDR55
-#define US_PR_SDDR55	0x82		/* SDDR-55 (made up) */
-#endif
-#define US_PR_DPCM_USB  0xf0		/* Combination CB/SDDR09 */
-
-#ifdef CONFIG_USB_STORAGE_FREECOM
-#define US_PR_FREECOM   0xf1		/* Freecom */
-#endif
-
-#ifdef CONFIG_USB_STORAGE_DATAFAB
-#define US_PR_DATAFAB   0xf2		/* Datafab chipsets */
-#endif
-
-#ifdef CONFIG_USB_STORAGE_JUMPSHOT
-#define US_PR_JUMPSHOT  0xf3		/* Lexar Jumpshot */
-#endif
-
-#define US_PR_DEVICE	0xff		/* Use device's value */
-
 /*
  * Bulk only data structures
  */
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index f5f47a34b168..76904ad11241 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -1134,3 +1134,27 @@ UNUSUAL_DEV(  0x55aa, 0xa103, 0x0000, 0x9999,
 		US_SC_SCSI, US_PR_SDDR55, NULL,
 		US_FL_SINGLE_LUN),
 #endif
+
+/* Control/Bulk transport for all SubClass values */
+USUAL_DEV(US_SC_RBC, US_PR_CB, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_8020, US_PR_CB, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_QIC, US_PR_CB, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_UFI, US_PR_CB, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_8070, US_PR_CB, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_SCSI, US_PR_CB, USB_US_TYPE_STOR),
+
+/* Control/Bulk/Interrupt transport for all SubClass values */
+USUAL_DEV(US_SC_RBC, US_PR_CBI, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_8020, US_PR_CBI, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_QIC, US_PR_CBI, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_UFI, US_PR_CBI, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_8070, US_PR_CBI, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_SCSI, US_PR_CBI, USB_US_TYPE_STOR),
+
+/* Bulk-only transport for all SubClass values */
+USUAL_DEV(US_SC_RBC, US_PR_BULK, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_8020, US_PR_BULK, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_QIC, US_PR_BULK, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_UFI, US_PR_BULK, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_8070, US_PR_BULK, USB_US_TYPE_STOR),
+USUAL_DEV(US_SC_SCSI, US_PR_BULK, 0),
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index 3847ebed2aa4..c8375aa62723 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -112,49 +112,33 @@ static atomic_t total_threads = ATOMIC_INIT(0);
 static DECLARE_COMPLETION(threads_gone);
 
 
-/* The entries in this table, except for final ones here
- * (USB_MASS_STORAGE_CLASS and the empty entry), correspond,
- * line for line with the entries of us_unsuaul_dev_list[].
+/*
+ * The entries in this table correspond, line for line,
+ * with the entries of us_unusual_dev_list[].
  */
+#ifndef CONFIG_USB_LIBUSUAL
 
 #define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \
 		    vendorName, productName,useProtocol, useTransport, \
 		    initFunction, flags) \
-{ USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin,bcdDeviceMax) }
+{ USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin,bcdDeviceMax), \
+  .driver_info = (flags)|(USB_US_TYPE_STOR<<24) }
+
+#define USUAL_DEV(useProto, useTrans, useType) \
+{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, useProto, useTrans), \
+  .driver_info = (USB_US_TYPE_STOR<<24) }
 
 static struct usb_device_id storage_usb_ids [] = {
 
 #	include "unusual_devs.h"
 #undef UNUSUAL_DEV
-	/* Control/Bulk transport for all SubClass values */
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_RBC, US_PR_CB) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_8020, US_PR_CB) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_QIC, US_PR_CB) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_UFI, US_PR_CB) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_8070, US_PR_CB) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_CB) },
-
-	/* Control/Bulk/Interrupt transport for all SubClass values */
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_RBC, US_PR_CBI) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_8020, US_PR_CBI) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_QIC, US_PR_CBI) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_UFI, US_PR_CBI) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_8070, US_PR_CBI) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_CBI) },
-
-	/* Bulk-only transport for all SubClass values */
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_RBC, US_PR_BULK) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_8020, US_PR_BULK) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_QIC, US_PR_BULK) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_UFI, US_PR_BULK) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_8070, US_PR_BULK) },
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_BULK) },
-
+#undef USUAL_DEV
 	/* Terminating entry */
 	{ }
 };
 
 MODULE_DEVICE_TABLE (usb, storage_usb_ids);
+#endif /* CONFIG_USB_LIBUSUAL */
 
 /* This is the list of devices we recognize, along with their flag data */
 
@@ -167,7 +151,6 @@ MODULE_DEVICE_TABLE (usb, storage_usb_ids);
  * are free to use as many characters as you like.
  */
 
-#undef UNUSUAL_DEV
 #define UNUSUAL_DEV(idVendor, idProduct, bcdDeviceMin, bcdDeviceMax, \
 		    vendor_name, product_name, use_protocol, use_transport, \
 		    init_function, Flags) \
@@ -177,53 +160,18 @@ MODULE_DEVICE_TABLE (usb, storage_usb_ids);
 	.useProtocol = use_protocol,	\
 	.useTransport = use_transport,	\
 	.initFunction = init_function,	\
-	.flags = Flags, \
+}
+
+#define USUAL_DEV(use_protocol, use_transport, use_type) \
+{ \
+	.useProtocol = use_protocol,	\
+	.useTransport = use_transport,	\
 }
 
 static struct us_unusual_dev us_unusual_dev_list[] = {
 #	include "unusual_devs.h" 
 #	undef UNUSUAL_DEV
-	/* Control/Bulk transport for all SubClass values */
-	{ .useProtocol = US_SC_RBC,
-	  .useTransport = US_PR_CB},
-	{ .useProtocol = US_SC_8020,
-	  .useTransport = US_PR_CB},
-	{ .useProtocol = US_SC_QIC,
-	  .useTransport = US_PR_CB},
-	{ .useProtocol = US_SC_UFI,
-	  .useTransport = US_PR_CB},
-	{ .useProtocol = US_SC_8070,
-	  .useTransport = US_PR_CB},
-	{ .useProtocol = US_SC_SCSI,
-	  .useTransport = US_PR_CB},
-
-	/* Control/Bulk/Interrupt transport for all SubClass values */
-	{ .useProtocol = US_SC_RBC,
-	  .useTransport = US_PR_CBI},
-	{ .useProtocol = US_SC_8020,
-	  .useTransport = US_PR_CBI},
-	{ .useProtocol = US_SC_QIC,
-	  .useTransport = US_PR_CBI},
-	{ .useProtocol = US_SC_UFI,
-	  .useTransport = US_PR_CBI},
-	{ .useProtocol = US_SC_8070,
-	  .useTransport = US_PR_CBI},
-	{ .useProtocol = US_SC_SCSI,
-	  .useTransport = US_PR_CBI},
-
-	/* Bulk-only transport for all SubClass values */
-	{ .useProtocol = US_SC_RBC,
-	  .useTransport = US_PR_BULK},
-	{ .useProtocol = US_SC_8020,
-	  .useTransport = US_PR_BULK},
-	{ .useProtocol = US_SC_QIC,
-	  .useTransport = US_PR_BULK},
-	{ .useProtocol = US_SC_UFI,
-	  .useTransport = US_PR_BULK},
-	{ .useProtocol = US_SC_8070,
-	  .useTransport = US_PR_BULK},
-	{ .useProtocol = US_SC_SCSI,
-	  .useTransport = US_PR_BULK},
+#	undef USUAL_DEV
 
 	/* Terminating entry */
 	{ NULL }
@@ -484,14 +432,20 @@ static int associate_dev(struct us_data *us, struct usb_interface *intf)
 	return 0;
 }
 
+/* Find an unusual_dev descriptor (always succeeds in the current code) */
+static struct us_unusual_dev *find_unusual(const struct usb_device_id *id)
+{
+	const int id_index = id - storage_usb_ids;
+	return &us_unusual_dev_list[id_index];
+}
+
 /* Get the unusual_devs entries and the string descriptors */
-static void get_device_info(struct us_data *us, int id_index)
+static void get_device_info(struct us_data *us, const struct usb_device_id *id)
 {
 	struct usb_device *dev = us->pusb_dev;
 	struct usb_interface_descriptor *idesc =
 		&us->pusb_intf->cur_altsetting->desc;
-	struct us_unusual_dev *unusual_dev = &us_unusual_dev_list[id_index];
-	struct usb_device_id *id = &storage_usb_ids[id_index];
+	struct us_unusual_dev *unusual_dev = find_unusual(id);
 
 	/* Store the entries */
 	us->unusual_dev = unusual_dev;
@@ -501,7 +455,7 @@ static void get_device_info(struct us_data *us, int id_index)
 	us->protocol = (unusual_dev->useTransport == US_PR_DEVICE) ?
 			idesc->bInterfaceProtocol :
 			unusual_dev->useTransport;
-	us->flags = unusual_dev->flags;
+	us->flags = USB_US_ORIG_FLAGS(id->driver_info);
 
 	/*
 	 * This flag is only needed when we're in high-speed, so let's
@@ -529,7 +483,7 @@ static void get_device_info(struct us_data *us, int id_index)
 		if (unusual_dev->useTransport != US_PR_DEVICE &&
 			us->protocol == idesc->bInterfaceProtocol)
 			msg += 2;
-		if (msg >= 0 && !(unusual_dev->flags & US_FL_NEED_OVERRIDE))
+		if (msg >= 0 && !(us->flags & US_FL_NEED_OVERRIDE))
 			printk(KERN_NOTICE USB_STORAGE "This device "
 				"(%04x,%04x,%04x S %02x P %02x)"
 				" has %s in unusual_devs.h\n"
@@ -921,10 +875,12 @@ static int storage_probe(struct usb_interface *intf,
 {
 	struct Scsi_Host *host;
 	struct us_data *us;
-	const int id_index = id - storage_usb_ids; 
 	int result;
 	struct task_struct *th;
 
+	if (usb_usual_check_type(id, USB_US_TYPE_STOR))
+		return -ENXIO;
+
 	US_DEBUGP("USB Mass Storage device detected\n");
 
 	/*
@@ -957,7 +913,7 @@ static int storage_probe(struct usb_interface *intf,
 	 * of the match from the usb_device_id table, so we can find the
 	 * corresponding entry in the private table.
 	 */
-	get_device_info(us, id_index);
+	get_device_info(us, id);
 
 #ifdef CONFIG_USB_STORAGE_SDDR09
 	if (us->protocol == US_PR_EUSB_SDDR09 ||
@@ -1062,9 +1018,10 @@ static int __init usb_stor_init(void)
 
 	/* register the driver, return usb_register return code if error */
 	retval = usb_register(&usb_storage_driver);
-	if (retval == 0)
+	if (retval == 0) {
 		printk(KERN_INFO "USB Mass Storage support registered.\n");
-
+		usb_usual_set_present(USB_US_TYPE_STOR);
+	}
 	return retval;
 }
 
@@ -1088,6 +1045,8 @@ static void __exit usb_stor_exit(void)
 		wait_for_completion(&threads_gone);
 		atomic_dec(&total_threads);
 	}
+
+	usb_usual_clear_present(USB_US_TYPE_STOR);
 }
 
 module_init(usb_stor_init);
diff --git a/drivers/usb/storage/usb.h b/drivers/usb/storage/usb.h
index 98b09711a739..0cd1eebc4497 100644
--- a/drivers/usb/storage/usb.h
+++ b/drivers/usb/storage/usb.h
@@ -45,6 +45,7 @@
 #define _USB_H_
 
 #include <linux/usb.h>
+#include <linux/usb_usual.h>
 #include <linux/blkdev.h>
 #include <linux/smp_lock.h>
 #include <linux/completion.h>
@@ -63,38 +64,8 @@ struct us_unusual_dev {
 	__u8  useProtocol;
 	__u8  useTransport;
 	int (*initFunction)(struct us_data *);
-	unsigned int flags;
 };
 
-/*
- * Static flag definitions.  We use this roundabout technique so that the
- * proc_info() routine can automatically display a message for each flag.
- */
-#define US_DO_ALL_FLAGS						\
-	US_FLAG(SINGLE_LUN,	0x00000001)			\
-		/* allow access to only LUN 0 */		\
-	US_FLAG(NEED_OVERRIDE,	0x00000002)			\
-		/* unusual_devs entry is necessary */		\
-	US_FLAG(SCM_MULT_TARG,	0x00000004)			\
-		/* supports multiple targets */			\
-	US_FLAG(FIX_INQUIRY,	0x00000008)			\
-		/* INQUIRY response needs faking */		\
-	US_FLAG(FIX_CAPACITY,	0x00000010)			\
-		/* READ CAPACITY response too big */		\
-	US_FLAG(IGNORE_RESIDUE,	0x00000020)			\
-		/* reported residue is wrong */			\
-	US_FLAG(BULK32,		0x00000040)			\
-		/* Uses 32-byte CBW length */			\
-	US_FLAG(NOT_LOCKABLE,	0x00000080)			\
-		/* PREVENT/ALLOW not supported */		\
-	US_FLAG(GO_SLOW,	0x00000100)			\
-		/* Need delay after Command phase */		\
-	US_FLAG(NO_WP_DETECT,	0x00000200)			\
-		/* Don't check for write-protect */		\
-
-#define US_FLAG(name, value)	US_FL_##name = value ,
-enum { US_DO_ALL_FLAGS };
-#undef US_FLAG
 
 /* Dynamic flag definitions: used in set_bit() etc. */
 #define US_FLIDX_URB_ACTIVE	18  /* 0x00040000  current_urb is in use  */
diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h
new file mode 100644
index 000000000000..f9c058f33712
--- /dev/null
+++ b/include/linux/usb_usual.h
@@ -0,0 +1,123 @@
+/*
+ * Interface to the libusual.
+ *
+ * Copyright (c) 2005 Pete Zaitcev <zaitcev@redhat.com>
+ * Copyright (c) 1999-2002 Matthew Dharm (mdharm-usb@one-eyed-alien.net)
+ * Copyright (c) 1999 Michael Gee (michael@linuxspecific.com)
+ */
+
+#ifndef __LINUX_USB_USUAL_H
+#define __LINUX_USB_USUAL_H
+
+#include <linux/config.h>
+
+/* We should do this for cleanliness... But other usb_foo.h do not do this. */
+/* #include <linux/usb.h> */
+
+/*
+ * The flags field, which we store in usb_device_id.driver_info.
+ * It is compatible with the old usb-storage flags in lower 24 bits.
+ */
+
+/*
+ * Static flag definitions.  We use this roundabout technique so that the
+ * proc_info() routine can automatically display a message for each flag.
+ */
+#define US_DO_ALL_FLAGS						\
+	US_FLAG(SINGLE_LUN,	0x00000001)			\
+		/* allow access to only LUN 0 */		\
+	US_FLAG(NEED_OVERRIDE,	0x00000002)			\
+		/* unusual_devs entry is necessary */		\
+	US_FLAG(SCM_MULT_TARG,	0x00000004)			\
+		/* supports multiple targets */			\
+	US_FLAG(FIX_INQUIRY,	0x00000008)			\
+		/* INQUIRY response needs faking */		\
+	US_FLAG(FIX_CAPACITY,	0x00000010)			\
+		/* READ CAPACITY response too big */		\
+	US_FLAG(IGNORE_RESIDUE,	0x00000020)			\
+		/* reported residue is wrong */			\
+	US_FLAG(BULK32,		0x00000040)			\
+		/* Uses 32-byte CBW length */			\
+	US_FLAG(NOT_LOCKABLE,	0x00000080)			\
+		/* PREVENT/ALLOW not supported */		\
+	US_FLAG(GO_SLOW,	0x00000100)			\
+		/* Need delay after Command phase */		\
+	US_FLAG(NO_WP_DETECT,	0x00000200)			\
+		/* Don't check for write-protect */		\
+
+#define US_FLAG(name, value)	US_FL_##name = value ,
+enum { US_DO_ALL_FLAGS };
+#undef US_FLAG
+
+/*
+ * The bias field for libusual and friends.
+ */
+#define USB_US_TYPE_NONE   0
+#define USB_US_TYPE_STOR   1		/* usb-storage */
+#define USB_US_TYPE_UB     2		/* ub */
+
+#define USB_US_TYPE(flags) 		(((flags) >> 24) & 0xFF)
+#define USB_US_ORIG_FLAGS(flags)	((flags) & 0x00FFFFFF)
+
+/*
+ * This is probably not the best place to keep these constants, conceptually.
+ * But it's the only header included into all places which need them.
+ */
+
+/* Sub Classes */
+
+#define US_SC_RBC	0x01		/* Typically, flash devices */
+#define US_SC_8020	0x02		/* CD-ROM */
+#define US_SC_QIC	0x03		/* QIC-157 Tapes */
+#define US_SC_UFI	0x04		/* Floppy */
+#define US_SC_8070	0x05		/* Removable media */
+#define US_SC_SCSI	0x06		/* Transparent */
+#define US_SC_ISD200    0x07		/* ISD200 ATA */
+#define US_SC_MIN	US_SC_RBC
+#define US_SC_MAX	US_SC_ISD200
+
+#define US_SC_DEVICE	0xff		/* Use device's value */
+
+/* Protocols */
+
+#define US_PR_CBI	0x00		/* Control/Bulk/Interrupt */
+#define US_PR_CB	0x01		/* Control/Bulk w/o interrupt */
+#define US_PR_BULK	0x50		/* bulk only */
+#ifdef CONFIG_USB_STORAGE_USBAT
+#define US_PR_USBAT	0x80		/* SCM-ATAPI bridge */
+#endif
+#ifdef CONFIG_USB_STORAGE_SDDR09
+#define US_PR_EUSB_SDDR09	0x81	/* SCM-SCSI bridge for SDDR-09 */
+#endif
+#ifdef CONFIG_USB_STORAGE_SDDR55
+#define US_PR_SDDR55	0x82		/* SDDR-55 (made up) */
+#endif
+#define US_PR_DPCM_USB  0xf0		/* Combination CB/SDDR09 */
+#ifdef CONFIG_USB_STORAGE_FREECOM
+#define US_PR_FREECOM   0xf1		/* Freecom */
+#endif
+#ifdef CONFIG_USB_STORAGE_DATAFAB
+#define US_PR_DATAFAB   0xf2		/* Datafab chipsets */
+#endif
+#ifdef CONFIG_USB_STORAGE_JUMPSHOT
+#define US_PR_JUMPSHOT  0xf3		/* Lexar Jumpshot */
+#endif
+
+#define US_PR_DEVICE	0xff		/* Use device's value */
+
+/*
+ */
+#ifdef CONFIG_USB_LIBUSUAL
+
+extern struct usb_device_id storage_usb_ids[];
+extern void usb_usual_set_present(int type);
+extern void usb_usual_clear_present(int type);
+extern int usb_usual_check_type(const struct usb_device_id *, int type);
+#else
+
+#define usb_usual_set_present(t)	do { } while(0)
+#define usb_usual_clear_present(t)	do { } while(0)
+#define usb_usual_check_type(id, t)	(0)
+#endif /* CONFIG_USB_LIBUSUAL */
+
+#endif /* __LINUX_USB_USUAL_H */
-- 
cgit v1.2.3-71-gd317


From 733260ff9c45bd4db60f45d17e8560a4a68dff4d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 16 Nov 2005 13:41:28 -0800
Subject: [PATCH] USB: add dynamic id functionality to USB core

Echo the usb vendor and product id to the "new_id" file in the driver's
sysfs directory, and then that driver will be able to bind to a device
with those ids if it is present.

Example:
	echo 0557 2008 > /sys/bus/usb/drivers/foo_driver/new_id
adds the hex values 0557 and 2008 to the device id table for the foo_driver.

Note, usb-serial drivers do not currently work with this capability yet.
usb-storage also might have some oddities.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/driver.c | 218 +++++++++++++++++++++++++++++++++++-----------
 include/linux/usb.h       |   8 ++
 2 files changed, 176 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 921a21be651d..1c0611045379 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -27,6 +27,15 @@
 #include "hcd.h"
 #include "usb.h"
 
+static int usb_match_one_id(struct usb_interface *interface,
+			    const struct usb_device_id *id);
+
+struct usb_dynid {
+	struct list_head node;
+	struct usb_device_id id;
+};
+
+
 static int generic_probe(struct device *dev)
 {
 	return 0;
@@ -58,6 +67,96 @@ struct device_driver usb_generic_driver = {
  * usb device or a usb interface. */
 int usb_generic_driver_data;
 
+#ifdef CONFIG_HOTPLUG
+
+/*
+ * Adds a new dynamic USBdevice ID to this driver,
+ * and cause the driver to probe for all devices again.
+ */
+static ssize_t store_new_id(struct device_driver *driver,
+			    const char *buf, size_t count)
+{
+	struct usb_driver *usb_drv = to_usb_driver(driver);
+	struct usb_dynid *dynid;
+	u32 idVendor = 0;
+	u32 idProduct = 0;
+	int fields = 0;
+
+	fields = sscanf(buf, "%x %x", &idVendor, &idProduct);
+	if (fields < 2)
+		return -EINVAL;
+
+	dynid = kzalloc(sizeof(*dynid), GFP_KERNEL);
+	if (!dynid)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&dynid->node);
+	dynid->id.idVendor = idVendor;
+	dynid->id.idProduct = idProduct;
+	dynid->id.match_flags = USB_DEVICE_ID_MATCH_DEVICE;
+
+	spin_lock(&usb_drv->dynids.lock);
+	list_add_tail(&usb_drv->dynids.list, &dynid->node);
+	spin_unlock(&usb_drv->dynids.lock);
+
+	if (get_driver(driver)) {
+		driver_attach(driver);
+		put_driver(driver);
+	}
+
+	return count;
+}
+static DRIVER_ATTR(new_id, S_IWUSR, NULL, store_new_id);
+
+static int usb_create_newid_file(struct usb_driver *usb_drv)
+{
+	int error = 0;
+
+	if (usb_drv->probe != NULL)
+		error = sysfs_create_file(&usb_drv->driver.kobj,
+					  &driver_attr_new_id.attr);
+	return error;
+}
+
+static void usb_free_dynids(struct usb_driver *usb_drv)
+{
+	struct usb_dynid *dynid, *n;
+
+	spin_lock(&usb_drv->dynids.lock);
+	list_for_each_entry_safe(dynid, n, &usb_drv->dynids.list, node) {
+		list_del(&dynid->node);
+		kfree(dynid);
+	}
+	spin_unlock(&usb_drv->dynids.lock);
+}
+#else
+static inline int usb_create_newid_file(struct usb_driver *usb_drv)
+{
+	return 0;
+}
+
+static inline void usb_free_dynids(struct usb_driver *usb_drv)
+{
+}
+#endif
+
+static const struct usb_device_id *usb_match_dynamic_id(struct usb_interface *intf,
+							struct usb_driver *drv)
+{
+	struct usb_dynid *dynid;
+
+	spin_lock(&drv->dynids.lock);
+	list_for_each_entry(dynid, &drv->dynids.list, node) {
+		if (usb_match_one_id(intf, &dynid->id)) {
+			spin_unlock(&drv->dynids.lock);
+			return &dynid->id;
+		}
+	}
+	spin_unlock(&drv->dynids.lock);
+	return NULL;
+}
+
+
 /* called from driver core with usb_bus_type.subsys writelock */
 static int usb_probe_interface(struct device *dev)
 {
@@ -75,6 +174,8 @@ static int usb_probe_interface(struct device *dev)
 		return -EHOSTUNREACH;
 
 	id = usb_match_id(intf, driver->id_table);
+	if (!id)
+		id = usb_match_dynamic_id(intf, driver);
 	if (id) {
 		dev_dbg(dev, "%s - got id\n", __FUNCTION__);
 
@@ -120,6 +221,64 @@ static int usb_unbind_interface(struct device *dev)
 	return 0;
 }
 
+/* returns 0 if no match, 1 if match */
+static int usb_match_one_id(struct usb_interface *interface,
+			    const struct usb_device_id *id)
+{
+	struct usb_host_interface *intf;
+	struct usb_device *dev;
+
+	/* proc_connectinfo in devio.c may call us with id == NULL. */
+	if (id == NULL)
+		return 0;
+
+	intf = interface->cur_altsetting;
+	dev = interface_to_usbdev(interface);
+
+	if ((id->match_flags & USB_DEVICE_ID_MATCH_VENDOR) &&
+	    id->idVendor != le16_to_cpu(dev->descriptor.idVendor))
+		return 0;
+
+	if ((id->match_flags & USB_DEVICE_ID_MATCH_PRODUCT) &&
+	    id->idProduct != le16_to_cpu(dev->descriptor.idProduct))
+		return 0;
+
+	/* No need to test id->bcdDevice_lo != 0, since 0 is never
+	   greater than any unsigned number. */
+	if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_LO) &&
+	    (id->bcdDevice_lo > le16_to_cpu(dev->descriptor.bcdDevice)))
+		return 0;
+
+	if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_HI) &&
+	    (id->bcdDevice_hi < le16_to_cpu(dev->descriptor.bcdDevice)))
+		return 0;
+
+	if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_CLASS) &&
+	    (id->bDeviceClass != dev->descriptor.bDeviceClass))
+		return 0;
+
+	if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_SUBCLASS) &&
+	    (id->bDeviceSubClass!= dev->descriptor.bDeviceSubClass))
+		return 0;
+
+	if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_PROTOCOL) &&
+	    (id->bDeviceProtocol != dev->descriptor.bDeviceProtocol))
+		return 0;
+
+	if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_CLASS) &&
+	    (id->bInterfaceClass != intf->desc.bInterfaceClass))
+		return 0;
+
+	if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_SUBCLASS) &&
+	    (id->bInterfaceSubClass != intf->desc.bInterfaceSubClass))
+		return 0;
+
+	if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_PROTOCOL) &&
+	    (id->bInterfaceProtocol != intf->desc.bInterfaceProtocol))
+		return 0;
+
+	return 1;
+}
 /**
  * usb_match_id - find first usb_device_id matching device or interface
  * @interface: the interface of interest
@@ -184,16 +343,10 @@ static int usb_unbind_interface(struct device *dev)
 const struct usb_device_id *usb_match_id(struct usb_interface *interface,
 					 const struct usb_device_id *id)
 {
-	struct usb_host_interface *intf;
-	struct usb_device *dev;
-
 	/* proc_connectinfo in devio.c may call us with id == NULL. */
 	if (id == NULL)
 		return NULL;
 
-	intf = interface->cur_altsetting;
-	dev = interface_to_usbdev(interface);
-
 	/* It is important to check that id->driver_info is nonzero,
 	   since an entry that is all zeroes except for a nonzero
 	   id->driver_info is the way to create an entry that
@@ -201,50 +354,8 @@ const struct usb_device_id *usb_match_id(struct usb_interface *interface,
 	   device and interface. */
 	for (; id->idVendor || id->bDeviceClass || id->bInterfaceClass ||
 	       id->driver_info; id++) {
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_VENDOR) &&
-		    id->idVendor != le16_to_cpu(dev->descriptor.idVendor))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_PRODUCT) &&
-		    id->idProduct != le16_to_cpu(dev->descriptor.idProduct))
-			continue;
-
-		/* No need to test id->bcdDevice_lo != 0, since 0 is never
-		   greater than any unsigned number. */
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_LO) &&
-		    (id->bcdDevice_lo > le16_to_cpu(dev->descriptor.bcdDevice)))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_HI) &&
-		    (id->bcdDevice_hi < le16_to_cpu(dev->descriptor.bcdDevice)))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_CLASS) &&
-		    (id->bDeviceClass != dev->descriptor.bDeviceClass))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_SUBCLASS) &&
-		    (id->bDeviceSubClass!= dev->descriptor.bDeviceSubClass))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_PROTOCOL) &&
-		    (id->bDeviceProtocol != dev->descriptor.bDeviceProtocol))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_CLASS) &&
-		    (id->bInterfaceClass != intf->desc.bInterfaceClass))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_SUBCLASS) &&
-		    (id->bInterfaceSubClass != intf->desc.bInterfaceSubClass))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_PROTOCOL) &&
-		    (id->bInterfaceProtocol != intf->desc.bInterfaceProtocol))
-			continue;
-
-		return id;
+		if (usb_match_one_id(interface, id))
+			return id;
 	}
 
 	return NULL;
@@ -268,6 +379,9 @@ int usb_device_match(struct device *dev, struct device_driver *drv)
 	if (id)
 		return 1;
 
+	id = usb_match_dynamic_id(intf, usb_drv);
+	if (id)
+		return 1;
 	return 0;
 }
 
@@ -296,6 +410,8 @@ int usb_register(struct usb_driver *new_driver)
 	new_driver->driver.probe = usb_probe_interface;
 	new_driver->driver.remove = usb_unbind_interface;
 	new_driver->driver.owner = new_driver->owner;
+	spin_lock_init(&new_driver->dynids.lock);
+	INIT_LIST_HEAD(&new_driver->dynids.list);
 
 	usb_lock_all_devices();
 	retval = driver_register(&new_driver->driver);
@@ -305,6 +421,7 @@ int usb_register(struct usb_driver *new_driver)
 		pr_info("%s: registered new driver %s\n",
 			usbcore_name, new_driver->name);
 		usbfs_update_special();
+		usb_create_newid_file(new_driver);
 	} else {
 		printk(KERN_ERR "%s: error %d registering driver %s\n",
 			usbcore_name, retval, new_driver->name);
@@ -330,6 +447,7 @@ void usb_deregister(struct usb_driver *driver)
 	pr_info("%s: deregistering driver %s\n", usbcore_name, driver->name);
 
 	usb_lock_all_devices();
+	usb_free_dynids(driver);
 	driver_unregister(&driver->driver);
 	usb_unlock_all_devices();
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index d81b050e5955..0dd96ef78c13 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -529,6 +529,11 @@ static inline int usb_make_path (struct usb_device *dev, char *buf,
 
 /* ----------------------------------------------------------------------- */
 
+struct usb_dynids {
+	spinlock_t lock;
+	struct list_head list;
+};
+
 /**
  * struct usb_driver - identifies USB driver to usbcore
  * @owner: Pointer to the module owner of this driver; initialize
@@ -553,6 +558,8 @@ static inline int usb_make_path (struct usb_device *dev, char *buf,
  * @id_table: USB drivers use ID table to support hotplugging.
  *	Export this with MODULE_DEVICE_TABLE(usb,...).  This must be set
  *	or your driver's probe function will never get called.
+ * @dynids: used internally to hold the list of dynamically added device
+ *	ids for this driver.
  * @driver: the driver model core driver structure.
  *
  * USB drivers must provide a name, probe() and disconnect() methods,
@@ -588,6 +595,7 @@ struct usb_driver {
 
 	const struct usb_device_id *id_table;
 
+	struct usb_dynids dynids;
 	struct device_driver driver;
 };
 #define	to_usb_driver(d) container_of(d, struct usb_driver, driver)
-- 
cgit v1.2.3-71-gd317


From ba9dc657af86d05d2971633e57d1f6f94ed60472 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 16 Nov 2005 13:41:28 -0800
Subject: [PATCH] USB: allow usb drivers to disable dynamic ids

This lets drivers, like the usb-serial ones, disable the ability to add
ids from sysfs.

The usb-serial drivers are "odd" in that they are really usb-serial bus
drivers, not usb bus drivers, so the dynamic id logic will have to go
into the usb-serial bus core for those drivers to get that ability.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/driver.c             | 19 +++++++++++++++++++
 drivers/usb/serial/airprime.c         |  1 +
 drivers/usb/serial/anydata.c          |  1 +
 drivers/usb/serial/belkin_sa.c        |  1 +
 drivers/usb/serial/cp2101.c           |  1 +
 drivers/usb/serial/cyberjack.c        |  1 +
 drivers/usb/serial/cypress_m8.c       |  1 +
 drivers/usb/serial/digi_acceleport.c  |  1 +
 drivers/usb/serial/empeg.c            |  1 +
 drivers/usb/serial/ftdi_sio.c         |  1 +
 drivers/usb/serial/garmin_gps.c       |  1 +
 drivers/usb/serial/generic.c          |  1 +
 drivers/usb/serial/hp4x.c             |  1 +
 drivers/usb/serial/io_edgeport.c      |  1 +
 drivers/usb/serial/io_ti.c            |  1 +
 drivers/usb/serial/ipaq.c             |  1 +
 drivers/usb/serial/ipw.c              |  1 +
 drivers/usb/serial/ir-usb.c           |  1 +
 drivers/usb/serial/keyspan.h          |  1 +
 drivers/usb/serial/keyspan_pda.c      |  1 +
 drivers/usb/serial/kl5kusb105.c       |  1 +
 drivers/usb/serial/kobil_sct.c        |  1 +
 drivers/usb/serial/mct_u232.c         |  1 +
 drivers/usb/serial/omninet.c          |  1 +
 drivers/usb/serial/option.c           |  1 +
 drivers/usb/serial/pl2303.c           |  1 +
 drivers/usb/serial/safe_serial.c      |  1 +
 drivers/usb/serial/ti_usb_3410_5052.c |  1 +
 drivers/usb/serial/usb-serial.c       |  1 +
 drivers/usb/serial/visor.c            |  1 +
 drivers/usb/serial/whiteheat.c        |  1 +
 include/linux/usb.h                   |  3 +++
 32 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 1c0611045379..5e65bc258e1b 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -112,12 +112,26 @@ static int usb_create_newid_file(struct usb_driver *usb_drv)
 {
 	int error = 0;
 
+	if (usb_drv->no_dynamic_id)
+		goto exit;
+
 	if (usb_drv->probe != NULL)
 		error = sysfs_create_file(&usb_drv->driver.kobj,
 					  &driver_attr_new_id.attr);
+exit:
 	return error;
 }
 
+static void usb_remove_newid_file(struct usb_driver *usb_drv)
+{
+	if (usb_drv->no_dynamic_id)
+		return;
+
+	if (usb_drv->probe != NULL)
+		sysfs_remove_file(&usb_drv->driver.kobj,
+				  &driver_attr_new_id.attr);
+}
+
 static void usb_free_dynids(struct usb_driver *usb_drv)
 {
 	struct usb_dynid *dynid, *n;
@@ -135,6 +149,10 @@ static inline int usb_create_newid_file(struct usb_driver *usb_drv)
 	return 0;
 }
 
+static void usb_remove_newid_file(struct usb_driver *usb_drv)
+{
+}
+
 static inline void usb_free_dynids(struct usb_driver *usb_drv)
 {
 }
@@ -447,6 +465,7 @@ void usb_deregister(struct usb_driver *driver)
 	pr_info("%s: deregistering driver %s\n", usbcore_name, driver->name);
 
 	usb_lock_all_devices();
+	usb_remove_newid_file(driver);
 	usb_free_dynids(driver);
 	driver_unregister(&driver->driver);
 	usb_unlock_all_devices();
diff --git a/drivers/usb/serial/airprime.c b/drivers/usb/serial/airprime.c
index 1f29d8837327..2ef9945a6c07 100644
--- a/drivers/usb/serial/airprime.c
+++ b/drivers/usb/serial/airprime.c
@@ -28,6 +28,7 @@ static struct usb_driver airprime_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 static struct usb_serial_driver airprime_device = {
diff --git a/drivers/usb/serial/anydata.c b/drivers/usb/serial/anydata.c
index 18022a74a3dc..7a171e034b59 100644
--- a/drivers/usb/serial/anydata.c
+++ b/drivers/usb/serial/anydata.c
@@ -32,6 +32,7 @@ static struct usb_driver anydata_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 static int anydata_open(struct usb_serial_port *port, struct file *filp)
diff --git a/drivers/usb/serial/belkin_sa.c b/drivers/usb/serial/belkin_sa.c
index 84bc0ee4f061..69039bd9fc5e 100644
--- a/drivers/usb/serial/belkin_sa.c
+++ b/drivers/usb/serial/belkin_sa.c
@@ -118,6 +118,7 @@ static struct usb_driver belkin_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 /* All of the device info needed for the serial converters */
diff --git a/drivers/usb/serial/cp2101.c b/drivers/usb/serial/cp2101.c
index c9787001cf2a..813bab37e076 100644
--- a/drivers/usb/serial/cp2101.c
+++ b/drivers/usb/serial/cp2101.c
@@ -72,6 +72,7 @@ static struct usb_driver cp2101_driver = {
 	.probe		= usb_serial_probe,
 	.disconnect	= usb_serial_disconnect,
 	.id_table	= id_table,
+	.no_dynamic_id	= 	1,
 };
 
 static struct usb_serial_driver cp2101_device = {
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index e581e4ae8483..8c10e4004905 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -81,6 +81,7 @@ static struct usb_driver cyberjack_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 static struct usb_serial_driver cyberjack_device = {
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index af9290ed257b..af18355e94cc 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -112,6 +112,7 @@ static struct usb_driver cypress_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 struct cypress_private {
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index dc74644a603d..c50cec95f49b 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -498,6 +498,7 @@ static struct usb_driver digi_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index 0b0546dcc7b9..e5e40064caf2 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -110,6 +110,7 @@ static struct usb_driver empeg_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 static struct usb_serial_driver empeg_device = {
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 06e04b442ff1..857fe791d702 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -488,6 +488,7 @@ static struct usb_driver ftdi_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 static char *ftdi_chip_name[] = {
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index 35820bda7ae1..198a322286f9 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -227,6 +227,7 @@ static struct usb_driver garmin_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 53a47c31cd0e..c00a440dc421 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -73,6 +73,7 @@ static struct usb_driver generic_driver = {
 	.probe =	generic_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	generic_serial_ids,
+	.no_dynamic_id = 	1,
 };
 #endif
 
diff --git a/drivers/usb/serial/hp4x.c b/drivers/usb/serial/hp4x.c
index 8eadfb705601..e588c3fe632d 100644
--- a/drivers/usb/serial/hp4x.c
+++ b/drivers/usb/serial/hp4x.c
@@ -42,6 +42,7 @@ static struct usb_driver hp49gp_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 static struct usb_serial_driver hp49gp_device = {
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index dc4c498bd1ed..276bd425a474 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -247,6 +247,7 @@ static struct usb_driver io_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 /* function prototypes for all of our local functions */
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 832b6d6734c0..8b2e4c78abcd 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -221,6 +221,7 @@ static struct usb_driver io_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index d5d066488100..efb568be7015 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -547,6 +547,7 @@ static struct usb_driver ipaq_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	ipaq_id_table,
+	.no_dynamic_id = 	1,
 };
 
 
diff --git a/drivers/usb/serial/ipw.c b/drivers/usb/serial/ipw.c
index 7744b8148bc5..64e2cda2a84a 100644
--- a/drivers/usb/serial/ipw.c
+++ b/drivers/usb/serial/ipw.c
@@ -157,6 +157,7 @@ static struct usb_driver usb_ipw_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	usb_ipw_ids,
+	.no_dynamic_id = 	1,
 };
 
 static int debug;
diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c
index 19f329e9bdcf..647431c1ccb1 100644
--- a/drivers/usb/serial/ir-usb.c
+++ b/drivers/usb/serial/ir-usb.c
@@ -130,6 +130,7 @@ static struct usb_driver ir_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 
diff --git a/drivers/usb/serial/keyspan.h b/drivers/usb/serial/keyspan.h
index 5cfc13b5e56f..4e6f626f6062 100644
--- a/drivers/usb/serial/keyspan.h
+++ b/drivers/usb/serial/keyspan.h
@@ -525,6 +525,7 @@ static struct usb_driver keyspan_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	keyspan_ids_combined,
+	.no_dynamic_id = 	1,
 };
 
 /* usb_device_id table for the pre-firmware download keyspan devices */
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index cd4f48bd83b6..0d1f15268549 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -155,6 +155,7 @@ static struct usb_driver keyspan_pda_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 static struct usb_device_id id_table_std [] = {
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index a8951c0fd020..bd68638b7c35 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -121,6 +121,7 @@ static struct usb_driver kl5kusb105d_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 static struct usb_serial_driver kl5kusb105d_device = {
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index 9456dd9dd136..4c853afea385 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -102,6 +102,7 @@ static struct usb_driver kobil_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index ca5dbadb9b7e..b0415e7542c4 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -130,6 +130,7 @@ static struct usb_driver mct_u232_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 static struct usb_serial_driver mct_u232_device = {
diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c
index 3caf97072ac0..b595befb24cf 100644
--- a/drivers/usb/serial/omninet.c
+++ b/drivers/usb/serial/omninet.c
@@ -85,6 +85,7 @@ static struct usb_driver omninet_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 7716000045b7..4ee657eaaa0b 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -100,6 +100,7 @@ static struct usb_driver option_driver = {
 	.probe      = usb_serial_probe,
 	.disconnect = usb_serial_disconnect,
 	.id_table   = option_ids,
+	.no_dynamic_id = 	1,
 };
 
 /* The card has three separate interfaces, wich the serial driver
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 41a45a5025b2..e302a320444c 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -87,6 +87,7 @@ static struct usb_driver pl2303_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 #define SET_LINE_REQUEST_TYPE		0x21
diff --git a/drivers/usb/serial/safe_serial.c b/drivers/usb/serial/safe_serial.c
index c22bdc0c4dfd..f8241c152043 100644
--- a/drivers/usb/serial/safe_serial.c
+++ b/drivers/usb/serial/safe_serial.c
@@ -165,6 +165,7 @@ static struct usb_driver safe_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table,
+	.no_dynamic_id = 	1,
 };
 
 static __u16 crc10_table[256] = {
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 205dbf7201da..17a1f09483bd 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -253,6 +253,7 @@ static struct usb_driver ti_usb_driver = {
 	.probe			= usb_serial_probe,
 	.disconnect		= usb_serial_disconnect,
 	.id_table		= ti_id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 static struct usb_serial_driver ti_1port_device = {
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 0c4881d18cd5..2ac37b52485a 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -46,6 +46,7 @@ static struct usb_driver usb_serial_driver = {
 	.name =		"usbserial",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
+	.no_dynamic_id = 	1,
 };
 
 /* There is no MODULE_DEVICE_TABLE for usbserial.c.  Instead
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index a473c1c34559..2973f5564c06 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -178,6 +178,7 @@ static struct usb_driver visor_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 /* All of the device info needed for the Handspring Visor, and Palm 4.0 devices */
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 18c3183be769..19c6386bb692 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -132,6 +132,7 @@ static struct usb_driver whiteheat_driver = {
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
 	.id_table =	id_table_combined,
+	.no_dynamic_id = 	1,
 };
 
 /* function prototypes for the Connect Tech WhiteHEAT prerenumeration device */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 0dd96ef78c13..8d5829936bc4 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -561,6 +561,8 @@ struct usb_dynids {
  * @dynids: used internally to hold the list of dynamically added device
  *	ids for this driver.
  * @driver: the driver model core driver structure.
+ * @no_dynamic_id: if set to 1, the USB core will not allow dynamic ids to be
+ *	added to this driver by preventing the sysfs file from being created.
  *
  * USB drivers must provide a name, probe() and disconnect() methods,
  * and an id_table.  Other driver fields are optional.
@@ -597,6 +599,7 @@ struct usb_driver {
 
 	struct usb_dynids dynids;
 	struct device_driver driver;
+	unsigned int no_dynamic_id:1;
 };
 #define	to_usb_driver(d) container_of(d, struct usb_driver, driver)
 
-- 
cgit v1.2.3-71-gd317


From 2143acc6dc79bdbff812f02a7dc5ab9d4fc81fc8 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 21 Nov 2005 14:53:03 -0800
Subject: [PATCH] USB: make registering a usb driver automatically set the
 module owner

This fixes the driver that forgot to set the module owner up.  Now we
can remove the unneeded pointer from the usb driver structure.  The idea
for how to do this was from Al Viro, who did this for the PCI drivers.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/driver.c | 9 +++++----
 include/linux/usb.h       | 6 +++++-
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 5e65bc258e1b..bb139f06bcd6 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -404,8 +404,9 @@ int usb_device_match(struct device *dev, struct device_driver *drv)
 }
 
 /**
- * usb_register - register a USB driver
+ * usb_register_driver - register a USB driver
  * @new_driver: USB operations for the driver
+ * @owner: module owner of this driver.
  *
  * Registers a USB driver with the USB core.  The list of unattached
  * interfaces will be rescanned whenever a new driver is added, allowing
@@ -416,7 +417,7 @@ int usb_device_match(struct device *dev, struct device_driver *drv)
  * usb_register_dev() to enable that functionality.  This function no longer
  * takes care of that.
  */
-int usb_register(struct usb_driver *new_driver)
+int usb_register_driver(struct usb_driver *new_driver, struct module *owner)
 {
 	int retval = 0;
 
@@ -427,7 +428,7 @@ int usb_register(struct usb_driver *new_driver)
 	new_driver->driver.bus = &usb_bus_type;
 	new_driver->driver.probe = usb_probe_interface;
 	new_driver->driver.remove = usb_unbind_interface;
-	new_driver->driver.owner = new_driver->owner;
+	new_driver->driver.owner = owner;
 	spin_lock_init(&new_driver->dynids.lock);
 	INIT_LIST_HEAD(&new_driver->dynids.list);
 
@@ -447,7 +448,7 @@ int usb_register(struct usb_driver *new_driver)
 
 	return retval;
 }
-EXPORT_SYMBOL_GPL(usb_register);
+EXPORT_SYMBOL_GPL(usb_register_driver);
 
 /**
  * usb_deregister - unregister a USB driver
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 8d5829936bc4..3d05c63451a8 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -625,7 +625,11 @@ struct usb_class_driver {
  * use these in module_init()/module_exit()
  * and don't forget MODULE_DEVICE_TABLE(usb, ...)
  */
-extern int usb_register(struct usb_driver *);
+int usb_register_driver(struct usb_driver *, struct module *);
+static inline int usb_register(struct usb_driver *driver)
+{
+	return usb_register_driver(driver, THIS_MODULE);
+}
 extern void usb_deregister(struct usb_driver *);
 
 extern int usb_register_dev(struct usb_interface *intf,
-- 
cgit v1.2.3-71-gd317


From 75318d2d7cab77b14c5d3dbd5e69f2680a769e16 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 21 Nov 2005 14:53:03 -0800
Subject: [PATCH] USB: remove .owner field from struct usb_driver

It is no longer needed, so let's remove it, saving a bit of memory.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/block/ub.c                         | 1 -
 drivers/bluetooth/bcm203x.c                | 1 -
 drivers/bluetooth/bfusb.c                  | 1 -
 drivers/bluetooth/bpa10x.c                 | 1 -
 drivers/bluetooth/hci_usb.c                | 1 -
 drivers/char/watchdog/pcwd_usb.c           | 1 -
 drivers/input/joystick/iforce/iforce-usb.c | 1 -
 drivers/isdn/hisax/hfc_usb.c               | 1 -
 drivers/isdn/hisax/st5481_init.c           | 1 -
 drivers/media/dvb/b2c2/flexcop-usb.c       | 1 -
 drivers/media/dvb/cinergyT2/cinergyT2.c    | 1 -
 drivers/media/dvb/dvb-usb/a800.c           | 1 -
 drivers/media/dvb/dvb-usb/cxusb.c          | 1 -
 drivers/media/dvb/dvb-usb/dibusb-mb.c      | 1 -
 drivers/media/dvb/dvb-usb/dibusb-mc.c      | 1 -
 drivers/media/dvb/dvb-usb/digitv.c         | 1 -
 drivers/media/dvb/dvb-usb/dtt200u.c        | 1 -
 drivers/media/dvb/dvb-usb/nova-t-usb2.c    | 1 -
 drivers/media/dvb/dvb-usb/umt-010.c        | 1 -
 drivers/media/dvb/dvb-usb/vp702x.c         | 1 -
 drivers/media/dvb/dvb-usb/vp7045.c         | 1 -
 drivers/media/video/cpia_usb.c             | 1 -
 drivers/media/video/em28xx/em28xx-video.c  | 1 -
 drivers/net/irda/irda-usb.c                | 1 -
 drivers/net/irda/stir4200.c                | 1 -
 drivers/usb/atm/cxacru.c                   | 1 -
 drivers/usb/atm/speedtch.c                 | 1 -
 drivers/usb/atm/ueagle-atm.c               | 1 -
 drivers/usb/atm/xusbatm.c                  | 1 -
 drivers/usb/class/audio.c                  | 1 -
 drivers/usb/class/cdc-acm.c                | 1 -
 drivers/usb/class/usb-midi.c               | 1 -
 drivers/usb/class/usblp.c                  | 1 -
 drivers/usb/core/devio.c                   | 1 -
 drivers/usb/core/hub.c                     | 1 -
 drivers/usb/image/mdc800.c                 | 1 -
 drivers/usb/image/microtek.c               | 1 -
 drivers/usb/input/acecad.c                 | 1 -
 drivers/usb/input/aiptek.c                 | 1 -
 drivers/usb/input/appletouch.c             | 1 -
 drivers/usb/input/ati_remote.c             | 1 -
 drivers/usb/input/hid-core.c               | 1 -
 drivers/usb/input/hiddev.c                 | 1 -
 drivers/usb/input/itmtouch.c               | 1 -
 drivers/usb/input/kbtab.c                  | 1 -
 drivers/usb/input/keyspan_remote.c         | 1 -
 drivers/usb/input/mtouchusb.c              | 1 -
 drivers/usb/input/powermate.c              | 1 -
 drivers/usb/input/touchkitusb.c            | 1 -
 drivers/usb/input/usbkbd.c                 | 1 -
 drivers/usb/input/usbmouse.c               | 1 -
 drivers/usb/input/wacom.c                  | 1 -
 drivers/usb/input/xpad.c                   | 1 -
 drivers/usb/input/yealink.c                | 1 -
 drivers/usb/media/dabusb.c                 | 1 -
 drivers/usb/media/dsbr100.c                | 1 -
 drivers/usb/media/ov511.c                  | 1 -
 drivers/usb/media/pwc/pwc-if.c             | 1 -
 drivers/usb/media/se401.c                  | 1 -
 drivers/usb/media/sn9c102_core.c           | 1 -
 drivers/usb/media/stv680.c                 | 1 -
 drivers/usb/media/vicam.c                  | 1 -
 drivers/usb/media/w9968cf.c                | 1 -
 drivers/usb/misc/auerswald.c               | 1 -
 drivers/usb/misc/cytherm.c                 | 1 -
 drivers/usb/misc/emi26.c                   | 1 -
 drivers/usb/misc/emi62.c                   | 1 -
 drivers/usb/misc/idmouse.c                 | 1 -
 drivers/usb/misc/ldusb.c                   | 1 -
 drivers/usb/misc/legousbtower.c            | 1 -
 drivers/usb/misc/phidgetkit.c              | 1 -
 drivers/usb/misc/phidgetservo.c            | 1 -
 drivers/usb/misc/rio500.c                  | 1 -
 drivers/usb/misc/sisusbvga/sisusb.c        | 1 -
 drivers/usb/misc/usblcd.c                  | 1 -
 drivers/usb/misc/usbled.c                  | 1 -
 drivers/usb/misc/usbtest.c                 | 1 -
 drivers/usb/misc/uss720.c                  | 1 -
 drivers/usb/net/asix.c                     | 1 -
 drivers/usb/net/catc.c                     | 1 -
 drivers/usb/net/cdc_ether.c                | 1 -
 drivers/usb/net/cdc_subset.c               | 1 -
 drivers/usb/net/gl620a.c                   | 1 -
 drivers/usb/net/kaweth.c                   | 1 -
 drivers/usb/net/net1080.c                  | 1 -
 drivers/usb/net/plusb.c                    | 1 -
 drivers/usb/net/rndis_host.c               | 1 -
 drivers/usb/net/rtl8150.c                  | 1 -
 drivers/usb/net/zaurus.c                   | 1 -
 drivers/usb/net/zd1201.c                   | 1 -
 drivers/usb/serial/airprime.c              | 1 -
 drivers/usb/serial/anydata.c               | 1 -
 drivers/usb/serial/belkin_sa.c             | 1 -
 drivers/usb/serial/cp2101.c                | 1 -
 drivers/usb/serial/cyberjack.c             | 1 -
 drivers/usb/serial/digi_acceleport.c       | 1 -
 drivers/usb/serial/empeg.c                 | 1 -
 drivers/usb/serial/garmin_gps.c            | 1 -
 drivers/usb/serial/generic.c               | 1 -
 drivers/usb/serial/hp4x.c                  | 1 -
 drivers/usb/serial/io_edgeport.c           | 1 -
 drivers/usb/serial/io_ti.c                 | 1 -
 drivers/usb/serial/ipaq.c                  | 1 -
 drivers/usb/serial/ipw.c                   | 1 -
 drivers/usb/serial/ir-usb.c                | 1 -
 drivers/usb/serial/keyspan.h               | 1 -
 drivers/usb/serial/keyspan_pda.c           | 1 -
 drivers/usb/serial/kl5kusb105.c            | 1 -
 drivers/usb/serial/kobil_sct.c             | 1 -
 drivers/usb/serial/mct_u232.c              | 1 -
 drivers/usb/serial/omninet.c               | 1 -
 drivers/usb/serial/option.c                | 1 -
 drivers/usb/serial/pl2303.c                | 1 -
 drivers/usb/serial/safe_serial.c           | 1 -
 drivers/usb/serial/ti_usb_3410_5052.c      | 1 -
 drivers/usb/serial/usb-serial.c            | 1 -
 drivers/usb/serial/visor.c                 | 1 -
 drivers/usb/serial/whiteheat.c             | 1 -
 drivers/usb/storage/libusual.c             | 1 -
 drivers/usb/storage/usb.c                  | 1 -
 drivers/usb/usb-skeleton.c                 | 1 -
 drivers/w1/dscore.c                        | 1 -
 include/linux/usb.h                        | 4 ----
 sound/usb/usbaudio.c                       | 1 -
 sound/usb/usx2y/usbusx2y.c                 | 1 -
 125 files changed, 128 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 06d741d58a68..c7a28f5be42f 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -2460,7 +2460,6 @@ static void ub_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver ub_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"ub",
 	.probe =	ub_probe,
 	.disconnect =	ub_disconnect,
diff --git a/drivers/bluetooth/bcm203x.c b/drivers/bluetooth/bcm203x.c
index 8e7fb3551775..3e7a067cc087 100644
--- a/drivers/bluetooth/bcm203x.c
+++ b/drivers/bluetooth/bcm203x.c
@@ -275,7 +275,6 @@ static void bcm203x_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver bcm203x_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "bcm203x",
 	.probe		= bcm203x_probe,
 	.disconnect	= bcm203x_disconnect,
diff --git a/drivers/bluetooth/bfusb.c b/drivers/bluetooth/bfusb.c
index 067e27893e4a..8947c8837dac 100644
--- a/drivers/bluetooth/bfusb.c
+++ b/drivers/bluetooth/bfusb.c
@@ -768,7 +768,6 @@ static void bfusb_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver bfusb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "bfusb",
 	.probe		= bfusb_probe,
 	.disconnect	= bfusb_disconnect,
diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c
index 394796315adc..9446960ac742 100644
--- a/drivers/bluetooth/bpa10x.c
+++ b/drivers/bluetooth/bpa10x.c
@@ -619,7 +619,6 @@ static void bpa10x_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver bpa10x_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "bpa10x",
 	.probe		= bpa10x_probe,
 	.disconnect	= bpa10x_disconnect,
diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c
index 057cb2b6e6d1..92382e823285 100644
--- a/drivers/bluetooth/hci_usb.c
+++ b/drivers/bluetooth/hci_usb.c
@@ -1044,7 +1044,6 @@ static void hci_usb_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver hci_usb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "hci_usb",
 	.probe		= hci_usb_probe,
 	.disconnect	= hci_usb_disconnect,
diff --git a/drivers/char/watchdog/pcwd_usb.c b/drivers/char/watchdog/pcwd_usb.c
index 092e9b133750..1533f56baa42 100644
--- a/drivers/char/watchdog/pcwd_usb.c
+++ b/drivers/char/watchdog/pcwd_usb.c
@@ -151,7 +151,6 @@ static void usb_pcwd_disconnect	(struct usb_interface *interface);
 
 /* usb specific object needed to register this driver with the usb subsystem */
 static struct usb_driver usb_pcwd_driver = {
-	.owner =	THIS_MODULE,
 	.name =		DRIVER_NAME,
 	.probe =	usb_pcwd_probe,
 	.disconnect =	usb_pcwd_disconnect,
diff --git a/drivers/input/joystick/iforce/iforce-usb.c b/drivers/input/joystick/iforce/iforce-usb.c
index 64b4a3080985..bc2fce60f9f8 100644
--- a/drivers/input/joystick/iforce/iforce-usb.c
+++ b/drivers/input/joystick/iforce/iforce-usb.c
@@ -235,7 +235,6 @@ static struct usb_device_id iforce_usb_ids [] = {
 MODULE_DEVICE_TABLE (usb, iforce_usb_ids);
 
 struct usb_driver iforce_usb_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"iforce",
 	.probe =	iforce_usb_probe,
 	.disconnect =	iforce_usb_disconnect,
diff --git a/drivers/isdn/hisax/hfc_usb.c b/drivers/isdn/hisax/hfc_usb.c
index f8457ef48826..ca5b4a3b683e 100644
--- a/drivers/isdn/hisax/hfc_usb.c
+++ b/drivers/isdn/hisax/hfc_usb.c
@@ -1715,7 +1715,6 @@ hfc_usb_disconnect(struct usb_interface
 /* our driver information structure */
 /************************************/
 static struct usb_driver hfc_drv = {
-	.owner = THIS_MODULE,
 	.name  = "hfc_usb",
 	.id_table = hfcusb_idtab,
 	.probe = hfc_usb_probe,
diff --git a/drivers/isdn/hisax/st5481_init.c b/drivers/isdn/hisax/st5481_init.c
index 8e192a3a3490..99cb0f3d59a1 100644
--- a/drivers/isdn/hisax/st5481_init.c
+++ b/drivers/isdn/hisax/st5481_init.c
@@ -180,7 +180,6 @@ static struct usb_device_id st5481_ids[] = {
 MODULE_DEVICE_TABLE (usb, st5481_ids);
 
 static struct usb_driver st5481_usb_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"st5481_usb",
 	.probe =	probe_st5481,
 	.disconnect =	disconnect_st5481,
diff --git a/drivers/media/dvb/b2c2/flexcop-usb.c b/drivers/media/dvb/b2c2/flexcop-usb.c
index 0a78ba3737a5..a6c91db40ad6 100644
--- a/drivers/media/dvb/b2c2/flexcop-usb.c
+++ b/drivers/media/dvb/b2c2/flexcop-usb.c
@@ -544,7 +544,6 @@ static struct usb_device_id flexcop_usb_table [] = {
 
 /* usb specific object needed to register this driver with the usb subsystem */
 static struct usb_driver flexcop_usb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "b2c2_flexcop_usb",
 	.probe		= flexcop_usb_probe,
 	.disconnect = flexcop_usb_disconnect,
diff --git a/drivers/media/dvb/cinergyT2/cinergyT2.c b/drivers/media/dvb/cinergyT2/cinergyT2.c
index 336fc284fa52..b996fb59b7e4 100644
--- a/drivers/media/dvb/cinergyT2/cinergyT2.c
+++ b/drivers/media/dvb/cinergyT2/cinergyT2.c
@@ -986,7 +986,6 @@ static const struct usb_device_id cinergyt2_table [] __devinitdata = {
 MODULE_DEVICE_TABLE(usb, cinergyt2_table);
 
 static struct usb_driver cinergyt2_driver = {
-	.owner	= THIS_MODULE,
 	.name	= "cinergyT2",
 	.probe	= cinergyt2_probe,
 	.disconnect	= cinergyt2_disconnect,
diff --git a/drivers/media/dvb/dvb-usb/a800.c b/drivers/media/dvb/dvb-usb/a800.c
index 8c7beffb045f..ce44aa6bbb83 100644
--- a/drivers/media/dvb/dvb-usb/a800.c
+++ b/drivers/media/dvb/dvb-usb/a800.c
@@ -144,7 +144,6 @@ static struct dvb_usb_properties a800_properties = {
 };
 
 static struct usb_driver a800_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "dvb_usb_a800",
 	.probe		= a800_probe,
 	.disconnect = dvb_usb_device_exit,
diff --git a/drivers/media/dvb/dvb-usb/cxusb.c b/drivers/media/dvb/dvb-usb/cxusb.c
index 3fe383f4bb4c..d05fab01cccd 100644
--- a/drivers/media/dvb/dvb-usb/cxusb.c
+++ b/drivers/media/dvb/dvb-usb/cxusb.c
@@ -241,7 +241,6 @@ static struct dvb_usb_properties cxusb_properties = {
 };
 
 static struct usb_driver cxusb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "dvb_usb_cxusb",
 	.probe		= cxusb_probe,
 	.disconnect = dvb_usb_device_exit,
diff --git a/drivers/media/dvb/dvb-usb/dibusb-mb.c b/drivers/media/dvb/dvb-usb/dibusb-mb.c
index aa271a2496d5..52ac3e5adf5d 100644
--- a/drivers/media/dvb/dvb-usb/dibusb-mb.c
+++ b/drivers/media/dvb/dvb-usb/dibusb-mb.c
@@ -373,7 +373,6 @@ static struct dvb_usb_properties artec_t1_usb2_properties = {
 };
 
 static struct usb_driver dibusb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "dvb_usb_dibusb_mb",
 	.probe		= dibusb_probe,
 	.disconnect = dvb_usb_device_exit,
diff --git a/drivers/media/dvb/dvb-usb/dibusb-mc.c b/drivers/media/dvb/dvb-usb/dibusb-mc.c
index 6a0912eab396..55802fba3c29 100644
--- a/drivers/media/dvb/dvb-usb/dibusb-mc.c
+++ b/drivers/media/dvb/dvb-usb/dibusb-mc.c
@@ -82,7 +82,6 @@ static struct dvb_usb_properties dibusb_mc_properties = {
 };
 
 static struct usb_driver dibusb_mc_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "dvb_usb_dibusb_mc",
 	.probe		= dibusb_mc_probe,
 	.disconnect = dvb_usb_device_exit,
diff --git a/drivers/media/dvb/dvb-usb/digitv.c b/drivers/media/dvb/dvb-usb/digitv.c
index f98e306a5759..450417a9e64b 100644
--- a/drivers/media/dvb/dvb-usb/digitv.c
+++ b/drivers/media/dvb/dvb-usb/digitv.c
@@ -233,7 +233,6 @@ static struct dvb_usb_properties digitv_properties = {
 };
 
 static struct usb_driver digitv_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "dvb_usb_digitv",
 	.probe		= digitv_probe,
 	.disconnect = dvb_usb_device_exit,
diff --git a/drivers/media/dvb/dvb-usb/dtt200u.c b/drivers/media/dvb/dvb-usb/dtt200u.c
index b595476332cd..6e2bac873445 100644
--- a/drivers/media/dvb/dvb-usb/dtt200u.c
+++ b/drivers/media/dvb/dvb-usb/dtt200u.c
@@ -198,7 +198,6 @@ static struct dvb_usb_properties wt220u_properties = {
 
 /* usb specific object needed to register this driver with the usb subsystem */
 static struct usb_driver dtt200u_usb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "dvb_usb_dtt200u",
 	.probe		= dtt200u_usb_probe,
 	.disconnect = dvb_usb_device_exit,
diff --git a/drivers/media/dvb/dvb-usb/nova-t-usb2.c b/drivers/media/dvb/dvb-usb/nova-t-usb2.c
index 1841a66427bf..fac48fc7a4ac 100644
--- a/drivers/media/dvb/dvb-usb/nova-t-usb2.c
+++ b/drivers/media/dvb/dvb-usb/nova-t-usb2.c
@@ -202,7 +202,6 @@ static struct dvb_usb_properties nova_t_properties = {
 };
 
 static struct usb_driver nova_t_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "dvb_usb_nova_t_usb2",
 	.probe		= nova_t_probe,
 	.disconnect = dvb_usb_device_exit,
diff --git a/drivers/media/dvb/dvb-usb/umt-010.c b/drivers/media/dvb/dvb-usb/umt-010.c
index 6fd67657c269..14f1911c79bb 100644
--- a/drivers/media/dvb/dvb-usb/umt-010.c
+++ b/drivers/media/dvb/dvb-usb/umt-010.c
@@ -128,7 +128,6 @@ static struct dvb_usb_properties umt_properties = {
 };
 
 static struct usb_driver umt_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "dvb_usb_umt_010",
 	.probe		= umt_probe,
 	.disconnect = dvb_usb_device_exit,
diff --git a/drivers/media/dvb/dvb-usb/vp702x.c b/drivers/media/dvb/dvb-usb/vp702x.c
index de13c04e8e64..afa00fdb5ec0 100644
--- a/drivers/media/dvb/dvb-usb/vp702x.c
+++ b/drivers/media/dvb/dvb-usb/vp702x.c
@@ -256,7 +256,6 @@ static struct dvb_usb_properties vp702x_properties = {
 
 /* usb specific object needed to register this driver with the usb subsystem */
 static struct usb_driver vp702x_usb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "dvb-usb-vp702x",
 	.probe 		= vp702x_usb_probe,
 	.disconnect = dvb_usb_device_exit,
diff --git a/drivers/media/dvb/dvb-usb/vp7045.c b/drivers/media/dvb/dvb-usb/vp7045.c
index 75765e3a569c..3835235b68df 100644
--- a/drivers/media/dvb/dvb-usb/vp7045.c
+++ b/drivers/media/dvb/dvb-usb/vp7045.c
@@ -253,7 +253,6 @@ static struct dvb_usb_properties vp7045_properties = {
 
 /* usb specific object needed to register this driver with the usb subsystem */
 static struct usb_driver vp7045_usb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "dvb_usb_vp7045",
 	.probe		= vp7045_usb_probe,
 	.disconnect = dvb_usb_device_exit,
diff --git a/drivers/media/video/cpia_usb.c b/drivers/media/video/cpia_usb.c
index 9774e94d1e7d..1439cb752874 100644
--- a/drivers/media/video/cpia_usb.c
+++ b/drivers/media/video/cpia_usb.c
@@ -582,7 +582,6 @@ MODULE_LICENSE("GPL");
 
 
 static struct usb_driver cpia_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "cpia",
 	.probe		= cpia_probe,
 	.disconnect	= cpia_disconnect,
diff --git a/drivers/media/video/em28xx/em28xx-video.c b/drivers/media/video/em28xx/em28xx-video.c
index 06d76879bde2..3a56120397ae 100644
--- a/drivers/media/video/em28xx/em28xx-video.c
+++ b/drivers/media/video/em28xx/em28xx-video.c
@@ -1884,7 +1884,6 @@ static void em28xx_usb_disconnect(struct usb_interface *interface)
 }
 
 static struct usb_driver em28xx_usb_driver = {
-	.owner = THIS_MODULE,
 	.name = "em28xx",
 	.probe = em28xx_usb_probe,
 	.disconnect = em28xx_usb_disconnect,
diff --git a/drivers/net/irda/irda-usb.c b/drivers/net/irda/irda-usb.c
index c22c0517883c..fa176ffb4ad5 100644
--- a/drivers/net/irda/irda-usb.c
+++ b/drivers/net/irda/irda-usb.c
@@ -1539,7 +1539,6 @@ static void irda_usb_disconnect(struct usb_interface *intf)
  * USB device callbacks
  */
 static struct usb_driver irda_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "irda-usb",
 	.probe		= irda_usb_probe,
 	.disconnect	= irda_usb_disconnect,
diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c
index 3961a754e920..31867e4b891b 100644
--- a/drivers/net/irda/stir4200.c
+++ b/drivers/net/irda/stir4200.c
@@ -1152,7 +1152,6 @@ static int stir_resume(struct usb_interface *intf)
  * USB device callbacks
  */
 static struct usb_driver irda_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "stir4200",
 	.probe		= stir_probe,
 	.disconnect	= stir_disconnect,
diff --git a/drivers/usb/atm/cxacru.c b/drivers/usb/atm/cxacru.c
index 9d59dc62e6d2..af0a41e7870e 100644
--- a/drivers/usb/atm/cxacru.c
+++ b/drivers/usb/atm/cxacru.c
@@ -853,7 +853,6 @@ static int cxacru_usb_probe(struct usb_interface *intf, const struct usb_device_
 }
 
 static struct usb_driver cxacru_usb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= cxacru_driver_name,
 	.probe		= cxacru_usb_probe,
 	.disconnect	= usbatm_usb_disconnect,
diff --git a/drivers/usb/atm/speedtch.c b/drivers/usb/atm/speedtch.c
index d0cbbb7f0385..b28336148658 100644
--- a/drivers/usb/atm/speedtch.c
+++ b/drivers/usb/atm/speedtch.c
@@ -659,7 +659,6 @@ MODULE_DEVICE_TABLE(usb, speedtch_usb_ids);
 static int speedtch_usb_probe(struct usb_interface *, const struct usb_device_id *);
 
 static struct usb_driver speedtch_usb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= speedtch_driver_name,
 	.probe		= speedtch_usb_probe,
 	.disconnect	= usbatm_usb_disconnect,
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index be08e16df09f..7d2a679989ed 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -1776,7 +1776,6 @@ static const struct usb_device_id uea_ids[] = {
  * USB driver descriptor
  */
 static struct usb_driver uea_driver = {
-	.owner = THIS_MODULE,
 	.name = "ueagle-atm",
 	.id_table = uea_ids,
 	.probe = uea_probe,
diff --git a/drivers/usb/atm/xusbatm.c b/drivers/usb/atm/xusbatm.c
index 7fe7fb484d10..5c76e3aaaa5e 100644
--- a/drivers/usb/atm/xusbatm.c
+++ b/drivers/usb/atm/xusbatm.c
@@ -140,7 +140,6 @@ static int xusbatm_usb_probe(struct usb_interface *intf,
 }
 
 static struct usb_driver xusbatm_usb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= xusbatm_driver_name,
 	.probe		= xusbatm_usb_probe,
 	.disconnect	= usbatm_usb_disconnect,
diff --git a/drivers/usb/class/audio.c b/drivers/usb/class/audio.c
index 50858273f8d3..3ad9ee8b84a9 100644
--- a/drivers/usb/class/audio.c
+++ b/drivers/usb/class/audio.c
@@ -2732,7 +2732,6 @@ static struct usb_device_id usb_audio_ids [] = {
 MODULE_DEVICE_TABLE (usb, usb_audio_ids);
 
 static struct usb_driver usb_audio_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"audio",
 	.probe =	usb_audio_probe,
 	.disconnect =	usb_audio_disconnect,
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 72936dc15ec9..93de121f52a8 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1088,7 +1088,6 @@ static struct usb_device_id acm_ids[] = {
 MODULE_DEVICE_TABLE (usb, acm_ids);
 
 static struct usb_driver acm_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"cdc_acm",
 	.probe =	acm_probe,
 	.disconnect =	acm_disconnect,
diff --git a/drivers/usb/class/usb-midi.c b/drivers/usb/class/usb-midi.c
index 5f8af35e7633..f13f004d311f 100644
--- a/drivers/usb/class/usb-midi.c
+++ b/drivers/usb/class/usb-midi.c
@@ -2027,7 +2027,6 @@ static struct usb_device_id id_table[] = {
 };
 
 static struct usb_driver usb_midi_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"midi",
 	.probe =	usb_midi_probe,
 	.disconnect =	usb_midi_disconnect,
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index 357e75335f17..10406b857ac7 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -1186,7 +1186,6 @@ static struct usb_device_id usblp_ids [] = {
 MODULE_DEVICE_TABLE (usb, usblp_ids);
 
 static struct usb_driver usblp_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usblp",
 	.probe =	usblp_probe,
 	.disconnect =	usblp_disconnect,
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index b1d6e9af732d..3a73170e95dd 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -402,7 +402,6 @@ static void driver_disconnect(struct usb_interface *intf)
 }
 
 struct usb_driver usbfs_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usbfs",
 	.probe =	driver_probe,
 	.disconnect =	driver_disconnect,
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 5faf7edd73cb..40c6c50c6bd9 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2865,7 +2865,6 @@ static struct usb_device_id hub_id_table [] = {
 MODULE_DEVICE_TABLE (usb, hub_id_table);
 
 static struct usb_driver hub_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"hub",
 	.probe =	hub_probe,
 	.disconnect =	hub_disconnect,
diff --git a/drivers/usb/image/mdc800.c b/drivers/usb/image/mdc800.c
index 1d973bcf56aa..049871145d63 100644
--- a/drivers/usb/image/mdc800.c
+++ b/drivers/usb/image/mdc800.c
@@ -962,7 +962,6 @@ MODULE_DEVICE_TABLE (usb, mdc800_table);
  */
 static struct usb_driver mdc800_usb_driver =
 {
-	.owner =	THIS_MODULE,
 	.name =		"mdc800",
 	.probe =	mdc800_usb_probe,
 	.disconnect =	mdc800_usb_disconnect,
diff --git a/drivers/usb/image/microtek.c b/drivers/usb/image/microtek.c
index 950543aa5ac7..458f2acdeb0a 100644
--- a/drivers/usb/image/microtek.c
+++ b/drivers/usb/image/microtek.c
@@ -160,7 +160,6 @@ static void mts_usb_disconnect(struct usb_interface *intf);
 static struct usb_device_id mts_usb_ids [];
 
 static struct usb_driver mts_usb_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"microtekX6",
 	.probe =	mts_usb_probe,
 	.disconnect =	mts_usb_disconnect,
diff --git a/drivers/usb/input/acecad.c b/drivers/usb/input/acecad.c
index a32558b4048e..df29b8078b54 100644
--- a/drivers/usb/input/acecad.c
+++ b/drivers/usb/input/acecad.c
@@ -261,7 +261,6 @@ static struct usb_device_id usb_acecad_id_table [] = {
 MODULE_DEVICE_TABLE(usb, usb_acecad_id_table);
 
 static struct usb_driver usb_acecad_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usb_acecad",
 	.probe =	usb_acecad_probe,
 	.disconnect =	usb_acecad_disconnect,
diff --git a/drivers/usb/input/aiptek.c b/drivers/usb/input/aiptek.c
index 0e2505c073db..356284c746a0 100644
--- a/drivers/usb/input/aiptek.c
+++ b/drivers/usb/input/aiptek.c
@@ -2190,7 +2190,6 @@ fail1:	input_free_device(inputdev);
 static void aiptek_disconnect(struct usb_interface *intf);
 
 static struct usb_driver aiptek_driver = {
-	.owner = THIS_MODULE,
 	.name = "aiptek",
 	.probe = aiptek_probe,
 	.disconnect = aiptek_disconnect,
diff --git a/drivers/usb/input/appletouch.c b/drivers/usb/input/appletouch.c
index 15840db092a5..1949b54f41f2 100644
--- a/drivers/usb/input/appletouch.c
+++ b/drivers/usb/input/appletouch.c
@@ -452,7 +452,6 @@ static int atp_resume(struct usb_interface *iface)
 }
 
 static struct usb_driver atp_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "appletouch",
 	.probe		= atp_probe,
 	.disconnect	= atp_disconnect,
diff --git a/drivers/usb/input/ati_remote.c b/drivers/usb/input/ati_remote.c
index 9a2a47db9494..8948e5c3941f 100644
--- a/drivers/usb/input/ati_remote.c
+++ b/drivers/usb/input/ati_remote.c
@@ -295,7 +295,6 @@ static void ati_remote_disconnect	(struct usb_interface *interface);
 
 /* usb specific object to register with the usb subsystem */
 static struct usb_driver ati_remote_driver = {
-	.owner        = THIS_MODULE,
 	.name         = "ati_remote",
 	.probe        = ati_remote_probe,
 	.disconnect   = ati_remote_disconnect,
diff --git a/drivers/usb/input/hid-core.c b/drivers/usb/input/hid-core.c
index a3e44ef1df43..256d7325d4a5 100644
--- a/drivers/usb/input/hid-core.c
+++ b/drivers/usb/input/hid-core.c
@@ -1930,7 +1930,6 @@ static struct usb_device_id hid_usb_ids [] = {
 MODULE_DEVICE_TABLE (usb, hid_usb_ids);
 
 static struct usb_driver hid_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usbhid",
 	.probe =	hid_probe,
 	.disconnect =	hid_disconnect,
diff --git a/drivers/usb/input/hiddev.c b/drivers/usb/input/hiddev.c
index 440377c7a0da..4dff8473553d 100644
--- a/drivers/usb/input/hiddev.c
+++ b/drivers/usb/input/hiddev.c
@@ -826,7 +826,6 @@ static int hiddev_usbd_probe(struct usb_interface *intf,
 
 
 static /* const */ struct usb_driver hiddev_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"hiddev",
 	.probe =	hiddev_usbd_probe,
 };
diff --git a/drivers/usb/input/itmtouch.c b/drivers/usb/input/itmtouch.c
index 4a50acb39d29..7618ae5c104f 100644
--- a/drivers/usb/input/itmtouch.c
+++ b/drivers/usb/input/itmtouch.c
@@ -250,7 +250,6 @@ static void itmtouch_disconnect(struct usb_interface *intf)
 MODULE_DEVICE_TABLE(usb, itmtouch_ids);
 
 static struct usb_driver itmtouch_driver = {
-	.owner =        THIS_MODULE,
 	.name =         "itmtouch",
 	.probe =        itmtouch_probe,
 	.disconnect =   itmtouch_disconnect,
diff --git a/drivers/usb/input/kbtab.c b/drivers/usb/input/kbtab.c
index fd48e74e78ed..f6d5cead542b 100644
--- a/drivers/usb/input/kbtab.c
+++ b/drivers/usb/input/kbtab.c
@@ -197,7 +197,6 @@ static void kbtab_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver kbtab_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"kbtab",
 	.probe =	kbtab_probe,
 	.disconnect =	kbtab_disconnect,
diff --git a/drivers/usb/input/keyspan_remote.c b/drivers/usb/input/keyspan_remote.c
index a32cfe51b77d..5ae5201dbf5a 100644
--- a/drivers/usb/input/keyspan_remote.c
+++ b/drivers/usb/input/keyspan_remote.c
@@ -559,7 +559,6 @@ static void keyspan_disconnect(struct usb_interface *interface)
  */
 static struct usb_driver keyspan_driver =
 {
-	.owner =	THIS_MODULE,
 	.name =		"keyspan_remote",
 	.probe =	keyspan_probe,
 	.disconnect =	keyspan_disconnect,
diff --git a/drivers/usb/input/mtouchusb.c b/drivers/usb/input/mtouchusb.c
index 52cc18cd247d..f018953a5485 100644
--- a/drivers/usb/input/mtouchusb.c
+++ b/drivers/usb/input/mtouchusb.c
@@ -310,7 +310,6 @@ static void mtouchusb_disconnect(struct usb_interface *intf)
 MODULE_DEVICE_TABLE(usb, mtouchusb_devices);
 
 static struct usb_driver mtouchusb_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "mtouchusb",
 	.probe		= mtouchusb_probe,
 	.disconnect	= mtouchusb_disconnect,
diff --git a/drivers/usb/input/powermate.c b/drivers/usb/input/powermate.c
index b7476233ef5d..fdf0f788062c 100644
--- a/drivers/usb/input/powermate.c
+++ b/drivers/usb/input/powermate.c
@@ -441,7 +441,6 @@ static struct usb_device_id powermate_devices [] = {
 MODULE_DEVICE_TABLE (usb, powermate_devices);
 
 static struct usb_driver powermate_driver = {
-	.owner =	THIS_MODULE,
         .name =         "powermate",
         .probe =        powermate_probe,
         .disconnect =   powermate_disconnect,
diff --git a/drivers/usb/input/touchkitusb.c b/drivers/usb/input/touchkitusb.c
index 7420c6b84284..75e7c12e7189 100644
--- a/drivers/usb/input/touchkitusb.c
+++ b/drivers/usb/input/touchkitusb.c
@@ -267,7 +267,6 @@ static void touchkit_disconnect(struct usb_interface *intf)
 MODULE_DEVICE_TABLE(usb, touchkit_devices);
 
 static struct usb_driver touchkit_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "touchkitusb",
 	.probe		= touchkit_probe,
 	.disconnect	= touchkit_disconnect,
diff --git a/drivers/usb/input/usbkbd.c b/drivers/usb/input/usbkbd.c
index 226b6f90a907..2f3edc26cb50 100644
--- a/drivers/usb/input/usbkbd.c
+++ b/drivers/usb/input/usbkbd.c
@@ -345,7 +345,6 @@ static struct usb_device_id usb_kbd_id_table [] = {
 MODULE_DEVICE_TABLE (usb, usb_kbd_id_table);
 
 static struct usb_driver usb_kbd_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usbkbd",
 	.probe =	usb_kbd_probe,
 	.disconnect =	usb_kbd_disconnect,
diff --git a/drivers/usb/input/usbmouse.c b/drivers/usb/input/usbmouse.c
index 230f6b1b314a..af526135d210 100644
--- a/drivers/usb/input/usbmouse.c
+++ b/drivers/usb/input/usbmouse.c
@@ -226,7 +226,6 @@ static struct usb_device_id usb_mouse_id_table [] = {
 MODULE_DEVICE_TABLE (usb, usb_mouse_id_table);
 
 static struct usb_driver usb_mouse_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "usbmouse",
 	.probe		= usb_mouse_probe,
 	.disconnect	= usb_mouse_disconnect,
diff --git a/drivers/usb/input/wacom.c b/drivers/usb/input/wacom.c
index dc099bbe12bf..48df4cfd5a42 100644
--- a/drivers/usb/input/wacom.c
+++ b/drivers/usb/input/wacom.c
@@ -945,7 +945,6 @@ static void wacom_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver wacom_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"wacom",
 	.probe =	wacom_probe,
 	.disconnect =	wacom_disconnect,
diff --git a/drivers/usb/input/xpad.c b/drivers/usb/input/xpad.c
index 43112f040b6d..e421328615fb 100644
--- a/drivers/usb/input/xpad.c
+++ b/drivers/usb/input/xpad.c
@@ -316,7 +316,6 @@ static void xpad_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver xpad_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "xpad",
 	.probe		= xpad_probe,
 	.disconnect	= xpad_disconnect,
diff --git a/drivers/usb/input/yealink.c b/drivers/usb/input/yealink.c
index f526aebea502..1bfc105ad4d6 100644
--- a/drivers/usb/input/yealink.c
+++ b/drivers/usb/input/yealink.c
@@ -987,7 +987,6 @@ static int usb_probe(struct usb_interface *intf, const struct usb_device_id *id)
 }
 
 static struct usb_driver yealink_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "yealink",
 	.probe		= usb_probe,
 	.disconnect	= usb_disconnect,
diff --git a/drivers/usb/media/dabusb.c b/drivers/usb/media/dabusb.c
index 27b23c55bbc7..18d8eaf408d5 100644
--- a/drivers/usb/media/dabusb.c
+++ b/drivers/usb/media/dabusb.c
@@ -812,7 +812,6 @@ static struct usb_device_id dabusb_ids [] = {
 MODULE_DEVICE_TABLE (usb, dabusb_ids);
 
 static struct usb_driver dabusb_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"dabusb",
 	.probe =	dabusb_probe,
 	.disconnect =	dabusb_disconnect,
diff --git a/drivers/usb/media/dsbr100.c b/drivers/usb/media/dsbr100.c
index 7503f5b96f59..6a5700e9d428 100644
--- a/drivers/usb/media/dsbr100.c
+++ b/drivers/usb/media/dsbr100.c
@@ -150,7 +150,6 @@ MODULE_DEVICE_TABLE (usb, usb_dsbr100_device_table);
 
 /* USB subsystem interface */
 static struct usb_driver usb_dsbr100_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"dsbr100",
 	.probe =	usb_dsbr100_probe,
 	.disconnect =	usb_dsbr100_disconnect,
diff --git a/drivers/usb/media/ov511.c b/drivers/usb/media/ov511.c
index 036c485d1d1e..8df4f9de5ee5 100644
--- a/drivers/usb/media/ov511.c
+++ b/drivers/usb/media/ov511.c
@@ -6008,7 +6008,6 @@ ov51x_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver ov511_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"ov511",
 	.id_table =	device_table,
 	.probe =	ov51x_probe,
diff --git a/drivers/usb/media/pwc/pwc-if.c b/drivers/usb/media/pwc/pwc-if.c
index 5524fd70210b..09ca6128ac20 100644
--- a/drivers/usb/media/pwc/pwc-if.c
+++ b/drivers/usb/media/pwc/pwc-if.c
@@ -111,7 +111,6 @@ static int usb_pwc_probe(struct usb_interface *intf, const struct usb_device_id
 static void usb_pwc_disconnect(struct usb_interface *intf);
 
 static struct usb_driver pwc_driver = {
-	.owner =		THIS_MODULE,
 	.name =			"Philips webcam",	/* name */
 	.id_table =		pwc_device_table,
 	.probe =		usb_pwc_probe,		/* probe() */
diff --git a/drivers/usb/media/se401.c b/drivers/usb/media/se401.c
index f69e443cd1bc..b2ae29af5940 100644
--- a/drivers/usb/media/se401.c
+++ b/drivers/usb/media/se401.c
@@ -1401,7 +1401,6 @@ static void se401_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver se401_driver = {
-	.owner		= THIS_MODULE,
         .name		= "se401",
         .id_table	= device_table,
 	.probe		= se401_probe,
diff --git a/drivers/usb/media/sn9c102_core.c b/drivers/usb/media/sn9c102_core.c
index b2e66e3b90aa..08723459da86 100644
--- a/drivers/usb/media/sn9c102_core.c
+++ b/drivers/usb/media/sn9c102_core.c
@@ -2711,7 +2711,6 @@ static void sn9c102_usb_disconnect(struct usb_interface* intf)
 
 
 static struct usb_driver sn9c102_usb_driver = {
-	.owner =      THIS_MODULE,
 	.name =       "sn9c102",
 	.id_table =   sn9c102_id_table,
 	.probe =      sn9c102_usb_probe,
diff --git a/drivers/usb/media/stv680.c b/drivers/usb/media/stv680.c
index 0fd0fa9fec21..774038b352cd 100644
--- a/drivers/usb/media/stv680.c
+++ b/drivers/usb/media/stv680.c
@@ -1477,7 +1477,6 @@ static void stv680_disconnect (struct usb_interface *intf)
 }
 
 static struct usb_driver stv680_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"stv680",
 	.probe =	stv680_probe,
 	.disconnect =	stv680_disconnect,
diff --git a/drivers/usb/media/vicam.c b/drivers/usb/media/vicam.c
index 0bc0b1247a6b..1c73155c8d77 100644
--- a/drivers/usb/media/vicam.c
+++ b/drivers/usb/media/vicam.c
@@ -1257,7 +1257,6 @@ static struct usb_device_id vicam_table[] = {
 MODULE_DEVICE_TABLE(usb, vicam_table);
 
 static struct usb_driver vicam_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "vicam",
 	.probe		= vicam_probe,
 	.disconnect	= vicam_disconnect,
diff --git a/drivers/usb/media/w9968cf.c b/drivers/usb/media/w9968cf.c
index 67612c81cb9f..52b90d50febb 100644
--- a/drivers/usb/media/w9968cf.c
+++ b/drivers/usb/media/w9968cf.c
@@ -3668,7 +3668,6 @@ static void w9968cf_usb_disconnect(struct usb_interface* intf)
 
 
 static struct usb_driver w9968cf_usb_driver = {
-	.owner =      THIS_MODULE,
 	.name =       "w9968cf",
 	.id_table =   winbond_id_table,
 	.probe =      w9968cf_usb_probe,
diff --git a/drivers/usb/misc/auerswald.c b/drivers/usb/misc/auerswald.c
index b293db3c28c3..fad387f21891 100644
--- a/drivers/usb/misc/auerswald.c
+++ b/drivers/usb/misc/auerswald.c
@@ -2103,7 +2103,6 @@ MODULE_DEVICE_TABLE (usb, auerswald_ids);
 
 /* Standard usb driver struct */
 static struct usb_driver auerswald_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"auerswald",
 	.probe =	auerswald_probe,
 	.disconnect =	auerswald_disconnect,
diff --git a/drivers/usb/misc/cytherm.c b/drivers/usb/misc/cytherm.c
index b33044d56a1e..6671317b495f 100644
--- a/drivers/usb/misc/cytherm.c
+++ b/drivers/usb/misc/cytherm.c
@@ -50,7 +50,6 @@ static void cytherm_disconnect(struct usb_interface *interface);
 
 /* usb specific object needed to register this driver with the usb subsystem */
 static struct usb_driver cytherm_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"cytherm",
 	.probe =	cytherm_probe,
 	.disconnect =	cytherm_disconnect,
diff --git a/drivers/usb/misc/emi26.c b/drivers/usb/misc/emi26.c
index c8155209bf4b..3824df33094e 100644
--- a/drivers/usb/misc/emi26.c
+++ b/drivers/usb/misc/emi26.c
@@ -227,7 +227,6 @@ static void emi26_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver emi26_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "emi26 - firmware loader",
 	.probe		= emi26_probe,
 	.disconnect	= emi26_disconnect,
diff --git a/drivers/usb/misc/emi62.c b/drivers/usb/misc/emi62.c
index 189986af2ac7..52fea2e08db8 100644
--- a/drivers/usb/misc/emi62.c
+++ b/drivers/usb/misc/emi62.c
@@ -266,7 +266,6 @@ static void emi62_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver emi62_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "emi62 - firmware loader",
 	.probe		= emi62_probe,
 	.disconnect	= emi62_disconnect,
diff --git a/drivers/usb/misc/idmouse.c b/drivers/usb/misc/idmouse.c
index 1dc3e0f73014..d8cde1017985 100644
--- a/drivers/usb/misc/idmouse.c
+++ b/drivers/usb/misc/idmouse.c
@@ -114,7 +114,6 @@ static struct usb_class_driver idmouse_class = {
 
 /* usb specific object needed to register this driver with the usb subsystem */
 static struct usb_driver idmouse_driver = {
-	.owner = THIS_MODULE,
 	.name = DRIVER_SHORT,
 	.probe = idmouse_probe,
 	.disconnect = idmouse_disconnect,
diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c
index 7e93ac96490f..981d8a5fbfd9 100644
--- a/drivers/usb/misc/ldusb.c
+++ b/drivers/usb/misc/ldusb.c
@@ -763,7 +763,6 @@ static void ld_usb_disconnect(struct usb_interface *intf)
 
 /* usb specific object needed to register this driver with the usb subsystem */
 static struct usb_driver ld_usb_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"ldusb",
 	.probe =	ld_usb_probe,
 	.disconnect =	ld_usb_disconnect,
diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c
index 2703e205bc8f..1336745b8f55 100644
--- a/drivers/usb/misc/legousbtower.c
+++ b/drivers/usb/misc/legousbtower.c
@@ -282,7 +282,6 @@ static struct usb_class_driver tower_class = {
 
 /* usb specific object needed to register this driver with the usb subsystem */
 static struct usb_driver tower_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"legousbtower",
 	.probe =	tower_probe,
 	.disconnect =	tower_disconnect,
diff --git a/drivers/usb/misc/phidgetkit.c b/drivers/usb/misc/phidgetkit.c
index 067a81486921..605a3c87e05c 100644
--- a/drivers/usb/misc/phidgetkit.c
+++ b/drivers/usb/misc/phidgetkit.c
@@ -555,7 +555,6 @@ static void interfacekit_disconnect(struct usb_interface *interface)
 }
 
 static struct usb_driver interfacekit_driver = {
-	.owner = THIS_MODULE,
 	.name = "phidgetkit",
 	.probe = interfacekit_probe,
 	.disconnect = interfacekit_disconnect,
diff --git a/drivers/usb/misc/phidgetservo.c b/drivers/usb/misc/phidgetservo.c
index a30d4a6ee824..b3418d2bcc69 100644
--- a/drivers/usb/misc/phidgetservo.c
+++ b/drivers/usb/misc/phidgetservo.c
@@ -306,7 +306,6 @@ servo_disconnect(struct usb_interface *interface)
 }
 
 static struct usb_driver servo_driver = {
-	.owner = THIS_MODULE,
 	.name = "phidgetservo",
 	.probe = servo_probe,
 	.disconnect = servo_disconnect,
diff --git a/drivers/usb/misc/rio500.c b/drivers/usb/misc/rio500.c
index 9590dbac5d9a..b9d66074b80c 100644
--- a/drivers/usb/misc/rio500.c
+++ b/drivers/usb/misc/rio500.c
@@ -522,7 +522,6 @@ static struct usb_device_id rio_table [] = {
 MODULE_DEVICE_TABLE (usb, rio_table);
 
 static struct usb_driver rio_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"rio500",
 	.probe =	probe_rio,
 	.disconnect =	disconnect_rio,
diff --git a/drivers/usb/misc/sisusbvga/sisusb.c b/drivers/usb/misc/sisusbvga/sisusb.c
index 41ef2b606751..44350d49ad0a 100644
--- a/drivers/usb/misc/sisusbvga/sisusb.c
+++ b/drivers/usb/misc/sisusbvga/sisusb.c
@@ -3489,7 +3489,6 @@ static struct usb_device_id sisusb_table [] = {
 MODULE_DEVICE_TABLE (usb, sisusb_table);
 
 static struct usb_driver sisusb_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"sisusb",
 	.probe =	sisusb_probe,
 	.disconnect =	sisusb_disconnect,
diff --git a/drivers/usb/misc/usblcd.c b/drivers/usb/misc/usblcd.c
index 85f3725334b0..cc3dae3f34e0 100644
--- a/drivers/usb/misc/usblcd.c
+++ b/drivers/usb/misc/usblcd.c
@@ -371,7 +371,6 @@ static void lcd_disconnect(struct usb_interface *interface)
 }
 
 static struct usb_driver lcd_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usblcd",
 	.probe =	lcd_probe,
 	.disconnect =	lcd_disconnect,
diff --git a/drivers/usb/misc/usbled.c b/drivers/usb/misc/usbled.c
index 3c93921cb6b3..877b081a3a6e 100644
--- a/drivers/usb/misc/usbled.c
+++ b/drivers/usb/misc/usbled.c
@@ -148,7 +148,6 @@ static void led_disconnect(struct usb_interface *interface)
 }
 
 static struct usb_driver led_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usbled",
 	.probe =	led_probe,
 	.disconnect =	led_disconnect,
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c
index 605a2afe34ed..84fa1728f052 100644
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -2134,7 +2134,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver usbtest_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usbtest",
 	.id_table =	id_table,
 	.probe =	usbtest_probe,
diff --git a/drivers/usb/misc/uss720.c b/drivers/usb/misc/uss720.c
index 1cabe7ed91f5..4081990b7d1a 100644
--- a/drivers/usb/misc/uss720.c
+++ b/drivers/usb/misc/uss720.c
@@ -780,7 +780,6 @@ MODULE_DEVICE_TABLE (usb, uss720_table);
 
 
 static struct usb_driver uss720_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"uss720",
 	.probe =	uss720_probe,
 	.disconnect =	uss720_disconnect,
diff --git a/drivers/usb/net/asix.c b/drivers/usb/net/asix.c
index 542120ef1fd2..2faf2f2bdcdd 100644
--- a/drivers/usb/net/asix.c
+++ b/drivers/usb/net/asix.c
@@ -918,7 +918,6 @@ static const struct usb_device_id	products [] = {
 MODULE_DEVICE_TABLE(usb, products);
 
 static struct usb_driver asix_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"asix",
 	.id_table =	products,
 	.probe =	usbnet_probe,
diff --git a/drivers/usb/net/catc.c b/drivers/usb/net/catc.c
index 37ef365a2472..be5f5e142dd0 100644
--- a/drivers/usb/net/catc.c
+++ b/drivers/usb/net/catc.c
@@ -934,7 +934,6 @@ static struct usb_device_id catc_id_table [] = {
 MODULE_DEVICE_TABLE(usb, catc_id_table);
 
 static struct usb_driver catc_driver = {
-	.owner =	THIS_MODULE,
 	.name =		driver_name,
 	.probe =	catc_probe,
 	.disconnect =	catc_disconnect,
diff --git a/drivers/usb/net/cdc_ether.c b/drivers/usb/net/cdc_ether.c
index c008c981862b..63f1f3ba8e0b 100644
--- a/drivers/usb/net/cdc_ether.c
+++ b/drivers/usb/net/cdc_ether.c
@@ -476,7 +476,6 @@ static const struct usb_device_id	products [] = {
 MODULE_DEVICE_TABLE(usb, products);
 
 static struct usb_driver cdc_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"cdc_ether",
 	.id_table =	products,
 	.probe =	usbnet_probe,
diff --git a/drivers/usb/net/cdc_subset.c b/drivers/usb/net/cdc_subset.c
index f05cfb83c82d..ec801e8bb1bb 100644
--- a/drivers/usb/net/cdc_subset.c
+++ b/drivers/usb/net/cdc_subset.c
@@ -306,7 +306,6 @@ MODULE_DEVICE_TABLE(usb, products);
 /*-------------------------------------------------------------------------*/
 
 static struct usb_driver cdc_subset_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"cdc_subset",
 	.probe =	usbnet_probe,
 	.suspend =	usbnet_suspend,
diff --git a/drivers/usb/net/gl620a.c b/drivers/usb/net/gl620a.c
index 2455e9a85674..faf1e86be687 100644
--- a/drivers/usb/net/gl620a.c
+++ b/drivers/usb/net/gl620a.c
@@ -377,7 +377,6 @@ static const struct usb_device_id	products [] = {
 MODULE_DEVICE_TABLE(usb, products);
 
 static struct usb_driver gl620a_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"gl620a",
 	.id_table =	products,
 	.probe =	usbnet_probe,
diff --git a/drivers/usb/net/kaweth.c b/drivers/usb/net/kaweth.c
index b5776518020f..def3bb8e2290 100644
--- a/drivers/usb/net/kaweth.c
+++ b/drivers/usb/net/kaweth.c
@@ -175,7 +175,6 @@ MODULE_DEVICE_TABLE (usb, usb_klsi_table);
  *     kaweth_driver
  ****************************************************************/
 static struct usb_driver kaweth_driver = {
-	.owner =	THIS_MODULE,
 	.name =		driver_name,
 	.probe =	kaweth_probe,
 	.disconnect =	kaweth_disconnect,
diff --git a/drivers/usb/net/net1080.c b/drivers/usb/net/net1080.c
index b3799b1a2b0d..78e6a43b1087 100644
--- a/drivers/usb/net/net1080.c
+++ b/drivers/usb/net/net1080.c
@@ -593,7 +593,6 @@ static const struct usb_device_id	products [] = {
 MODULE_DEVICE_TABLE(usb, products);
 
 static struct usb_driver net1080_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"net1080",
 	.id_table =	products,
 	.probe =	usbnet_probe,
diff --git a/drivers/usb/net/plusb.c b/drivers/usb/net/plusb.c
index 89856aa0e3b8..4fe863389cb7 100644
--- a/drivers/usb/net/plusb.c
+++ b/drivers/usb/net/plusb.c
@@ -127,7 +127,6 @@ static const struct usb_device_id	products [] = {
 MODULE_DEVICE_TABLE(usb, products);
 
 static struct usb_driver plusb_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"plusb",
 	.id_table =	products,
 	.probe =	usbnet_probe,
diff --git a/drivers/usb/net/rndis_host.c b/drivers/usb/net/rndis_host.c
index c0ecbab6f6ba..49991ac1bf3b 100644
--- a/drivers/usb/net/rndis_host.c
+++ b/drivers/usb/net/rndis_host.c
@@ -586,7 +586,6 @@ static const struct usb_device_id	products [] = {
 MODULE_DEVICE_TABLE(usb, products);
 
 static struct usb_driver rndis_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"rndis_host",
 	.id_table =	products,
 	.probe =	usbnet_probe,
diff --git a/drivers/usb/net/rtl8150.c b/drivers/usb/net/rtl8150.c
index 787dd3591d6a..8ca52be23976 100644
--- a/drivers/usb/net/rtl8150.c
+++ b/drivers/usb/net/rtl8150.c
@@ -177,7 +177,6 @@ static int rtl8150_probe(struct usb_interface *intf,
 static const char driver_name [] = "rtl8150";
 
 static struct usb_driver rtl8150_driver = {
-	.owner =	THIS_MODULE,
 	.name =		driver_name,
 	.probe =	rtl8150_probe,
 	.disconnect =	rtl8150_disconnect,
diff --git a/drivers/usb/net/zaurus.c b/drivers/usb/net/zaurus.c
index 680d13957af4..9c5ab251370c 100644
--- a/drivers/usb/net/zaurus.c
+++ b/drivers/usb/net/zaurus.c
@@ -357,7 +357,6 @@ static const struct usb_device_id	products [] = {
 MODULE_DEVICE_TABLE(usb, products);
 
 static struct usb_driver zaurus_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"zaurus",
 	.id_table =	products,
 	.probe =	usbnet_probe,
diff --git a/drivers/usb/net/zd1201.c b/drivers/usb/net/zd1201.c
index 2f52261c7cc1..4d6673adc8fc 100644
--- a/drivers/usb/net/zd1201.c
+++ b/drivers/usb/net/zd1201.c
@@ -1923,7 +1923,6 @@ static int zd1201_resume(struct usb_interface *interface)
 #endif
 
 static struct usb_driver zd1201_usb = {
-	.owner = THIS_MODULE,
 	.name = "zd1201",
 	.probe = zd1201_probe,
 	.disconnect = zd1201_disconnect,
diff --git a/drivers/usb/serial/airprime.c b/drivers/usb/serial/airprime.c
index 2ef9945a6c07..dbf1f063098c 100644
--- a/drivers/usb/serial/airprime.c
+++ b/drivers/usb/serial/airprime.c
@@ -23,7 +23,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE(usb, id_table);
 
 static struct usb_driver airprime_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"airprime",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/anydata.c b/drivers/usb/serial/anydata.c
index 7a171e034b59..343f6f228220 100644
--- a/drivers/usb/serial/anydata.c
+++ b/drivers/usb/serial/anydata.c
@@ -27,7 +27,6 @@ static int buffer_size;
 static int debug;
 
 static struct usb_driver anydata_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"anydata",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/belkin_sa.c b/drivers/usb/serial/belkin_sa.c
index 69039bd9fc5e..4144777ea18b 100644
--- a/drivers/usb/serial/belkin_sa.c
+++ b/drivers/usb/serial/belkin_sa.c
@@ -113,7 +113,6 @@ static struct usb_device_id id_table_combined [] = {
 MODULE_DEVICE_TABLE (usb, id_table_combined);
 
 static struct usb_driver belkin_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"belkin",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/cp2101.c b/drivers/usb/serial/cp2101.c
index 813bab37e076..da46b351e188 100644
--- a/drivers/usb/serial/cp2101.c
+++ b/drivers/usb/serial/cp2101.c
@@ -67,7 +67,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver cp2101_driver = {
-	.owner		= THIS_MODULE,
 	.name		= "cp2101",
 	.probe		= usb_serial_probe,
 	.disconnect	= usb_serial_disconnect,
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index 8c10e4004905..6d18d4eaba35 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -76,7 +76,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver cyberjack_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"cyberjack",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index c50cec95f49b..8fc414bd5b24 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -493,7 +493,6 @@ static struct usb_device_id id_table_4 [] = {
 MODULE_DEVICE_TABLE (usb, id_table_combined);
 
 static struct usb_driver digi_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"digi_acceleport",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index e5e40064caf2..79a766e9ca23 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -105,7 +105,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver empeg_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"empeg",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index 198a322286f9..452efce72714 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -222,7 +222,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver garmin_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"garmin_gps",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index c00a440dc421..4ddac620fc0c 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -68,7 +68,6 @@ static int generic_probe(struct usb_interface *interface,
 }
 
 static struct usb_driver generic_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usbserial_generic",
 	.probe =	generic_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/hp4x.c b/drivers/usb/serial/hp4x.c
index e588c3fe632d..e9719da2aca1 100644
--- a/drivers/usb/serial/hp4x.c
+++ b/drivers/usb/serial/hp4x.c
@@ -37,7 +37,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE(usb, id_table);
 
 static struct usb_driver hp49gp_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"hp4X",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index 276bd425a474..4e2b599d85a6 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -242,7 +242,6 @@ static void edge_shutdown		(struct usb_serial *serial);
 #include "io_tables.h"	/* all of the devices that this driver supports */
 
 static struct usb_driver io_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"io_edgeport",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 8b2e4c78abcd..22ad1a5a8f9e 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -216,7 +216,6 @@ static struct usb_device_id id_table_combined [] = {
 MODULE_DEVICE_TABLE (usb, id_table_combined);
 
 static struct usb_driver io_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"io_ti",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index efb568be7015..06d07cea0b70 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -542,7 +542,6 @@ static struct usb_device_id ipaq_id_table [] = {
 MODULE_DEVICE_TABLE (usb, ipaq_id_table);
 
 static struct usb_driver ipaq_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"ipaq",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/ipw.c b/drivers/usb/serial/ipw.c
index 64e2cda2a84a..2dd191f5fe76 100644
--- a/drivers/usb/serial/ipw.c
+++ b/drivers/usb/serial/ipw.c
@@ -152,7 +152,6 @@ static struct usb_device_id usb_ipw_ids[] = {
 MODULE_DEVICE_TABLE(usb, usb_ipw_ids);
 
 static struct usb_driver usb_ipw_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"ipwtty",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c
index 647431c1ccb1..a59010421444 100644
--- a/drivers/usb/serial/ir-usb.c
+++ b/drivers/usb/serial/ir-usb.c
@@ -125,7 +125,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver ir_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"ir-usb",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/keyspan.h b/drivers/usb/serial/keyspan.h
index 4e6f626f6062..7472ed6bf626 100644
--- a/drivers/usb/serial/keyspan.h
+++ b/drivers/usb/serial/keyspan.h
@@ -520,7 +520,6 @@ static struct usb_device_id keyspan_ids_combined[] = {
 MODULE_DEVICE_TABLE(usb, keyspan_ids_combined);
 
 static struct usb_driver keyspan_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"keyspan",                
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index 0d1f15268549..b0441c35f98f 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -150,7 +150,6 @@ static struct usb_device_id id_table_combined [] = {
 MODULE_DEVICE_TABLE (usb, id_table_combined);
 
 static struct usb_driver keyspan_pda_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"keyspan_pda",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index bd68638b7c35..4e2f7dfb58b2 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -116,7 +116,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver kl5kusb105d_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"kl5kusb105d",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index 4c853afea385..d9c21e275130 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -97,7 +97,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver kobil_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"kobil",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index b0415e7542c4..b6d6cab9c859 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -125,7 +125,6 @@ static struct usb_device_id id_table_combined [] = {
 MODULE_DEVICE_TABLE (usb, id_table_combined);
 
 static struct usb_driver mct_u232_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"mct_u232",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c
index b595befb24cf..762d8ff9a1e4 100644
--- a/drivers/usb/serial/omninet.c
+++ b/drivers/usb/serial/omninet.c
@@ -80,7 +80,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver omninet_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"omninet",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 4ee657eaaa0b..3fd2405304fd 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -95,7 +95,6 @@ static struct usb_device_id option_ids[] = {
 MODULE_DEVICE_TABLE(usb, option_ids);
 
 static struct usb_driver option_driver = {
-	.owner      = THIS_MODULE,
 	.name       = "option",
 	.probe      = usb_serial_probe,
 	.disconnect = usb_serial_disconnect,
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index e302a320444c..c96ba9fc19e0 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -82,7 +82,6 @@ static struct usb_device_id id_table [] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver pl2303_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"pl2303",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/safe_serial.c b/drivers/usb/serial/safe_serial.c
index f8241c152043..3ea284ce7b8b 100644
--- a/drivers/usb/serial/safe_serial.c
+++ b/drivers/usb/serial/safe_serial.c
@@ -160,7 +160,6 @@ static struct usb_device_id id_table[] = {
 MODULE_DEVICE_TABLE (usb, id_table);
 
 static struct usb_driver safe_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"safe_serial",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 17a1f09483bd..9e53ec75bcfd 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -248,7 +248,6 @@ static struct usb_device_id ti_id_table_combined[] = {
 };
 
 static struct usb_driver ti_usb_driver = {
-	.owner			= THIS_MODULE,
 	.name			= "ti_usb_3410_5052",
 	.probe			= usb_serial_probe,
 	.disconnect		= usb_serial_disconnect,
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 2ac37b52485a..12aaf18ff9ea 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -42,7 +42,6 @@
 
 /* Driver structure we register with the USB core */
 static struct usb_driver usb_serial_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usbserial",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index 2973f5564c06..49b1fbe61f25 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -173,7 +173,6 @@ static struct usb_device_id id_table_combined [] = {
 MODULE_DEVICE_TABLE (usb, id_table_combined);
 
 static struct usb_driver visor_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"visor",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 19c6386bb692..a7c3c4734d83 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -127,7 +127,6 @@ static struct usb_device_id id_table_combined [] = {
 MODULE_DEVICE_TABLE (usb, id_table_combined);
 
 static struct usb_driver whiteheat_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"whiteheat",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
diff --git a/drivers/usb/storage/libusual.c b/drivers/usb/storage/libusual.c
index 2680c69a2417..b28151d1b609 100644
--- a/drivers/usb/storage/libusual.c
+++ b/drivers/usb/storage/libusual.c
@@ -153,7 +153,6 @@ static void usu_disconnect(struct usb_interface *intf)
 }
 
 static struct usb_driver usu_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"libusual",
 	.probe =	usu_probe,
 	.disconnect =	usu_disconnect,
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index c8375aa62723..484ed297bed0 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -1000,7 +1000,6 @@ static void storage_disconnect(struct usb_interface *intf)
  ***********************************************************************/
 
 static struct usb_driver usb_storage_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"usb-storage",
 	.probe =	storage_probe,
 	.disconnect =	storage_disconnect,
diff --git a/drivers/usb/usb-skeleton.c b/drivers/usb/usb-skeleton.c
index 6c3a53f8f26c..60c458ebaa2d 100644
--- a/drivers/usb/usb-skeleton.c
+++ b/drivers/usb/usb-skeleton.c
@@ -330,7 +330,6 @@ static void skel_disconnect(struct usb_interface *interface)
 }
 
 static struct usb_driver skel_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"skeleton",
 	.probe =	skel_probe,
 	.disconnect =	skel_disconnect,
diff --git a/drivers/w1/dscore.c b/drivers/w1/dscore.c
index 15fb250451e5..b9146306df49 100644
--- a/drivers/w1/dscore.c
+++ b/drivers/w1/dscore.c
@@ -52,7 +52,6 @@ static int ds_send_control_cmd(struct ds_device *, u16, u16);
 
 
 static struct usb_driver ds_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"DS9490R",
 	.probe =	ds_probe,
 	.disconnect =	ds_disconnect,
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 3d05c63451a8..2714814ab66c 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -536,8 +536,6 @@ struct usb_dynids {
 
 /**
  * struct usb_driver - identifies USB driver to usbcore
- * @owner: Pointer to the module owner of this driver; initialize
- *	it using THIS_MODULE.
  * @name: The driver name should be unique among USB drivers,
  *	and should normally be the same as the module name.
  * @probe: Called to see if the driver is willing to manage a particular
@@ -580,8 +578,6 @@ struct usb_dynids {
  * them as necessary, and blocking until the unlinks complete).
  */
 struct usb_driver {
-	struct module *owner;
-
 	const char *name;
 
 	int (*probe) (struct usb_interface *intf,
diff --git a/sound/usb/usbaudio.c b/sound/usb/usbaudio.c
index 99dae024b640..22f8bb612bff 100644
--- a/sound/usb/usbaudio.c
+++ b/sound/usb/usbaudio.c
@@ -1996,7 +1996,6 @@ static struct usb_device_id usb_audio_ids [] = {
 MODULE_DEVICE_TABLE (usb, usb_audio_ids);
 
 static struct usb_driver usb_audio_driver = {
-	.owner =	THIS_MODULE,
 	.name =		"snd-usb-audio",
 	.probe =	usb_audio_probe,
 	.disconnect =	usb_audio_disconnect,
diff --git a/sound/usb/usx2y/usbusx2y.c b/sound/usb/usx2y/usbusx2y.c
index cf77313c609d..a3967f72ab4e 100644
--- a/sound/usb/usx2y/usbusx2y.c
+++ b/sound/usb/usx2y/usbusx2y.c
@@ -409,7 +409,6 @@ static void snd_usX2Y_disconnect(struct usb_interface *intf)
 
 MODULE_DEVICE_TABLE(usb, snd_usX2Y_usb_id_table);
 static struct usb_driver snd_usX2Y_usb_driver = {
- 	.owner =	THIS_MODULE,
 	.name =		"snd-usb-usx2y",
 	.probe =	snd_usX2Y_probe,
 	.disconnect =	snd_usX2Y_disconnect,
-- 
cgit v1.2.3-71-gd317


From 9ad3d6ccf5eee285e233dbaf186369b8d477a666 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 17 Nov 2005 17:10:32 -0500
Subject: [PATCH] USB: Remove USB private semaphore

This patch (as605) removes the private udev->serialize semaphore,
relying instead on the locking provided by the embedded struct device's
semaphore.  The changes are confined to the core, except that the
usb_trylock_device routine now uses the return convention of
down_trylock rather than down_read_trylock (they return opposite values
for no good reason).

A couple of other associated changes are included as well:

	Now that we aren't concerned about HCDs that avoid using the
	hcd glue layer, usb_disconnect no longer needs to acquire the
	usb_bus_lock -- that can be done by usb_remove_hcd where it
	belongs.

	Devices aren't locked over the same scope of code in
	usb_new_device and hub_port_connect_change as they used to be.
	This shouldn't cause any trouble.

Along with the preceding driver core patch, this needs a lot of testing.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/devices.c  |   4 +-
 drivers/usb/core/devio.c    |   2 -
 drivers/usb/core/driver.c   |   4 --
 drivers/usb/core/hcd.c      |   5 +-
 drivers/usb/core/hub.c      |  48 +++++++------------
 drivers/usb/core/usb.c      | 114 ++++----------------------------------------
 drivers/usb/core/usb.h      |   3 --
 drivers/usb/host/ohci-hub.c |   2 +-
 include/linux/usb.h         |   9 ++--
 9 files changed, 37 insertions(+), 154 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c
index 83e815d3cd52..55bc563a3256 100644
--- a/drivers/usb/core/devices.c
+++ b/drivers/usb/core/devices.c
@@ -545,10 +545,10 @@ static ssize_t usb_device_dump(char __user **buffer, size_t *nbytes, loff_t *ski
 		struct usb_device *childdev = usbdev->children[chix];
 
 		if (childdev) {
-			down(&childdev->serialize);
+			usb_lock_device(childdev);
 			ret = usb_device_dump(buffer, nbytes, skip_bytes, file_offset, childdev,
 					bus, level + 1, chix, ++cnt);
-			up(&childdev->serialize);
+			usb_unlock_device(childdev);
 			if (ret == -EFAULT)
 				return total_written;
 			total_written += ret;
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 3a73170e95dd..2b68998fe4b3 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -1349,9 +1349,7 @@ static int proc_ioctl(struct dev_state *ps, struct usbdevfs_ioctl *ctl)
 	/* let kernel drivers try to (re)bind to the interface */
 	case USBDEVFS_CONNECT:
 		usb_unlock_device(ps->dev);
-		usb_lock_all_devices();
 		bus_rescan_devices(intf->dev.bus);
-		usb_unlock_all_devices();
 		usb_lock_device(ps->dev);
 		break;
 
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index bb139f06bcd6..076462c8ba2a 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -432,9 +432,7 @@ int usb_register_driver(struct usb_driver *new_driver, struct module *owner)
 	spin_lock_init(&new_driver->dynids.lock);
 	INIT_LIST_HEAD(&new_driver->dynids.list);
 
-	usb_lock_all_devices();
 	retval = driver_register(&new_driver->driver);
-	usb_unlock_all_devices();
 
 	if (!retval) {
 		pr_info("%s: registered new driver %s\n",
@@ -465,11 +463,9 @@ void usb_deregister(struct usb_driver *driver)
 {
 	pr_info("%s: deregistering driver %s\n", usbcore_name, driver->name);
 
-	usb_lock_all_devices();
 	usb_remove_newid_file(driver);
 	usb_free_dynids(driver);
 	driver_unregister(&driver->driver);
-	usb_unlock_all_devices();
 
 	usbfs_update_special();
 }
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index da24c31ee00d..d16a0e8a7d72 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -857,9 +857,7 @@ static int register_root_hub (struct usb_device *usb_dev,
 		return (retval < 0) ? retval : -EMSGSIZE;
 	}
 
-	usb_lock_device (usb_dev);
 	retval = usb_new_device (usb_dev);
-	usb_unlock_device (usb_dev);
 	if (retval) {
 		usb_dev->bus->root_hub = NULL;
 		dev_err (parent_dev, "can't register root hub for %s, %d\n",
@@ -1891,7 +1889,10 @@ void usb_remove_hcd(struct usb_hcd *hcd)
 	spin_lock_irq (&hcd_root_hub_lock);
 	hcd->rh_registered = 0;
 	spin_unlock_irq (&hcd_root_hub_lock);
+
+	down(&usb_bus_list_lock);
 	usb_disconnect(&hcd->self.root_hub);
+	up(&usb_bus_list_lock);
 
 	hcd->poll_rh = 0;
 	del_timer_sync(&hcd->rh_timer);
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 40c6c50c6bd9..dd3bcfb2bcb6 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -32,7 +32,7 @@
 #include "hub.h"
 
 /* Protect struct usb_device->state and ->children members
- * Note: Both are also protected by ->serialize, except that ->state can
+ * Note: Both are also protected by ->dev.sem, except that ->state can
  * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */
 static DEFINE_SPINLOCK(device_state_lock);
 
@@ -975,8 +975,8 @@ static int locktree(struct usb_device *udev)
 			/* when everyone grabs locks top->bottom,
 			 * non-overlapping work may be concurrent
 			 */
-			down(&udev->serialize);
-			up(&hdev->serialize);
+			usb_lock_device(udev);
+			usb_unlock_device(hdev);
 			return t + 1;
 		}
 	}
@@ -1132,16 +1132,10 @@ void usb_disconnect(struct usb_device **pdev)
 	 * this quiesces everyting except pending urbs.
 	 */
 	usb_set_device_state(udev, USB_STATE_NOTATTACHED);
-
-	/* lock the bus list on behalf of HCDs unregistering their root hubs */
-	if (!udev->parent) {
-		down(&usb_bus_list_lock);
-		usb_lock_device(udev);
-	} else
-		down(&udev->serialize);
-
 	dev_info (&udev->dev, "USB disconnect, address %d\n", udev->devnum);
 
+	usb_lock_device(udev);
+
 	/* Free up all the children before we remove this device */
 	for (i = 0; i < USB_MAXCHILDREN; i++) {
 		if (udev->children[i])
@@ -1169,11 +1163,7 @@ void usb_disconnect(struct usb_device **pdev)
 	*pdev = NULL;
 	spin_unlock_irq(&device_state_lock);
 
-	if (!udev->parent) {
-		usb_unlock_device(udev);
-		up(&usb_bus_list_lock);
-	} else
-		up(&udev->serialize);
+	usb_unlock_device(udev);
 
 	device_unregister(&udev->dev);
 }
@@ -1243,8 +1233,8 @@ static inline void show_string(struct usb_device *udev, char *id, char *string)
  *
  * This is called with devices which have been enumerated, but not yet
  * configured.  The device descriptor is available, but not descriptors
- * for any device configuration.  The caller must have locked udev and
- * either the parent hub (if udev is a normal device) or else the
+ * for any device configuration.  The caller must have locked either
+ * the parent hub (if udev is a normal device) or else the
  * usb_bus_list_lock (if udev is a root hub).  The parent's pointer to
  * udev has already been installed, but udev is not yet visible through
  * sysfs or other filesystem code.
@@ -1254,8 +1244,7 @@ static inline void show_string(struct usb_device *udev, char *id, char *string)
  *
  * This call is synchronous, and may not be used in an interrupt context.
  *
- * Only the hub driver should ever call this; root hub registration
- * uses it indirectly.
+ * Only the hub driver or root-hub registrar should ever call this.
  */
 int usb_new_device(struct usb_device *udev)
 {
@@ -1364,6 +1353,8 @@ int usb_new_device(struct usb_device *udev)
 	}
 	usb_create_sysfs_dev_files (udev);
 
+	usb_lock_device(udev);
+
 	/* choose and set the configuration. that registers the interfaces
 	 * with the driver core, and lets usb device drivers bind to them.
 	 */
@@ -1385,6 +1376,8 @@ int usb_new_device(struct usb_device *udev)
 	/* USB device state == configured ... usable */
 	usb_notify_add_device(udev);
 
+	usb_unlock_device(udev);
+
 	return 0;
 
 fail:
@@ -1872,11 +1865,8 @@ int usb_resume_device(struct usb_device *udev)
 	usb_unlock_device(udev);
 
 	/* rebind drivers that had no suspend() */
-	if (status == 0) {
-		usb_lock_all_devices();
+	if (status == 0)
 		bus_rescan_devices(&usb_bus_type);
-		usb_unlock_all_devices();
-	}
 	return status;
 }
 
@@ -1889,14 +1879,14 @@ static int remote_wakeup(struct usb_device *udev)
 	/* don't repeat RESUME sequence if this device
 	 * was already woken up by some other task
 	 */
-	down(&udev->serialize);
+	usb_lock_device(udev);
 	if (udev->state == USB_STATE_SUSPENDED) {
 		dev_dbg(&udev->dev, "RESUME (wakeup)\n");
 		/* TRSMRCY = 10 msec */
 		msleep(10);
 		status = finish_device_resume(udev);
 	}
-	up(&udev->serialize);
+	usb_unlock_device(udev);
 #endif
 	return status;
 }
@@ -1997,7 +1987,7 @@ static int hub_resume(struct usb_interface *intf)
 
 		if (!udev || status < 0)
 			continue;
-		down (&udev->serialize);
+		usb_lock_device(udev);
 		if (portstat & USB_PORT_STAT_SUSPEND)
 			status = hub_port_resume(hub, port1, udev);
 		else {
@@ -2008,7 +1998,7 @@ static int hub_resume(struct usb_interface *intf)
 				hub_port_logical_disconnect(hub, port1);
 			}
 		}
-		up(&udev->serialize);
+		usb_unlock_device(udev);
 	}
 	}
 #endif
@@ -2573,7 +2563,6 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1,
 		 * udev becomes globally accessible, although presumably
 		 * no one will look at it until hdev is unlocked.
 		 */
-		down (&udev->serialize);
 		status = 0;
 
 		/* We mustn't add new devices if the parent hub has
@@ -2597,7 +2586,6 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1,
 			}
 		}
 
-		up (&udev->serialize);
 		if (status)
 			goto loop_disable;
 
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 294e9f127477..fcfda21be499 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -32,7 +32,6 @@
 #include <linux/spinlock.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
-#include <linux/rwsem.h>
 #include <linux/usb.h>
 
 #include <asm/io.h>
@@ -49,8 +48,6 @@ const char *usbcore_name = "usbcore";
 static int nousb;	/* Disable USB when built into kernel image */
 			/* Not honored on modular build */
 
-static DECLARE_RWSEM(usb_all_devices_rwsem);
-
 
 /**
  * usb_ifnum_to_if - get the interface object with a given interface number
@@ -446,8 +443,6 @@ usb_alloc_dev(struct usb_device *parent, struct usb_bus *bus, unsigned port1)
 	dev->parent = parent;
 	INIT_LIST_HEAD(&dev->filelist);
 
-	init_MUTEX(&dev->serialize);
-
 	return dev;
 }
 
@@ -520,75 +515,20 @@ void usb_put_intf(struct usb_interface *intf)
 
 /*			USB device locking
  *
- * Although locking USB devices should be straightforward, it is
- * complicated by the way the driver-model core works.  When a new USB
- * driver is registered or unregistered, the core will automatically
- * probe or disconnect all matching interfaces on all USB devices while
- * holding the USB subsystem writelock.  There's no good way for us to
- * tell which devices will be used or to lock them beforehand; our only
- * option is to effectively lock all the USB devices.
- *
- * We do that by using a private rw-semaphore, usb_all_devices_rwsem.
- * When locking an individual device you must first acquire the rwsem's
- * readlock.  When a driver is registered or unregistered the writelock
- * must be held.  These actions are encapsulated in the subroutines
- * below, so all a driver needs to do is call usb_lock_device() and
- * usb_unlock_device().
+ * USB devices and interfaces are locked using the semaphore in their
+ * embedded struct device.  The hub driver guarantees that whenever a
+ * device is connected or disconnected, drivers are called with the
+ * USB device locked as well as their particular interface.
  *
  * Complications arise when several devices are to be locked at the same
  * time.  Only hub-aware drivers that are part of usbcore ever have to
- * do this; nobody else needs to worry about it.  The problem is that
- * usb_lock_device() must not be called to lock a second device since it
- * would acquire the rwsem's readlock reentrantly, leading to deadlock if
- * another thread was waiting for the writelock.  The solution is simple:
- *
- *	When locking more than one device, call usb_lock_device()
- *	to lock the first one.  Lock the others by calling
- *	down(&udev->serialize) directly.
- *
- *	When unlocking multiple devices, use up(&udev->serialize)
- *	to unlock all but the last one.  Unlock the last one by
- *	calling usb_unlock_device().
+ * do this; nobody else needs to worry about it.  The rule for locking
+ * is simple:
  *
  *	When locking both a device and its parent, always lock the
  *	the parent first.
  */
 
-/**
- * usb_lock_device - acquire the lock for a usb device structure
- * @udev: device that's being locked
- *
- * Use this routine when you don't hold any other device locks;
- * to acquire nested inner locks call down(&udev->serialize) directly.
- * This is necessary for proper interaction with usb_lock_all_devices().
- */
-void usb_lock_device(struct usb_device *udev)
-{
-	down_read(&usb_all_devices_rwsem);
-	down(&udev->serialize);
-}
-
-/**
- * usb_trylock_device - attempt to acquire the lock for a usb device structure
- * @udev: device that's being locked
- *
- * Don't use this routine if you already hold a device lock;
- * use down_trylock(&udev->serialize) instead.
- * This is necessary for proper interaction with usb_lock_all_devices().
- *
- * Returns 1 if successful, 0 if contention.
- */
-int usb_trylock_device(struct usb_device *udev)
-{
-	if (!down_read_trylock(&usb_all_devices_rwsem))
-		return 0;
-	if (down_trylock(&udev->serialize)) {
-		up_read(&usb_all_devices_rwsem);
-		return 0;
-	}
-	return 1;
-}
-
 /**
  * usb_lock_device_for_reset - cautiously acquire the lock for a
  *	usb device structure
@@ -627,7 +567,7 @@ int usb_lock_device_for_reset(struct usb_device *udev,
 		}
 	}
 
-	while (!usb_trylock_device(udev)) {
+	while (usb_trylock_device(udev) != 0) {
 
 		/* If we can't acquire the lock after waiting one second,
 		 * we're probably deadlocked */
@@ -645,39 +585,6 @@ int usb_lock_device_for_reset(struct usb_device *udev,
 	return 1;
 }
 
-/**
- * usb_unlock_device - release the lock for a usb device structure
- * @udev: device that's being unlocked
- *
- * Use this routine when releasing the only device lock you hold;
- * to release inner nested locks call up(&udev->serialize) directly.
- * This is necessary for proper interaction with usb_lock_all_devices().
- */
-void usb_unlock_device(struct usb_device *udev)
-{
-	up(&udev->serialize);
-	up_read(&usb_all_devices_rwsem);
-}
-
-/**
- * usb_lock_all_devices - acquire the lock for all usb device structures
- *
- * This is necessary when registering a new driver or probing a bus,
- * since the driver-model core may try to use any usb_device.
- */
-void usb_lock_all_devices(void)
-{
-	down_write(&usb_all_devices_rwsem);
-}
-
-/**
- * usb_unlock_all_devices - release the lock for all usb device structures
- */
-void usb_unlock_all_devices(void)
-{
-	up_write(&usb_all_devices_rwsem);
-}
-
 
 static struct usb_device *match_device(struct usb_device *dev,
 				       u16 vendor_id, u16 product_id)
@@ -700,10 +607,10 @@ static struct usb_device *match_device(struct usb_device *dev,
 	/* look through all of the children of this device */
 	for (child = 0; child < dev->maxchild; ++child) {
 		if (dev->children[child]) {
-			down(&dev->children[child]->serialize);
+			usb_lock_device(dev->children[child]);
 			ret_dev = match_device(dev->children[child],
 					       vendor_id, product_id);
-			up(&dev->children[child]->serialize);
+			usb_unlock_device(dev->children[child]);
 			if (ret_dev)
 				goto exit;
 		}
@@ -1300,10 +1207,7 @@ EXPORT_SYMBOL(usb_put_dev);
 EXPORT_SYMBOL(usb_get_dev);
 EXPORT_SYMBOL(usb_hub_tt_clear_buffer);
 
-EXPORT_SYMBOL(usb_lock_device);
-EXPORT_SYMBOL(usb_trylock_device);
 EXPORT_SYMBOL(usb_lock_device_for_reset);
-EXPORT_SYMBOL(usb_unlock_device);
 
 EXPORT_SYMBOL(usb_driver_claim_interface);
 EXPORT_SYMBOL(usb_driver_release_interface);
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index 98e85fb4d3b7..4647e1ebc68d 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -16,9 +16,6 @@ extern int usb_get_device_descriptor(struct usb_device *dev,
 extern char *usb_cache_string(struct usb_device *udev, int index);
 extern int usb_set_configuration(struct usb_device *dev, int configuration);
 
-extern void usb_lock_all_devices(void);
-extern void usb_unlock_all_devices(void);
-
 extern void usb_kick_khubd(struct usb_device *dev);
 extern void usb_suspend_root_hub(struct usb_device *hdev);
 extern void usb_resume_root_hub(struct usb_device *dev);
diff --git a/drivers/usb/host/ohci-hub.c b/drivers/usb/host/ohci-hub.c
index 72e3b12a1926..4b2226d77b34 100644
--- a/drivers/usb/host/ohci-hub.c
+++ b/drivers/usb/host/ohci-hub.c
@@ -372,7 +372,7 @@ done:
 					& ohci->hc_control)
 				== OHCI_USB_OPER
 			&& time_after (jiffies, ohci->next_statechange)
-			&& usb_trylock_device (hcd->self.root_hub)
+			&& usb_trylock_device (hcd->self.root_hub) == 0
 			) {
 		ohci_vdbg (ohci, "autosuspend\n");
 		(void) ohci_bus_suspend (hcd);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 2714814ab66c..46dc0421d19e 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -329,8 +329,6 @@ struct usb_device {
 	struct usb_tt	*tt; 		/* low/full speed dev, highspeed hub */
 	int		ttport;		/* device port on that tt hub */
 
-	struct semaphore serialize;
-
 	unsigned int toggle[2];		/* one bit for each endpoint
 					 * ([0] = IN, [1] = OUT) */
 
@@ -377,11 +375,12 @@ struct usb_device {
 extern struct usb_device *usb_get_dev(struct usb_device *dev);
 extern void usb_put_dev(struct usb_device *dev);
 
-extern void usb_lock_device(struct usb_device *udev);
-extern int usb_trylock_device(struct usb_device *udev);
+/* USB device locking */
+#define usb_lock_device(udev)		down(&(udev)->dev.sem)
+#define usb_unlock_device(udev)		up(&(udev)->dev.sem)
+#define usb_trylock_device(udev)	down_trylock(&(udev)->dev.sem)
 extern int usb_lock_device_for_reset(struct usb_device *udev,
 		struct usb_interface *iface);
-extern void usb_unlock_device(struct usb_device *udev);
 
 /* USB port reset for device reinitialization */
 extern int usb_reset_device(struct usb_device *dev);
-- 
cgit v1.2.3-71-gd317


From 55c527187c9d78f840b284d596a0b298bc1493af Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 23 Nov 2005 12:03:12 -0500
Subject: [PATCH] USB: Consider power budget when choosing configuration

This patch (as609) changes the way we keep track of power budgeting for
USB hubs and devices, and it updates the choose_configuration routine to
take this information into account.  (This is something we should have
been doing all along.)  A new field in struct usb_device holds the amount
of bus current available from the upstream port, and the usb_hub structure
keeps track of the current available for each downstream port.

Two new rules for configuration selection are added:

	Don't select a self-powered configuration when only bus power
	is available.

	Don't select a configuration requiring more bus power than is
	available.

However the first rule is #if-ed out, because I found that the internal
hub in my HP USB keyboard claims that its only configuration is
self-powered.  The rule would prevent the configuration from being chosen,
leaving the hub & keyboard unconfigured.  Since similar descriptor errors
may turn out to be fairly common, it seemed wise not to include a rule
that would break automatic configuration unnecessarily for such devices.

The second rule may also trigger unnecessarily, although this should be
less common.  More likely it will annoy people by sometimes failing to
accept configurations that should never have been chosen in the first
place.

The patch also changes usbcore's reaction when no configuration is
suitable.  Instead of raising an error and rejecting the device, now
the core will simply leave the device unconfigured.  People can always
work around such problems by installing configurations manually through
sysfs.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hcd.c     |   5 +-
 drivers/usb/core/hub.c     | 229 ++++++++++++++++++++++++++++++---------------
 drivers/usb/core/hub.h     |   3 +-
 drivers/usb/core/message.c |   6 ++
 include/linux/usb.h        |   2 +
 5 files changed, 164 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index d16a0e8a7d72..0018bbc4de34 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1825,8 +1825,6 @@ int usb_add_hcd(struct usb_hcd *hcd,
 		retval = -ENOMEM;
 		goto err_allocate_root_hub;
 	}
-	rhdev->speed = (hcd->driver->flags & HCD_USB2) ? USB_SPEED_HIGH :
-			USB_SPEED_FULL;
 
 	/* Although in principle hcd->driver->start() might need to use rhdev,
 	 * none of the current drivers do.
@@ -1844,6 +1842,9 @@ int usb_add_hcd(struct usb_hcd *hcd,
 		dev_dbg(hcd->self.controller, "supports USB remote wakeup\n");
 	hcd->remote_wakeup = hcd->can_wakeup;
 
+	rhdev->speed = (hcd->driver->flags & HCD_USB2) ? USB_SPEED_HIGH :
+			USB_SPEED_FULL;
+	rhdev->bus_mA = min(500u, hcd->power_budget);
 	if ((retval = register_root_hub(rhdev, hcd)) != 0)
 		goto err_register_root_hub;
 
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 895ac829b9cf..b311005ff1a6 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -702,26 +702,40 @@ static int hub_configure(struct usb_hub *hub,
 	 * and battery-powered root hubs (may provide just 8 mA).
 	 */
 	ret = usb_get_status(hdev, USB_RECIP_DEVICE, 0, &hubstatus);
-	if (ret < 0) {
+	if (ret < 2) {
 		message = "can't get hub status";
 		goto fail;
 	}
 	le16_to_cpus(&hubstatus);
 	if (hdev == hdev->bus->root_hub) {
-		struct usb_hcd *hcd =
-				container_of(hdev->bus, struct usb_hcd, self);
-
-		hub->power_budget = min(500u, hcd->power_budget) / 2;
+		if (hdev->bus_mA == 0 || hdev->bus_mA >= 500)
+			hub->mA_per_port = 500;
+		else {
+			hub->mA_per_port = hdev->bus_mA;
+			hub->limited_power = 1;
+		}
 	} else if ((hubstatus & (1 << USB_DEVICE_SELF_POWERED)) == 0) {
 		dev_dbg(hub_dev, "hub controller current requirement: %dmA\n",
 			hub->descriptor->bHubContrCurrent);
-		hub->power_budget = (501 - hub->descriptor->bHubContrCurrent)
-					/ 2;
+		hub->limited_power = 1;
+		if (hdev->maxchild > 0) {
+			int remaining = hdev->bus_mA -
+					hub->descriptor->bHubContrCurrent;
+
+			if (remaining < hdev->maxchild * 100)
+				dev_warn(hub_dev,
+					"insufficient power available "
+					"to use all downstream ports\n");
+			hub->mA_per_port = 100;		/* 7.2.1.1 */
+		}
+	} else {	/* Self-powered external hub */
+		/* FIXME: What about battery-powered external hubs that
+		 * provide less current per port? */
+		hub->mA_per_port = 500;
 	}
-	if (hub->power_budget)
-		dev_dbg(hub_dev, "%dmA bus power budget for children\n",
-			hub->power_budget * 2);
-
+	if (hub->mA_per_port < 500)
+		dev_dbg(hub_dev, "%umA bus power budget for each child\n",
+				hub->mA_per_port);
 
 	ret = hub_hub_status(hub, &hubstatus, &hubchange);
 	if (ret < 0) {
@@ -1136,45 +1150,107 @@ void usb_disconnect(struct usb_device **pdev)
 	device_unregister(&udev->dev);
 }
 
+static inline const char *plural(int n)
+{
+	return (n == 1 ? "" : "s");
+}
+
 static int choose_configuration(struct usb_device *udev)
 {
-	int c, i;
+	int i;
+	u16 devstatus;
+	int bus_powered;
+	int num_configs;
+	struct usb_host_config *c, *best;
+
+	/* If this fails, assume the device is bus-powered */
+	devstatus = 0;
+	usb_get_status(udev, USB_RECIP_DEVICE, 0, &devstatus);
+	le16_to_cpus(&devstatus);
+	bus_powered = ((devstatus & (1 << USB_DEVICE_SELF_POWERED)) == 0);
+	dev_dbg(&udev->dev, "device is %s-powered\n",
+			bus_powered ? "bus" : "self");
+
+	best = NULL;
+	c = udev->config;
+	num_configs = udev->descriptor.bNumConfigurations;
+	for (i = 0; i < num_configs; (i++, c++)) {
+		struct usb_interface_descriptor	*desc =
+				&c->intf_cache[0]->altsetting->desc;
+
+		/*
+		 * HP's USB bus-powered keyboard has only one configuration
+		 * and it claims to be self-powered; other devices may have
+		 * similar errors in their descriptors.  If the next test
+		 * were allowed to execute, such configurations would always
+		 * be rejected and the devices would not work as expected.
+		 */
+#if 0
+		/* Rule out self-powered configs for a bus-powered device */
+		if (bus_powered && (c->desc.bmAttributes &
+					USB_CONFIG_ATT_SELFPOWER))
+			continue;
+#endif
 
-	/* NOTE: this should interact with hub power budgeting */
+		/*
+		 * The next test may not be as effective as it should be.
+		 * Some hubs have errors in their descriptor, claiming
+		 * to be self-powered when they are really bus-powered.
+		 * We will overestimate the amount of current such hubs
+		 * make available for each port.
+		 *
+		 * This is a fairly benign sort of failure.  It won't
+		 * cause us to reject configurations that we should have
+		 * accepted.
+		 */
 
-	c = udev->config[0].desc.bConfigurationValue;
-	if (udev->descriptor.bNumConfigurations != 1) {
-		for (i = 0; i < udev->descriptor.bNumConfigurations; i++) {
-			struct usb_interface_descriptor	*desc;
+		/* Rule out configs that draw too much bus current */
+		if (c->desc.bMaxPower * 2 > udev->bus_mA)
+			continue;
 
-			/* heuristic:  Linux is more likely to have class
-			 * drivers, so avoid vendor-specific interfaces.
-			 */
-			desc = &udev->config[i].intf_cache[0]
-					->altsetting->desc;
-			if (desc->bInterfaceClass == USB_CLASS_VENDOR_SPEC)
-				continue;
-			/* COMM/2/all is CDC ACM, except 0xff is MSFT RNDIS.
-			 * MSFT needs this to be the first config; never use
-			 * it as the default unless Linux has host-side RNDIS.
-			 * A second config would ideally be CDC-Ethernet, but
-			 * may instead be the "vendor specific" CDC subset
-			 * long used by ARM Linux for sa1100 or pxa255.
-			 */
-			if (desc->bInterfaceClass == USB_CLASS_COMM
-					&& desc->bInterfaceSubClass == 2
-					&& desc->bInterfaceProtocol == 0xff) {
-				c = udev->config[1].desc.bConfigurationValue;
-				continue;
-			}
-			c = udev->config[i].desc.bConfigurationValue;
+		/* If the first config's first interface is COMM/2/0xff
+		 * (MSFT RNDIS), rule it out unless Linux has host-side
+		 * RNDIS support. */
+		if (i == 0 && desc->bInterfaceClass == USB_CLASS_COMM
+				&& desc->bInterfaceSubClass == 2
+				&& desc->bInterfaceProtocol == 0xff) {
+#ifndef CONFIG_USB_NET_RNDIS
+			continue;
+#else
+			best = c;
+#endif
+		}
+
+		/* From the remaining configs, choose the first one whose
+		 * first interface is for a non-vendor-specific class.
+		 * Reason: Linux is more likely to have a class driver
+		 * than a vendor-specific driver. */
+		else if (udev->descriptor.bDeviceClass !=
+						USB_CLASS_VENDOR_SPEC &&
+				desc->bInterfaceClass !=
+						USB_CLASS_VENDOR_SPEC) {
+			best = c;
 			break;
 		}
+
+		/* If all the remaining configs are vendor-specific,
+		 * choose the first one. */
+		else if (!best)
+			best = c;
+	}
+
+	if (best) {
+		i = best->desc.bConfigurationValue;
 		dev_info(&udev->dev,
-			"configuration #%d chosen from %d choices\n",
-			c, udev->descriptor.bNumConfigurations);
+			"configuration #%d chosen from %d choice%s\n",
+			i, num_configs, plural(num_configs));
+	} else {
+		i = -1;
+		dev_warn(&udev->dev,
+			"no configuration chosen from %d choice%s\n",
+			num_configs, plural(num_configs));
 	}
-	return c;
+	return i;
 }
 
 #ifdef DEBUG
@@ -1327,17 +1403,13 @@ int usb_new_device(struct usb_device *udev)
 	 * with the driver core, and lets usb device drivers bind to them.
 	 */
 	c = choose_configuration(udev);
-	if (c < 0)
-		dev_warn(&udev->dev,
-				"can't choose an initial configuration\n");
-	else {
+	if (c >= 0) {
 		err = usb_set_configuration(udev, c);
 		if (err) {
 			dev_err(&udev->dev, "can't set config #%d, error %d\n",
 					c, err);
-			usb_remove_sysfs_dev_files(udev);
-			device_del(&udev->dev);
-			goto fail;
+			/* This need not be fatal.  The user can try to
+			 * set other configurations. */
 		}
 	}
 
@@ -1702,7 +1774,7 @@ static int finish_device_resume(struct usb_device *udev)
 	 * and device drivers will know about any resume quirks.
 	 */
 	status = usb_get_status(udev, USB_RECIP_DEVICE, 0, &devstatus);
-	if (status < 0)
+	if (status < 2)
 		dev_dbg(&udev->dev,
 			"gone after usb resume? status %d\n",
 			status);
@@ -1711,7 +1783,7 @@ static int finish_device_resume(struct usb_device *udev)
 		int		(*resume)(struct device *);
 
 		le16_to_cpus(&devstatus);
-		if (devstatus & (1 << USB_DEVICE_REMOTE_WAKEUP)
+		if ((devstatus & (1 << USB_DEVICE_REMOTE_WAKEUP))
 				&& udev->parent) {
 			status = usb_control_msg(udev,
 					usb_sndctrlpipe(udev, 0),
@@ -2374,39 +2446,36 @@ hub_power_remaining (struct usb_hub *hub)
 {
 	struct usb_device *hdev = hub->hdev;
 	int remaining;
-	unsigned i;
+	int port1;
 
-	remaining = hub->power_budget;
-	if (!remaining)		/* self-powered */
+	if (!hub->limited_power)
 		return 0;
 
-	for (i = 0; i < hdev->maxchild; i++) {
-		struct usb_device	*udev = hdev->children[i];
-		int			delta, ceiling;
+	remaining = hdev->bus_mA - hub->descriptor->bHubContrCurrent;
+	for (port1 = 1; port1 <= hdev->maxchild; ++port1) {
+		struct usb_device	*udev = hdev->children[port1 - 1];
+		int			delta;
 
 		if (!udev)
 			continue;
 
-		/* 100mA per-port ceiling, or 8mA for OTG ports */
-		if (i != (udev->bus->otg_port - 1) || hdev->parent)
-			ceiling = 50;
-		else
-			ceiling = 4;
-
+		/* Unconfigured devices may not use more than 100mA,
+		 * or 8mA for OTG ports */
 		if (udev->actconfig)
-			delta = udev->actconfig->desc.bMaxPower;
+			delta = udev->actconfig->desc.bMaxPower * 2;
+		else if (port1 != udev->bus->otg_port || hdev->parent)
+			delta = 100;
 		else
-			delta = ceiling;
-		// dev_dbg(&udev->dev, "budgeted %dmA\n", 2 * delta);
-		if (delta > ceiling)
-			dev_warn(&udev->dev, "%dmA over %dmA budget!\n",
-				2 * (delta - ceiling), 2 * ceiling);
+			delta = 8;
+		if (delta > hub->mA_per_port)
+			dev_warn(&udev->dev, "%dmA is over %umA budget "
+					"for port %d!\n",
+					delta, hub->mA_per_port, port1);
 		remaining -= delta;
 	}
 	if (remaining < 0) {
-		dev_warn(hub->intfdev,
-			"%dmA over power budget!\n",
-			-2 * remaining);
+		dev_warn(hub->intfdev, "%dmA over power budget!\n",
+			- remaining);
 		remaining = 0;
 	}
 	return remaining;
@@ -2501,7 +2570,8 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1,
 
 		usb_set_device_state(udev, USB_STATE_POWERED);
 		udev->speed = USB_SPEED_UNKNOWN;
- 
+ 		udev->bus_mA = hub->mA_per_port;
+
 		/* set the address */
 		choose_address(udev);
 		if (udev->devnum <= 0) {
@@ -2521,16 +2591,16 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1,
 		 * on the parent.
 		 */
 		if (udev->descriptor.bDeviceClass == USB_CLASS_HUB
-				&& hub->power_budget) {
+				&& udev->bus_mA <= 100) {
 			u16	devstat;
 
 			status = usb_get_status(udev, USB_RECIP_DEVICE, 0,
 					&devstat);
-			if (status < 0) {
+			if (status < 2) {
 				dev_dbg(&udev->dev, "get status %d ?\n", status);
 				goto loop_disable;
 			}
-			cpu_to_le16s(&devstat);
+			le16_to_cpus(&devstat);
 			if ((devstat & (1 << USB_DEVICE_SELF_POWERED)) == 0) {
 				dev_err(&udev->dev,
 					"can't connect bus-powered hub "
@@ -2583,9 +2653,7 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1,
 
 		status = hub_power_remaining(hub);
 		if (status)
-			dev_dbg(hub_dev,
-				"%dmA power budget left\n",
-				2 * status);
+			dev_dbg(hub_dev, "%dmA power budget left\n", status);
 
 		return;
 
@@ -2797,6 +2865,11 @@ static void hub_events(void)
 			if (hubchange & HUB_CHANGE_LOCAL_POWER) {
 				dev_dbg (hub_dev, "power change\n");
 				clear_hub_feature(hdev, C_HUB_LOCAL_POWER);
+				if (hubstatus & HUB_STATUS_LOCAL_POWER)
+					/* FIXME: Is this always true? */
+					hub->limited_power = 0;
+				else
+					hub->limited_power = 1;
 			}
 			if (hubchange & HUB_CHANGE_OVERCURRENT) {
 				dev_dbg (hub_dev, "overcurrent change\n");
diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
index bf23f8978024..29d5f45a8456 100644
--- a/drivers/usb/core/hub.h
+++ b/drivers/usb/core/hub.h
@@ -220,8 +220,9 @@ struct usb_hub {
 	struct usb_hub_descriptor *descriptor;	/* class descriptor */
 	struct usb_tt		tt;		/* Transaction Translator */
 
-	u8			power_budget;	/* in 2mA units; or zero */
+	unsigned		mA_per_port;	/* current for each child */
 
+	unsigned		limited_power:1;
 	unsigned		quiescing:1;
 	unsigned		activating:1;
 	unsigned		resume_root_hub:1;
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index fe74f99ca5f4..99ab774d4fdb 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1387,6 +1387,12 @@ free_interfaces:
 	if (dev->state != USB_STATE_ADDRESS)
 		usb_disable_device (dev, 1);	// Skip ep0
 
+	n = dev->bus_mA - cp->desc.bMaxPower * 2;
+	if (n < 0)
+		dev_warn(&dev->dev, "new config #%d exceeds power "
+				"limit by %dmA\n",
+				configuration, -n);
+
 	if ((ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
 			USB_REQ_SET_CONFIGURATION, 0, configuration, 0,
 			NULL, 0, USB_CTRL_SET_TIMEOUT)) < 0)
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 46dc0421d19e..27575e678a7c 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -347,6 +347,8 @@ struct usb_device {
 
 	char **rawdescriptors;		/* Raw descriptors for each config */
 
+	unsigned short bus_mA;		/* Current available from the bus */
+
 	int have_langid;		/* whether string_langid is valid */
 	int string_langid;		/* language ID for strings */
 
-- 
cgit v1.2.3-71-gd317


From 12c3da346eb81b6a281031f62eda3bca993dff5a Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 23 Nov 2005 12:09:52 -0500
Subject: [PATCH] USB: Store port number in usb_device

This patch (as610) adds a field to struct usb_device to store the device's
port number.  This allows us to remove several loops in the hub driver
(searching for a particular device among all the entries in the parent's
array of children).

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hub.c | 79 ++++++++++++--------------------------------------
 drivers/usb/core/usb.c |  1 +
 include/linux/usb.h    |  1 +
 3 files changed, 20 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index b311005ff1a6..a523c8f20b5d 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -946,24 +946,21 @@ static int locktree(struct usb_device *udev)
 	t = locktree(hdev);
 	if (t < 0)
 		return t;
-	for (t = 0; t < hdev->maxchild; t++) {
-		if (hdev->children[t] == udev) {
-			/* everything is fail-fast once disconnect
-			 * processing starts
-			 */
-			if (udev->state == USB_STATE_NOTATTACHED)
-				break;
 
-			/* when everyone grabs locks top->bottom,
-			 * non-overlapping work may be concurrent
-			 */
-			usb_lock_device(udev);
-			usb_unlock_device(hdev);
-			return t + 1;
-		}
+	/* everything is fail-fast once disconnect
+	 * processing starts
+	 */
+	if (udev->state == USB_STATE_NOTATTACHED) {
+		usb_unlock_device(hdev);
+		return -ENODEV;
 	}
+
+	/* when everyone grabs locks top->bottom,
+	 * non-overlapping work may be concurrent
+	 */
+	usb_lock_device(udev);
 	usb_unlock_device(hdev);
-	return -ENODEV;
+	return udev->portnum;
 }
 
 static void recursively_mark_NOTATTACHED(struct usb_device *udev)
@@ -1335,15 +1332,9 @@ int usb_new_device(struct usb_device *udev)
 					le16_to_cpu(udev->config[0].desc.wTotalLength),
 					USB_DT_OTG, (void **) &desc) == 0) {
 			if (desc->bmAttributes & USB_OTG_HNP) {
-				unsigned		port1;
+				unsigned		port1 = udev->portnum;
 				struct usb_device	*root = udev->parent;
 				
-				for (port1 = 1; port1 <= root->maxchild;
-						port1++) {
-					if (root->children[port1-1] == udev)
-						break;
-				}
-
 				dev_info(&udev->dev,
 					"Dual-Role OTG device on %sHNP port\n",
 					(port1 == bus->otg_port)
@@ -1720,22 +1711,9 @@ static int __usb_suspend_device (struct usb_device *udev, int port1)
 int usb_suspend_device(struct usb_device *udev)
 {
 #ifdef	CONFIG_USB_SUSPEND
-	int	port1;
-
 	if (udev->state == USB_STATE_NOTATTACHED)
 		return -ENODEV;
-	if (!udev->parent)
-		port1 = 0;
-	else {
-		for (port1 = udev->parent->maxchild; port1 > 0; --port1) {
-			if (udev->parent->children[port1-1] == udev)
-				break;
-		}
-		if (port1 == 0)
-			return -ENODEV;
-	}
-
-	return __usb_suspend_device(udev, port1);
+	return __usb_suspend_device(udev, udev->portnum);
 #else
 	/* NOTE:  udev->state unchanged, it's not lying ... */
 	udev->dev.power.power_state = PMSG_SUSPEND;
@@ -1893,20 +1871,10 @@ hub_port_resume(struct usb_hub *hub, int port1, struct usb_device *udev)
  */
 int usb_resume_device(struct usb_device *udev)
 {
-	int	port1, status;
+	int	status;
 
 	if (udev->state == USB_STATE_NOTATTACHED)
 		return -ENODEV;
-	if (!udev->parent)
-		port1 = 0;
-	else {
-		for (port1 = udev->parent->maxchild; port1 > 0; --port1) {
-			if (udev->parent->children[port1-1] == udev)
-				break;
-		}
-		if (port1 == 0)
-			return -ENODEV;
-	}
 
 #ifdef	CONFIG_USB_SUSPEND
 	/* selective resume of one downstream hub-to-device port */
@@ -1915,7 +1883,7 @@ int usb_resume_device(struct usb_device *udev)
 			// NOTE swsusp may bork us, device state being wrong...
 			// NOTE this fails if parent is also suspended...
 			status = hub_port_resume(hdev_to_hub(udev->parent),
-					port1, udev);
+					udev->portnum, udev);
 		} else
 			status = 0;
 	} else
@@ -3029,7 +2997,8 @@ int usb_reset_device(struct usb_device *udev)
 	struct usb_hub			*parent_hub;
 	struct usb_device_descriptor	descriptor = udev->descriptor;
 	struct usb_hub			*hub = NULL;
-	int 				i, ret = 0, port1 = -1;
+	int 				i, ret = 0;
+	int				port1 = udev->portnum;
 
 	if (udev->state == USB_STATE_NOTATTACHED ||
 			udev->state == USB_STATE_SUSPENDED) {
@@ -3043,18 +3012,6 @@ int usb_reset_device(struct usb_device *udev)
 		dev_dbg(&udev->dev, "%s for root hub!\n", __FUNCTION__);
 		return -EISDIR;
 	}
-
-	for (i = 0; i < parent_hdev->maxchild; i++)
-		if (parent_hdev->children[i] == udev) {
-			port1 = i + 1;
-			break;
-		}
-
-	if (port1 < 0) {
-		/* If this ever happens, it's very bad */
-		dev_err(&udev->dev, "Can't locate device's port!\n");
-		return -ENOENT;
-	}
 	parent_hub = hdev_to_hub(parent_hdev);
 
 	/* If we're resetting an active hub, take some special actions */
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index fcfda21be499..39e6b61b898a 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -439,6 +439,7 @@ usb_alloc_dev(struct usb_device *parent, struct usb_bus *bus, unsigned port1)
 		/* hub driver sets up TT records */
 	}
 
+	dev->portnum = port1;
 	dev->bus = bus;
 	dev->parent = parent;
 	INIT_LIST_HEAD(&dev->filelist);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 27575e678a7c..e59d1bd52d4f 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -348,6 +348,7 @@ struct usb_device {
 	char **rawdescriptors;		/* Raw descriptors for each config */
 
 	unsigned short bus_mA;		/* Current available from the bus */
+	u8 portnum;			/* Parent port number (origin 1) */
 
 	int have_langid;		/* whether string_langid is valid */
 	int string_langid;		/* language ID for strings */
-- 
cgit v1.2.3-71-gd317


From e80b0fade09ef1ee67b0898d480d4c588f124d5f Mon Sep 17 00:00:00 2001
From: Matthew Dharm <mdharm-usb@one-eyed-alien.net>
Date: Sun, 4 Dec 2005 22:02:44 -0800
Subject: [PATCH] USB Storage: add alauda support

This patch adds another usb-storage subdriver, which supports two fairly
old dual-XD/SmartMedia reader-writers (USB1.1 devices).

This driver was written by Daniel Drake <dsd@gentoo.org> -- he notes
that he wrote this driver without specs, however a vendor-supplied GPL
driver for the previous generation of products ("sma03") did prove to be
quite useful, as did the sddr09 driver which also has to deal with
low-level physical block layout on SmartMedia.

The original patch has been reformed by me, as it clashed with the
libusual patches.

We really need to consolidate some of this common SmartMedia code, and
get together with the MTD guys to share it with them as well.

Signed-off-by: Daniel Drake <dsd@gentoo.org>
Signed-off-by: Matthew Dharm <mdharm-usb@one-eyed-alien.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/storage/Kconfig        |    9 +
 drivers/usb/storage/Makefile       |    1 +
 drivers/usb/storage/alauda.c       | 1119 ++++++++++++++++++++++++++++++++++++
 drivers/usb/storage/alauda.h       |  100 ++++
 drivers/usb/storage/unusual_devs.h |   14 +
 drivers/usb/storage/usb.c          |   12 +
 include/linux/usb_usual.h          |    3 +
 7 files changed, 1258 insertions(+)
 create mode 100644 drivers/usb/storage/alauda.c
 create mode 100644 drivers/usb/storage/alauda.h

(limited to 'include/linux')

diff --git a/drivers/usb/storage/Kconfig b/drivers/usb/storage/Kconfig
index bdfcb95d9c12..92be101feba7 100644
--- a/drivers/usb/storage/Kconfig
+++ b/drivers/usb/storage/Kconfig
@@ -112,6 +112,15 @@ config USB_STORAGE_JUMPSHOT
 	  Say Y here to include additional code to support the Lexar Jumpshot
 	  USB CompactFlash reader.
 
+config USB_STORAGE_ALAUDA
+	bool "Olympus MAUSB-10/Fuji DPC-R1 support (EXPERIMENTAL)"
+	depends on USB_STORAGE && EXPERIMENTAL
+	help
+	  Say Y here to include additional code to support the Olympus MAUSB-10
+	  and Fujifilm DPC-R1 USB Card reader/writer devices.
+
+	  These devices are based on the Alauda chip and support support both
+	  XD and SmartMedia cards.
 
 config USB_STORAGE_ONETOUCH
 	bool "Support OneTouch Button on Maxtor Hard Drives (EXPERIMENTAL)"
diff --git a/drivers/usb/storage/Makefile b/drivers/usb/storage/Makefile
index 2d416e9028bb..8cbba22508a4 100644
--- a/drivers/usb/storage/Makefile
+++ b/drivers/usb/storage/Makefile
@@ -18,6 +18,7 @@ usb-storage-obj-$(CONFIG_USB_STORAGE_DPCM)	+= dpcm.o
 usb-storage-obj-$(CONFIG_USB_STORAGE_ISD200)	+= isd200.o
 usb-storage-obj-$(CONFIG_USB_STORAGE_DATAFAB)	+= datafab.o
 usb-storage-obj-$(CONFIG_USB_STORAGE_JUMPSHOT)	+= jumpshot.o
+usb-storage-obj-$(CONFIG_USB_STORAGE_ALAUDA)	+= alauda.o
 usb-storage-obj-$(CONFIG_USB_STORAGE_ONETOUCH)	+= onetouch.o
 
 usb-storage-objs :=	scsiglue.o protocol.o transport.o usb.o \
diff --git a/drivers/usb/storage/alauda.c b/drivers/usb/storage/alauda.c
new file mode 100644
index 000000000000..4d3cbb12b713
--- /dev/null
+++ b/drivers/usb/storage/alauda.c
@@ -0,0 +1,1119 @@
+/*
+ * Driver for Alauda-based card readers
+ *
+ * Current development and maintenance by:
+ *   (c) 2005 Daniel Drake <dsd@gentoo.org>
+ *
+ * The 'Alauda' is a chip manufacturered by RATOC for OEM use.
+ *
+ * Alauda implements a vendor-specific command set to access two media reader
+ * ports (XD, SmartMedia). This driver converts SCSI commands to the commands
+ * which are accepted by these devices.
+ *
+ * The driver was developed through reverse-engineering, with the help of the
+ * sddr09 driver which has many similarities, and with some help from the
+ * (very old) vendor-supplied GPL sma03 driver.
+ *
+ * For protocol info, see http://alauda.sourceforge.net
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+
+#include "usb.h"
+#include "transport.h"
+#include "protocol.h"
+#include "debug.h"
+#include "alauda.h"
+
+#define short_pack(lsb,msb) ( ((u16)(lsb)) | ( ((u16)(msb))<<8 ) )
+#define LSB_of(s) ((s)&0xFF)
+#define MSB_of(s) ((s)>>8)
+
+#define MEDIA_PORT(us) us->srb->device->lun
+#define MEDIA_INFO(us) ((struct alauda_info *)us->extra)->port[MEDIA_PORT(us)]
+
+#define PBA_LO(pba) ((pba & 0xF) << 5)
+#define PBA_HI(pba) (pba >> 3)
+#define PBA_ZONE(pba) (pba >> 11)
+
+/*
+ * Media handling
+ */
+
+struct alauda_card_info {
+	unsigned char id;		/* id byte */
+	unsigned char chipshift;	/* 1<<cs bytes total capacity */
+	unsigned char pageshift;	/* 1<<ps bytes in a page */
+	unsigned char blockshift;	/* 1<<bs pages per block */
+	unsigned char zoneshift;	/* 1<<zs blocks per zone */
+};
+
+static struct alauda_card_info alauda_card_ids[] = {
+	/* NAND flash */
+	{ 0x6e, 20, 8, 4, 8},	/* 1 MB */
+	{ 0xe8, 20, 8, 4, 8},	/* 1 MB */
+	{ 0xec, 20, 8, 4, 8},	/* 1 MB */
+	{ 0x64, 21, 8, 4, 9}, 	/* 2 MB */
+	{ 0xea, 21, 8, 4, 9},	/* 2 MB */
+	{ 0x6b, 22, 9, 4, 9},	/* 4 MB */
+	{ 0xe3, 22, 9, 4, 9},	/* 4 MB */
+	{ 0xe5, 22, 9, 4, 9},	/* 4 MB */
+	{ 0xe6, 23, 9, 4, 10},	/* 8 MB */
+	{ 0x73, 24, 9, 5, 10},	/* 16 MB */
+	{ 0x75, 25, 9, 5, 10},	/* 32 MB */
+	{ 0x76, 26, 9, 5, 10},	/* 64 MB */
+	{ 0x79, 27, 9, 5, 10},	/* 128 MB */
+	{ 0x71, 28, 9, 5, 10},	/* 256 MB */
+
+	/* MASK ROM */
+	{ 0x5d, 21, 9, 4, 8},	/* 2 MB */
+	{ 0xd5, 22, 9, 4, 9},	/* 4 MB */
+	{ 0xd6, 23, 9, 4, 10},	/* 8 MB */
+	{ 0x57, 24, 9, 4, 11},	/* 16 MB */
+	{ 0x58, 25, 9, 4, 12},	/* 32 MB */
+	{ 0,}
+};
+
+static struct alauda_card_info *alauda_card_find_id(unsigned char id) {
+	int i;
+
+	for (i = 0; alauda_card_ids[i].id != 0; i++)
+		if (alauda_card_ids[i].id == id)
+			return &(alauda_card_ids[i]);
+	return NULL;
+}
+
+/*
+ * ECC computation.
+ */
+
+static unsigned char parity[256];
+static unsigned char ecc2[256];
+
+static void nand_init_ecc(void) {
+	int i, j, a;
+
+	parity[0] = 0;
+	for (i = 1; i < 256; i++)
+		parity[i] = (parity[i&(i-1)] ^ 1);
+
+	for (i = 0; i < 256; i++) {
+		a = 0;
+		for (j = 0; j < 8; j++) {
+			if (i & (1<<j)) {
+				if ((j & 1) == 0)
+					a ^= 0x04;
+				if ((j & 2) == 0)
+					a ^= 0x10;
+				if ((j & 4) == 0)
+					a ^= 0x40;
+			}
+		}
+		ecc2[i] = ~(a ^ (a<<1) ^ (parity[i] ? 0xa8 : 0));
+	}
+}
+
+/* compute 3-byte ecc on 256 bytes */
+static void nand_compute_ecc(unsigned char *data, unsigned char *ecc) {
+	int i, j, a;
+	unsigned char par, bit, bits[8];
+
+	par = 0;
+	for (j = 0; j < 8; j++)
+		bits[j] = 0;
+
+	/* collect 16 checksum bits */
+	for (i = 0; i < 256; i++) {
+		par ^= data[i];
+		bit = parity[data[i]];
+		for (j = 0; j < 8; j++)
+			if ((i & (1<<j)) == 0)
+				bits[j] ^= bit;
+	}
+
+	/* put 4+4+4 = 12 bits in the ecc */
+	a = (bits[3] << 6) + (bits[2] << 4) + (bits[1] << 2) + bits[0];
+	ecc[0] = ~(a ^ (a<<1) ^ (parity[par] ? 0xaa : 0));
+
+	a = (bits[7] << 6) + (bits[6] << 4) + (bits[5] << 2) + bits[4];
+	ecc[1] = ~(a ^ (a<<1) ^ (parity[par] ? 0xaa : 0));
+
+	ecc[2] = ecc2[par];
+}
+
+static int nand_compare_ecc(unsigned char *data, unsigned char *ecc) {
+	return (data[0] == ecc[0] && data[1] == ecc[1] && data[2] == ecc[2]);
+}
+
+static void nand_store_ecc(unsigned char *data, unsigned char *ecc) {
+	memcpy(data, ecc, 3);
+}
+
+/*
+ * Alauda driver
+ */
+
+/*
+ * Forget our PBA <---> LBA mappings for a particular port
+ */
+static void alauda_free_maps (struct alauda_media_info *media_info)
+{
+	unsigned int shift = media_info->zoneshift
+		+ media_info->blockshift + media_info->pageshift;
+	unsigned int num_zones = media_info->capacity >> shift;
+	unsigned int i;
+
+	if (media_info->lba_to_pba != NULL)
+		for (i = 0; i < num_zones; i++) {
+			kfree(media_info->lba_to_pba[i]);
+			media_info->lba_to_pba[i] = NULL;
+		}
+
+	if (media_info->pba_to_lba != NULL)
+		for (i = 0; i < num_zones; i++) {
+			kfree(media_info->pba_to_lba[i]);
+			media_info->pba_to_lba[i] = NULL;
+		}
+}
+
+/*
+ * Returns 2 bytes of status data
+ * The first byte describes media status, and second byte describes door status
+ */
+static int alauda_get_media_status(struct us_data *us, unsigned char *data)
+{
+	int rc;
+	unsigned char command;
+
+	if (MEDIA_PORT(us) == ALAUDA_PORT_XD)
+		command = ALAUDA_GET_XD_MEDIA_STATUS;
+	else
+		command = ALAUDA_GET_SM_MEDIA_STATUS;
+
+	rc = usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe,
+		command, 0xc0, 0, 1, data, 2);
+
+	US_DEBUGP("alauda_get_media_status: Media status %02X %02X\n",
+		data[0], data[1]);
+
+	return rc;
+}
+
+/*
+ * Clears the "media was changed" bit so that we know when it changes again
+ * in the future.
+ */
+static int alauda_ack_media(struct us_data *us)
+{
+	unsigned char command;
+
+	if (MEDIA_PORT(us) == ALAUDA_PORT_XD)
+		command = ALAUDA_ACK_XD_MEDIA_CHANGE;
+	else
+		command = ALAUDA_ACK_SM_MEDIA_CHANGE;
+
+	return usb_stor_ctrl_transfer(us, us->send_ctrl_pipe,
+		command, 0x40, 0, 1, NULL, 0);
+}
+
+/*
+ * Retrieves a 4-byte media signature, which indicates manufacturer, capacity,
+ * and some other details.
+ */
+static int alauda_get_media_signature(struct us_data *us, unsigned char *data)
+{
+	unsigned char command;
+
+	if (MEDIA_PORT(us) == ALAUDA_PORT_XD)
+		command = ALAUDA_GET_XD_MEDIA_SIG;
+	else
+		command = ALAUDA_GET_SM_MEDIA_SIG;
+
+	return usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe,
+		command, 0xc0, 0, 0, data, 4);
+}
+
+/*
+ * Resets the media status (but not the whole device?)
+ */
+static int alauda_reset_media(struct us_data *us)
+{
+	unsigned char *command = us->iobuf;
+
+	memset(command, 0, 9);
+	command[0] = ALAUDA_BULK_CMD;
+	command[1] = ALAUDA_BULK_RESET_MEDIA;
+	command[8] = MEDIA_PORT(us);
+
+	return usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe,
+		command, 9, NULL);
+}
+
+/*
+ * Examines the media and deduces capacity, etc.
+ */
+static int alauda_init_media(struct us_data *us)
+{
+	unsigned char *data = us->iobuf;
+	int ready = 0;
+	struct alauda_card_info *media_info;
+	unsigned int num_zones;
+
+	while (ready == 0) {
+		msleep(20);
+
+		if (alauda_get_media_status(us, data) != USB_STOR_XFER_GOOD)
+			return USB_STOR_TRANSPORT_ERROR;
+
+		if (data[0] & 0x10)
+			ready = 1;
+	}
+
+	US_DEBUGP("alauda_init_media: We are ready for action!\n");
+
+	if (alauda_ack_media(us) != USB_STOR_XFER_GOOD)
+		return USB_STOR_TRANSPORT_ERROR;
+
+	msleep(10);
+
+	if (alauda_get_media_status(us, data) != USB_STOR_XFER_GOOD)
+		return USB_STOR_TRANSPORT_ERROR;
+
+	if (data[0] != 0x14) {
+		US_DEBUGP("alauda_init_media: Media not ready after ack\n");
+		return USB_STOR_TRANSPORT_ERROR;
+	}
+
+	if (alauda_get_media_signature(us, data) != USB_STOR_XFER_GOOD)
+		return USB_STOR_TRANSPORT_ERROR;
+
+	US_DEBUGP("alauda_init_media: Media signature: %02X %02X %02X %02X\n",
+		data[0], data[1], data[2], data[3]);
+	media_info = alauda_card_find_id(data[1]);
+	if (media_info == NULL) {
+		printk("alauda_init_media: Unrecognised media signature: "
+			"%02X %02X %02X %02X\n",
+			data[0], data[1], data[2], data[3]);
+		return USB_STOR_TRANSPORT_ERROR;
+	}
+
+	MEDIA_INFO(us).capacity = 1 << media_info->chipshift;
+	US_DEBUGP("Found media with capacity: %ldMB\n",
+		MEDIA_INFO(us).capacity >> 20);
+
+	MEDIA_INFO(us).pageshift = media_info->pageshift;
+	MEDIA_INFO(us).blockshift = media_info->blockshift;
+	MEDIA_INFO(us).zoneshift = media_info->zoneshift;
+
+	MEDIA_INFO(us).pagesize = 1 << media_info->pageshift;
+	MEDIA_INFO(us).blocksize = 1 << media_info->blockshift;
+	MEDIA_INFO(us).zonesize = 1 << media_info->zoneshift;
+
+	MEDIA_INFO(us).uzonesize = ((1 << media_info->zoneshift) / 128) * 125;
+	MEDIA_INFO(us).blockmask = MEDIA_INFO(us).blocksize - 1;
+
+	num_zones = MEDIA_INFO(us).capacity >> (MEDIA_INFO(us).zoneshift
+		+ MEDIA_INFO(us).blockshift + MEDIA_INFO(us).pageshift);
+	MEDIA_INFO(us).pba_to_lba = kcalloc(num_zones, sizeof(u16*), GFP_NOIO);
+	MEDIA_INFO(us).lba_to_pba = kcalloc(num_zones, sizeof(u16*), GFP_NOIO);
+
+	if (alauda_reset_media(us) != USB_STOR_XFER_GOOD)
+		return USB_STOR_TRANSPORT_ERROR;
+
+	return USB_STOR_TRANSPORT_GOOD;
+}
+
+/*
+ * Examines the media status and does the right thing when the media has gone,
+ * appeared, or changed.
+ */
+static int alauda_check_media(struct us_data *us)
+{
+	struct alauda_info *info = (struct alauda_info *) us->extra;
+	unsigned char status[2];
+	int rc;
+
+	rc = alauda_get_media_status(us, status);
+
+	/* Check for no media or door open */
+	if ((status[0] & 0x80) || ((status[0] & 0x1F) == 0x10)
+		|| ((status[1] & 0x01) == 0)) {
+		US_DEBUGP("alauda_check_media: No media, or door open\n");
+		alauda_free_maps(&MEDIA_INFO(us));
+		info->sense_key = 0x02;
+		info->sense_asc = 0x3A;
+		info->sense_ascq = 0x00;
+		return USB_STOR_TRANSPORT_FAILED;
+	}
+
+	/* Check for media change */
+	if (status[0] & 0x08) {
+		US_DEBUGP("alauda_check_media: Media change detected\n");
+		alauda_free_maps(&MEDIA_INFO(us));
+		alauda_init_media(us);
+
+		info->sense_key = UNIT_ATTENTION;
+		info->sense_asc = 0x28;
+		info->sense_ascq = 0x00;
+		return USB_STOR_TRANSPORT_FAILED;
+	}
+
+	return USB_STOR_TRANSPORT_GOOD;
+}
+
+/*
+ * Checks the status from the 2nd status register
+ * Returns 3 bytes of status data, only the first is known
+ */
+static int alauda_check_status2(struct us_data *us)
+{
+	int rc;
+	unsigned char command[] = {
+		ALAUDA_BULK_CMD, ALAUDA_BULK_GET_STATUS2,
+		0, 0, 0, 0, 3, 0, MEDIA_PORT(us)
+	};
+	unsigned char data[3];
+
+	rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe,
+		command, 9, NULL);
+	if (rc != USB_STOR_XFER_GOOD)
+		return rc;
+
+	rc = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe,
+		data, 3, NULL);
+	if (rc != USB_STOR_XFER_GOOD)
+		return rc;
+
+	US_DEBUGP("alauda_check_status2: %02X %02X %02X\n", data[0], data[1], data[2]);
+	if (data[0] & ALAUDA_STATUS_ERROR)
+		return USB_STOR_XFER_ERROR;
+
+	return USB_STOR_XFER_GOOD;
+}
+
+/*
+ * Gets the redundancy data for the first page of a PBA
+ * Returns 16 bytes.
+ */
+static int alauda_get_redu_data(struct us_data *us, u16 pba, unsigned char *data)
+{
+	int rc;
+	unsigned char command[] = {
+		ALAUDA_BULK_CMD, ALAUDA_BULK_GET_REDU_DATA,
+		PBA_HI(pba), PBA_ZONE(pba), 0, PBA_LO(pba), 0, 0, MEDIA_PORT(us)
+	};
+
+	rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe,
+		command, 9, NULL);
+	if (rc != USB_STOR_XFER_GOOD)
+		return rc;
+
+	return usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe,
+		data, 16, NULL);
+}
+
+/*
+ * Finds the first unused PBA in a zone
+ * Returns the absolute PBA of an unused PBA, or 0 if none found.
+ */
+static u16 alauda_find_unused_pba(struct alauda_media_info *info,
+	unsigned int zone)
+{
+	u16 *pba_to_lba = info->pba_to_lba[zone];
+	unsigned int i;
+
+	for (i = 0; i < info->zonesize; i++)
+		if (pba_to_lba[i] == UNDEF)
+			return (zone << info->zoneshift) + i;
+
+	return 0;
+}
+
+/*
+ * Reads the redundancy data for all PBA's in a zone
+ * Produces lba <--> pba mappings
+ */
+static int alauda_read_map(struct us_data *us, unsigned int zone)
+{
+	unsigned char *data = us->iobuf;
+	int result;
+	int i, j;
+	unsigned int zonesize = MEDIA_INFO(us).zonesize;
+	unsigned int uzonesize = MEDIA_INFO(us).uzonesize;
+	unsigned int lba_offset, lba_real, blocknum;
+	unsigned int zone_base_lba = zone * uzonesize;
+	unsigned int zone_base_pba = zone * zonesize;
+	u16 *lba_to_pba = kcalloc(zonesize, sizeof(u16), GFP_NOIO);
+	u16 *pba_to_lba = kcalloc(zonesize, sizeof(u16), GFP_NOIO);
+	if (lba_to_pba == NULL || pba_to_lba == NULL) {
+		result = USB_STOR_TRANSPORT_ERROR;
+		goto error;
+	}
+
+	US_DEBUGP("alauda_read_map: Mapping blocks for zone %d\n", zone);
+
+	/* 1024 PBA's per zone */
+	for (i = 0; i < zonesize; i++)
+		lba_to_pba[i] = pba_to_lba[i] = UNDEF;
+
+	for (i = 0; i < zonesize; i++) {
+		blocknum = zone_base_pba + i;
+
+		result = alauda_get_redu_data(us, blocknum, data);
+		if (result != USB_STOR_XFER_GOOD) {
+			result = USB_STOR_TRANSPORT_ERROR;
+			goto error;
+		}
+
+		/* special PBAs have control field 0^16 */
+		for (j = 0; j < 16; j++)
+			if (data[j] != 0)
+				goto nonz;
+		pba_to_lba[i] = UNUSABLE;
+		US_DEBUGP("alauda_read_map: PBA %d has no logical mapping\n", blocknum);
+		continue;
+
+	nonz:
+		/* unwritten PBAs have control field FF^16 */
+		for (j = 0; j < 16; j++)
+			if (data[j] != 0xff)
+				goto nonff;
+		continue;
+
+	nonff:
+		/* normal PBAs start with six FFs */
+		if (j < 6) {
+			US_DEBUGP("alauda_read_map: PBA %d has no logical mapping: "
+			       "reserved area = %02X%02X%02X%02X "
+			       "data status %02X block status %02X\n",
+			       blocknum, data[0], data[1], data[2], data[3],
+			       data[4], data[5]);
+			pba_to_lba[i] = UNUSABLE;
+			continue;
+		}
+
+		if ((data[6] >> 4) != 0x01) {
+			US_DEBUGP("alauda_read_map: PBA %d has invalid address "
+			       "field %02X%02X/%02X%02X\n",
+			       blocknum, data[6], data[7], data[11], data[12]);
+			pba_to_lba[i] = UNUSABLE;
+			continue;
+		}
+
+		/* check even parity */
+		if (parity[data[6] ^ data[7]]) {
+			printk("alauda_read_map: Bad parity in LBA for block %d"
+			       " (%02X %02X)\n", i, data[6], data[7]);
+			pba_to_lba[i] = UNUSABLE;
+			continue;
+		}
+
+		lba_offset = short_pack(data[7], data[6]);
+		lba_offset = (lba_offset & 0x07FF) >> 1;
+		lba_real = lba_offset + zone_base_lba;
+
+		/*
+		 * Every 1024 physical blocks ("zone"), the LBA numbers
+		 * go back to zero, but are within a higher block of LBA's.
+		 * Also, there is a maximum of 1000 LBA's per zone.
+		 * In other words, in PBA 1024-2047 you will find LBA 0-999
+		 * which are really LBA 1000-1999. This allows for 24 bad
+		 * or special physical blocks per zone.
+		 */
+
+		if (lba_offset >= uzonesize) {
+			printk("alauda_read_map: Bad low LBA %d for block %d\n",
+			       lba_real, blocknum);
+			continue;
+		}
+
+		if (lba_to_pba[lba_offset] != UNDEF) {
+			printk("alauda_read_map: LBA %d seen for PBA %d and %d\n",
+			       lba_real, lba_to_pba[lba_offset], blocknum);
+			continue;
+		}
+
+		pba_to_lba[i] = lba_real;
+		lba_to_pba[lba_offset] = blocknum;
+		continue;
+	}
+
+	MEDIA_INFO(us).lba_to_pba[zone] = lba_to_pba;
+	MEDIA_INFO(us).pba_to_lba[zone] = pba_to_lba;
+	result = 0;
+	goto out;
+
+error:
+	kfree(lba_to_pba);
+	kfree(pba_to_lba);
+out:
+	return result;
+}
+
+/*
+ * Checks to see whether we have already mapped a certain zone
+ * If we haven't, the map is generated
+ */
+static void alauda_ensure_map_for_zone(struct us_data *us, unsigned int zone)
+{
+	if (MEDIA_INFO(us).lba_to_pba[zone] == NULL
+		|| MEDIA_INFO(us).pba_to_lba[zone] == NULL)
+		alauda_read_map(us, zone);
+}
+
+/*
+ * Erases an entire block
+ */
+static int alauda_erase_block(struct us_data *us, u16 pba)
+{
+	int rc;
+	unsigned char command[] = {
+		ALAUDA_BULK_CMD, ALAUDA_BULK_ERASE_BLOCK, PBA_HI(pba),
+		PBA_ZONE(pba), 0, PBA_LO(pba), 0x02, 0, MEDIA_PORT(us)
+	};
+	unsigned char buf[2];
+
+	US_DEBUGP("alauda_erase_block: Erasing PBA %d\n", pba);
+
+	rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe,
+		command, 9, NULL);
+	if (rc != USB_STOR_XFER_GOOD)
+		return rc;
+
+	rc = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe,
+		buf, 2, NULL);
+	if (rc != USB_STOR_XFER_GOOD)
+		return rc;
+
+	US_DEBUGP("alauda_erase_block: Erase result: %02X %02X\n",
+		buf[0], buf[1]);
+	return rc;
+}
+
+/*
+ * Reads data from a certain offset page inside a PBA, including interleaved
+ * redundancy data. Returns (pagesize+64)*pages bytes in data.
+ */
+static int alauda_read_block_raw(struct us_data *us, u16 pba,
+		unsigned int page, unsigned int pages, unsigned char *data)
+{
+	int rc;
+	unsigned char command[] = {
+		ALAUDA_BULK_CMD, ALAUDA_BULK_READ_BLOCK, PBA_HI(pba),
+		PBA_ZONE(pba), 0, PBA_LO(pba) + page, pages, 0, MEDIA_PORT(us)
+	};
+
+	US_DEBUGP("alauda_read_block: pba %d page %d count %d\n",
+		pba, page, pages);
+
+	rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe,
+		command, 9, NULL);
+	if (rc != USB_STOR_XFER_GOOD)
+		return rc;
+
+	return usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe,
+		data, (MEDIA_INFO(us).pagesize + 64) * pages, NULL);
+}
+
+/*
+ * Reads data from a certain offset page inside a PBA, excluding redundancy
+ * data. Returns pagesize*pages bytes in data. Note that data must be big enough
+ * to hold (pagesize+64)*pages bytes of data, but you can ignore those 'extra'
+ * trailing bytes outside this function.
+ */
+static int alauda_read_block(struct us_data *us, u16 pba,
+		unsigned int page, unsigned int pages, unsigned char *data)
+{
+	int i, rc;
+	unsigned int pagesize = MEDIA_INFO(us).pagesize;
+
+	rc = alauda_read_block_raw(us, pba, page, pages, data);
+	if (rc != USB_STOR_XFER_GOOD)
+		return rc;
+
+	/* Cut out the redundancy data */
+	for (i = 0; i < pages; i++) {
+		int dest_offset = i * pagesize;
+		int src_offset = i * (pagesize + 64);
+		memmove(data + dest_offset, data + src_offset, pagesize);
+	}
+
+	return rc;
+}
+
+/*
+ * Writes an entire block of data and checks status after write.
+ * Redundancy data must be already included in data. Data should be
+ * (pagesize+64)*blocksize bytes in length.
+ */
+static int alauda_write_block(struct us_data *us, u16 pba, unsigned char *data)
+{
+	int rc;
+	struct alauda_info *info = (struct alauda_info *) us->extra;
+	unsigned char command[] = {
+		ALAUDA_BULK_CMD, ALAUDA_BULK_WRITE_BLOCK, PBA_HI(pba),
+		PBA_ZONE(pba), 0, PBA_LO(pba), 32, 0, MEDIA_PORT(us)
+	};
+
+	US_DEBUGP("alauda_write_block: pba %d\n", pba);
+
+	rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe,
+		command, 9, NULL);
+	if (rc != USB_STOR_XFER_GOOD)
+		return rc;
+
+	rc = usb_stor_bulk_transfer_buf(us, info->wr_ep, data,
+		(MEDIA_INFO(us).pagesize + 64) * MEDIA_INFO(us).blocksize,
+		NULL);
+	if (rc != USB_STOR_XFER_GOOD)
+		return rc;
+
+	return alauda_check_status2(us);
+}
+
+/*
+ * Write some data to a specific LBA.
+ */
+static int alauda_write_lba(struct us_data *us, u16 lba,
+		 unsigned int page, unsigned int pages,
+		 unsigned char *ptr, unsigned char *blockbuffer)
+{
+	u16 pba, lbap, new_pba;
+	unsigned char *bptr, *cptr, *xptr;
+	unsigned char ecc[3];
+	int i, result;
+	unsigned int uzonesize = MEDIA_INFO(us).uzonesize;
+	unsigned int zonesize = MEDIA_INFO(us).zonesize;
+	unsigned int pagesize = MEDIA_INFO(us).pagesize;
+	unsigned int blocksize = MEDIA_INFO(us).blocksize;
+	unsigned int lba_offset = lba % uzonesize;
+	unsigned int new_pba_offset;
+	unsigned int zone = lba / uzonesize;
+
+	alauda_ensure_map_for_zone(us, zone);
+
+	pba = MEDIA_INFO(us).lba_to_pba[zone][lba_offset];
+	if (pba == 1) {
+		/* Maybe it is impossible to write to PBA 1.
+		   Fake success, but don't do anything. */
+		printk("alauda_write_lba: avoid writing to pba 1\n");
+		return USB_STOR_TRANSPORT_GOOD;
+	}
+
+	new_pba = alauda_find_unused_pba(&MEDIA_INFO(us), zone);
+	if (!new_pba) {
+		printk("alauda_write_lba: Out of unused blocks\n");
+		return USB_STOR_TRANSPORT_ERROR;
+	}
+
+	/* read old contents */
+	if (pba != UNDEF) {
+		result = alauda_read_block_raw(us, pba, 0,
+			blocksize, blockbuffer);
+		if (result != USB_STOR_XFER_GOOD)
+			return result;
+	} else {
+		memset(blockbuffer, 0, blocksize * (pagesize + 64));
+	}
+
+	lbap = (lba_offset << 1) | 0x1000;
+	if (parity[MSB_of(lbap) ^ LSB_of(lbap)])
+		lbap ^= 1;
+
+	/* check old contents and fill lba */
+	for (i = 0; i < blocksize; i++) {
+		bptr = blockbuffer + (i * (pagesize + 64));
+		cptr = bptr + pagesize;
+		nand_compute_ecc(bptr, ecc);
+		if (!nand_compare_ecc(cptr+13, ecc)) {
+			US_DEBUGP("Warning: bad ecc in page %d- of pba %d\n",
+				  i, pba);
+			nand_store_ecc(cptr+13, ecc);
+		}
+		nand_compute_ecc(bptr + (pagesize / 2), ecc);
+		if (!nand_compare_ecc(cptr+8, ecc)) {
+			US_DEBUGP("Warning: bad ecc in page %d+ of pba %d\n",
+				  i, pba);
+			nand_store_ecc(cptr+8, ecc);
+		}
+		cptr[6] = cptr[11] = MSB_of(lbap);
+		cptr[7] = cptr[12] = LSB_of(lbap);
+	}
+
+	/* copy in new stuff and compute ECC */
+	xptr = ptr;
+	for (i = page; i < page+pages; i++) {
+		bptr = blockbuffer + (i * (pagesize + 64));
+		cptr = bptr + pagesize;
+		memcpy(bptr, xptr, pagesize);
+		xptr += pagesize;
+		nand_compute_ecc(bptr, ecc);
+		nand_store_ecc(cptr+13, ecc);
+		nand_compute_ecc(bptr + (pagesize / 2), ecc);
+		nand_store_ecc(cptr+8, ecc);
+	}
+
+	result = alauda_write_block(us, new_pba, blockbuffer);
+	if (result != USB_STOR_XFER_GOOD)
+		return result;
+
+	new_pba_offset = new_pba - (zone * zonesize);
+	MEDIA_INFO(us).pba_to_lba[zone][new_pba_offset] = lba;
+	MEDIA_INFO(us).lba_to_pba[zone][lba_offset] = new_pba;
+	US_DEBUGP("alauda_write_lba: Remapped LBA %d to PBA %d\n",
+		lba, new_pba);
+
+	if (pba != UNDEF) {
+		unsigned int pba_offset = pba - (zone * zonesize);
+		result = alauda_erase_block(us, pba);
+		if (result != USB_STOR_XFER_GOOD)
+			return result;
+		MEDIA_INFO(us).pba_to_lba[zone][pba_offset] = UNDEF;
+	}
+
+	return USB_STOR_TRANSPORT_GOOD;
+}
+
+/*
+ * Read data from a specific sector address
+ */
+static int alauda_read_data(struct us_data *us, unsigned long address,
+		unsigned int sectors)
+{
+	unsigned char *buffer;
+	u16 lba, max_lba;
+	unsigned int page, len, index, offset;
+	unsigned int blockshift = MEDIA_INFO(us).blockshift;
+	unsigned int pageshift = MEDIA_INFO(us).pageshift;
+	unsigned int blocksize = MEDIA_INFO(us).blocksize;
+	unsigned int pagesize = MEDIA_INFO(us).pagesize;
+	unsigned int uzonesize = MEDIA_INFO(us).uzonesize;
+	int result;
+
+	/*
+	 * Since we only read in one block at a time, we have to create
+	 * a bounce buffer and move the data a piece at a time between the
+	 * bounce buffer and the actual transfer buffer.
+	 * We make this buffer big enough to hold temporary redundancy data,
+	 * which we use when reading the data blocks.
+	 */
+
+	len = min(sectors, blocksize) * (pagesize + 64);
+	buffer = kmalloc(len, GFP_NOIO);
+	if (buffer == NULL) {
+		printk("alauda_read_data: Out of memory\n");
+		return USB_STOR_TRANSPORT_ERROR;
+	}
+
+	/* Figure out the initial LBA and page */
+	lba = address >> blockshift;
+	page = (address & MEDIA_INFO(us).blockmask);
+	max_lba = MEDIA_INFO(us).capacity >> (blockshift + pageshift);
+
+	result = USB_STOR_TRANSPORT_GOOD;
+	index = offset = 0;
+
+	while (sectors > 0) {
+		unsigned int zone = lba / uzonesize; /* integer division */
+		unsigned int lba_offset = lba - (zone * uzonesize);
+		unsigned int pages;
+		u16 pba;
+		alauda_ensure_map_for_zone(us, zone);
+
+		/* Not overflowing capacity? */
+		if (lba >= max_lba) {
+			US_DEBUGP("Error: Requested lba %u exceeds "
+				  "maximum %u\n", lba, max_lba);
+			result = USB_STOR_TRANSPORT_ERROR;
+			break;
+		}
+
+		/* Find number of pages we can read in this block */
+		pages = min(sectors, blocksize - page);
+		len = pages << pageshift;
+
+		/* Find where this lba lives on disk */
+		pba = MEDIA_INFO(us).lba_to_pba[zone][lba_offset];
+
+		if (pba == UNDEF) {	/* this lba was never written */
+			US_DEBUGP("Read %d zero pages (LBA %d) page %d\n",
+				  pages, lba, page);
+
+			/* This is not really an error. It just means
+			   that the block has never been written.
+			   Instead of returning USB_STOR_TRANSPORT_ERROR
+			   it is better to return all zero data. */
+
+			memset(buffer, 0, len);
+		} else {
+			US_DEBUGP("Read %d pages, from PBA %d"
+				  " (LBA %d) page %d\n",
+				  pages, pba, lba, page);
+
+			result = alauda_read_block(us, pba, page, pages, buffer);
+			if (result != USB_STOR_TRANSPORT_GOOD)
+				break;
+		}
+
+		/* Store the data in the transfer buffer */
+		usb_stor_access_xfer_buf(buffer, len, us->srb,
+				&index, &offset, TO_XFER_BUF);
+
+		page = 0;
+		lba++;
+		sectors -= pages;
+	}
+
+	kfree(buffer);
+	return result;
+}
+
+/*
+ * Write data to a specific sector address
+ */
+static int alauda_write_data(struct us_data *us, unsigned long address,
+		unsigned int sectors)
+{
+	unsigned char *buffer, *blockbuffer;
+	unsigned int page, len, index, offset;
+	unsigned int blockshift = MEDIA_INFO(us).blockshift;
+	unsigned int pageshift = MEDIA_INFO(us).pageshift;
+	unsigned int blocksize = MEDIA_INFO(us).blocksize;
+	unsigned int pagesize = MEDIA_INFO(us).pagesize;
+	u16 lba, max_lba;
+	int result;
+
+	/*
+	 * Since we don't write the user data directly to the device,
+	 * we have to create a bounce buffer and move the data a piece
+	 * at a time between the bounce buffer and the actual transfer buffer.
+	 */
+
+	len = min(sectors, blocksize) * pagesize;
+	buffer = kmalloc(len, GFP_NOIO);
+	if (buffer == NULL) {
+		printk("alauda_write_data: Out of memory\n");
+		return USB_STOR_TRANSPORT_ERROR;
+	}
+
+	/*
+	 * We also need a temporary block buffer, where we read in the old data,
+	 * overwrite parts with the new data, and manipulate the redundancy data
+	 */
+	blockbuffer = kmalloc((pagesize + 64) * blocksize, GFP_NOIO);
+	if (blockbuffer == NULL) {
+		printk("alauda_write_data: Out of memory\n");
+		kfree(buffer);
+		return USB_STOR_TRANSPORT_ERROR;
+	}
+
+	/* Figure out the initial LBA and page */
+	lba = address >> blockshift;
+	page = (address & MEDIA_INFO(us).blockmask);
+	max_lba = MEDIA_INFO(us).capacity >> (pageshift + blockshift);
+
+	result = USB_STOR_TRANSPORT_GOOD;
+	index = offset = 0;
+
+	while (sectors > 0) {
+		/* Write as many sectors as possible in this block */
+		unsigned int pages = min(sectors, blocksize - page);
+		len = pages << pageshift;
+
+		/* Not overflowing capacity? */
+		if (lba >= max_lba) {
+			US_DEBUGP("alauda_write_data: Requested lba %u exceeds "
+				  "maximum %u\n", lba, max_lba);
+			result = USB_STOR_TRANSPORT_ERROR;
+			break;
+		}
+
+		/* Get the data from the transfer buffer */
+		usb_stor_access_xfer_buf(buffer, len, us->srb,
+				&index, &offset, FROM_XFER_BUF);
+
+		result = alauda_write_lba(us, lba, page, pages, buffer,
+			blockbuffer);
+		if (result != USB_STOR_TRANSPORT_GOOD)
+			break;
+
+		page = 0;
+		lba++;
+		sectors -= pages;
+	}
+
+	kfree(buffer);
+	kfree(blockbuffer);
+	return result;
+}
+
+/*
+ * Our interface with the rest of the world
+ */
+
+static void alauda_info_destructor(void *extra)
+{
+	struct alauda_info *info = (struct alauda_info *) extra;
+	int port;
+
+	if (!info)
+		return;
+
+	for (port = 0; port < 2; port++) {
+		struct alauda_media_info *media_info = &info->port[port];
+
+		alauda_free_maps(media_info);
+		kfree(media_info->lba_to_pba);
+		kfree(media_info->pba_to_lba);
+	}
+}
+
+/*
+ * Initialize alauda_info struct and find the data-write endpoint
+ */
+int init_alauda(struct us_data *us)
+{
+	struct alauda_info *info;
+	struct usb_host_interface *altsetting = us->pusb_intf->cur_altsetting;
+	nand_init_ecc();
+
+	us->extra = kzalloc(sizeof(struct alauda_info), GFP_NOIO);
+	if (!us->extra) {
+		US_DEBUGP("init_alauda: Gah! Can't allocate storage for"
+			"alauda info struct!\n");
+		return USB_STOR_TRANSPORT_ERROR;
+	}
+	info = (struct alauda_info *) us->extra;
+	us->extra_destructor = alauda_info_destructor;
+
+	info->wr_ep = usb_sndbulkpipe(us->pusb_dev,
+		altsetting->endpoint[0].desc.bEndpointAddress
+		& USB_ENDPOINT_NUMBER_MASK);
+
+	return USB_STOR_TRANSPORT_GOOD;
+}
+
+int alauda_transport(struct scsi_cmnd *srb, struct us_data *us)
+{
+	int rc;
+	struct alauda_info *info = (struct alauda_info *) us->extra;
+	unsigned char *ptr = us->iobuf;
+	static unsigned char inquiry_response[36] = {
+		0x00, 0x80, 0x00, 0x01, 0x1F, 0x00, 0x00, 0x00
+	};
+
+	if (srb->cmnd[0] == INQUIRY) {
+		US_DEBUGP("alauda_transport: INQUIRY. "
+			"Returning bogus response.\n");
+		memcpy(ptr, inquiry_response, sizeof(inquiry_response));
+		fill_inquiry_response(us, ptr, 36);
+		return USB_STOR_TRANSPORT_GOOD;
+	}
+
+	if (srb->cmnd[0] == TEST_UNIT_READY) {
+		US_DEBUGP("alauda_transport: TEST_UNIT_READY.\n");
+		return alauda_check_media(us);
+	}
+
+	if (srb->cmnd[0] == READ_CAPACITY) {
+		unsigned int num_zones;
+		unsigned long capacity;
+
+		rc = alauda_check_media(us);
+		if (rc != USB_STOR_TRANSPORT_GOOD)
+			return rc;
+
+		num_zones = MEDIA_INFO(us).capacity >> (MEDIA_INFO(us).zoneshift
+			+ MEDIA_INFO(us).blockshift + MEDIA_INFO(us).pageshift);
+
+		capacity = num_zones * MEDIA_INFO(us).uzonesize
+			* MEDIA_INFO(us).blocksize;
+
+		/* Report capacity and page size */
+		((__be32 *) ptr)[0] = cpu_to_be32(capacity - 1);
+		((__be32 *) ptr)[1] = cpu_to_be32(512);
+
+		usb_stor_set_xfer_buf(ptr, 8, srb);
+		return USB_STOR_TRANSPORT_GOOD;
+	}
+
+	if (srb->cmnd[0] == READ_10) {
+		unsigned int page, pages;
+
+		rc = alauda_check_media(us);
+		if (rc != USB_STOR_TRANSPORT_GOOD)
+			return rc;
+
+		page = short_pack(srb->cmnd[3], srb->cmnd[2]);
+		page <<= 16;
+		page |= short_pack(srb->cmnd[5], srb->cmnd[4]);
+		pages = short_pack(srb->cmnd[8], srb->cmnd[7]);
+
+		US_DEBUGP("alauda_transport: READ_10: page %d pagect %d\n",
+			  page, pages);
+
+		return alauda_read_data(us, page, pages);
+	}
+
+	if (srb->cmnd[0] == WRITE_10) {
+		unsigned int page, pages;
+
+		rc = alauda_check_media(us);
+		if (rc != USB_STOR_TRANSPORT_GOOD)
+			return rc;
+
+		page = short_pack(srb->cmnd[3], srb->cmnd[2]);
+		page <<= 16;
+		page |= short_pack(srb->cmnd[5], srb->cmnd[4]);
+		pages = short_pack(srb->cmnd[8], srb->cmnd[7]);
+
+		US_DEBUGP("alauda_transport: WRITE_10: page %d pagect %d\n",
+			  page, pages);
+
+		return alauda_write_data(us, page, pages);
+	}
+
+	if (srb->cmnd[0] == REQUEST_SENSE) {
+		US_DEBUGP("alauda_transport: REQUEST_SENSE.\n");
+
+		memset(ptr, 0, 18);
+		ptr[0] = 0xF0;
+		ptr[2] = info->sense_key;
+		ptr[7] = 11;
+		ptr[12] = info->sense_asc;
+		ptr[13] = info->sense_ascq;
+		usb_stor_set_xfer_buf(ptr, 18, srb);
+
+		return USB_STOR_TRANSPORT_GOOD;
+	}
+
+	if (srb->cmnd[0] == ALLOW_MEDIUM_REMOVAL) {
+		/* sure.  whatever.  not like we can stop the user from popping
+		   the media out of the device (no locking doors, etc) */
+		return USB_STOR_TRANSPORT_GOOD;
+	}
+
+	US_DEBUGP("alauda_transport: Gah! Unknown command: %d (0x%x)\n",
+		srb->cmnd[0], srb->cmnd[0]);
+	info->sense_key = 0x05;
+	info->sense_asc = 0x20;
+	info->sense_ascq = 0x00;
+	return USB_STOR_TRANSPORT_FAILED;
+}
+
diff --git a/drivers/usb/storage/alauda.h b/drivers/usb/storage/alauda.h
new file mode 100644
index 000000000000..a700f87d0803
--- /dev/null
+++ b/drivers/usb/storage/alauda.h
@@ -0,0 +1,100 @@
+/*
+ * Driver for Alauda-based card readers
+ *
+ * Current development and maintenance by:
+ *    (c) 2005 Daniel Drake <dsd@gentoo.org>
+ *
+ * See alauda.c for more explanation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _USB_ALAUDA_H
+#define _USB_ALAUDA_H
+
+/*
+ * Status bytes
+ */
+#define ALAUDA_STATUS_ERROR		0x01
+#define ALAUDA_STATUS_READY		0x40
+
+/*
+ * Control opcodes (for request field)
+ */
+#define ALAUDA_GET_XD_MEDIA_STATUS	0x08
+#define ALAUDA_GET_SM_MEDIA_STATUS	0x98
+#define ALAUDA_ACK_XD_MEDIA_CHANGE	0x0a
+#define ALAUDA_ACK_SM_MEDIA_CHANGE	0x9a
+#define ALAUDA_GET_XD_MEDIA_SIG		0x86
+#define ALAUDA_GET_SM_MEDIA_SIG		0x96
+
+/*
+ * Bulk command identity (byte 0)
+ */
+#define ALAUDA_BULK_CMD			0x40
+
+/*
+ * Bulk opcodes (byte 1)
+ */
+#define ALAUDA_BULK_GET_REDU_DATA	0x85
+#define ALAUDA_BULK_READ_BLOCK		0x94
+#define ALAUDA_BULK_ERASE_BLOCK		0xa3
+#define ALAUDA_BULK_WRITE_BLOCK		0xb4
+#define ALAUDA_BULK_GET_STATUS2		0xb7
+#define ALAUDA_BULK_RESET_MEDIA		0xe0
+
+/*
+ * Port to operate on (byte 8)
+ */
+#define ALAUDA_PORT_XD			0x00
+#define ALAUDA_PORT_SM			0x01
+
+/*
+ * LBA and PBA are unsigned ints. Special values.
+ */
+#define UNDEF    0xffff
+#define SPARE    0xfffe
+#define UNUSABLE 0xfffd
+
+int init_alauda(struct us_data *us);
+int alauda_transport(struct scsi_cmnd *srb, struct us_data *us);
+
+struct alauda_media_info {
+	unsigned long capacity;		/* total media size in bytes */
+	unsigned int pagesize;		/* page size in bytes */
+	unsigned int blocksize;		/* number of pages per block */
+	unsigned int uzonesize;		/* number of usable blocks per zone */
+	unsigned int zonesize;		/* number of blocks per zone */
+	unsigned int blockmask;		/* mask to get page from address */
+
+	unsigned char pageshift;
+	unsigned char blockshift;
+	unsigned char zoneshift;
+
+	u16 **lba_to_pba;		/* logical to physical block map */
+	u16 **pba_to_lba;		/* physical to logical block map */
+};
+
+struct alauda_info {
+	struct alauda_media_info port[2];
+	int wr_ep;			/* endpoint to write data out of */
+
+	unsigned char sense_key;
+	unsigned long sense_asc;	/* additional sense code */
+	unsigned long sense_ascq;	/* additional sense code qualifier */
+};
+
+#endif
+
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index be3c06d17533..7a865dd04683 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -535,6 +535,13 @@ UNUSUAL_DEV(  0x057b, 0x0022, 0x0000, 0x9999,
 		"Silicon Media R/W",
 		US_SC_DEVICE, US_PR_DEVICE, NULL, 0),
 
+#ifdef CONFIG_USB_STORAGE_ALAUDA
+UNUSUAL_DEV(  0x0584, 0x0008, 0x0102, 0x0102,
+		"Fujifilm",
+		"DPC-R1 (Alauda)",
+ 		US_SC_SCSI, US_PR_ALAUDA, init_alauda, 0 ),
+#endif
+
 /* Fabrizio Fellini <fello@libero.it> */
 UNUSUAL_DEV(  0x0595, 0x4343, 0x0000, 0x2210,
 		"Fujifilm",
@@ -784,6 +791,13 @@ UNUSUAL_DEV(  0x07af, 0x0006, 0x0100, 0x0100,
  		US_SC_SCSI, US_PR_DPCM_USB, NULL, 0 ),
 #endif
 
+#ifdef CONFIG_USB_STORAGE_ALAUDA
+UNUSUAL_DEV(  0x07b4, 0x010a, 0x0102, 0x0102,
+		"Olympus",
+		"MAUSB-10 (Alauda)",
+ 		US_SC_SCSI, US_PR_ALAUDA, init_alauda, 0 ),
+#endif
+
 #ifdef CONFIG_USB_STORAGE_DATAFAB
 UNUSUAL_DEV(  0x07c4, 0xa000, 0x0000, 0x0015,
 		"Datafab",
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index 85c8c17b3c0c..dbcf23980ff1 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -94,6 +94,9 @@
 #ifdef CONFIG_USB_STORAGE_ONETOUCH
 #include "onetouch.h"
 #endif
+#ifdef CONFIG_USB_STORAGE_ALAUDA
+#include "alauda.h"
+#endif
 
 /* Some informational data */
 MODULE_AUTHOR("Matthew Dharm <mdharm-usb@one-eyed-alien.net>");
@@ -644,6 +647,15 @@ static int get_protocol(struct us_data *us)
 		break;
 #endif
 
+#ifdef CONFIG_USB_STORAGE_ALAUDA
+	case US_PR_ALAUDA:
+		us->transport_name  = "Alauda Control/Bulk";
+		us->transport = alauda_transport;
+		us->transport_reset = usb_stor_Bulk_reset;
+		us->max_lun = 1;
+		break;
+#endif
+
 	default:
 		return -EIO;
 	}
diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h
index f9c058f33712..b2d08984a9f7 100644
--- a/include/linux/usb_usual.h
+++ b/include/linux/usb_usual.h
@@ -102,6 +102,9 @@ enum { US_DO_ALL_FLAGS };
 #ifdef CONFIG_USB_STORAGE_JUMPSHOT
 #define US_PR_JUMPSHOT  0xf3		/* Lexar Jumpshot */
 #endif
+#ifdef CONFIG_USB_STORAGE_ALAUDA
+#define US_PR_ALAUDA    0xf4		/* Alauda chipsets */
+#endif
 
 #define US_PR_DEVICE	0xff		/* Use device's value */
 
-- 
cgit v1.2.3-71-gd317


From 0296b2281352e4794e174b393c37f131502e09f0 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Fri, 11 Nov 2005 05:33:52 +0100
Subject: [PATCH] remove CONFIG_KOBJECT_UEVENT option

It makes zero sense to have hotplug, but not the netlink
events enabled today. Remove this option and merge the
kobject_uevent.h header into the kobject.h header file.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 MAINTAINERS                    |  6 -----
 drivers/input/input.c          |  1 -
 drivers/s390/crypto/z90main.c  |  1 -
 include/linux/kobject.h        | 35 +++++++++++++++++++++++++-
 include/linux/kobject_uevent.h | 57 ------------------------------------------
 init/Kconfig                   | 19 --------------
 kernel/sysctl.c                |  4 +--
 lib/kobject_uevent.c           | 24 ++++--------------
 8 files changed, 40 insertions(+), 107 deletions(-)
 delete mode 100644 include/linux/kobject_uevent.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 6af683025ae0..b49a4ad3b872 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1476,12 +1476,6 @@ W:	http://nfs.sourceforge.net/
 W:	http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
 S:	Maintained
 
-KERNEL EVENT LAYER (KOBJECT_UEVENT)
-P:	Robert Love
-M:	rml@novell.com
-L:	linux-kernel@vger.kernel.org
-S:	Maintained
-
 KEXEC
 P:	Eric Biederman
 P:	Randy Dunlap
diff --git a/drivers/input/input.c b/drivers/input/input.c
index bdd2a7fc268d..43b49ccd7dad 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -18,7 +18,6 @@
 #include <linux/random.h>
 #include <linux/major.h>
 #include <linux/proc_fs.h>
-#include <linux/kobject_uevent.h>
 #include <linux/interrupt.h>
 #include <linux/poll.h>
 #include <linux/device.h>
diff --git a/drivers/s390/crypto/z90main.c b/drivers/s390/crypto/z90main.c
index 4010f2bb85af..790fcbb74b43 100644
--- a/drivers/s390/crypto/z90main.c
+++ b/drivers/s390/crypto/z90main.c
@@ -34,7 +34,6 @@
 #include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/kobject_uevent.h>
 #include <linux/proc_fs.h>
 #include <linux/syscalls.h>
 #include "z90crypt.h"
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 7f7403aa4a41..baf5251d9f63 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -23,15 +23,31 @@
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
 #include <linux/kref.h>
-#include <linux/kobject_uevent.h>
 #include <linux/kernel.h>
 #include <asm/atomic.h>
 
 #define KOBJ_NAME_LEN	20
 
+#define HOTPLUG_PATH_LEN	256
+
+/* path to the userspace helper executed on an event */
+extern char hotplug_path[];
+
 /* counter to tag the hotplug event, read only except for the kobject core */
 extern u64 hotplug_seqnum;
 
+/* the actions here must match the proper string in lib/kobject_uevent.c */
+typedef int __bitwise kobject_action_t;
+enum kobject_action {
+	KOBJ_ADD	= (__force kobject_action_t) 0x01,	/* add event, for hotplug */
+	KOBJ_REMOVE	= (__force kobject_action_t) 0x02,	/* remove event, for hotplug */
+	KOBJ_CHANGE	= (__force kobject_action_t) 0x03,	/* a sysfs attribute file has changed */
+	KOBJ_MOUNT	= (__force kobject_action_t) 0x04,	/* mount event for block devices */
+	KOBJ_UMOUNT	= (__force kobject_action_t) 0x05,	/* umount event for block devices */
+	KOBJ_OFFLINE	= (__force kobject_action_t) 0x06,	/* offline event for hotplug devices */
+	KOBJ_ONLINE	= (__force kobject_action_t) 0x07,	/* online event for hotplug devices */
+};
+
 struct kobject {
 	const char		* k_name;
 	char			name[KOBJ_NAME_LEN];
@@ -243,16 +259,33 @@ extern void subsys_remove_file(struct subsystem * , struct subsys_attribute *);
 
 #ifdef CONFIG_HOTPLUG
 void kobject_hotplug(struct kobject *kobj, enum kobject_action action);
+
 int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
 			char *buffer, int buffer_size, int *cur_len,
 			const char *format, ...)
 	__attribute__((format (printf, 7, 8)));
+
+int kobject_uevent(struct kobject *kobj,
+		   enum kobject_action action,
+		   struct attribute *attr);
+int kobject_uevent_atomic(struct kobject *kobj,
+			  enum kobject_action action,
+			  struct attribute *attr);
+
 #else
 static inline void kobject_hotplug(struct kobject *kobj, enum kobject_action action) { }
 static inline int add_hotplug_env_var(char **envp, int num_envp, int *cur_index, 
 				      char *buffer, int buffer_size, int *cur_len, 
 				      const char *format, ...)
 { return 0; }
+int kobject_uevent(struct kobject *kobj,
+		   enum kobject_action action,
+		   struct attribute *attr)
+{ return 0; }
+int kobject_uevent_atomic(struct kobject *kobj,
+			  enum kobject_action action,
+			  struct attribute *attr)
+{ return 0; }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/include/linux/kobject_uevent.h b/include/linux/kobject_uevent.h
deleted file mode 100644
index aa664fe7e561..000000000000
--- a/include/linux/kobject_uevent.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * kobject_uevent.h - list of kobject user events that can be generated
- *
- * Copyright (C) 2004 IBM Corp.
- * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
- *
- * This file is released under the GPLv2.
- *
- */
-
-#ifndef _KOBJECT_EVENT_H_
-#define _KOBJECT_EVENT_H_
-
-#define HOTPLUG_PATH_LEN	256
-
-/* path to the hotplug userspace helper executed on an event */
-extern char hotplug_path[];
-
-/*
- * If you add an action here, you must also add the proper string to the
- * lib/kobject_uevent.c file.
- */
-typedef int __bitwise kobject_action_t;
-enum kobject_action {
-	KOBJ_ADD	= (__force kobject_action_t) 0x01,	/* add event, for hotplug */
-	KOBJ_REMOVE	= (__force kobject_action_t) 0x02,	/* remove event, for hotplug */
-	KOBJ_CHANGE	= (__force kobject_action_t) 0x03,	/* a sysfs attribute file has changed */
-	KOBJ_MOUNT	= (__force kobject_action_t) 0x04,	/* mount event for block devices */
-	KOBJ_UMOUNT	= (__force kobject_action_t) 0x05,	/* umount event for block devices */
-	KOBJ_OFFLINE	= (__force kobject_action_t) 0x06,	/* offline event for hotplug devices */
-	KOBJ_ONLINE	= (__force kobject_action_t) 0x07,	/* online event for hotplug devices */
-};
-
-
-#ifdef CONFIG_KOBJECT_UEVENT
-int kobject_uevent(struct kobject *kobj,
-		   enum kobject_action action,
-		   struct attribute *attr);
-int kobject_uevent_atomic(struct kobject *kobj,
-			  enum kobject_action action,
-			  struct attribute *attr);
-#else
-static inline int kobject_uevent(struct kobject *kobj,
-				 enum kobject_action action,
-				 struct attribute *attr)
-{
-	return 0;
-}
-static inline int kobject_uevent_atomic(struct kobject *kobj,
-				        enum kobject_action action,
-					struct attribute *attr)
-{
-	return 0;
-}
-#endif
-
-#endif
diff --git a/init/Kconfig b/init/Kconfig
index 9fc0759fa942..0de8b7765ae4 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -205,25 +205,6 @@ config HOTPLUG
 	  modules require HOTPLUG functionality, but a module built
 	  outside the kernel tree does. Such modules require Y here.
 
-config KOBJECT_UEVENT
-	bool "Kernel Userspace Events" if EMBEDDED
-	depends on NET
-	default y
-	help
-	  This option enables the kernel userspace event layer, which is a
-	  simple mechanism for kernel-to-user communication over a netlink
-	  socket.
-	  The goal of the kernel userspace events layer is to provide a simple
-	  and efficient events system, that notifies userspace about kobject
-	  state changes. This will enable applications to just listen for
-	  events instead of polling system devices and files.
-	  Hotplug events (kobject addition and removal) are also available on
-	  the netlink socket in addition to the execution of /sbin/hotplug if
-	  CONFIG_HOTPLUG is enabled.
-
-	  Say Y, unless you are building a system requiring minimal memory
-	  consumption.
-
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b53115b882e1..6a51e25d4466 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -31,6 +31,7 @@
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/kobject.h>
 #include <linux/net.h>
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
@@ -83,9 +84,6 @@ static int ngroups_max = NGROUPS_MAX;
 #ifdef CONFIG_KMOD
 extern char modprobe_path[];
 #endif
-#ifdef CONFIG_HOTPLUG
-extern char hotplug_path[];
-#endif
 #ifdef CONFIG_CHR_DEV_SG
 extern int sg_big_buff;
 #endif
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 3ab375411e38..1f90eea7eebc 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -19,14 +19,17 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/string.h>
-#include <linux/kobject_uevent.h>
 #include <linux/kobject.h>
 #include <net/sock.h>
 
 #define BUFFER_SIZE	1024	/* buffer for the hotplug env */
 #define NUM_ENVP	32	/* number of env pointers */
 
-#if defined(CONFIG_KOBJECT_UEVENT) || defined(CONFIG_HOTPLUG)
+#if defined(CONFIG_HOTPLUG)
+char hotplug_path[HOTPLUG_PATH_LEN] = "/sbin/hotplug";
+u64 hotplug_seqnum;
+static DEFINE_SPINLOCK(sequence_lock);
+
 static char *action_to_string(enum kobject_action action)
 {
 	switch (action) {
@@ -48,9 +51,7 @@ static char *action_to_string(enum kobject_action action)
 		return NULL;
 	}
 }
-#endif
 
-#ifdef CONFIG_KOBJECT_UEVENT
 static struct sock *uevent_sock;
 
 /**
@@ -168,21 +169,6 @@ static int __init kobject_uevent_init(void)
 
 postcore_initcall(kobject_uevent_init);
 
-#else
-static inline int send_uevent(const char *signal, const char *obj,
-			      char **envp, int gfp_mask)
-{
-	return 0;
-}
-
-#endif /* CONFIG_KOBJECT_UEVENT */
-
-
-#ifdef CONFIG_HOTPLUG
-char hotplug_path[HOTPLUG_PATH_LEN] = "/sbin/hotplug";
-u64 hotplug_seqnum;
-static DEFINE_SPINLOCK(sequence_lock);
-
 /**
  * kobject_hotplug - notify userspace by executing /sbin/hotplug
  *
-- 
cgit v1.2.3-71-gd317


From 033b96fd30db52a710d97b06f87d16fc59fee0f1 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Fri, 11 Nov 2005 06:09:55 +0100
Subject: [PATCH] remove mount/umount uevents from superblock handling

The names of these events have been confusing from the beginning
on, as they have been more like claim/release events. We needed these
events for noticing HAL if storage devices have been mounted.

Thanks to Al, we have the proper solution now and can poll()
/proc/mounts instead to get notfied about mount tree changes.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/super.c              | 15 +--------------
 include/linux/kobject.h |  6 ++----
 lib/kobject_uevent.c    |  4 ----
 3 files changed, 3 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 6689dded3c84..5a347a4f673a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -665,16 +665,6 @@ static int test_bdev_super(struct super_block *s, void *data)
 	return (void *)s->s_bdev == data;
 }
 
-static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
-{
-	if (bdev->bd_disk) {
-		if (bdev->bd_part)
-			kobject_uevent(&bdev->bd_part->kobj, action, NULL);
-		else
-			kobject_uevent(&bdev->bd_disk->kobj, action, NULL);
-	}
-}
-
 struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int))
@@ -717,10 +707,8 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 			up_write(&s->s_umount);
 			deactivate_super(s);
 			s = ERR_PTR(error);
-		} else {
+		} else
 			s->s_flags |= MS_ACTIVE;
-			bdev_uevent(bdev, KOBJ_MOUNT);
-		}
 	}
 
 	return s;
@@ -736,7 +724,6 @@ void kill_block_super(struct super_block *sb)
 {
 	struct block_device *bdev = sb->s_bdev;
 
-	bdev_uevent(bdev, KOBJ_UMOUNT);
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
 	close_bdev_excl(bdev);
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index baf5251d9f63..e6926b327538 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -42,10 +42,8 @@ enum kobject_action {
 	KOBJ_ADD	= (__force kobject_action_t) 0x01,	/* add event, for hotplug */
 	KOBJ_REMOVE	= (__force kobject_action_t) 0x02,	/* remove event, for hotplug */
 	KOBJ_CHANGE	= (__force kobject_action_t) 0x03,	/* a sysfs attribute file has changed */
-	KOBJ_MOUNT	= (__force kobject_action_t) 0x04,	/* mount event for block devices */
-	KOBJ_UMOUNT	= (__force kobject_action_t) 0x05,	/* umount event for block devices */
-	KOBJ_OFFLINE	= (__force kobject_action_t) 0x06,	/* offline event for hotplug devices */
-	KOBJ_ONLINE	= (__force kobject_action_t) 0x07,	/* online event for hotplug devices */
+	KOBJ_OFFLINE	= (__force kobject_action_t) 0x04,	/* offline event for hotplug devices */
+	KOBJ_ONLINE	= (__force kobject_action_t) 0x05,	/* online event for hotplug devices */
 };
 
 struct kobject {
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 1f90eea7eebc..845bf67d94ca 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -39,10 +39,6 @@ static char *action_to_string(enum kobject_action action)
 		return "remove";
 	case KOBJ_CHANGE:
 		return "change";
-	case KOBJ_MOUNT:
-		return "mount";
-	case KOBJ_UMOUNT:
-		return "umount";
 	case KOBJ_OFFLINE:
 		return "offline";
 	case KOBJ_ONLINE:
-- 
cgit v1.2.3-71-gd317


From 5f123fbd80f4f788554636f02bf73e40f914e0d6 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Fri, 11 Nov 2005 14:43:07 +0100
Subject: [PATCH] merge kobject_uevent and kobject_hotplug

The distinction between hotplug and uevent does not make sense these
days, netlink events are the default.

udev depends entirely on netlink uevents. Only during early boot and
in initramfs, /sbin/hotplug is needed. So merge the two functions and
provide only one interface without all the options.

The netlink layer got a nice generic interface with named slots
recently, which is probably a better facility to plug events for
subsystem specific events.
Also the new poll() interface to /proc/mounts is a nicer way to
notify about changes than sending events through the core.
The uevents should only be used for driver core related requests to
userspace now.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/scsi/ipr.c      |   2 +-
 include/linux/kobject.h |  27 ++---
 lib/kobject_uevent.c    | 279 +++++++++++++++++-------------------------------
 3 files changed, 102 insertions(+), 206 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index fa2cb3582cfa..bf44a409ba0d 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -2132,7 +2132,7 @@ restart:
 	}
 
 	spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags);
-	kobject_uevent(&ioa_cfg->host->shost_classdev.kobj, KOBJ_CHANGE, NULL);
+	kobject_hotplug(&ioa_cfg->host->shost_classdev.kobj, KOBJ_CHANGE);
 	LEAVE;
 }
 
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e6926b327538..5b08248fba72 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -39,11 +39,11 @@ extern u64 hotplug_seqnum;
 /* the actions here must match the proper string in lib/kobject_uevent.c */
 typedef int __bitwise kobject_action_t;
 enum kobject_action {
-	KOBJ_ADD	= (__force kobject_action_t) 0x01,	/* add event, for hotplug */
-	KOBJ_REMOVE	= (__force kobject_action_t) 0x02,	/* remove event, for hotplug */
-	KOBJ_CHANGE	= (__force kobject_action_t) 0x03,	/* a sysfs attribute file has changed */
-	KOBJ_OFFLINE	= (__force kobject_action_t) 0x04,	/* offline event for hotplug devices */
-	KOBJ_ONLINE	= (__force kobject_action_t) 0x05,	/* online event for hotplug devices */
+	KOBJ_ADD	= (__force kobject_action_t) 0x01,	/* exclusive to core */
+	KOBJ_REMOVE	= (__force kobject_action_t) 0x02,	/* exclusive to core */
+	KOBJ_CHANGE	= (__force kobject_action_t) 0x03,	/* device state change */
+	KOBJ_OFFLINE	= (__force kobject_action_t) 0x04,	/* device offline */
+	KOBJ_ONLINE	= (__force kobject_action_t) 0x05,	/* device online */
 };
 
 struct kobject {
@@ -262,28 +262,13 @@ int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
 			char *buffer, int buffer_size, int *cur_len,
 			const char *format, ...)
 	__attribute__((format (printf, 7, 8)));
-
-int kobject_uevent(struct kobject *kobj,
-		   enum kobject_action action,
-		   struct attribute *attr);
-int kobject_uevent_atomic(struct kobject *kobj,
-			  enum kobject_action action,
-			  struct attribute *attr);
-
 #else
 static inline void kobject_hotplug(struct kobject *kobj, enum kobject_action action) { }
+
 static inline int add_hotplug_env_var(char **envp, int num_envp, int *cur_index, 
 				      char *buffer, int buffer_size, int *cur_len, 
 				      const char *format, ...)
 { return 0; }
-int kobject_uevent(struct kobject *kobj,
-		   enum kobject_action action,
-		   struct attribute *attr)
-{ return 0; }
-int kobject_uevent_atomic(struct kobject *kobj,
-			  enum kobject_action action,
-			  struct attribute *attr)
-{ return 0; }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 845bf67d94ca..dd061da3aba9 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -29,6 +29,7 @@
 char hotplug_path[HOTPLUG_PATH_LEN] = "/sbin/hotplug";
 u64 hotplug_seqnum;
 static DEFINE_SPINLOCK(sequence_lock);
+static struct sock *uevent_sock;
 
 static char *action_to_string(enum kobject_action action)
 {
@@ -48,123 +49,6 @@ static char *action_to_string(enum kobject_action action)
 	}
 }
 
-static struct sock *uevent_sock;
-
-/**
- * send_uevent - notify userspace by sending event through netlink socket
- *
- * @signal: signal name
- * @obj: object path (kobject)
- * @envp: possible hotplug environment to pass with the message
- * @gfp_mask:
- */
-static int send_uevent(const char *signal, const char *obj,
-		       char **envp, gfp_t gfp_mask)
-{
-	struct sk_buff *skb;
-	char *pos;
-	int len;
-
-	if (!uevent_sock)
-		return -EIO;
-
-	len = strlen(signal) + 1;
-	len += strlen(obj) + 1;
-
-	/* allocate buffer with the maximum possible message size */
-	skb = alloc_skb(len + BUFFER_SIZE, gfp_mask);
-	if (!skb)
-		return -ENOMEM;
-
-	pos = skb_put(skb, len);
-	sprintf(pos, "%s@%s", signal, obj);
-
-	/* copy the environment key by key to our continuous buffer */
-	if (envp) {
-		int i;
-
-		for (i = 2; envp[i]; i++) {
-			len = strlen(envp[i]) + 1;
-			pos = skb_put(skb, len);
-			strcpy(pos, envp[i]);
-		}
-	}
-
-	NETLINK_CB(skb).dst_group = 1;
-	return netlink_broadcast(uevent_sock, skb, 0, 1, gfp_mask);
-}
-
-static int do_kobject_uevent(struct kobject *kobj, enum kobject_action action, 
-			     struct attribute *attr, gfp_t gfp_mask)
-{
-	char *path;
-	char *attrpath;
-	char *signal;
-	int len;
-	int rc = -ENOMEM;
-
-	path = kobject_get_path(kobj, gfp_mask);
-	if (!path)
-		return -ENOMEM;
-
-	signal = action_to_string(action);
-	if (!signal)
-		return -EINVAL;
-
-	if (attr) {
-		len = strlen(path);
-		len += strlen(attr->name) + 2;
-		attrpath = kmalloc(len, gfp_mask);
-		if (!attrpath)
-			goto exit;
-		sprintf(attrpath, "%s/%s", path, attr->name);
-		rc = send_uevent(signal, attrpath, NULL, gfp_mask);
-		kfree(attrpath);
-	} else
-		rc = send_uevent(signal, path, NULL, gfp_mask);
-
-exit:
-	kfree(path);
-	return rc;
-}
-
-/**
- * kobject_uevent - notify userspace by sending event through netlink socket
- * 
- * @signal: signal name
- * @kobj: struct kobject that the event is happening to
- * @attr: optional struct attribute the event belongs to
- */
-int kobject_uevent(struct kobject *kobj, enum kobject_action action,
-		   struct attribute *attr)
-{
-	return do_kobject_uevent(kobj, action, attr, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(kobject_uevent);
-
-int kobject_uevent_atomic(struct kobject *kobj, enum kobject_action action,
-			  struct attribute *attr)
-{
-	return do_kobject_uevent(kobj, action, attr, GFP_ATOMIC);
-}
-EXPORT_SYMBOL_GPL(kobject_uevent_atomic);
-
-static int __init kobject_uevent_init(void)
-{
-	uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, 1, NULL,
-					    THIS_MODULE);
-
-	if (!uevent_sock) {
-		printk(KERN_ERR
-		       "kobject_uevent: unable to create netlink socket!\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-postcore_initcall(kobject_uevent_init);
-
 /**
  * kobject_hotplug - notify userspace by executing /sbin/hotplug
  *
@@ -173,95 +57,84 @@ postcore_initcall(kobject_uevent_init);
  */
 void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 {
-	char *argv [3];
-	char **envp = NULL;
-	char *buffer = NULL;
-	char *seq_buff;
+	char **envp;
+	char *buffer;
 	char *scratch;
+	const char *action_string;
+	const char *devpath = NULL;
+	const char *subsystem;
+	struct kobject *top_kobj;
+	struct kset *kset;
+	struct kset_hotplug_ops *hotplug_ops;
+	u64 seq;
+	char *seq_buff;
 	int i = 0;
 	int retval;
-	char *kobj_path = NULL;
-	const char *name = NULL;
-	char *action_string;
-	u64 seq;
-	struct kobject *top_kobj = kobj;
-	struct kset *kset;
-	static struct kset_hotplug_ops null_hotplug_ops;
-	struct kset_hotplug_ops *hotplug_ops = &null_hotplug_ops;
 
-	/* If this kobj does not belong to a kset,
-	   try to find a parent that does. */
+	pr_debug("%s\n", __FUNCTION__);
+
+	action_string = action_to_string(action);
+	if (!action_string)
+		return;
+
+	/* search the kset we belong to */
+	top_kobj = kobj;
 	if (!top_kobj->kset && top_kobj->parent) {
 		do {
 			top_kobj = top_kobj->parent;
 		} while (!top_kobj->kset && top_kobj->parent);
 	}
-
-	if (top_kobj->kset)
-		kset = top_kobj->kset;
-	else
+	if (!top_kobj->kset)
 		return;
 
-	if (kset->hotplug_ops)
-		hotplug_ops = kset->hotplug_ops;
+	kset = top_kobj->kset;
+	hotplug_ops = kset->hotplug_ops;
 
-	/* If the kset has a filter operation, call it.
-	   Skip the event, if the filter returns zero. */
-	if (hotplug_ops->filter) {
+	/*  skip the event, if the filter returns zero. */
+	if (hotplug_ops && hotplug_ops->filter)
 		if (!hotplug_ops->filter(kset, kobj))
 			return;
-	}
-
-	pr_debug ("%s\n", __FUNCTION__);
-
-	action_string = action_to_string(action);
-	if (!action_string)
-		return;
 
-	envp = kmalloc(NUM_ENVP * sizeof (char *), GFP_KERNEL);
+	/* environment index */
+	envp = kzalloc(NUM_ENVP * sizeof (char *), GFP_KERNEL);
 	if (!envp)
 		return;
-	memset (envp, 0x00, NUM_ENVP * sizeof (char *));
 
+	/* environment values */
 	buffer = kmalloc(BUFFER_SIZE, GFP_KERNEL);
 	if (!buffer)
 		goto exit;
 
-	if (hotplug_ops->name)
-		name = hotplug_ops->name(kset, kobj);
-	if (name == NULL)
-		name = kobject_name(&kset->kobj);
+	/* complete object path */
+	devpath = kobject_get_path(kobj, GFP_KERNEL);
+	if (!devpath)
+		goto exit;
 
-	argv [0] = hotplug_path;
-	argv [1] = (char *)name; /* won't be changed but 'const' has to go */
-	argv [2] = NULL;
+	/* originating subsystem */
+	if (hotplug_ops && hotplug_ops->name)
+		subsystem = hotplug_ops->name(kset, kobj);
+	else
+		subsystem = kobject_name(&kset->kobj);
 
-	/* minimal command environment */
-	envp [i++] = "HOME=/";
-	envp [i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	/* event environemnt for helper process only */
+	envp[i++] = "HOME=/";
+	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 
+	/* default keys */
 	scratch = buffer;
-
 	envp [i++] = scratch;
 	scratch += sprintf(scratch, "ACTION=%s", action_string) + 1;
-
-	kobj_path = kobject_get_path(kobj, GFP_KERNEL);
-	if (!kobj_path)
-		goto exit;
-
 	envp [i++] = scratch;
-	scratch += sprintf (scratch, "DEVPATH=%s", kobj_path) + 1;
-
+	scratch += sprintf (scratch, "DEVPATH=%s", devpath) + 1;
 	envp [i++] = scratch;
-	scratch += sprintf(scratch, "SUBSYSTEM=%s", name) + 1;
+	scratch += sprintf(scratch, "SUBSYSTEM=%s", subsystem) + 1;
 
-	/* reserve space for the sequence,
-	 * put the real one in after the hotplug call */
+	/* just reserve the space, overwrite it after kset call has returned */
 	envp[i++] = seq_buff = scratch;
 	scratch += strlen("SEQNUM=18446744073709551616") + 1;
 
-	if (hotplug_ops->hotplug) {
-		/* have the kset specific function add its stuff */
+	/* let the kset specific function add its stuff */
+	if (hotplug_ops && hotplug_ops->hotplug) {
 		retval = hotplug_ops->hotplug (kset, kobj,
 				  &envp[i], NUM_ENVP - i, scratch,
 				  BUFFER_SIZE - (scratch - buffer));
@@ -272,27 +145,49 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 		}
 	}
 
+	/* we will send an event, request a new sequence number */
 	spin_lock(&sequence_lock);
 	seq = ++hotplug_seqnum;
 	spin_unlock(&sequence_lock);
 	sprintf(seq_buff, "SEQNUM=%llu", (unsigned long long)seq);
 
-	pr_debug ("%s: %s %s seq=%llu %s %s %s %s %s\n",
-		  __FUNCTION__, argv[0], argv[1], (unsigned long long)seq,
-		  envp[0], envp[1], envp[2], envp[3], envp[4]);
-
-	send_uevent(action_string, kobj_path, envp, GFP_KERNEL);
+	/* send netlink message */
+	if (uevent_sock) {
+		struct sk_buff *skb;
+		size_t len;
+
+		/* allocate message with the maximum possible size */
+		len = strlen(action_string) + strlen(devpath) + 2;
+		skb = alloc_skb(len + BUFFER_SIZE, GFP_KERNEL);
+		if (skb) {
+			/* add header */
+			scratch = skb_put(skb, len);
+			sprintf(scratch, "%s@%s", action_string, devpath);
+
+			/* copy keys to our continuous event payload buffer */
+			for (i = 2; envp[i]; i++) {
+				len = strlen(envp[i]) + 1;
+				scratch = skb_put(skb, len);
+				strcpy(scratch, envp[i]);
+			}
+
+			NETLINK_CB(skb).dst_group = 1;
+			netlink_broadcast(uevent_sock, skb, 0, 1, GFP_KERNEL);
+		}
+	}
 
-	if (!hotplug_path[0])
-		goto exit;
+	/* call uevent_helper, usually only enabled during early boot */
+	if (hotplug_path[0]) {
+		char *argv [3];
 
-	retval = call_usermodehelper (argv[0], argv, envp, 0);
-	if (retval)
-		pr_debug ("%s - call_usermodehelper returned %d\n",
-			  __FUNCTION__, retval);
+		argv [0] = hotplug_path;
+		argv [1] = (char *)subsystem;
+		argv [2] = NULL;
+		call_usermodehelper (argv[0], argv, envp, 0);
+	}
 
 exit:
-	kfree(kobj_path);
+	kfree(devpath);
 	kfree(buffer);
 	kfree(envp);
 	return;
@@ -350,4 +245,20 @@ int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
 }
 EXPORT_SYMBOL(add_hotplug_env_var);
 
+static int __init kobject_uevent_init(void)
+{
+	uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, 1, NULL,
+					    THIS_MODULE);
+
+	if (!uevent_sock) {
+		printk(KERN_ERR
+		       "kobject_uevent: unable to create netlink socket!\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+postcore_initcall(kobject_uevent_init);
+
 #endif /* CONFIG_HOTPLUG */
-- 
cgit v1.2.3-71-gd317


From 312c004d36ce6c739512bac83b452f4c20ab1f62 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Wed, 16 Nov 2005 09:00:00 +0100
Subject: [PATCH] driver core: replace "hotplug" by "uevent"

Leave the overloaded "hotplug" word to susbsystems which are handling
real devices. The driver core does not "plug" anything, it just exports
the state to userspace and generates events.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/powerpc/eeh-pci-error-recovery.txt | 31 ++++-----
 arch/powerpc/kernel/vio.c                        |  2 +-
 block/genhd.c                                    | 48 ++++++-------
 drivers/acpi/container.c                         |  8 +--
 drivers/acpi/processor_core.c                    |  8 +--
 drivers/acpi/scan.c                              | 14 ++--
 drivers/base/Kconfig                             |  4 +-
 drivers/base/class.c                             | 66 +++++++++---------
 drivers/base/core.c                              | 42 ++++++------
 drivers/base/cpu.c                               |  4 +-
 drivers/base/firmware_class.c                    | 45 ++++++-------
 drivers/base/memory.c                            | 12 ++--
 drivers/ieee1394/nodemgr.c                       | 20 +++---
 drivers/infiniband/core/sysfs.c                  | 16 ++---
 drivers/input/input.c                            | 14 ++--
 drivers/input/serio/serio.c                      | 22 +++---
 drivers/macintosh/macio_asic.c                   |  4 +-
 drivers/mmc/mmc_sysfs.c                          |  4 +-
 drivers/pci/hotplug.c                            | 44 ++++++------
 drivers/pci/pci-driver.c                         |  6 +-
 drivers/pci/pci.h                                |  4 +-
 drivers/pcmcia/cs.c                              | 10 +--
 drivers/pcmcia/ds.c                              | 50 +++++++-------
 drivers/scsi/ipr.c                               |  2 +-
 drivers/usb/core/usb.c                           | 86 +++++++++++-------------
 drivers/usb/host/hc_crisv10.c                    |  2 +-
 drivers/w1/w1.c                                  | 14 ++--
 fs/partitions/check.c                            |  6 +-
 include/linux/device.h                           | 14 ++--
 include/linux/firmware.h                         |  2 +-
 include/linux/kobject.h                          | 40 ++++++-----
 include/linux/sysctl.h                           |  2 +-
 include/linux/usb.h                              |  2 +-
 kernel/ksysfs.c                                  | 14 ++--
 kernel/sysctl.c                                  |  4 +-
 lib/kobject.c                                    |  4 +-
 lib/kobject_uevent.c                             | 64 +++++++++---------
 net/bluetooth/hci_sysfs.c                        |  4 +-
 net/bridge/br_sysfs_if.c                         |  4 +-
 net/core/net-sysfs.c                             |  8 +--
 40 files changed, 372 insertions(+), 378 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt
index e75d7474322c..67a11a36270c 100644
--- a/Documentation/powerpc/eeh-pci-error-recovery.txt
+++ b/Documentation/powerpc/eeh-pci-error-recovery.txt
@@ -115,7 +115,7 @@ Current PPC64 Linux EEH Implementation
 At this time, a generic EEH recovery mechanism has been implemented,
 so that individual device drivers do not need to be modified to support
 EEH recovery.  This generic mechanism piggy-backs on the PCI hotplug
-infrastructure,  and percolates events up through the hotplug/udev
+infrastructure,  and percolates events up through the userspace/udev
 infrastructure.  Followiing is a detailed description of how this is
 accomplished.
 
@@ -172,7 +172,7 @@ A handler for the EEH notifier_block events is implemented in
 drivers/pci/hotplug/pSeries_pci.c, called handle_eeh_events().
 It saves the device BAR's and then calls rpaphp_unconfig_pci_adapter().
 This last call causes the device driver for the card to be stopped,
-which causes hotplug events to go out to user space. This triggers
+which causes uevents to go out to user space. This triggers
 user-space scripts that might issue commands such as "ifdown eth0"
 for ethernet cards, and so on.  This handler then sleeps for 5 seconds,
 hoping to give the user-space scripts enough time to complete.
@@ -258,29 +258,30 @@ rpa_php_unconfig_pci_adapter() {             // in rpaphp_pci.c
     calls
     pci_destroy_dev (struct pci_dev *) {
       calls
-      device_unregister (&dev->dev) {      // in /drivers/base/core.c
+      device_unregister (&dev->dev) {        // in /drivers/base/core.c
         calls
-        device_del(struct device * dev) {  // in /drivers/base/core.c
+        device_del(struct device * dev) {    // in /drivers/base/core.c
           calls
-          kobject_del() {                  //in /libs/kobject.c
+          kobject_del() {                    //in /libs/kobject.c
             calls
-            kobject_hotplug() {            // in /libs/kobject.c
+            kobject_uevent() {               // in /libs/kobject.c
               calls
-              kset_hotplug() {             // in /lib/kobject.c
+              kset_uevent() {                // in /lib/kobject.c
                 calls
-                kset->hotplug_ops->hotplug() which is really just
+                kset->uevent_ops->uevent()   // which is really just
                 a call to
-                dev_hotplug() {           // in /drivers/base/core.c
+                dev_uevent() {               // in /drivers/base/core.c
                   calls
-                  dev->bus->hotplug() which is really just a call to
-                  pci_hotplug () {      // in drivers/pci/hotplug.c
+                  dev->bus->uevent() which is really just a call to
+                  pci_uevent () {            // in drivers/pci/hotplug.c
                     which prints device name, etc....
                  }
                }
-               then kset_hotplug() calls
-                call_usermodehelper () with
-                   argv[0]=hotplug_path[] which is "/sbin/hotplug"
-             --> event to userspace,
+               then kobject_uevent() sends a netlink uevent to userspace
+               --> userspace uevent
+               (during early boot, nobody listens to netlink events and
+               kobject_uevent() executes uevent_helper[], which runs the
+               event process /sbin/hotplug)
            }
          }
          kobject_del() then calls sysfs_remove_dir(), which would
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 71a6addf9f7f..13c41495fe06 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -293,6 +293,6 @@ static int vio_hotplug(struct device *dev, char **envp, int num_envp,
 
 struct bus_type vio_bus_type = {
 	.name = "vio",
-	.hotplug = vio_hotplug,
+	.uevent = vio_hotplug,
 	.match = vio_bus_match,
 };
diff --git a/block/genhd.c b/block/genhd.c
index f04609d553b8..f1ed83f3f083 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -358,7 +358,7 @@ static struct sysfs_ops disk_sysfs_ops = {
 static ssize_t disk_uevent_store(struct gendisk * disk,
 				 const char *buf, size_t count)
 {
-	kobject_hotplug(&disk->kobj, KOBJ_ADD);
+	kobject_uevent(&disk->kobj, KOBJ_ADD);
 	return count;
 }
 static ssize_t disk_dev_read(struct gendisk * disk, char *page)
@@ -455,14 +455,14 @@ static struct kobj_type ktype_block = {
 
 extern struct kobj_type ktype_part;
 
-static int block_hotplug_filter(struct kset *kset, struct kobject *kobj)
+static int block_uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
 
 	return ((ktype == &ktype_block) || (ktype == &ktype_part));
 }
 
-static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int block_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			 int num_envp, char *buffer, int buffer_size)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
@@ -474,40 +474,40 @@ static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 
 	if (ktype == &ktype_block) {
 		disk = container_of(kobj, struct gendisk, kobj);
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "MINOR=%u", disk->first_minor);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "MINOR=%u", disk->first_minor);
 	} else if (ktype == &ktype_part) {
 		disk = container_of(kobj->parent, struct gendisk, kobj);
 		part = container_of(kobj, struct hd_struct, kobj);
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "MINOR=%u",
-				    disk->first_minor + part->partno);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "MINOR=%u",
+			       disk->first_minor + part->partno);
 	} else
 		return 0;
 
-	add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &length,
-			    "MAJOR=%u", disk->major);
+	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
+		       "MAJOR=%u", disk->major);
 
 	/* add physical device, backing this device  */
 	physdev = disk->driverfs_dev;
 	if (physdev) {
 		char *path = kobject_get_path(&physdev->kobj, GFP_KERNEL);
 
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "PHYSDEVPATH=%s", path);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "PHYSDEVPATH=%s", path);
 		kfree(path);
 
 		if (physdev->bus)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVBUS=%s",
-					    physdev->bus->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVBUS=%s",
+				       physdev->bus->name);
 
 		if (physdev->driver)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVDRIVER=%s",
-					    physdev->driver->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVDRIVER=%s",
+				       physdev->driver->name);
 	}
 
 	/* terminate, set to next free slot, shrink available space */
@@ -520,13 +520,13 @@ static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	return 0;
 }
 
-static struct kset_hotplug_ops block_hotplug_ops = {
-	.filter		= block_hotplug_filter,
-	.hotplug	= block_hotplug,
+static struct kset_uevent_ops block_uevent_ops = {
+	.filter		= block_uevent_filter,
+	.uevent		= block_uevent,
 };
 
 /* declare block_subsys. */
-static decl_subsys(block, &ktype_block, &block_hotplug_ops);
+static decl_subsys(block, &ktype_block, &block_uevent_ops);
 
 
 /*
diff --git a/drivers/acpi/container.c b/drivers/acpi/container.c
index 27ec12c1fab0..b69a8cad82b7 100644
--- a/drivers/acpi/container.c
+++ b/drivers/acpi/container.c
@@ -172,21 +172,21 @@ static void container_notify_cb(acpi_handle handle, u32 type, void *context)
 			if (ACPI_FAILURE(status) || !device) {
 				result = container_device_add(&device, handle);
 				if (!result)
-					kobject_hotplug(&device->kobj,
-							KOBJ_ONLINE);
+					kobject_uevent(&device->kobj,
+						       KOBJ_ONLINE);
 				else
 					printk("Failed to add container\n");
 			}
 		} else {
 			if (ACPI_SUCCESS(status)) {
 				/* device exist and this is a remove request */
-				kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+				kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 			}
 		}
 		break;
 	case ACPI_NOTIFY_EJECT_REQUEST:
 		if (!acpi_bus_get_device(handle, &device) && device) {
-			kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 		}
 		break;
 	default:
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 0c561c571f29..1278aca96fe3 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -748,7 +748,7 @@ int acpi_processor_device_add(acpi_handle handle, struct acpi_device **device)
 		return_VALUE(-ENODEV);
 
 	if ((pr->id >= 0) && (pr->id < NR_CPUS)) {
-		kobject_hotplug(&(*device)->kobj, KOBJ_ONLINE);
+		kobject_uevent(&(*device)->kobj, KOBJ_ONLINE);
 	}
 	return_VALUE(0);
 }
@@ -788,13 +788,13 @@ acpi_processor_hotplug_notify(acpi_handle handle, u32 event, void *data)
 		}
 
 		if (pr->id >= 0 && (pr->id < NR_CPUS)) {
-			kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 			break;
 		}
 
 		result = acpi_processor_start(device);
 		if ((!result) && ((pr->id >= 0) && (pr->id < NR_CPUS))) {
-			kobject_hotplug(&device->kobj, KOBJ_ONLINE);
+			kobject_uevent(&device->kobj, KOBJ_ONLINE);
 		} else {
 			ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
 					  "Device [%s] failed to start\n",
@@ -818,7 +818,7 @@ acpi_processor_hotplug_notify(acpi_handle handle, u32 event, void *data)
 		}
 
 		if ((pr->id < NR_CPUS) && (cpu_present(pr->id)))
-			kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 		break;
 	default:
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 31218e1d2a18..0745d20afb8c 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -78,7 +78,7 @@ static struct kobj_type ktype_acpi_ns = {
 	.release = acpi_device_release,
 };
 
-static int namespace_hotplug(struct kset *kset, struct kobject *kobj,
+static int namespace_uevent(struct kset *kset, struct kobject *kobj,
 			     char **envp, int num_envp, char *buffer,
 			     int buffer_size)
 {
@@ -89,8 +89,8 @@ static int namespace_hotplug(struct kset *kset, struct kobject *kobj,
 	if (!dev->driver)
 		return 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &len,
-				"PHYSDEVDRIVER=%s", dev->driver->name))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &len,
+			   "PHYSDEVDRIVER=%s", dev->driver->name))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -98,8 +98,8 @@ static int namespace_hotplug(struct kset *kset, struct kobject *kobj,
 	return 0;
 }
 
-static struct kset_hotplug_ops namespace_hotplug_ops = {
-	.hotplug = &namespace_hotplug,
+static struct kset_uevent_ops namespace_uevent_ops = {
+	.uevent = &namespace_uevent,
 };
 
 static struct kset acpi_namespace_kset = {
@@ -108,7 +108,7 @@ static struct kset acpi_namespace_kset = {
 		 },
 	.subsys = &acpi_subsys,
 	.ktype = &ktype_acpi_ns,
-	.hotplug_ops = &namespace_hotplug_ops,
+	.uevent_ops = &namespace_uevent_ops,
 };
 
 static void acpi_device_register(struct acpi_device *device,
@@ -347,7 +347,7 @@ static int acpi_bus_get_wakeup_device_flags(struct acpi_device *device)
 }
 
 /* --------------------------------------------------------------------------
-		ACPI hotplug sysfs device file support
+		ACPI sysfs device file support
    -------------------------------------------------------------------------- */
 static ssize_t acpi_eject_store(struct acpi_device *device,
 				const char *buf, size_t count);
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 934149c1512b..f0eff3dac58d 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -19,11 +19,11 @@ config PREVENT_FIRMWARE_BUILD
 	  If unsure say Y here.
 
 config FW_LOADER
-	tristate "Hotplug firmware loading support"
+	tristate "Userspace firmware loading support"
 	select HOTPLUG
 	---help---
 	  This option is provided for the case where no in-kernel-tree modules
-	  require hotplug firmware loading support, but a module built outside
+	  require userspace firmware loading support, but a module built outside
 	  the kernel tree does.
 
 config DEBUG_DRIVER
diff --git a/drivers/base/class.c b/drivers/base/class.c
index db65fd0babe9..df7fdabd0730 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -178,7 +178,7 @@ static void class_device_create_release(struct class_device *class_dev)
 }
 
 /* needed to allow these devices to have parent class devices */
-static int class_device_create_hotplug(struct class_device *class_dev,
+static int class_device_create_uevent(struct class_device *class_dev,
 				       char **envp, int num_envp,
 				       char *buffer, int buffer_size)
 {
@@ -331,7 +331,7 @@ static struct kobj_type ktype_class_device = {
 	.release	= class_dev_release,
 };
 
-static int class_hotplug_filter(struct kset *kset, struct kobject *kobj)
+static int class_uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
 
@@ -343,14 +343,14 @@ static int class_hotplug_filter(struct kset *kset, struct kobject *kobj)
 	return 0;
 }
 
-static const char *class_hotplug_name(struct kset *kset, struct kobject *kobj)
+static const char *class_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	struct class_device *class_dev = to_class_dev(kobj);
 
 	return class_dev->class->name;
 }
 
-static int class_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int class_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			 int num_envp, char *buffer, int buffer_size)
 {
 	struct class_device *class_dev = to_class_dev(kobj);
@@ -365,29 +365,29 @@ static int class_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 		struct device *dev = class_dev->dev;
 		char *path = kobject_get_path(&dev->kobj, GFP_KERNEL);
 
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "PHYSDEVPATH=%s", path);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "PHYSDEVPATH=%s", path);
 		kfree(path);
 
 		if (dev->bus)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVBUS=%s", dev->bus->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVBUS=%s", dev->bus->name);
 
 		if (dev->driver)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVDRIVER=%s", dev->driver->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVDRIVER=%s", dev->driver->name);
 	}
 
 	if (MAJOR(class_dev->devt)) {
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "MAJOR=%u", MAJOR(class_dev->devt));
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "MAJOR=%u", MAJOR(class_dev->devt));
 
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "MINOR=%u", MINOR(class_dev->devt));
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "MINOR=%u", MINOR(class_dev->devt));
 	}
 
 	/* terminate, set to next free slot, shrink available space */
@@ -397,30 +397,30 @@ static int class_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	buffer = &buffer[length];
 	buffer_size -= length;
 
-	if (class_dev->hotplug) {
+	if (class_dev->uevent) {
 		/* have the class device specific function add its stuff */
-		retval = class_dev->hotplug(class_dev, envp, num_envp,
+		retval = class_dev->uevent(class_dev, envp, num_envp,
 					    buffer, buffer_size);
 		if (retval)
-			pr_debug("class_dev->hotplug() returned %d\n", retval);
-	} else if (class_dev->class->hotplug) {
+			pr_debug("class_dev->uevent() returned %d\n", retval);
+	} else if (class_dev->class->uevent) {
 		/* have the class specific function add its stuff */
-		retval = class_dev->class->hotplug(class_dev, envp, num_envp,
+		retval = class_dev->class->uevent(class_dev, envp, num_envp,
 						   buffer, buffer_size);
 		if (retval)
-			pr_debug("class->hotplug() returned %d\n", retval);
+			pr_debug("class->uevent() returned %d\n", retval);
 	}
 
 	return retval;
 }
 
-static struct kset_hotplug_ops class_hotplug_ops = {
-	.filter =	class_hotplug_filter,
-	.name =		class_hotplug_name,
-	.hotplug =	class_hotplug,
+static struct kset_uevent_ops class_uevent_ops = {
+	.filter =	class_uevent_filter,
+	.name =		class_uevent_name,
+	.uevent =	class_uevent,
 };
 
-static decl_subsys(class_obj, &ktype_class_device, &class_hotplug_ops);
+static decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops);
 
 
 static int class_device_add_attrs(struct class_device * cd)
@@ -464,7 +464,7 @@ static ssize_t show_dev(struct class_device *class_dev, char *buf)
 static ssize_t store_uevent(struct class_device *class_dev,
 			    const char *buf, size_t count)
 {
-	kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
+	kobject_uevent(&class_dev->kobj, KOBJ_ADD);
 	return count;
 }
 
@@ -559,7 +559,7 @@ int class_device_add(struct class_device *class_dev)
 				  class_name);
 	}
 
-	kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
+	kobject_uevent(&class_dev->kobj, KOBJ_ADD);
 
 	/* notify any interfaces this device is now here */
 	if (parent_class) {
@@ -632,7 +632,7 @@ struct class_device *class_device_create(struct class *cls,
 	class_dev->class = cls;
 	class_dev->parent = parent;
 	class_dev->release = class_device_create_release;
-	class_dev->hotplug = class_device_create_hotplug;
+	class_dev->uevent = class_device_create_uevent;
 
 	va_start(args, fmt);
 	vsnprintf(class_dev->class_id, BUS_ID_SIZE, fmt, args);
@@ -674,7 +674,7 @@ void class_device_del(struct class_device *class_dev)
 		class_device_remove_file(class_dev, class_dev->devt_attr);
 	class_device_remove_attrs(class_dev);
 
-	kobject_hotplug(&class_dev->kobj, KOBJ_REMOVE);
+	kobject_uevent(&class_dev->kobj, KOBJ_REMOVE);
 	kobject_del(&class_dev->kobj);
 
 	class_device_put(parent_device);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 8615b42b517a..fd8059920dbf 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -90,7 +90,7 @@ static struct kobj_type ktype_device = {
 };
 
 
-static int dev_hotplug_filter(struct kset *kset, struct kobject *kobj)
+static int dev_uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
 
@@ -102,14 +102,14 @@ static int dev_hotplug_filter(struct kset *kset, struct kobject *kobj)
 	return 0;
 }
 
-static const char *dev_hotplug_name(struct kset *kset, struct kobject *kobj)
+static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	struct device *dev = to_dev(kobj);
 
 	return dev->bus->name;
 }
 
-static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int dev_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			int num_envp, char *buffer, int buffer_size)
 {
 	struct device *dev = to_dev(kobj);
@@ -119,15 +119,15 @@ static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 
 	/* add bus name of physical device */
 	if (dev->bus)
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "PHYSDEVBUS=%s", dev->bus->name);
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "PHYSDEVBUS=%s", dev->bus->name);
 
 	/* add driver name of physical device */
 	if (dev->driver)
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "PHYSDEVDRIVER=%s", dev->driver->name);
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "PHYSDEVDRIVER=%s", dev->driver->name);
 
 	/* terminate, set to next free slot, shrink available space */
 	envp[i] = NULL;
@@ -136,11 +136,11 @@ static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	buffer = &buffer[length];
 	buffer_size -= length;
 
-	if (dev->bus && dev->bus->hotplug) {
+	if (dev->bus && dev->bus->uevent) {
 		/* have the bus specific function add its stuff */
-		retval = dev->bus->hotplug (dev, envp, num_envp, buffer, buffer_size);
+		retval = dev->bus->uevent(dev, envp, num_envp, buffer, buffer_size);
 			if (retval) {
-			pr_debug ("%s - hotplug() returned %d\n",
+			pr_debug ("%s - uevent() returned %d\n",
 				  __FUNCTION__, retval);
 		}
 	}
@@ -148,16 +148,16 @@ static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	return retval;
 }
 
-static struct kset_hotplug_ops device_hotplug_ops = {
-	.filter =	dev_hotplug_filter,
-	.name =		dev_hotplug_name,
-	.hotplug =	dev_hotplug,
+static struct kset_uevent_ops device_uevent_ops = {
+	.filter =	dev_uevent_filter,
+	.name =		dev_uevent_name,
+	.uevent =	dev_uevent,
 };
 
 static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	kobject_hotplug(&dev->kobj, KOBJ_ADD);
+	kobject_uevent(&dev->kobj, KOBJ_ADD);
 	return count;
 }
 
@@ -165,7 +165,7 @@ static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
  *	device_subsys - structure to be registered with kobject core.
  */
 
-decl_subsys(devices, &ktype_device, &device_hotplug_ops);
+decl_subsys(devices, &ktype_device, &device_uevent_ops);
 
 
 /**
@@ -274,7 +274,7 @@ int device_add(struct device *dev)
 	dev->uevent_attr.store = store_uevent;
 	device_create_file(dev, &dev->uevent_attr);
 
-	kobject_hotplug(&dev->kobj, KOBJ_ADD);
+	kobject_uevent(&dev->kobj, KOBJ_ADD);
 	if ((error = device_pm_add(dev)))
 		goto PMError;
 	if ((error = bus_add_device(dev)))
@@ -291,7 +291,7 @@ int device_add(struct device *dev)
  BusError:
 	device_pm_remove(dev);
  PMError:
-	kobject_hotplug(&dev->kobj, KOBJ_REMOVE);
+	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	kobject_del(&dev->kobj);
  Error:
 	if (parent)
@@ -374,7 +374,7 @@ void device_del(struct device * dev)
 		platform_notify_remove(dev);
 	bus_remove_device(dev);
 	device_pm_remove(dev);
-	kobject_hotplug(&dev->kobj, KOBJ_REMOVE);
+	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	kobject_del(&dev->kobj);
 	if (parent)
 		put_device(parent);
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index a95844790f7b..281d26784d25 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -41,14 +41,14 @@ static ssize_t store_online(struct sys_device *dev, const char *buf,
 	case '0':
 		ret = cpu_down(cpu->sysdev.id);
 		if (!ret)
-			kobject_hotplug(&dev->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
 		break;
 	case '1':
 		ret = smp_prepare_cpu(cpu->sysdev.id);
 		if (!ret)
 			ret = cpu_up(cpu->sysdev.id);
 		if (!ret)
-			kobject_hotplug(&dev->kobj, KOBJ_ONLINE);
+			kobject_uevent(&dev->kobj, KOBJ_ONLINE);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 59dacb6552c0..5b3d5e9ddcb6 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -85,17 +85,17 @@ firmware_timeout_store(struct class *class, const char *buf, size_t count)
 static CLASS_ATTR(timeout, 0644, firmware_timeout_show, firmware_timeout_store);
 
 static void  fw_class_dev_release(struct class_device *class_dev);
-int firmware_class_hotplug(struct class_device *dev, char **envp,
+int firmware_class_uevent(struct class_device *dev, char **envp,
 			   int num_envp, char *buffer, int buffer_size);
 
 static struct class firmware_class = {
 	.name		= "firmware",
-	.hotplug	= firmware_class_hotplug,
+	.uevent	= firmware_class_uevent,
 	.release	= fw_class_dev_release,
 };
 
 int
-firmware_class_hotplug(struct class_device *class_dev, char **envp,
+firmware_class_uevent(struct class_device *class_dev, char **envp,
 		       int num_envp, char *buffer, int buffer_size)
 {
 	struct firmware_priv *fw_priv = class_get_devdata(class_dev);
@@ -104,13 +104,12 @@ firmware_class_hotplug(struct class_device *class_dev, char **envp,
 	if (!test_bit(FW_STATUS_READY, &fw_priv->status))
 		return -ENODEV;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &len,
-			"FIRMWARE=%s", fw_priv->fw_id))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &len,
+			   "FIRMWARE=%s", fw_priv->fw_id))
 		return -ENOMEM;
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &len,
-			"TIMEOUT=%i", loading_timeout))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &len,
+			   "TIMEOUT=%i", loading_timeout))
 		return -ENOMEM;
-
 	envp[i] = NULL;
 
 	return 0;
@@ -352,7 +351,7 @@ error_kfree:
 
 static int
 fw_setup_class_device(struct firmware *fw, struct class_device **class_dev_p,
-		      const char *fw_name, struct device *device, int hotplug)
+		      const char *fw_name, struct device *device, int uevent)
 {
 	struct class_device *class_dev;
 	struct firmware_priv *fw_priv;
@@ -384,7 +383,7 @@ fw_setup_class_device(struct firmware *fw, struct class_device **class_dev_p,
 		goto error_unreg;
 	}
 
-	if (hotplug)
+	if (uevent)
                 set_bit(FW_STATUS_READY, &fw_priv->status);
         else
                 set_bit(FW_STATUS_READY_NOHOTPLUG, &fw_priv->status);
@@ -399,7 +398,7 @@ out:
 
 static int
 _request_firmware(const struct firmware **firmware_p, const char *name,
-		 struct device *device, int hotplug)
+		 struct device *device, int uevent)
 {
 	struct class_device *class_dev;
 	struct firmware_priv *fw_priv;
@@ -418,19 +417,19 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 	}
 
 	retval = fw_setup_class_device(firmware, &class_dev, name, device,
-		hotplug);
+				       uevent);
 	if (retval)
 		goto error_kfree_fw;
 
 	fw_priv = class_get_devdata(class_dev);
 
-	if (hotplug) {
+	if (uevent) {
 		if (loading_timeout > 0) {
 			fw_priv->timeout.expires = jiffies + loading_timeout * HZ;
 			add_timer(&fw_priv->timeout);
 		}
 
-		kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
+		kobject_uevent(&class_dev->kobj, KOBJ_ADD);
 		wait_for_completion(&fw_priv->completion);
 		set_bit(FW_STATUS_DONE, &fw_priv->status);
 		del_timer_sync(&fw_priv->timeout);
@@ -456,7 +455,7 @@ out:
 }
 
 /**
- * request_firmware: - request firmware to hotplug and wait for it
+ * request_firmware: - send firmware request and wait for it
  * @firmware_p: pointer to firmware image
  * @name: name of firmware file
  * @device: device for which firmware is being loaded
@@ -466,7 +465,7 @@ out:
  *
  *      Should be called from user context where sleeping is allowed.
  *
- *      @name will be used as $FIRMWARE in the hotplug environment and
+ *      @name will be used as $FIRMWARE in the uevent environment and
  *      should be distinctive enough not to be confused with any other
  *      firmware image for this or any other device.
  **/
@@ -474,8 +473,8 @@ int
 request_firmware(const struct firmware **firmware_p, const char *name,
                  struct device *device)
 {
-        int hotplug = 1;
-        return _request_firmware(firmware_p, name, device, hotplug);
+        int uevent = 1;
+        return _request_firmware(firmware_p, name, device, uevent);
 }
 
 /**
@@ -518,7 +517,7 @@ struct firmware_work {
 	struct device *device;
 	void *context;
 	void (*cont)(const struct firmware *fw, void *context);
-	int hotplug;
+	int uevent;
 };
 
 static int
@@ -533,7 +532,7 @@ request_firmware_work_func(void *arg)
 	}
 	daemonize("%s/%s", "firmware", fw_work->name);
 	ret = _request_firmware(&fw, fw_work->name, fw_work->device,
-		fw_work->hotplug);
+		fw_work->uevent);
 	if (ret < 0)
 		fw_work->cont(NULL, fw_work->context);
 	else {
@@ -548,7 +547,7 @@ request_firmware_work_func(void *arg)
 /**
  * request_firmware_nowait: asynchronous version of request_firmware
  * @module: module requesting the firmware
- * @hotplug: invokes hotplug event to copy the firmware image if this flag
+ * @uevent: sends uevent to copy the firmware image if this flag
  *	is non-zero else the firmware copy must be done manually.
  * @name: name of firmware file
  * @device: device for which firmware is being loaded
@@ -562,7 +561,7 @@ request_firmware_work_func(void *arg)
  **/
 int
 request_firmware_nowait(
-	struct module *module, int hotplug,
+	struct module *module, int uevent,
 	const char *name, struct device *device, void *context,
 	void (*cont)(const struct firmware *fw, void *context))
 {
@@ -583,7 +582,7 @@ request_firmware_nowait(
 		.device = device,
 		.context = context,
 		.cont = cont,
-		.hotplug = hotplug,
+		.uevent = uevent,
 	};
 
 	ret = kernel_thread(request_firmware_work_func, fw_work,
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index bc3ca6a656b2..7e1d077874df 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -29,12 +29,12 @@ static struct sysdev_class memory_sysdev_class = {
 	set_kset_name(MEMORY_CLASS_NAME),
 };
 
-static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj)
+static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	return MEMORY_CLASS_NAME;
 }
 
-static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int memory_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			int num_envp, char *buffer, int buffer_size)
 {
 	int retval = 0;
@@ -42,9 +42,9 @@ static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	return retval;
 }
 
-static struct kset_hotplug_ops memory_hotplug_ops = {
-	.name		= memory_hotplug_name,
-	.hotplug	= memory_hotplug,
+static struct kset_uevent_ops memory_uevent_ops = {
+	.name		= memory_uevent_name,
+	.uevent		= memory_uevent,
 };
 
 static struct notifier_block *memory_chain;
@@ -431,7 +431,7 @@ int __init memory_dev_init(void)
 	unsigned int i;
 	int ret;
 
-	memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops;
+	memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
 	ret = sysdev_class_register(&memory_sysdev_class);
 
 	/*
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 0ea37b1bccb2..f2453668acf5 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -121,8 +121,8 @@ struct host_info {
 };
 
 static int nodemgr_bus_match(struct device * dev, struct device_driver * drv);
-static int nodemgr_hotplug(struct class_device *cdev, char **envp, int num_envp,
-			   char *buffer, int buffer_size);
+static int nodemgr_uevent(struct class_device *cdev, char **envp, int num_envp,
+			  char *buffer, int buffer_size);
 static void nodemgr_resume_ne(struct node_entry *ne);
 static void nodemgr_remove_ne(struct node_entry *ne);
 static struct node_entry *find_entry_by_guid(u64 guid);
@@ -162,7 +162,7 @@ static void ud_cls_release(struct class_device *class_dev)
 static struct class nodemgr_ud_class = {
 	.name		= "ieee1394",
 	.release	= ud_cls_release,
-	.hotplug	= nodemgr_hotplug,
+	.uevent		= nodemgr_uevent,
 };
 
 static struct hpsb_highlevel nodemgr_highlevel;
@@ -966,7 +966,7 @@ static struct unit_directory *nodemgr_process_unit_directory
 				if (ud_child == NULL)
 					break;
 				
-				/* inherit unspecified values so hotplug picks it up */
+				/* inherit unspecified values, the driver core picks it up */
 				if ((ud->flags & UNIT_DIRECTORY_MODEL_ID) &&
 				    !(ud_child->flags & UNIT_DIRECTORY_MODEL_ID))
 				{
@@ -1062,8 +1062,8 @@ static void nodemgr_process_root_directory(struct host_info *hi, struct node_ent
 
 #ifdef CONFIG_HOTPLUG
 
-static int nodemgr_hotplug(struct class_device *cdev, char **envp, int num_envp,
-			   char *buffer, int buffer_size)
+static int nodemgr_uevent(struct class_device *cdev, char **envp, int num_envp,
+			  char *buffer, int buffer_size)
 {
 	struct unit_directory *ud;
 	int i = 0;
@@ -1112,8 +1112,8 @@ do {								\
 
 #else
 
-static int nodemgr_hotplug(struct class_device *cdev, char **envp, int num_envp,
-			   char *buffer, int buffer_size)
+static int nodemgr_uevent(struct class_device *cdev, char **envp, int num_envp,
+			  char *buffer, int buffer_size)
 {
 	return -ENODEV;
 }
@@ -1618,8 +1618,8 @@ static int nodemgr_host_thread(void *__hi)
 
 		/* Scan our nodes to get the bus options and create node
 		 * entries. This does not do the sysfs stuff, since that
-		 * would trigger hotplug callbacks and such, which is a
-		 * bad idea at this point. */
+		 * would trigger uevents and such, which is a bad idea at
+		 * this point. */
 		nodemgr_node_scan(hi, generation);
 
 		/* This actually does the full probe, with sysfs
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 08648b1a387e..1f1743c5c9a3 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -434,24 +434,24 @@ static void ib_device_release(struct class_device *cdev)
 	kfree(dev);
 }
 
-static int ib_device_hotplug(struct class_device *cdev, char **envp,
-			     int num_envp, char *buf, int size)
+static int ib_device_uevent(struct class_device *cdev, char **envp,
+			    int num_envp, char *buf, int size)
 {
 	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
 	int i = 0, len = 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buf, size, &len,
-				"NAME=%s", dev->name))
+	if (add_uevent_var(envp, num_envp, &i, buf, size, &len,
+			   "NAME=%s", dev->name))
 		return -ENOMEM;
 
 	/*
-	 * It might be nice to pass the node GUID to hotplug, but
+	 * It might be nice to pass the node GUID with the event, but
 	 * right now the only way to get it is to query the device
 	 * provider, and this can crash during device removal because
 	 * we are will be running after driver removal has started.
 	 * We could add a node_guid field to struct ib_device, or we
-	 * could just let the hotplug script read the node GUID from
-	 * sysfs when devices are added.
+	 * could just let userspace read the node GUID from sysfs when
+	 * devices are added.
 	 */
 
 	envp[i] = NULL;
@@ -653,7 +653,7 @@ static struct class_device_attribute *ib_class_attributes[] = {
 static struct class ib_class = {
 	.name    = "infiniband",
 	.release = ib_device_release,
-	.hotplug = ib_device_hotplug,
+	.uevent = ib_device_uevent,
 };
 
 int ib_device_register_sysfs(struct ib_device *device)
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 43b49ccd7dad..2d37b394e384 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -610,10 +610,10 @@ static void input_dev_release(struct class_device *class_dev)
 }
 
 /*
- * Input hotplugging interface - loading event handlers based on
+ * Input uevent interface - loading event handlers based on
  * device bitfields.
  */
-static int input_add_hotplug_bm_var(char **envp, int num_envp, int *cur_index,
+static int input_add_uevent_bm_var(char **envp, int num_envp, int *cur_index,
 				    char *buffer, int buffer_size, int *cur_len,
 				    const char *name, unsigned long *bitmap, int max)
 {
@@ -638,7 +638,7 @@ static int input_add_hotplug_bm_var(char **envp, int num_envp, int *cur_index,
 
 #define INPUT_ADD_HOTPLUG_VAR(fmt, val...)				\
 	do {								\
-		int err = add_hotplug_env_var(envp, num_envp, &i,	\
+		int err = add_uevent_var(envp, num_envp, &i,	\
 					buffer, buffer_size, &len,	\
 					fmt, val);			\
 		if (err)						\
@@ -647,15 +647,15 @@ static int input_add_hotplug_bm_var(char **envp, int num_envp, int *cur_index,
 
 #define INPUT_ADD_HOTPLUG_BM_VAR(name, bm, max)				\
 	do {								\
-		int err = input_add_hotplug_bm_var(envp, num_envp, &i,	\
+		int err = input_add_uevent_bm_var(envp, num_envp, &i,	\
 					buffer, buffer_size, &len,	\
 					name, bm, max);			\
 		if (err)						\
 			return err;					\
 	} while (0)
 
-static int input_dev_hotplug(struct class_device *cdev, char **envp,
-			     int num_envp, char *buffer, int buffer_size)
+static int input_dev_uevent(struct class_device *cdev, char **envp,
+			    int num_envp, char *buffer, int buffer_size)
 {
 	struct input_dev *dev = to_input_dev(cdev);
 	int i = 0;
@@ -697,7 +697,7 @@ static int input_dev_hotplug(struct class_device *cdev, char **envp,
 struct class input_class = {
 	.name			= "input",
 	.release		= input_dev_release,
-	.hotplug		= input_dev_hotplug,
+	.uevent			= input_dev_uevent,
 };
 
 struct input_dev *input_allocate_device(void)
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index fbb69ef6a77b..8e530cc970e1 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -800,16 +800,16 @@ static int serio_bus_match(struct device *dev, struct device_driver *drv)
 
 #ifdef CONFIG_HOTPLUG
 
-#define SERIO_ADD_HOTPLUG_VAR(fmt, val...)				\
+#define SERIO_ADD_UEVENT_VAR(fmt, val...)				\
 	do {								\
-		int err = add_hotplug_env_var(envp, num_envp, &i,	\
+		int err = add_uevent_var(envp, num_envp, &i,	\
 					buffer, buffer_size, &len,	\
 					fmt, val);			\
 		if (err)						\
 			return err;					\
 	} while (0)
 
-static int serio_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int serio_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	struct serio *serio;
 	int i = 0;
@@ -820,21 +820,21 @@ static int serio_hotplug(struct device *dev, char **envp, int num_envp, char *bu
 
 	serio = to_serio_port(dev);
 
-	SERIO_ADD_HOTPLUG_VAR("SERIO_TYPE=%02x", serio->id.type);
-	SERIO_ADD_HOTPLUG_VAR("SERIO_PROTO=%02x", serio->id.proto);
-	SERIO_ADD_HOTPLUG_VAR("SERIO_ID=%02x", serio->id.id);
-	SERIO_ADD_HOTPLUG_VAR("SERIO_EXTRA=%02x", serio->id.extra);
-	SERIO_ADD_HOTPLUG_VAR("MODALIAS=serio:ty%02Xpr%02Xid%02Xex%02X",
+	SERIO_ADD_UEVENT_VAR("SERIO_TYPE=%02x", serio->id.type);
+	SERIO_ADD_UEVENT_VAR("SERIO_PROTO=%02x", serio->id.proto);
+	SERIO_ADD_UEVENT_VAR("SERIO_ID=%02x", serio->id.id);
+	SERIO_ADD_UEVENT_VAR("SERIO_EXTRA=%02x", serio->id.extra);
+	SERIO_ADD_UEVENT_VAR("MODALIAS=serio:ty%02Xpr%02Xid%02Xex%02X",
 				serio->id.type, serio->id.proto, serio->id.id, serio->id.extra);
 	envp[i] = NULL;
 
 	return 0;
 }
-#undef SERIO_ADD_HOTPLUG_VAR
+#undef SERIO_ADD_UEVENT_VAR
 
 #else
 
-static int serio_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int serio_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	return -ENODEV;
 }
@@ -908,7 +908,7 @@ static int __init serio_init(void)
 	serio_bus.dev_attrs = serio_device_attrs;
 	serio_bus.drv_attrs = serio_driver_attrs;
 	serio_bus.match = serio_bus_match;
-	serio_bus.hotplug = serio_hotplug;
+	serio_bus.uevent = serio_uevent;
 	serio_bus.resume = serio_resume;
 	bus_register(&serio_bus);
 
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index c34c96d18907..228e1852a836 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -128,7 +128,7 @@ static int macio_device_resume(struct device * dev)
 	return 0;
 }
 
-static int macio_hotplug (struct device *dev, char **envp, int num_envp,
+static int macio_uevent(struct device *dev, char **envp, int num_envp,
                           char *buffer, int buffer_size)
 {
 	struct macio_dev * macio_dev;
@@ -203,7 +203,7 @@ extern struct device_attribute macio_dev_attrs[];
 struct bus_type macio_bus_type = {
        .name	= "macio",
        .match	= macio_bus_match,
-       .hotplug = macio_hotplug,
+       .uevent = macio_uevent,
        .suspend	= macio_device_suspend,
        .resume	= macio_device_resume,
        .dev_attrs = macio_dev_attrs,
diff --git a/drivers/mmc/mmc_sysfs.c b/drivers/mmc/mmc_sysfs.c
index 3f4a66ca9555..ec701667abfc 100644
--- a/drivers/mmc/mmc_sysfs.c
+++ b/drivers/mmc/mmc_sysfs.c
@@ -80,7 +80,7 @@ static int mmc_bus_match(struct device *dev, struct device_driver *drv)
 }
 
 static int
-mmc_bus_hotplug(struct device *dev, char **envp, int num_envp, char *buf,
+mmc_bus_uevent(struct device *dev, char **envp, int num_envp, char *buf,
 		int buf_size)
 {
 	struct mmc_card *card = dev_to_mmc_card(dev);
@@ -140,7 +140,7 @@ static struct bus_type mmc_bus_type = {
 	.name		= "mmc",
 	.dev_attrs	= mmc_dev_attrs,
 	.match		= mmc_bus_match,
-	.hotplug	= mmc_bus_hotplug,
+	.uevent		= mmc_bus_uevent,
 	.suspend	= mmc_bus_suspend,
 	.resume		= mmc_bus_resume,
 };
diff --git a/drivers/pci/hotplug.c b/drivers/pci/hotplug.c
index e1743be31909..1c97e7dd130b 100644
--- a/drivers/pci/hotplug.c
+++ b/drivers/pci/hotplug.c
@@ -3,8 +3,8 @@
 #include <linux/module.h>
 #include "pci.h"
 
-int pci_hotplug (struct device *dev, char **envp, int num_envp,
-		 char *buffer, int buffer_size)
+int pci_uevent(struct device *dev, char **envp, int num_envp,
+	       char *buffer, int buffer_size)
 {
 	struct pci_dev *pdev;
 	int i = 0;
@@ -17,34 +17,34 @@ int pci_hotplug (struct device *dev, char **envp, int num_envp,
 	if (!pdev)
 		return -ENODEV;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_CLASS=%04X", pdev->class))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_CLASS=%04X", pdev->class))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_ID=%04X:%04X", pdev->vendor, pdev->device))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_ID=%04X:%04X", pdev->vendor, pdev->device))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_SUBSYS_ID=%04X:%04X", pdev->subsystem_vendor,
-				pdev->subsystem_device))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_SUBSYS_ID=%04X:%04X", pdev->subsystem_vendor,
+			   pdev->subsystem_device))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_SLOT_NAME=%s", pci_name(pdev)))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_SLOT_NAME=%s", pci_name(pdev)))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"MODALIAS=pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02x",
-				pdev->vendor, pdev->device,
-				pdev->subsystem_vendor, pdev->subsystem_device,
-				(u8)(pdev->class >> 16), (u8)(pdev->class >> 8),
-				(u8)(pdev->class)))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "MODALIAS=pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02x",
+			   pdev->vendor, pdev->device,
+			   pdev->subsystem_vendor, pdev->subsystem_device,
+			   (u8)(pdev->class >> 16), (u8)(pdev->class >> 8),
+			   (u8)(pdev->class)))
 		return -ENOMEM;
 
 	envp[i] = NULL;
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index a9046d4b8af3..7146b69b812c 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -502,8 +502,8 @@ void pci_dev_put(struct pci_dev *dev)
 }
 
 #ifndef CONFIG_HOTPLUG
-int pci_hotplug (struct device *dev, char **envp, int num_envp,
-		 char *buffer, int buffer_size)
+int pci_uevent(struct device *dev, char **envp, int num_envp,
+	       char *buffer, int buffer_size)
 {
 	return -ENODEV;
 }
@@ -512,7 +512,7 @@ int pci_hotplug (struct device *dev, char **envp, int num_envp,
 struct bus_type pci_bus_type = {
 	.name		= "pci",
 	.match		= pci_bus_match,
-	.hotplug	= pci_hotplug,
+	.uevent		= pci_uevent,
 	.suspend	= pci_device_suspend,
 	.resume		= pci_device_resume,
 	.dev_attrs	= pci_dev_attrs,
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 6527b36c9a61..294849d24590 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1,7 +1,7 @@
 /* Functions internal to the PCI core code */
 
-extern int pci_hotplug (struct device *dev, char **envp, int num_envp,
-			 char *buffer, int buffer_size);
+extern int pci_uevent(struct device *dev, char **envp, int num_envp,
+		      char *buffer, int buffer_size);
 extern int pci_create_sysfs_dev_files(struct pci_dev *pdev);
 extern void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
 extern void pci_cleanup_rom(struct pci_dev *dev);
diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c
index a30aa74304a2..7cf09084ef61 100644
--- a/drivers/pcmcia/cs.c
+++ b/drivers/pcmcia/cs.c
@@ -901,14 +901,14 @@ int pcmcia_insert_card(struct pcmcia_socket *skt)
 EXPORT_SYMBOL(pcmcia_insert_card);
 
 
-static int pcmcia_socket_hotplug(struct class_device *dev, char **envp,
-				int num_envp, char *buffer, int buffer_size)
+static int pcmcia_socket_uevent(struct class_device *dev, char **envp,
+			        int num_envp, char *buffer, int buffer_size)
 {
 	struct pcmcia_socket *s = container_of(dev, struct pcmcia_socket, dev);
 	int i = 0, length = 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				&length, "SOCKET_NO=%u", s->sock))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			   &length, "SOCKET_NO=%u", s->sock))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -927,7 +927,7 @@ static void pcmcia_release_socket_class(struct class *data)
 
 struct class pcmcia_socket_class = {
 	.name = "pcmcia_socket",
-        .hotplug = pcmcia_socket_hotplug,
+	.uevent = pcmcia_socket_uevent,
 	.release = pcmcia_release_socket,
 	.class_release = pcmcia_release_socket_class,
 };
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 7f8219f3fd9e..6fb76399547e 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -779,8 +779,8 @@ static int pcmcia_bus_match(struct device * dev, struct device_driver * drv) {
 
 #ifdef CONFIG_HOTPLUG
 
-static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
-			      char *buffer, int buffer_size)
+static int pcmcia_bus_uevent(struct device *dev, char **envp, int num_envp,
+			     char *buffer, int buffer_size)
 {
 	struct pcmcia_device *p_dev;
 	int i, length = 0;
@@ -800,31 +800,31 @@ static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
 
 	i = 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"SOCKET_NO=%u",
-				p_dev->socket->sock))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "SOCKET_NO=%u",
+			   p_dev->socket->sock))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"DEVICE_NO=%02X",
-				p_dev->device_no))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "DEVICE_NO=%02X",
+			   p_dev->device_no))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"MODALIAS=pcmcia:m%04Xc%04Xf%02Xfn%02Xpfn%02X"
-				"pa%08Xpb%08Xpc%08Xpd%08X",
-				p_dev->has_manf_id ? p_dev->manf_id : 0,
-				p_dev->has_card_id ? p_dev->card_id : 0,
-				p_dev->has_func_id ? p_dev->func_id : 0,
-				p_dev->func,
-				p_dev->device_no,
-				hash[0],
-				hash[1],
-				hash[2],
-				hash[3]))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "MODALIAS=pcmcia:m%04Xc%04Xf%02Xfn%02Xpfn%02X"
+			   "pa%08Xpb%08Xpc%08Xpd%08X",
+			   p_dev->has_manf_id ? p_dev->manf_id : 0,
+			   p_dev->has_card_id ? p_dev->card_id : 0,
+			   p_dev->has_func_id ? p_dev->func_id : 0,
+			   p_dev->func,
+			   p_dev->device_no,
+			   hash[0],
+			   hash[1],
+			   hash[2],
+			   hash[3]))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -834,7 +834,7 @@ static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
 
 #else
 
-static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
+static int pcmcia_bus_uevent(struct device *dev, char **envp, int num_envp,
 			      char *buffer, int buffer_size)
 {
 	return -ENODEV;
@@ -1223,7 +1223,7 @@ static struct class_interface pcmcia_bus_interface = {
 
 struct bus_type pcmcia_bus_type = {
 	.name = "pcmcia",
-	.hotplug = pcmcia_bus_hotplug,
+	.uevent = pcmcia_bus_uevent,
 	.match = pcmcia_bus_match,
 	.dev_attrs = pcmcia_dev_attrs,
 };
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index bf44a409ba0d..07ddf9a38758 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -2132,7 +2132,7 @@ restart:
 	}
 
 	spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags);
-	kobject_hotplug(&ioa_cfg->host->shost_classdev.kobj, KOBJ_CHANGE);
+	kobject_uevent(&ioa_cfg->host->shost_classdev.kobj, KOBJ_CHANGE);
 	LEAVE;
 }
 
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index e80ef9467825..af2f0941baac 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -363,8 +363,7 @@ void usb_driver_release_interface(struct usb_driver *driver,
  * Most USB device drivers will use this indirectly, through the usb core,
  * but some layered driver frameworks use it directly.
  * These device tables are exported with MODULE_DEVICE_TABLE, through
- * modutils and "modules.usbmap", to support the driver loading
- * functionality of USB hotplugging.
+ * modutils, to support the driver loading functionality of USB hotplugging.
  *
  * What Matches:
  *
@@ -545,10 +544,7 @@ static int usb_device_match (struct device *dev, struct device_driver *drv)
 #ifdef	CONFIG_HOTPLUG
 
 /*
- * USB hotplugging invokes what /proc/sys/kernel/hotplug says
- * (normally /sbin/hotplug) when USB devices get added or removed.
- *
- * This invokes a user mode policy agent, typically helping to load driver
+ * This sends an uevent to userspace, typically helping to load driver
  * or other modules, configure the device, and more.  Drivers can provide
  * a MODULE_DEVICE_TABLE to help with module loading subtasks.
  *
@@ -557,8 +553,8 @@ static int usb_device_match (struct device *dev, struct device_driver *drv)
  * delays in event delivery.  Use sysfs (and DEVPATH) to make sure the
  * device (and this configuration!) are still present.
  */
-static int usb_hotplug (struct device *dev, char **envp, int num_envp,
-			char *buffer, int buffer_size)
+static int usb_uevent(struct device *dev, char **envp, int num_envp,
+		      char *buffer, int buffer_size)
 {
 	struct usb_interface *intf;
 	struct usb_device *usb_dev;
@@ -570,7 +566,7 @@ static int usb_hotplug (struct device *dev, char **envp, int num_envp,
 		return -ENODEV;
 
 	/* driver is often null here; dev_dbg() would oops */
-	pr_debug ("usb %s: hotplug\n", dev->bus_id);
+	pr_debug ("usb %s: uevent\n", dev->bus_id);
 
 	/* Must check driver_data here, as on remove driver is always NULL */
 	if ((dev->driver == &usb_generic_driver) || 
@@ -597,51 +593,51 @@ static int usb_hotplug (struct device *dev, char **envp, int num_envp,
 	 *
 	 * FIXME reduce hardwired intelligence here
 	 */
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"DEVICE=/proc/bus/usb/%03d/%03d",
-				usb_dev->bus->busnum, usb_dev->devnum))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "DEVICE=/proc/bus/usb/%03d/%03d",
+			   usb_dev->bus->busnum, usb_dev->devnum))
 		return -ENOMEM;
 #endif
 
 	/* per-device configurations are common */
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PRODUCT=%x/%x/%x",
-				le16_to_cpu(usb_dev->descriptor.idVendor),
-				le16_to_cpu(usb_dev->descriptor.idProduct),
-				le16_to_cpu(usb_dev->descriptor.bcdDevice)))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PRODUCT=%x/%x/%x",
+			   le16_to_cpu(usb_dev->descriptor.idVendor),
+			   le16_to_cpu(usb_dev->descriptor.idProduct),
+			   le16_to_cpu(usb_dev->descriptor.bcdDevice)))
 		return -ENOMEM;
 
 	/* class-based driver binding models */
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"TYPE=%d/%d/%d",
-				usb_dev->descriptor.bDeviceClass,
-				usb_dev->descriptor.bDeviceSubClass,
-				usb_dev->descriptor.bDeviceProtocol))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "TYPE=%d/%d/%d",
+			   usb_dev->descriptor.bDeviceClass,
+			   usb_dev->descriptor.bDeviceSubClass,
+			   usb_dev->descriptor.bDeviceProtocol))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"INTERFACE=%d/%d/%d",
-				alt->desc.bInterfaceClass,
-				alt->desc.bInterfaceSubClass,
-				alt->desc.bInterfaceProtocol))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "INTERFACE=%d/%d/%d",
+			   alt->desc.bInterfaceClass,
+			   alt->desc.bInterfaceSubClass,
+			   alt->desc.bInterfaceProtocol))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"MODALIAS=usb:v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02Xic%02Xisc%02Xip%02X",
-				le16_to_cpu(usb_dev->descriptor.idVendor),
-				le16_to_cpu(usb_dev->descriptor.idProduct),
-				le16_to_cpu(usb_dev->descriptor.bcdDevice),
-				usb_dev->descriptor.bDeviceClass,
-				usb_dev->descriptor.bDeviceSubClass,
-				usb_dev->descriptor.bDeviceProtocol,
-				alt->desc.bInterfaceClass,
-				alt->desc.bInterfaceSubClass,
-				alt->desc.bInterfaceProtocol))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "MODALIAS=usb:v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02Xic%02Xisc%02Xip%02X",
+			   le16_to_cpu(usb_dev->descriptor.idVendor),
+			   le16_to_cpu(usb_dev->descriptor.idProduct),
+			   le16_to_cpu(usb_dev->descriptor.bcdDevice),
+			   usb_dev->descriptor.bDeviceClass,
+			   usb_dev->descriptor.bDeviceSubClass,
+			   usb_dev->descriptor.bDeviceProtocol,
+			   alt->desc.bInterfaceClass,
+			   alt->desc.bInterfaceSubClass,
+			   alt->desc.bInterfaceProtocol))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -651,7 +647,7 @@ static int usb_hotplug (struct device *dev, char **envp, int num_envp,
 
 #else
 
-static int usb_hotplug (struct device *dev, char **envp,
+static int usb_uevent(struct device *dev, char **envp,
 			int num_envp, char *buffer, int buffer_size)
 {
 	return -ENODEV;
@@ -1491,7 +1487,7 @@ static int usb_generic_resume(struct device *dev)
 struct bus_type usb_bus_type = {
 	.name =		"usb",
 	.match =	usb_device_match,
-	.hotplug =	usb_hotplug,
+	.uevent =	usb_uevent,
 	.suspend =	usb_generic_suspend,
 	.resume =	usb_generic_resume,
 };
diff --git a/drivers/usb/host/hc_crisv10.c b/drivers/usb/host/hc_crisv10.c
index 0eaabeb37ac3..641268d7e6f3 100644
--- a/drivers/usb/host/hc_crisv10.c
+++ b/drivers/usb/host/hc_crisv10.c
@@ -4397,7 +4397,7 @@ static int __init etrax_usb_hc_init(void)
         device_initialize(&fake_device);
         kobject_set_name(&fake_device.kobj, "etrax_usb");
         kobject_add(&fake_device.kobj);
-        kobject_hotplug(&fake_device.kobj, KOBJ_ADD);
+	kobject_uevent(&fake_device.kobj, KOBJ_ADD);
         hc->bus->controller = &fake_device;
 	usb_register_bus(hc->bus);
 
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index 14016b1cd948..024206c4a0e4 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -142,12 +142,12 @@ static struct bin_attribute w1_slave_attr_bin_id = {
 /* Default family */
 static struct w1_family w1_default_family;
 
-static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size);
+static int w1_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size);
 
 static struct bus_type w1_bus_type = {
 	.name = "w1",
 	.match = w1_master_match,
-	.hotplug = w1_hotplug,
+	.uevent = w1_uevent,
 };
 
 struct device_driver w1_master_driver = {
@@ -361,7 +361,7 @@ void w1_destroy_master_attributes(struct w1_master *master)
 }
 
 #ifdef CONFIG_HOTPLUG
-static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int w1_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	struct w1_master *md = NULL;
 	struct w1_slave *sl = NULL;
@@ -377,7 +377,7 @@ static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffe
 		event_owner = "slave";
 		name = sl->name;
 	} else {
-		dev_dbg(dev, "Unknown hotplug event.\n");
+		dev_dbg(dev, "Unknown event.\n");
 		return -EINVAL;
 	}
 
@@ -386,18 +386,18 @@ static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffe
 	if (dev->driver != &w1_slave_driver || !sl)
 		return 0;
 
-	err = add_hotplug_env_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_FID=%02X", sl->reg_num.family);
+	err = add_uevent_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_FID=%02X", sl->reg_num.family);
 	if (err)
 		return err;
 
-	err = add_hotplug_env_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_SLAVE_ID=%024LX", (u64)sl->reg_num.id);
+	err = add_uevent_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_SLAVE_ID=%024LX", (u64)sl->reg_num.id);
 	if (err)
 		return err;
 
 	return 0;
 };
 #else
-static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int w1_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	return 0;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8dc1822a7022..7187a57d51e8 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,7 +226,7 @@ static struct sysfs_ops part_sysfs_ops = {
 static ssize_t part_uevent_store(struct hd_struct * p,
 				 const char *page, size_t count)
 {
-	kobject_hotplug(&p->kobj, KOBJ_ADD);
+	kobject_uevent(&p->kobj, KOBJ_ADD);
 	return count;
 }
 static ssize_t part_dev_read(struct hd_struct * p, char *page)
@@ -360,7 +360,7 @@ void register_disk(struct gendisk *disk)
 	if ((err = kobject_add(&disk->kobj)))
 		return;
 	disk_sysfs_symlinks(disk);
-	kobject_hotplug(&disk->kobj, KOBJ_ADD);
+	kobject_uevent(&disk->kobj, KOBJ_ADD);
 
 	/* No minors to use for partitions */
 	if (disk->minors == 1) {
@@ -465,6 +465,6 @@ void del_gendisk(struct gendisk *disk)
 		sysfs_remove_link(&disk->driverfs_dev->kobj, "block");
 		put_device(disk->driverfs_dev);
 	}
-	kobject_hotplug(&disk->kobj, KOBJ_REMOVE);
+	kobject_uevent(&disk->kobj, KOBJ_REMOVE);
 	kobject_del(&disk->kobj);
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 17cbc6db67b4..0cdee78e5ce1 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -47,8 +47,8 @@ struct bus_type {
 	struct driver_attribute	* drv_attrs;
 
 	int		(*match)(struct device * dev, struct device_driver * drv);
-	int		(*hotplug) (struct device *dev, char **envp, 
-				    int num_envp, char *buffer, int buffer_size);
+	int		(*uevent)(struct device *dev, char **envp,
+				  int num_envp, char *buffer, int buffer_size);
 	int		(*suspend)(struct device * dev, pm_message_t state);
 	int		(*resume)(struct device * dev);
 };
@@ -151,7 +151,7 @@ struct class {
 	struct class_attribute		* class_attrs;
 	struct class_device_attribute	* class_dev_attrs;
 
-	int	(*hotplug)(struct class_device *dev, char **envp, 
+	int	(*uevent)(struct class_device *dev, char **envp,
 			   int num_envp, char *buffer, int buffer_size);
 
 	void	(*release)(struct class_device *dev);
@@ -209,9 +209,9 @@ extern int class_device_create_file(struct class_device *,
  * set, this will be called instead of the class specific release function.
  * Only use this if you want to override the default release function, like
  * when you are nesting class_device structures.
- * @hotplug: pointer to a hotplug function for this struct class_device.  If
- * set, this will be called instead of the class specific hotplug function.
- * Only use this if you want to override the default hotplug function, like
+ * @uevent: pointer to a uevent function for this struct class_device.  If
+ * set, this will be called instead of the class specific uevent function.
+ * Only use this if you want to override the default uevent function, like
  * when you are nesting class_device structures.
  */
 struct class_device {
@@ -227,7 +227,7 @@ struct class_device {
 	struct class_device	*parent;	/* parent of this child device, if there is one */
 
 	void	(*release)(struct class_device *dev);
-	int	(*hotplug)(struct class_device *dev, char **envp,
+	int	(*uevent)(struct class_device *dev, char **envp,
 			   int num_envp, char *buffer, int buffer_size);
 	char	class_id[BUS_ID_SIZE];	/* unique to this class */
 };
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index 2063c0839d4f..2d716080be4a 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -14,7 +14,7 @@ struct device;
 int request_firmware(const struct firmware **fw, const char *name,
 		     struct device *device);
 int request_firmware_nowait(
-	struct module *module, int hotplug,
+	struct module *module, int uevent,
 	const char *name, struct device *device, void *context,
 	void (*cont)(const struct firmware *fw, void *context));
 
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 5b08248fba72..8eb21f2f25e1 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -26,15 +26,14 @@
 #include <linux/kernel.h>
 #include <asm/atomic.h>
 
-#define KOBJ_NAME_LEN	20
-
-#define HOTPLUG_PATH_LEN	256
+#define KOBJ_NAME_LEN			20
+#define UEVENT_HELPER_PATH_LEN		256
 
 /* path to the userspace helper executed on an event */
-extern char hotplug_path[];
+extern char uevent_helper[];
 
-/* counter to tag the hotplug event, read only except for the kobject core */
-extern u64 hotplug_seqnum;
+/* counter to tag the uevent, read only except for the kobject core */
+extern u64 uevent_seqnum;
 
 /* the actions here must match the proper string in lib/kobject_uevent.c */
 typedef int __bitwise kobject_action_t;
@@ -101,15 +100,14 @@ struct kobj_type {
  *	of object; multiple ksets can belong to one subsystem. All 
  *	ksets of a subsystem share the subsystem's lock.
  *
- *      Each kset can support hotplugging; if it does, it will be given
- *      the opportunity to filter out specific kobjects from being
- *      reported, as well as to add its own "data" elements to the
- *      environment being passed to the hotplug helper.
+ *	Each kset can support specific event variables; it can
+ *	supress the event generation or add subsystem specific
+ *	variables carried with the event.
  */
-struct kset_hotplug_ops {
+struct kset_uevent_ops {
 	int (*filter)(struct kset *kset, struct kobject *kobj);
 	const char *(*name)(struct kset *kset, struct kobject *kobj);
-	int (*hotplug)(struct kset *kset, struct kobject *kobj, char **envp,
+	int (*uevent)(struct kset *kset, struct kobject *kobj, char **envp,
 			int num_envp, char *buffer, int buffer_size);
 };
 
@@ -119,7 +117,7 @@ struct kset {
 	struct list_head	list;
 	spinlock_t		list_lock;
 	struct kobject		kobj;
-	struct kset_hotplug_ops	* hotplug_ops;
+	struct kset_uevent_ops	* uevent_ops;
 };
 
 
@@ -167,20 +165,20 @@ struct subsystem {
 	struct rw_semaphore	rwsem;
 };
 
-#define decl_subsys(_name,_type,_hotplug_ops) \
+#define decl_subsys(_name,_type,_uevent_ops) \
 struct subsystem _name##_subsys = { \
 	.kset = { \
 		.kobj = { .name = __stringify(_name) }, \
 		.ktype = _type, \
-		.hotplug_ops =_hotplug_ops, \
+		.uevent_ops =_uevent_ops, \
 	} \
 }
-#define decl_subsys_name(_varname,_name,_type,_hotplug_ops) \
+#define decl_subsys_name(_varname,_name,_type,_uevent_ops) \
 struct subsystem _varname##_subsys = { \
 	.kset = { \
 		.kobj = { .name = __stringify(_name) }, \
 		.ktype = _type, \
-		.hotplug_ops =_hotplug_ops, \
+		.uevent_ops =_uevent_ops, \
 	} \
 }
 
@@ -256,16 +254,16 @@ extern int subsys_create_file(struct subsystem * , struct subsys_attribute *);
 extern void subsys_remove_file(struct subsystem * , struct subsys_attribute *);
 
 #ifdef CONFIG_HOTPLUG
-void kobject_hotplug(struct kobject *kobj, enum kobject_action action);
+void kobject_uevent(struct kobject *kobj, enum kobject_action action);
 
-int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
+int add_uevent_var(char **envp, int num_envp, int *cur_index,
 			char *buffer, int buffer_size, int *cur_len,
 			const char *format, ...)
 	__attribute__((format (printf, 7, 8)));
 #else
-static inline void kobject_hotplug(struct kobject *kobj, enum kobject_action action) { }
+static inline void kobject_uevent(struct kobject *kobj, enum kobject_action action) { }
 
-static inline int add_hotplug_env_var(char **envp, int num_envp, int *cur_index, 
+static inline int add_uevent_var(char **envp, int num_envp, int *cur_index,
 				      char *buffer, int buffer_size, int *cur_len, 
 				      const char *format, ...)
 { return 0; }
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4be34ef8c2f7..501564264518 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -124,7 +124,7 @@ enum
 	KERN_OVERFLOWUID=46,	/* int: overflow UID */
 	KERN_OVERFLOWGID=47,	/* int: overflow GID */
 	KERN_SHMPATH=48,	/* string: path to shm fs */
-	KERN_HOTPLUG=49,	/* string: path to hotplug policy agent */
+	KERN_HOTPLUG=49,	/* string: path to uevent helper (deprecated) */
 	KERN_IEEE_EMULATION_WARNINGS=50, /* int: unimplemented ieee instructions */
 	KERN_S390_USER_DEBUG_LOGGING=51,  /* int: dumps of user faults */
 	KERN_CORE_USES_PID=52,		/* int: use core or core.%pid */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index d81b050e5955..7a20997e8071 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -225,7 +225,7 @@ struct usb_interface_cache {
  * Device drivers should not attempt to activate configurations.  The choice
  * of which configuration to install is a policy decision based on such
  * considerations as available power, functionality provided, and the user's
- * desires (expressed through hotplug scripts).  However, drivers can call
+ * desires (expressed through userspace tools).  However, drivers can call
  * usb_reset_configuration() to reinitialize the current configuration and
  * all its interfaces.
  */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e975a76a9d5b..bfb4a7a54e22 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -26,23 +26,23 @@ static struct subsys_attribute _name##_attr = \
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
 {
-	return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
+	return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
 /* uevent helper program, used during early boo */
 static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
 {
-	return sprintf(page, "%s\n", hotplug_path);
+	return sprintf(page, "%s\n", uevent_helper);
 }
 static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
 {
-	if (count+1 > HOTPLUG_PATH_LEN)
+	if (count+1 > UEVENT_HELPER_PATH_LEN)
 		return -ENOENT;
-	memcpy(hotplug_path, page, count);
-	hotplug_path[count] = '\0';
-	if (count && hotplug_path[count-1] == '\n')
-		hotplug_path[count-1] = '\0';
+	memcpy(uevent_helper, page, count);
+	uevent_helper[count] = '\0';
+	if (count && uevent_helper[count-1] == '\n')
+		uevent_helper[count-1] = '\0';
 	return count;
 }
 KERNEL_ATTR_RW(uevent_helper);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a51e25d4466..345f4a1d533f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -395,8 +395,8 @@ static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_HOTPLUG,
 		.procname	= "hotplug",
-		.data		= &hotplug_path,
-		.maxlen		= HOTPLUG_PATH_LEN,
+		.data		= &uevent_helper,
+		.maxlen		= UEVENT_HELPER_PATH_LEN,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
diff --git a/lib/kobject.c b/lib/kobject.c
index a181abed89f6..7a0e6809490d 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -207,7 +207,7 @@ int kobject_register(struct kobject * kobj)
 			       kobject_name(kobj),error);
 			dump_stack();
 		} else
-			kobject_hotplug(kobj, KOBJ_ADD);
+			kobject_uevent(kobj, KOBJ_ADD);
 	} else
 		error = -EINVAL;
 	return error;
@@ -312,7 +312,7 @@ void kobject_del(struct kobject * kobj)
 void kobject_unregister(struct kobject * kobj)
 {
 	pr_debug("kobject %s: unregistering\n",kobject_name(kobj));
-	kobject_hotplug(kobj, KOBJ_REMOVE);
+	kobject_uevent(kobj, KOBJ_REMOVE);
 	kobject_del(kobj);
 	kobject_put(kobj);
 }
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index dd061da3aba9..01479e5c6d18 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -22,12 +22,12 @@
 #include <linux/kobject.h>
 #include <net/sock.h>
 
-#define BUFFER_SIZE	1024	/* buffer for the hotplug env */
+#define BUFFER_SIZE	1024	/* buffer for the variables */
 #define NUM_ENVP	32	/* number of env pointers */
 
 #if defined(CONFIG_HOTPLUG)
-char hotplug_path[HOTPLUG_PATH_LEN] = "/sbin/hotplug";
-u64 hotplug_seqnum;
+char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
+u64 uevent_seqnum;
 static DEFINE_SPINLOCK(sequence_lock);
 static struct sock *uevent_sock;
 
@@ -50,12 +50,12 @@ static char *action_to_string(enum kobject_action action)
 }
 
 /**
- * kobject_hotplug - notify userspace by executing /sbin/hotplug
+ * kobject_uevent - notify userspace by ending an uevent
  *
- * @action: action that is happening (usually "ADD" or "REMOVE")
+ * @action: action that is happening (usually KOBJ_ADD and KOBJ_REMOVE)
  * @kobj: struct kobject that the action is happening to
  */
-void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
+void kobject_uevent(struct kobject *kobj, enum kobject_action action)
 {
 	char **envp;
 	char *buffer;
@@ -65,7 +65,7 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 	const char *subsystem;
 	struct kobject *top_kobj;
 	struct kset *kset;
-	struct kset_hotplug_ops *hotplug_ops;
+	struct kset_uevent_ops *uevent_ops;
 	u64 seq;
 	char *seq_buff;
 	int i = 0;
@@ -88,11 +88,11 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 		return;
 
 	kset = top_kobj->kset;
-	hotplug_ops = kset->hotplug_ops;
+	uevent_ops = kset->uevent_ops;
 
 	/*  skip the event, if the filter returns zero. */
-	if (hotplug_ops && hotplug_ops->filter)
-		if (!hotplug_ops->filter(kset, kobj))
+	if (uevent_ops && uevent_ops->filter)
+		if (!uevent_ops->filter(kset, kobj))
 			return;
 
 	/* environment index */
@@ -111,8 +111,8 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 		goto exit;
 
 	/* originating subsystem */
-	if (hotplug_ops && hotplug_ops->name)
-		subsystem = hotplug_ops->name(kset, kobj);
+	if (uevent_ops && uevent_ops->name)
+		subsystem = uevent_ops->name(kset, kobj);
 	else
 		subsystem = kobject_name(&kset->kobj);
 
@@ -134,12 +134,12 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 	scratch += strlen("SEQNUM=18446744073709551616") + 1;
 
 	/* let the kset specific function add its stuff */
-	if (hotplug_ops && hotplug_ops->hotplug) {
-		retval = hotplug_ops->hotplug (kset, kobj,
+	if (uevent_ops && uevent_ops->uevent) {
+		retval = uevent_ops->uevent(kset, kobj,
 				  &envp[i], NUM_ENVP - i, scratch,
 				  BUFFER_SIZE - (scratch - buffer));
 		if (retval) {
-			pr_debug ("%s - hotplug() returned %d\n",
+			pr_debug ("%s - uevent() returned %d\n",
 				  __FUNCTION__, retval);
 			goto exit;
 		}
@@ -147,7 +147,7 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 
 	/* we will send an event, request a new sequence number */
 	spin_lock(&sequence_lock);
-	seq = ++hotplug_seqnum;
+	seq = ++uevent_seqnum;
 	spin_unlock(&sequence_lock);
 	sprintf(seq_buff, "SEQNUM=%llu", (unsigned long long)seq);
 
@@ -177,10 +177,10 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 	}
 
 	/* call uevent_helper, usually only enabled during early boot */
-	if (hotplug_path[0]) {
+	if (uevent_helper[0]) {
 		char *argv [3];
 
-		argv [0] = hotplug_path;
+		argv [0] = uevent_helper;
 		argv [1] = (char *)subsystem;
 		argv [2] = NULL;
 		call_usermodehelper (argv[0], argv, envp, 0);
@@ -192,39 +192,39 @@ exit:
 	kfree(envp);
 	return;
 }
-EXPORT_SYMBOL(kobject_hotplug);
+EXPORT_SYMBOL_GPL(kobject_uevent);
 
 /**
- * add_hotplug_env_var - helper for creating hotplug environment variables
+ * add_uevent_var - helper for creating event variables
  * @envp: Pointer to table of environment variables, as passed into
- * hotplug() method.
+ * uevent() method.
  * @num_envp: Number of environment variable slots available, as
- * passed into hotplug() method.
+ * passed into uevent() method.
  * @cur_index: Pointer to current index into @envp.  It should be
- * initialized to 0 before the first call to add_hotplug_env_var(),
+ * initialized to 0 before the first call to add_uevent_var(),
  * and will be incremented on success.
  * @buffer: Pointer to buffer for environment variables, as passed
- * into hotplug() method.
- * @buffer_size: Length of @buffer, as passed into hotplug() method.
+ * into uevent() method.
+ * @buffer_size: Length of @buffer, as passed into uevent() method.
  * @cur_len: Pointer to current length of space used in @buffer.
  * Should be initialized to 0 before the first call to
- * add_hotplug_env_var(), and will be incremented on success.
+ * add_uevent_var(), and will be incremented on success.
  * @format: Format for creating environment variable (of the form
  * "XXX=%x") for snprintf().
  *
  * Returns 0 if environment variable was added successfully or -ENOMEM
  * if no space was available.
  */
-int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
-			char *buffer, int buffer_size, int *cur_len,
-			const char *format, ...)
+int add_uevent_var(char **envp, int num_envp, int *cur_index,
+		   char *buffer, int buffer_size, int *cur_len,
+		   const char *format, ...)
 {
 	va_list args;
 
 	/*
 	 * We check against num_envp - 1 to make sure there is at
-	 * least one slot left after we return, since the hotplug
-	 * method needs to set the last slot to NULL.
+	 * least one slot left after we return, since kobject_uevent()
+	 * needs to set the last slot to NULL.
 	 */
 	if (*cur_index >= num_envp - 1)
 		return -ENOMEM;
@@ -243,7 +243,7 @@ int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
 	(*cur_index)++;
 	return 0;
 }
-EXPORT_SYMBOL(add_hotplug_env_var);
+EXPORT_SYMBOL_GPL(add_uevent_var);
 
 static int __init kobject_uevent_init(void)
 {
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index bd7568ac87fc..0ed38740388c 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -78,7 +78,7 @@ static struct class_device_attribute *bt_attrs[] = {
 };
 
 #ifdef CONFIG_HOTPLUG
-static int bt_hotplug(struct class_device *cdev, char **envp, int num_envp, char *buf, int size)
+static int bt_uevent(struct class_device *cdev, char **envp, int num_envp, char *buf, int size)
 {
 	struct hci_dev *hdev = class_get_devdata(cdev);
 	int n, i = 0;
@@ -107,7 +107,7 @@ struct class bt_class = {
 	.name		= "bluetooth",
 	.release	= bt_release,
 #ifdef CONFIG_HOTPLUG
-	.hotplug	= bt_hotplug,
+	.uevent		= bt_uevent,
 #endif
 };
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index f6a19d53eaeb..2ebdc23bbe26 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -248,7 +248,7 @@ int br_sysfs_addif(struct net_bridge_port *p)
 	if (err)
 		goto out2;
 
-	kobject_hotplug(&p->kobj, KOBJ_ADD);
+	kobject_uevent(&p->kobj, KOBJ_ADD);
 	return 0;
  out2:
 	kobject_del(&p->kobj);
@@ -260,7 +260,7 @@ void br_sysfs_removeif(struct net_bridge_port *p)
 {
 	pr_debug("br_sysfs_removeif\n");
 	sysfs_remove_link(&p->br->ifobj, p->dev->name);
-	kobject_hotplug(&p->kobj, KOBJ_REMOVE);
+	kobject_uevent(&p->kobj, KOBJ_REMOVE);
 	kobject_del(&p->kobj);
 }
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e2137f3e489d..198655dd9a77 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -369,14 +369,14 @@ static struct attribute_group wireless_group = {
 #endif
 
 #ifdef CONFIG_HOTPLUG
-static int netdev_hotplug(struct class_device *cd, char **envp,
-			  int num_envp, char *buf, int size)
+static int netdev_uevent(struct class_device *cd, char **envp,
+			 int num_envp, char *buf, int size)
 {
 	struct net_device *dev = to_net_dev(cd);
 	int i = 0;
 	int n;
 
-	/* pass interface in env to hotplug. */
+	/* pass interface to uevent. */
 	envp[i++] = buf;
 	n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1;
 	buf += n;
@@ -408,7 +408,7 @@ static struct class net_class = {
 	.name = "net",
 	.release = netdev_release,
 #ifdef CONFIG_HOTPLUG
-	.hotplug = netdev_hotplug,
+	.uevent = netdev_uevent,
 #endif
 };
 
-- 
cgit v1.2.3-71-gd317


From f743ca5e10f4145e0b3e6d11b9b46171e16af7ce Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Tue, 22 Nov 2005 23:36:13 -0800
Subject: [PATCH] kobject_uevent CONFIG_NET=n fix

lib/lib.a(kobject_uevent.o)(.text+0x25f): In function `kobject_uevent':
: undefined reference to `__alloc_skb'
lib/lib.a(kobject_uevent.o)(.text+0x2a1): In function `kobject_uevent':
: undefined reference to `skb_over_panic'
lib/lib.a(kobject_uevent.o)(.text+0x31d): In function `kobject_uevent':
: undefined reference to `skb_over_panic'
lib/lib.a(kobject_uevent.o)(.text+0x356): In function `kobject_uevent':
: undefined reference to `netlink_broadcast'
lib/lib.a(kobject_uevent.o)(.init.text+0x9): In function `kobject_uevent_init':
: undefined reference to `netlink_kernel_create'
make: *** [.tmp_vmlinux1] Error 1

Netlink is unconditionally enabled if CONFIG_NET, so that's OK.

kobject_uevent.o is compiled even if !CONFIG_HOTPLUG, which is lazy.

Let's compound the sin.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/kobject.h | 2 +-
 kernel/ksysfs.c         | 3 +++
 lib/kobject_uevent.c    | 4 +---
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 8eb21f2f25e1..2a8d8da70961 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -253,7 +253,7 @@ struct subsys_attribute {
 extern int subsys_create_file(struct subsystem * , struct subsys_attribute *);
 extern void subsys_remove_file(struct subsystem * , struct subsys_attribute *);
 
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) & defined(CONFIG_NET)
 void kobject_uevent(struct kobject *kobj, enum kobject_action action);
 
 int add_uevent_var(char **envp, int num_envp, int *cur_index,
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index bfb4a7a54e22..99af8b05eeaa 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,6 +15,9 @@
 #include <linux/module.h>
 #include <linux/init.h>
 
+u64 uevent_seqnum;
+char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
+
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
 
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 01479e5c6d18..f56e27ae9d52 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -25,9 +25,7 @@
 #define BUFFER_SIZE	1024	/* buffer for the variables */
 #define NUM_ENVP	32	/* number of env pointers */
 
-#if defined(CONFIG_HOTPLUG)
-char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
-u64 uevent_seqnum;
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 static DEFINE_SPINLOCK(sequence_lock);
 static struct sock *uevent_sock;
 
-- 
cgit v1.2.3-71-gd317


From 1d8f430c15b3a345db990e285742c67c2f52f9a6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 7 Dec 2005 21:40:34 +0100
Subject: [PATCH] Input: add modalias support

Here's the patch for modalias support for input classes.  It uses
comma-separated numbers, and doesn't describe all the potential keys (no
module currently cares, and that would make the strings huge).  The
changes to input.h are to move the definitions needed by file2alias
outside __KERNEL__.  I chose not to move those definitions to
mod_devicetable.h, because there are so many that it might break compile
of something else in the kernel.

The rest is fairly straightforward.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
CC: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/input/input.c    | 39 ++++++++++++++++++++++++
 include/linux/input.h    | 79 +++++++++++++++++++++++++-----------------------
 scripts/mod/file2alias.c | 62 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 141 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index 2d37b394e384..ef5824c8846b 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -528,10 +528,49 @@ INPUT_DEV_STRING_ATTR_SHOW(name);
 INPUT_DEV_STRING_ATTR_SHOW(phys);
 INPUT_DEV_STRING_ATTR_SHOW(uniq);
 
+static int print_modalias_bits(char *buf, char prefix, unsigned long *arr,
+			       unsigned int min, unsigned int max)
+{
+	int len, i;
+
+	len = sprintf(buf, "%c", prefix);
+	for (i = min; i < max; i++)
+		if (arr[LONG(i)] & BIT(i))
+			len += sprintf(buf+len, "%X,", i);
+	return len;
+}
+
+static ssize_t input_dev_show_modalias(struct class_device *dev, char *buf)
+{
+	struct input_dev *id = to_input_dev(dev);
+	ssize_t len = 0;
+
+	len += sprintf(buf+len, "input:b%04Xv%04Xp%04Xe%04X-",
+		       id->id.bustype,
+		       id->id.vendor,
+		       id->id.product,
+		       id->id.version);
+
+	len += print_modalias_bits(buf+len, 'e', id->evbit, 0, EV_MAX);
+	len += print_modalias_bits(buf+len, 'k', id->keybit,
+				   KEY_MIN_INTERESTING, KEY_MAX);
+	len += print_modalias_bits(buf+len, 'r', id->relbit, 0, REL_MAX);
+	len += print_modalias_bits(buf+len, 'a', id->absbit, 0, ABS_MAX);
+	len += print_modalias_bits(buf+len, 'm', id->mscbit, 0, MSC_MAX);
+	len += print_modalias_bits(buf+len, 'l', id->ledbit, 0, LED_MAX);
+	len += print_modalias_bits(buf+len, 's', id->sndbit, 0, SND_MAX);
+	len += print_modalias_bits(buf+len, 'f', id->ffbit, 0, FF_MAX);
+	len += print_modalias_bits(buf+len, 'w', id->swbit, 0, SW_MAX);
+	len += sprintf(buf+len, "\n");
+	return len;
+}
+static CLASS_DEVICE_ATTR(modalias, S_IRUGO, input_dev_show_modalias, NULL);
+
 static struct attribute *input_dev_attrs[] = {
 	&class_device_attr_name.attr,
 	&class_device_attr_phys.attr,
 	&class_device_attr_uniq.attr,
+	&class_device_attr_modalias.attr,
 	NULL
 };
 
diff --git a/include/linux/input.h b/include/linux/input.h
index 3c5823368ddb..bef08551a33b 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -18,6 +18,7 @@
 #include <sys/ioctl.h>
 #include <asm/types.h>
 #endif
+#include <linux/mod_devicetable.h>
 
 /*
  * The event structure itself
@@ -511,6 +512,8 @@ struct input_absinfo {
 #define KEY_FN_S		0x1e3
 #define KEY_FN_B		0x1e4
 
+/* We avoid low common keys in module aliases so they don't get huge. */
+#define KEY_MIN_INTERESTING	KEY_MUTE
 #define KEY_MAX			0x1ff
 
 /*
@@ -793,6 +796,44 @@ struct ff_effect {
 
 #define FF_MAX		0x7f
 
+struct input_device_id {
+
+	kernel_ulong_t flags;
+
+	struct input_id id;
+
+	kernel_ulong_t evbit[EV_MAX/BITS_PER_LONG+1];
+	kernel_ulong_t keybit[KEY_MAX/BITS_PER_LONG+1];
+	kernel_ulong_t relbit[REL_MAX/BITS_PER_LONG+1];
+	kernel_ulong_t absbit[ABS_MAX/BITS_PER_LONG+1];
+	kernel_ulong_t mscbit[MSC_MAX/BITS_PER_LONG+1];
+	kernel_ulong_t ledbit[LED_MAX/BITS_PER_LONG+1];
+	kernel_ulong_t sndbit[SND_MAX/BITS_PER_LONG+1];
+	kernel_ulong_t ffbit[FF_MAX/BITS_PER_LONG+1];
+	kernel_ulong_t swbit[SW_MAX/BITS_PER_LONG+1];
+
+	kernel_ulong_t driver_info;
+};
+
+/*
+ * Structure for hotplug & device<->driver matching.
+ */
+
+#define INPUT_DEVICE_ID_MATCH_BUS	1
+#define INPUT_DEVICE_ID_MATCH_VENDOR	2
+#define INPUT_DEVICE_ID_MATCH_PRODUCT	4
+#define INPUT_DEVICE_ID_MATCH_VERSION	8
+
+#define INPUT_DEVICE_ID_MATCH_EVBIT	0x010
+#define INPUT_DEVICE_ID_MATCH_KEYBIT	0x020
+#define INPUT_DEVICE_ID_MATCH_RELBIT	0x040
+#define INPUT_DEVICE_ID_MATCH_ABSBIT	0x080
+#define INPUT_DEVICE_ID_MATCH_MSCIT	0x100
+#define INPUT_DEVICE_ID_MATCH_LEDBIT	0x200
+#define INPUT_DEVICE_ID_MATCH_SNDBIT	0x400
+#define INPUT_DEVICE_ID_MATCH_FFBIT	0x800
+#define INPUT_DEVICE_ID_MATCH_SWBIT	0x1000
+
 #ifdef __KERNEL__
 
 /*
@@ -901,49 +942,11 @@ struct input_dev {
 };
 #define to_input_dev(d) container_of(d, struct input_dev, cdev)
 
-/*
- * Structure for hotplug & device<->driver matching.
- */
-
-#define INPUT_DEVICE_ID_MATCH_BUS	1
-#define INPUT_DEVICE_ID_MATCH_VENDOR	2
-#define INPUT_DEVICE_ID_MATCH_PRODUCT	4
-#define INPUT_DEVICE_ID_MATCH_VERSION	8
-
-#define INPUT_DEVICE_ID_MATCH_EVBIT	0x010
-#define INPUT_DEVICE_ID_MATCH_KEYBIT	0x020
-#define INPUT_DEVICE_ID_MATCH_RELBIT	0x040
-#define INPUT_DEVICE_ID_MATCH_ABSBIT	0x080
-#define INPUT_DEVICE_ID_MATCH_MSCIT	0x100
-#define INPUT_DEVICE_ID_MATCH_LEDBIT	0x200
-#define INPUT_DEVICE_ID_MATCH_SNDBIT	0x400
-#define INPUT_DEVICE_ID_MATCH_FFBIT	0x800
-#define INPUT_DEVICE_ID_MATCH_SWBIT	0x1000
-
 #define INPUT_DEVICE_ID_MATCH_DEVICE\
 	(INPUT_DEVICE_ID_MATCH_BUS | INPUT_DEVICE_ID_MATCH_VENDOR | INPUT_DEVICE_ID_MATCH_PRODUCT)
 #define INPUT_DEVICE_ID_MATCH_DEVICE_AND_VERSION\
 	(INPUT_DEVICE_ID_MATCH_DEVICE | INPUT_DEVICE_ID_MATCH_VERSION)
 
-struct input_device_id {
-
-	unsigned long flags;
-
-	struct input_id id;
-
-	unsigned long evbit[NBITS(EV_MAX)];
-	unsigned long keybit[NBITS(KEY_MAX)];
-	unsigned long relbit[NBITS(REL_MAX)];
-	unsigned long absbit[NBITS(ABS_MAX)];
-	unsigned long mscbit[NBITS(MSC_MAX)];
-	unsigned long ledbit[NBITS(LED_MAX)];
-	unsigned long sndbit[NBITS(SND_MAX)];
-	unsigned long ffbit[NBITS(FF_MAX)];
-	unsigned long swbit[NBITS(SW_MAX)];
-
-	unsigned long driver_info;
-};
-
 struct input_handle;
 
 struct input_handler {
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index e3d144a3f10b..e0eedffe565b 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -16,8 +16,10 @@
  * use either stdint.h or inttypes.h for the rest. */
 #if KERNEL_ELFCLASS == ELFCLASS32
 typedef Elf32_Addr	kernel_ulong_t;
+#define BITS_PER_LONG 32
 #else
 typedef Elf64_Addr	kernel_ulong_t;
+#define BITS_PER_LONG 64
 #endif
 #ifdef __sun__
 #include <inttypes.h>
@@ -35,6 +37,7 @@ typedef unsigned char	__u8;
  * even potentially has different endianness and word sizes, since 
  * we handle those differences explicitly below */
 #include "../../include/linux/mod_devicetable.h"
+#include "../../include/linux/input.h"
 
 #define ADD(str, sep, cond, field)                              \
 do {                                                            \
@@ -366,6 +369,61 @@ static int do_i2c_entry(const char *filename, struct i2c_device_id *i2c, char *a
 	return 1;
 }
 
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+static void do_input(char *alias,
+		     kernel_ulong_t *arr, unsigned int min, unsigned int max)
+{
+	unsigned int i;
+	for (i = min; i < max; i++) {
+		if (arr[i/BITS_PER_LONG] & (1 << (i%BITS_PER_LONG)))
+			sprintf(alias+strlen(alias), "%X,*", i);
+	}
+}
+
+/* input:b0v0p0e0-eXkXrXaXmXlXsXfXwX where X is comma-separated %02X. */
+static int do_input_entry(const char *filename, struct input_device_id *id,
+			  char *alias)
+{
+	sprintf(alias, "input:");
+
+	ADD(alias, "b", id->flags&INPUT_DEVICE_ID_MATCH_BUS, id->id.bustype);
+	ADD(alias, "v", id->flags&INPUT_DEVICE_ID_MATCH_VENDOR, id->id.vendor);
+	ADD(alias, "p", id->flags&INPUT_DEVICE_ID_MATCH_PRODUCT,
+	    id->id.product);
+	ADD(alias, "e", id->flags&INPUT_DEVICE_ID_MATCH_VERSION,
+	    id->id.version);
+
+	sprintf(alias + strlen(alias), "-e*");
+	if (id->flags&INPUT_DEVICE_ID_MATCH_EVBIT)
+		do_input(alias, id->evbit, 0, EV_MAX);
+	sprintf(alias + strlen(alias), "k*");
+	if (id->flags&INPUT_DEVICE_ID_MATCH_KEYBIT)
+		do_input(alias, id->keybit, KEY_MIN_INTERESTING, KEY_MAX);
+	sprintf(alias + strlen(alias), "r*");
+	if (id->flags&INPUT_DEVICE_ID_MATCH_RELBIT)
+		do_input(alias, id->relbit, 0, REL_MAX);
+	sprintf(alias + strlen(alias), "a*");
+	if (id->flags&INPUT_DEVICE_ID_MATCH_ABSBIT)
+		do_input(alias, id->absbit, 0, ABS_MAX);
+	sprintf(alias + strlen(alias), "m*");
+	if (id->flags&INPUT_DEVICE_ID_MATCH_MSCIT)
+		do_input(alias, id->mscbit, 0, MSC_MAX);
+	sprintf(alias + strlen(alias), "l*");
+	if (id->flags&INPUT_DEVICE_ID_MATCH_LEDBIT)
+		do_input(alias, id->ledbit, 0, LED_MAX);
+	sprintf(alias + strlen(alias), "s*");
+	if (id->flags&INPUT_DEVICE_ID_MATCH_SNDBIT)
+		do_input(alias, id->sndbit, 0, SND_MAX);
+	sprintf(alias + strlen(alias), "f*");
+	if (id->flags&INPUT_DEVICE_ID_MATCH_FFBIT)
+		do_input(alias, id->ffbit, 0, SND_MAX);
+	sprintf(alias + strlen(alias), "w*");
+	if (id->flags&INPUT_DEVICE_ID_MATCH_SWBIT)
+		do_input(alias, id->swbit, 0, SW_MAX);
+	return 1;
+}
+
 /* Ignore any prefix, eg. v850 prepends _ */
 static inline int sym_is(const char *symbol, const char *name)
 {
@@ -453,7 +511,9 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
 	else if (sym_is(symname, "__mod_i2c_device_table"))
 		do_table(symval, sym->st_size, sizeof(struct i2c_device_id),
 			 do_i2c_entry, mod);
-
+	else if (sym_is(symname, "__mod_input_device_table"))
+		do_table(symval, sym->st_size, sizeof(struct input_device_id),
+			 do_input_entry, mod);
 }
 
 /* Now add out buffered information to the generated C source */
-- 
cgit v1.2.3-71-gd317


From e39b84337b8aed3044683a57741a19e5002225b9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 10 Dec 2005 22:48:20 +1100
Subject: [PATCH] Input: fix add modalias support build error

Fix build when scripts/mod/file2alias.c includes linux/input.h, which
tries to include /usr/include/linux/mod_devicetable.h:

 In file included from scripts/mod/file2alias.c:40:
 include/linux/input.h:21:35: linux/mod_devicetable.h: No such file or directory
 make[2]: *** [scripts/mod/file2alias.o] Error 1

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/input.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index bef08551a33b..6d4cc3c110d6 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -13,12 +13,12 @@
 #include <linux/time.h>
 #include <linux/list.h>
 #include <linux/device.h>
+#include <linux/mod_devicetable.h>
 #else
 #include <sys/time.h>
 #include <sys/ioctl.h>
 #include <asm/types.h>
 #endif
-#include <linux/mod_devicetable.h>
 
 /*
  * The event structure itself
-- 
cgit v1.2.3-71-gd317


From 93ce3061be212f6280e7ccafa9a7f698a95c6d75 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Sat, 10 Dec 2005 01:36:27 -0500
Subject: [PATCH] Driver Core: Add platform_device_del()

Driver core: add platform_device_del function

Having platform_device_del90 allows more straightforward error
handling code in drivers registering platform devices.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/platform.c         | 45 +++++++++++++++++++++++++++--------------
 include/linux/platform_device.h |  1 +
 2 files changed, 31 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 1091af1cbb58..95ecfc490d54 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -168,7 +168,7 @@ struct platform_device *platform_device_alloc(const char *name, unsigned int id)
 		pa->pdev.dev.release = platform_device_release;
 	}
 
-	return pa ? &pa->pdev : NULL;	
+	return pa ? &pa->pdev : NULL;
 }
 EXPORT_SYMBOL_GPL(platform_device_alloc);
 
@@ -282,24 +282,13 @@ int platform_device_add(struct platform_device *pdev)
 EXPORT_SYMBOL_GPL(platform_device_add);
 
 /**
- *	platform_device_register - add a platform-level device
- *	@pdev:	platform device we're adding
- *
- */
-int platform_device_register(struct platform_device * pdev)
-{
-	device_initialize(&pdev->dev);
-	return platform_device_add(pdev);
-}
-
-/**
- *	platform_device_unregister - remove a platform-level device
+ *	platform_device_del - remove a platform-level device
  *	@pdev:	platform device we're removing
  *
  *	Note that this function will also release all memory- and port-based
  *	resources owned by the device (@dev->resource).
  */
-void platform_device_unregister(struct platform_device * pdev)
+void platform_device_del(struct platform_device *pdev)
 {
 	int i;
 
@@ -310,9 +299,35 @@ void platform_device_unregister(struct platform_device * pdev)
 				release_resource(r);
 		}
 
-		device_unregister(&pdev->dev);
+		device_del(&pdev->dev);
 	}
 }
+EXPORT_SYMBOL_GPL(platform_device_del);
+
+/**
+ *	platform_device_register - add a platform-level device
+ *	@pdev:	platform device we're adding
+ *
+ */
+int platform_device_register(struct platform_device * pdev)
+{
+	device_initialize(&pdev->dev);
+	return platform_device_add(pdev);
+}
+
+/**
+ *	platform_device_unregister - unregister a platform-level device
+ *	@pdev:	platform device we're unregistering
+ *
+ *	Unregistration is done in 2 steps. Fisrt we release all resources
+ *	and remove it from the sybsystem, then we drop reference count by
+ *	calling platform_device_put().
+ */
+void platform_device_unregister(struct platform_device * pdev)
+{
+	platform_device_del(pdev);
+	platform_device_put(pdev);
+}
 
 /**
  *	platform_device_register_simple
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 17e336f40b47..782090c68932 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -41,6 +41,7 @@ extern struct platform_device *platform_device_alloc(const char *name, unsigned
 extern int platform_device_add_resources(struct platform_device *pdev, struct resource *res, unsigned int num);
 extern int platform_device_add_data(struct platform_device *pdev, void *data, size_t size);
 extern int platform_device_add(struct platform_device *pdev);
+extern void platform_device_del(struct platform_device *pdev);
 extern void platform_device_put(struct platform_device *pdev);
 
 struct platform_driver {
-- 
cgit v1.2.3-71-gd317


From c1d10adb4a521de5760112853f42aaeefcec96eb Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 5 Jan 2006 12:19:05 -0800
Subject: [NETFILTER]: Add ctnetlink port for nf_conntrack

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h  |    3 +
 include/net/netfilter/nf_conntrack.h           |   31 +
 include/net/netfilter/nf_conntrack_helper.h    |    2 +
 include/net/netfilter/nf_conntrack_l3proto.h   |   15 +-
 include/net/netfilter/nf_conntrack_protocol.h  |   26 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |   47 +
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |   97 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |   47 +
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |   77 +-
 net/netfilter/Kconfig                          |    7 +
 net/netfilter/Makefile                         |    3 +
 net/netfilter/nf_conntrack_core.c              |  232 +++-
 net/netfilter/nf_conntrack_netlink.c           | 1642 ++++++++++++++++++++++++
 net/netfilter/nf_conntrack_proto_tcp.c         |   71 +
 net/netfilter/nf_conntrack_proto_udp.c         |   10 +
 net/netfilter/nf_conntrack_standalone.c        |   42 +-
 16 files changed, 2289 insertions(+), 63 deletions(-)
 create mode 100644 net/netfilter/nf_conntrack_netlink.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 116fcaced909..b8e9a5b6fb1e 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -64,6 +64,9 @@ enum ctattr_l4proto {
 	CTA_PROTO_ICMP_ID,
 	CTA_PROTO_ICMP_TYPE,
 	CTA_PROTO_ICMP_CODE,
+	CTA_PROTO_ICMPV6_ID,
+	CTA_PROTO_ICMPV6_TYPE,
+	CTA_PROTO_ICMPV6_CODE,
 	__CTA_PROTO_MAX
 };
 #define CTA_PROTO_MAX (__CTA_PROTO_MAX - 1)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index cc4825610795..64b82b74a650 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -94,6 +94,9 @@ struct nf_conn
 	/* Current number of expected connections */
 	unsigned int expecting;
 
+	/* Unique ID that identifies this conntrack*/
+	unsigned int id;
+
 	/* Helper. if any */
 	struct nf_conntrack_helper *helper;
 
@@ -140,6 +143,9 @@ struct nf_conntrack_expect
 	/* Usage count. */
 	atomic_t use;
 
+	/* Unique ID */
+	unsigned int id;
+
 	/* Flags */
 	unsigned int flags;
 
@@ -190,6 +196,31 @@ static inline void nf_ct_put(struct nf_conn *ct)
 	nf_conntrack_put(&ct->ct_general);
 }
 
+extern struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
+		    const struct nf_conn *ignored_conntrack);
+
+extern void nf_conntrack_hash_insert(struct nf_conn *ct);
+
+extern struct nf_conntrack_expect *
+__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple);
+
+extern struct nf_conntrack_expect *
+nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple);
+
+extern void nf_ct_unlink_expect(struct nf_conntrack_expect *exp);
+
+extern void nf_ct_remove_expectations(struct nf_conn *ct);
+
+extern void nf_conntrack_flush(void);
+
+extern struct nf_conntrack_helper *
+nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple);
+extern void nf_ct_helper_put(struct nf_conntrack_helper *helper);
+
+extern struct nf_conntrack_helper *
+__nf_conntrack_helper_find_byname(const char *name);
+
 /* call to create an explicit dependency on nf_conntrack. */
 extern void need_nf_conntrack(void);
 
diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 5a66b2a3a623..86ec8174ad02 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -33,6 +33,8 @@ struct nf_conntrack_helper
 		    unsigned int protoff,
 		    struct nf_conn *ct,
 		    enum ip_conntrack_info conntrackinfo);
+
+	int (*to_nfattr)(struct sk_buff *skb, const struct nf_conn *ct);
 };
 
 extern int nf_conntrack_helper_register(struct nf_conntrack_helper *);
diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index 01663e5b33df..67856eb93b43 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -14,6 +14,8 @@
 #include <linux/seq_file.h>
 #include <net/netfilter/nf_conntrack.h>
 
+struct nfattr;
+
 struct nf_conntrack_l3proto
 {
 	/* Next pointer. */
@@ -70,6 +72,12 @@ struct nf_conntrack_l3proto
 
 	u_int32_t (*get_features)(const struct nf_conntrack_tuple *tuple);
 
+	int (*tuple_to_nfattr)(struct sk_buff *skb,
+			       const struct nf_conntrack_tuple *t);
+
+	int (*nfattr_to_tuple)(struct nfattr *tb[],
+			       struct nf_conntrack_tuple *t);
+
 	/* Module (if any) which this is connected to. */
 	struct module *me;
 };
@@ -81,11 +89,16 @@ extern int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto);
 extern void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto);
 
 static inline struct nf_conntrack_l3proto *
-nf_ct_find_l3proto(u_int16_t l3proto)
+__nf_ct_l3proto_find(u_int16_t l3proto)
 {
 	return nf_ct_l3protos[l3proto];
 }
 
+extern struct nf_conntrack_l3proto *
+nf_ct_l3proto_find_get(u_int16_t l3proto);
+
+extern void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p);
+
 /* Existing built-in protocols */
 extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
 extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
diff --git a/include/net/netfilter/nf_conntrack_protocol.h b/include/net/netfilter/nf_conntrack_protocol.h
index b3afda35397a..1f33737fcea5 100644
--- a/include/net/netfilter/nf_conntrack_protocol.h
+++ b/include/net/netfilter/nf_conntrack_protocol.h
@@ -12,6 +12,7 @@
 #include <net/netfilter/nf_conntrack.h>
 
 struct seq_file;
+struct nfattr;
 
 struct nf_conntrack_protocol
 {
@@ -66,6 +67,18 @@ struct nf_conntrack_protocol
 		     enum ip_conntrack_info *ctinfo,
 		     int pf, unsigned int hooknum);
 
+	/* convert protoinfo to nfnetink attributes */
+	int (*to_nfattr)(struct sk_buff *skb, struct nfattr *nfa,
+			 const struct nf_conn *ct);
+
+	/* convert nfnetlink attributes to protoinfo */
+	int (*from_nfattr)(struct nfattr *tb[], struct nf_conn *ct);
+
+	int (*tuple_to_nfattr)(struct sk_buff *skb,
+			       const struct nf_conntrack_tuple *t);
+	int (*nfattr_to_tuple)(struct nfattr *tb[],
+			       struct nf_conntrack_tuple *t);
+
 	/* Module (if any) which this is connected to. */
 	struct module *me;
 };
@@ -80,12 +93,23 @@ extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
 extern struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
 
 extern struct nf_conntrack_protocol *
-nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol);
+__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol);
+
+extern struct nf_conntrack_protocol *
+nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol);
+
+extern void nf_ct_proto_put(struct nf_conntrack_protocol *p);
 
 /* Protocol registration. */
 extern int nf_conntrack_protocol_register(struct nf_conntrack_protocol *proto);
 extern void nf_conntrack_protocol_unregister(struct nf_conntrack_protocol *proto);
 
+/* Generic netlink helpers */
+extern int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+				      const struct nf_conntrack_tuple *tuple);
+extern int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+				      struct nf_conntrack_tuple *t);
+
 /* Log invalid packets */
 extern unsigned int nf_ct_log_invalid;
 
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 8202c1c0afad..385867efd481 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -392,6 +392,48 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 	return -ENOENT;
 }
 
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int ipv4_tuple_to_nfattr(struct sk_buff *skb,
+				const struct nf_conntrack_tuple *tuple)
+{
+	NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t),
+		&tuple->src.u3.ip);
+	NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t),
+		&tuple->dst.u3.ip);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static const size_t cta_min_ip[CTA_IP_MAX] = {
+	[CTA_IP_V4_SRC-1]       = sizeof(u_int32_t),
+	[CTA_IP_V4_DST-1]       = sizeof(u_int32_t),
+};
+
+static int ipv4_nfattr_to_tuple(struct nfattr *tb[],
+				struct nf_conntrack_tuple *t)
+{
+	if (!tb[CTA_IP_V4_SRC-1] || !tb[CTA_IP_V4_DST-1])
+		return -EINVAL;
+
+	if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
+		return -EINVAL;
+
+	t->src.u3.ip =
+		*(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
+	t->dst.u3.ip =
+		*(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
+
+	return 0;
+}
+#endif
+
 static struct nf_sockopt_ops so_getorigdst = {
 	.pf		= PF_INET,
 	.get_optmin	= SO_ORIGINAL_DST,
@@ -408,6 +450,11 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
 	.print_conntrack = ipv4_print_conntrack,
 	.prepare	 = ipv4_prepare,
 	.get_features	 = ipv4_get_features,
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nfattr = ipv4_tuple_to_nfattr,
+	.nfattr_to_tuple = ipv4_nfattr_to_tuple,
+#endif
 	.me		 = THIS_MODULE,
 };
 
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 7ddb5c08f7b8..52dc175be39a 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -50,20 +50,21 @@ static int icmp_pkt_to_tuple(const struct sk_buff *skb,
 	return 1;
 }
 
+/* Add 1; spaces filled with 0. */
+static const u_int8_t invmap[] = {
+	[ICMP_ECHO] = ICMP_ECHOREPLY + 1,
+	[ICMP_ECHOREPLY] = ICMP_ECHO + 1,
+	[ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
+	[ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
+	[ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
+	[ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
+	[ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
+	[ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
+};
+
 static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
 			     const struct nf_conntrack_tuple *orig)
 {
-	/* Add 1; spaces filled with 0. */
-	static u_int8_t invmap[]
-		= { [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
-		    [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
-		    [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
-		    [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
-		    [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
-		    [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
-		    [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
-		    [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1};
-
 	if (orig->dst.u.icmp.type >= sizeof(invmap)
 	    || !invmap[orig->dst.u.icmp.type])
 		return 0;
@@ -120,11 +121,12 @@ static int icmp_packet(struct nf_conn *ct,
 static int icmp_new(struct nf_conn *conntrack,
 		    const struct sk_buff *skb, unsigned int dataoff)
 {
-	static u_int8_t valid_new[]
-		= { [ICMP_ECHO] = 1,
-		    [ICMP_TIMESTAMP] = 1,
-		    [ICMP_INFO_REQUEST] = 1,
-		    [ICMP_ADDRESS] = 1 };
+	static const u_int8_t valid_new[] = {
+		[ICMP_ECHO] = 1,
+		[ICMP_TIMESTAMP] = 1,
+		[ICMP_INFO_REQUEST] = 1,
+		[ICMP_ADDRESS] = 1
+	};
 
 	if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
 	    || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
@@ -168,7 +170,7 @@ icmp_error_message(struct sk_buff *skb,
 		return -NF_ACCEPT;
 	}
 
-	innerproto = nf_ct_find_proto(PF_INET, inside->ip.protocol);
+	innerproto = __nf_ct_proto_find(PF_INET, inside->ip.protocol);
 	dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp);
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET,
@@ -281,6 +283,60 @@ checksum_skipped:
 	return icmp_error_message(skb, ctinfo, hooknum);
 }
 
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int icmp_tuple_to_nfattr(struct sk_buff *skb,
+				const struct nf_conntrack_tuple *t)
+{
+	NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
+		&t->src.u.icmp.id);
+	NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
+		&t->dst.u.icmp.type);
+	NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
+		&t->dst.u.icmp.code);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static const size_t cta_min_proto[CTA_PROTO_MAX] = {
+	[CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
+	[CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
+	[CTA_PROTO_ICMP_ID-1]   = sizeof(u_int16_t)
+};
+
+static int icmp_nfattr_to_tuple(struct nfattr *tb[],
+				struct nf_conntrack_tuple *tuple)
+{
+	if (!tb[CTA_PROTO_ICMP_TYPE-1]
+	    || !tb[CTA_PROTO_ICMP_CODE-1]
+	    || !tb[CTA_PROTO_ICMP_ID-1])
+		return -EINVAL;
+
+	if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+		return -EINVAL;
+
+	tuple->dst.u.icmp.type = 
+			*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
+	tuple->dst.u.icmp.code =
+			*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
+	tuple->src.u.icmp.id =
+			*(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
+
+	if (tuple->dst.u.icmp.type >= sizeof(invmap)
+	    || !invmap[tuple->dst.u.icmp.type])
+		return -EINVAL;
+
+	return 0;
+}
+#endif
+
 struct nf_conntrack_protocol nf_conntrack_protocol_icmp =
 {
 	.list			= { NULL, NULL },
@@ -295,7 +351,12 @@ struct nf_conntrack_protocol nf_conntrack_protocol_icmp =
 	.new			= icmp_new,
 	.error			= icmp_error,
 	.destroy		= NULL,
-	.me			= NULL
+	.me			= NULL,
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nfattr	= icmp_tuple_to_nfattr,
+	.nfattr_to_tuple	= icmp_nfattr_to_tuple,
+#endif
 };
 
 EXPORT_SYMBOL(nf_conntrack_protocol_icmp);
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 753a3ae8502b..704fbbe74874 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -401,6 +401,48 @@ static ctl_table nf_ct_net_table[] = {
 };
 #endif
 
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int ipv6_tuple_to_nfattr(struct sk_buff *skb,
+				const struct nf_conntrack_tuple *tuple)
+{
+	NFA_PUT(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4,
+		&tuple->src.u3.ip6);
+	NFA_PUT(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4,
+		&tuple->dst.u3.ip6);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static const size_t cta_min_ip[CTA_IP_MAX] = {
+	[CTA_IP_V6_SRC-1]       = sizeof(u_int32_t)*4,
+	[CTA_IP_V6_DST-1]       = sizeof(u_int32_t)*4,
+};
+
+static int ipv6_nfattr_to_tuple(struct nfattr *tb[],
+				struct nf_conntrack_tuple *t)
+{
+	if (!tb[CTA_IP_V6_SRC-1] || !tb[CTA_IP_V6_DST-1])
+		return -EINVAL;
+
+	if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
+		return -EINVAL;
+
+	memcpy(&t->src.u3.ip6, NFA_DATA(tb[CTA_IP_V6_SRC-1]), 
+	       sizeof(u_int32_t) * 4);
+	memcpy(&t->dst.u3.ip6, NFA_DATA(tb[CTA_IP_V6_DST-1]),
+	       sizeof(u_int32_t) * 4);
+
+	return 0;
+}
+#endif
+
 struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
 	.l3proto		= PF_INET6,
 	.name			= "ipv6",
@@ -409,6 +451,11 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
 	.print_tuple		= ipv6_print_tuple,
 	.print_conntrack	= ipv6_print_conntrack,
 	.prepare		= ipv6_prepare,
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nfattr	= ipv6_tuple_to_nfattr,
+	.nfattr_to_tuple	= ipv6_nfattr_to_tuple,
+#endif
 	.get_features		= ipv6_get_features,
 	.me			= THIS_MODULE,
 };
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index a7e03cfacd06..09945c333055 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -57,17 +57,17 @@ static int icmpv6_pkt_to_tuple(const struct sk_buff *skb,
 	return 1;
 }
 
+/* Add 1; spaces filled with 0. */
+static u_int8_t invmap[] = {
+	[ICMPV6_ECHO_REQUEST - 128]	= ICMPV6_ECHO_REPLY + 1,
+	[ICMPV6_ECHO_REPLY - 128]	= ICMPV6_ECHO_REQUEST + 1,
+	[ICMPV6_NI_QUERY - 128]		= ICMPV6_NI_QUERY + 1,
+	[ICMPV6_NI_REPLY - 128]		= ICMPV6_NI_REPLY +1
+};
+
 static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 			       const struct nf_conntrack_tuple *orig)
 {
-	/* Add 1; spaces filled with 0. */
-	static u_int8_t invmap[] = {
-		[ICMPV6_ECHO_REQUEST - 128]	= ICMPV6_ECHO_REPLY + 1,
-		[ICMPV6_ECHO_REPLY - 128]	= ICMPV6_ECHO_REQUEST + 1,
-		[ICMPV6_NI_QUERY - 128]		= ICMPV6_NI_QUERY + 1,
-		[ICMPV6_NI_REPLY - 128]		= ICMPV6_NI_REPLY +1
-	};
-
 	int type = orig->dst.u.icmp.type - 128;
 	if (type < 0 || type >= sizeof(invmap) || !invmap[type])
 		return 0;
@@ -185,7 +185,7 @@ icmpv6_error_message(struct sk_buff *skb,
 		return -NF_ACCEPT;
 	}
 
-	inproto = nf_ct_find_proto(PF_INET6, inprotonum);
+	inproto = __nf_ct_proto_find(PF_INET6, inprotonum);
 
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum,
@@ -255,6 +255,60 @@ skipped:
 	return icmpv6_error_message(skb, dataoff, ctinfo, hooknum);
 }
 
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+static int icmpv6_tuple_to_nfattr(struct sk_buff *skb,
+				  const struct nf_conntrack_tuple *t)
+{
+	NFA_PUT(skb, CTA_PROTO_ICMPV6_ID, sizeof(u_int16_t),
+		&t->src.u.icmp.id);
+	NFA_PUT(skb, CTA_PROTO_ICMPV6_TYPE, sizeof(u_int8_t),
+		&t->dst.u.icmp.type);
+	NFA_PUT(skb, CTA_PROTO_ICMPV6_CODE, sizeof(u_int8_t),
+		&t->dst.u.icmp.code);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static const size_t cta_min_proto[CTA_PROTO_MAX] = {
+	[CTA_PROTO_ICMPV6_TYPE-1] = sizeof(u_int8_t),
+	[CTA_PROTO_ICMPV6_CODE-1] = sizeof(u_int8_t),
+	[CTA_PROTO_ICMPV6_ID-1]   = sizeof(u_int16_t)
+};
+
+static int icmpv6_nfattr_to_tuple(struct nfattr *tb[],
+				struct nf_conntrack_tuple *tuple)
+{
+	if (!tb[CTA_PROTO_ICMPV6_TYPE-1]
+	    || !tb[CTA_PROTO_ICMPV6_CODE-1]
+	    || !tb[CTA_PROTO_ICMPV6_ID-1])
+		return -EINVAL;
+
+	if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+		return -EINVAL;
+
+	tuple->dst.u.icmp.type =
+			*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_TYPE-1]);
+	tuple->dst.u.icmp.code =
+			*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_CODE-1]);
+	tuple->src.u.icmp.id =
+			*(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_ID-1]);
+
+	if (tuple->dst.u.icmp.type < 128
+	    || tuple->dst.u.icmp.type - 128 >= sizeof(invmap)
+	    || !invmap[tuple->dst.u.icmp.type - 128])
+		return -EINVAL;
+
+	return 0;
+}
+#endif
+
 struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 =
 {
 	.l3proto		= PF_INET6,
@@ -267,6 +321,11 @@ struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 =
 	.packet			= icmpv6_packet,
 	.new			= icmpv6_new,
 	.error			= icmpv6_error,
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nfattr	= icmpv6_tuple_to_nfattr,
+	.nfattr_to_tuple	= icmpv6_nfattr_to_tuple,
+#endif
 };
 
 EXPORT_SYMBOL(nf_conntrack_protocol_icmpv6);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 794c41d19b28..7d55f9cbd853 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -95,4 +95,11 @@ config NF_CONNTRACK_FTP
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NF_CT_NETLINK
+	tristate 'Connection tracking netlink interface (EXPERIMENTAL)'
+	depends on EXPERIMENTAL && NF_CONNTRACK && NETFILTER_NETLINK
+	depends on NF_CONNTRACK!=y || NETFILTER_NETLINK!=m
+	help
+	  This option enables support for a netlink-based userspace interface
+
 endmenu
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 55f019ad2c08..cb2183145c37 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -13,3 +13,6 @@ obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
 
 # SCTP protocol connection tracking
 obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
+
+# netlink interface for nf_conntrack
+obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0c5b01d732d8..62bb509f05d4 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -82,6 +82,8 @@ unsigned int nf_ct_log_invalid;
 static LIST_HEAD(unconfirmed);
 static int nf_conntrack_vmalloc;
 
+static unsigned int nf_conntrack_next_id = 1;
+static unsigned int nf_conntrack_expect_next_id = 1;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 struct notifier_block *nf_conntrack_chain;
 struct notifier_block *nf_conntrack_expect_chain;
@@ -184,7 +186,7 @@ DECLARE_MUTEX(nf_ct_cache_mutex);
 
 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
 struct nf_conntrack_protocol *
-nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol)
+__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
 {
 	if (unlikely(nf_ct_protos[l3proto] == NULL))
 		return &nf_conntrack_generic_protocol;
@@ -192,6 +194,50 @@ nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol)
 	return nf_ct_protos[l3proto][protocol];
 }
 
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct nf_conntrack_protocol *
+nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
+{
+	struct nf_conntrack_protocol *p;
+
+	preempt_disable();
+	p = __nf_ct_proto_find(l3proto, protocol);
+	if (p) {
+		if (!try_module_get(p->me))
+			p = &nf_conntrack_generic_protocol;
+	}
+	preempt_enable();
+	
+	return p;
+}
+
+void nf_ct_proto_put(struct nf_conntrack_protocol *p)
+{
+	module_put(p->me);
+}
+
+struct nf_conntrack_l3proto *
+nf_ct_l3proto_find_get(u_int16_t l3proto)
+{
+	struct nf_conntrack_l3proto *p;
+
+	preempt_disable();
+	p = __nf_ct_l3proto_find(l3proto);
+	if (p) {
+		if (!try_module_get(p->me))
+			p = &nf_conntrack_generic_l3proto;
+	}
+	preempt_enable();
+
+	return p;
+}
+
+void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
+{
+	module_put(p->me);
+}
+
 static int nf_conntrack_hash_rnd_initted;
 static unsigned int nf_conntrack_hash_rnd;
 
@@ -384,7 +430,7 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 }
 
 /* nf_conntrack_expect helper functions */
-static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
 {
 	ASSERT_WRITE_LOCK(&nf_conntrack_lock);
 	NF_CT_ASSERT(!timer_pending(&exp->timeout));
@@ -404,6 +450,33 @@ static void expectation_timed_out(unsigned long ul_expect)
 	nf_conntrack_expect_put(exp);
 }
 
+struct nf_conntrack_expect *
+__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_expect *i;
+	
+	list_for_each_entry(i, &nf_conntrack_expect_list, list) {
+		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+			atomic_inc(&i->use);
+			return i;
+		}
+	}
+	return NULL;
+}
+
+/* Just find a expectation corresponding to a tuple. */
+struct nf_conntrack_expect *
+nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_expect *i;
+	
+	read_lock_bh(&nf_conntrack_lock);
+	i = __nf_conntrack_expect_find(tuple);
+	read_unlock_bh(&nf_conntrack_lock);
+
+	return i;
+}
+
 /* If an expectation for this connection is found, it gets delete from
  * global list then returned. */
 static struct nf_conntrack_expect *
@@ -432,7 +505,7 @@ find_expectation(const struct nf_conntrack_tuple *tuple)
 }
 
 /* delete all expectations for this conntrack */
-static void remove_expectations(struct nf_conn *ct)
+void nf_ct_remove_expectations(struct nf_conn *ct)
 {
 	struct nf_conntrack_expect *i, *tmp;
 
@@ -462,7 +535,7 @@ clean_from_lists(struct nf_conn *ct)
 	LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
 
 	/* Destroy all pending expectations */
-	remove_expectations(ct);
+	nf_ct_remove_expectations(ct);
 }
 
 static void
@@ -482,12 +555,11 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	/* To make sure we don't get any weird locking issues here:
 	 * destroy_conntrack() MUST NOT be called with a write lock
 	 * to nf_conntrack_lock!!! -HW */
-	l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
+	l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
 	if (l3proto && l3proto->destroy)
 		l3proto->destroy(ct);
 
-	proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num,
-				 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+	proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
 	if (proto && proto->destroy)
 		proto->destroy(ct);
 
@@ -499,7 +571,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	 * except TFTP can create an expectation on the first packet,
 	 * before connection is in the list, so we need to clean here,
 	 * too. */
-	remove_expectations(ct);
+	nf_ct_remove_expectations(ct);
 
 	/* We overload first tuple to link into unconfirmed list. */
 	if (!nf_ct_is_confirmed(ct)) {
@@ -540,7 +612,7 @@ conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
 		&& nf_ct_tuple_equal(tuple, &i->tuple);
 }
 
-static struct nf_conntrack_tuple_hash *
+struct nf_conntrack_tuple_hash *
 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
 		    const struct nf_conn *ignored_conntrack)
 {
@@ -575,6 +647,29 @@ nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
 	return h;
 }
 
+static void __nf_conntrack_hash_insert(struct nf_conn *ct,
+				       unsigned int hash,
+				       unsigned int repl_hash) 
+{
+	ct->id = ++nf_conntrack_next_id;
+	list_prepend(&nf_conntrack_hash[hash],
+		     &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+	list_prepend(&nf_conntrack_hash[repl_hash],
+		     &ct->tuplehash[IP_CT_DIR_REPLY].list);
+}
+
+void nf_conntrack_hash_insert(struct nf_conn *ct)
+{
+	unsigned int hash, repl_hash;
+
+	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	write_lock_bh(&nf_conntrack_lock);
+	__nf_conntrack_hash_insert(ct, hash, repl_hash);
+	write_unlock_bh(&nf_conntrack_lock);
+}
+
 /* Confirm a connection given skb; places it in hash table */
 int
 __nf_conntrack_confirm(struct sk_buff **pskb)
@@ -621,10 +716,7 @@ __nf_conntrack_confirm(struct sk_buff **pskb)
 		/* Remove from unconfirmed list */
 		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
 
-		list_prepend(&nf_conntrack_hash[hash],
-			     &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
-		list_prepend(&nf_conntrack_hash[repl_hash],
-			     &ct->tuplehash[IP_CT_DIR_REPLY]);
+		__nf_conntrack_hash_insert(ct, hash, repl_hash);
 		/* Timer relative to confirmation time, not original
 		   setting time, otherwise we'd get timer wrap in
 		   weird delay cases. */
@@ -708,13 +800,41 @@ static inline int helper_cmp(const struct nf_conntrack_helper *i,
 }
 
 static struct nf_conntrack_helper *
-nf_ct_find_helper(const struct nf_conntrack_tuple *tuple)
+__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
 {
 	return LIST_FIND(&helpers, helper_cmp,
 			 struct nf_conntrack_helper *,
 			 tuple);
 }
 
+struct nf_conntrack_helper *
+nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_helper *helper;
+
+	/* need nf_conntrack_lock to assure that helper exists until
+	 * try_module_get() is called */
+	read_lock_bh(&nf_conntrack_lock);
+
+	helper = __nf_ct_helper_find(tuple);
+	if (helper) {
+		/* need to increase module usage count to assure helper will
+		 * not go away while the caller is e.g. busy putting a
+		 * conntrack in the hash that uses the helper */
+		if (!try_module_get(helper->me))
+			helper = NULL;
+	}
+
+	read_unlock_bh(&nf_conntrack_lock);
+
+	return helper;
+}
+
+void nf_ct_helper_put(struct nf_conntrack_helper *helper)
+{
+	module_put(helper->me);
+}
+
 static struct nf_conn *
 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
 		     const struct nf_conntrack_tuple *repl,
@@ -744,7 +864,7 @@ __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
 	/*  find features needed by this conntrack. */
 	features = l3proto->get_features(orig);
 	read_lock_bh(&nf_conntrack_lock);
-	if (nf_ct_find_helper(repl) != NULL)
+	if (__nf_ct_helper_find(repl) != NULL)
 		features |= NF_CT_F_HELP;
 	read_unlock_bh(&nf_conntrack_lock);
 
@@ -794,7 +914,7 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
 {
 	struct nf_conntrack_l3proto *l3proto;
 
-	l3proto = nf_ct_find_l3proto(orig->src.l3num);
+	l3proto = __nf_ct_l3proto_find(orig->src.l3num);
 	return __nf_conntrack_alloc(orig, repl, l3proto);
 }
 
@@ -853,7 +973,7 @@ init_conntrack(const struct nf_conntrack_tuple *tuple,
 		nf_conntrack_get(&conntrack->master->ct_general);
 		NF_CT_STAT_INC(expect_new);
 	} else {
-		conntrack->helper = nf_ct_find_helper(&repl_tuple);
+		conntrack->helper = __nf_ct_helper_find(&repl_tuple);
 
 		NF_CT_STAT_INC(new);
         }
@@ -947,13 +1067,13 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
 		return NF_ACCEPT;
 	}
 
-	l3proto = nf_ct_find_l3proto((u_int16_t)pf);
+	l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
 	if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
 		DEBUGP("not prepared to track yet or error occured\n");
 		return -ret;
 	}
 
-	proto = nf_ct_find_proto((u_int16_t)pf, protonum);
+	proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
 
 	/* It may be an special packet, error, unclean...
 	 * inverse of the return code tells to the netfilter
@@ -1002,9 +1122,9 @@ int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
 			 const struct nf_conntrack_tuple *orig)
 {
 	return nf_ct_invert_tuple(inverse, orig,
-				  nf_ct_find_l3proto(orig->src.l3num),
-				  nf_ct_find_proto(orig->src.l3num,
-						   orig->dst.protonum));
+				  __nf_ct_l3proto_find(orig->src.l3num),
+				  __nf_ct_proto_find(orig->src.l3num,
+						     orig->dst.protonum));
 }
 
 /* Would two expected things clash? */
@@ -1096,6 +1216,7 @@ static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
 	exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
 	add_timer(&exp->timeout);
 
+	exp->id = ++nf_conntrack_expect_next_id;
 	atomic_inc(&exp->use);
 	NF_CT_STAT_INC(expect_create);
 }
@@ -1176,7 +1297,7 @@ void nf_conntrack_alter_reply(struct nf_conn *conntrack,
 
 	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
 	if (!conntrack->master && conntrack->expecting == 0)
-		conntrack->helper = nf_ct_find_helper(newreply);
+		conntrack->helper = __nf_ct_helper_find(newreply);
 	write_unlock_bh(&nf_conntrack_lock);
 }
 
@@ -1201,6 +1322,19 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 	return 0;
 }
 
+struct nf_conntrack_helper *
+__nf_conntrack_helper_find_byname(const char *name)
+{
+	struct nf_conntrack_helper *h;
+
+	list_for_each_entry(h, &helpers, list) {
+		if (!strcmp(h->name, name))
+			return h;
+	}
+
+	return NULL;
+}
+
 static inline int unhelp(struct nf_conntrack_tuple_hash *i,
 			 const struct nf_conntrack_helper *me)
 {
@@ -1284,6 +1418,51 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
 		nf_conntrack_event_cache(event, skb);
 }
 
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+			       const struct nf_conntrack_tuple *tuple)
+{
+	NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
+		&tuple->src.u.tcp.port);
+	NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
+		&tuple->dst.u.tcp.port);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static const size_t cta_min_proto[CTA_PROTO_MAX] = {
+	[CTA_PROTO_SRC_PORT-1]  = sizeof(u_int16_t),
+	[CTA_PROTO_DST_PORT-1]  = sizeof(u_int16_t)
+};
+
+int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+			       struct nf_conntrack_tuple *t)
+{
+	if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
+		return -EINVAL;
+
+	if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+		return -EINVAL;
+
+	t->src.u.tcp.port =
+		*(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
+	t->dst.u.tcp.port =
+		*(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
+
+	return 0;
+}
+#endif
+
 /* Used by ipt_REJECT and ip6t_REJECT. */
 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
 {
@@ -1366,6 +1545,11 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
 			   get_order(sizeof(struct list_head) * size));
 }
 
+void nf_conntrack_flush()
+{
+	nf_ct_iterate_cleanup(kill_all, NULL);
+}
+
 /* Mishearing the voices in his head, our hero wonders how he's
    supposed to kill the mall. */
 void nf_conntrack_cleanup(void)
@@ -1379,7 +1563,7 @@ void nf_conntrack_cleanup(void)
 
 	nf_ct_event_cache_flush();
  i_see_dead_people:
-	nf_ct_iterate_cleanup(kill_all, NULL);
+	nf_conntrack_flush();
 	if (atomic_read(&nf_conntrack_count) != 0) {
 		schedule();
 		goto i_see_dead_people;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
new file mode 100644
index 000000000000..4f2e50952a12
--- /dev/null
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -0,0 +1,1642 @@
+/* Connection tracking via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2003 by Patrick Mchardy <kaber@trash.net>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * I've reworked this stuff to use attributes instead of conntrack 
+ * structures. 5.44 am. I need more tea. --pablo 05/07/11.
+ *
+ * Initial connection tracking via netlink development funded and 
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ *
+ * Derived from ip_conntrack_netlink.c: Port by Pablo Neira Ayuso (05/11/14)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+MODULE_LICENSE("GPL");
+
+static char __initdata version[] = "0.92";
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+
+static inline int
+ctnetlink_dump_tuples_proto(struct sk_buff *skb, 
+			    const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_protocol *proto;
+	int ret = 0;
+
+	NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
+
+	/* If no protocol helper is found, this function will return the
+	 * generic protocol helper, so proto won't *ever* be NULL */
+	proto = nf_ct_proto_find_get(tuple->src.l3num, tuple->dst.protonum);
+	if (likely(proto->tuple_to_nfattr))
+		ret = proto->tuple_to_nfattr(skb, tuple);
+	
+	nf_ct_proto_put(proto);
+
+	return ret;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_tuples(struct sk_buff *skb, 
+		      const struct nf_conntrack_tuple *tuple)
+{
+	struct nfattr *nest_parms;
+	struct nf_conntrack_l3proto *l3proto;
+	int ret = 0;
+	
+	l3proto = nf_ct_l3proto_find_get(tuple->src.l3num);
+	
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
+	if (likely(l3proto->tuple_to_nfattr))
+		ret = l3proto->tuple_to_nfattr(skb, tuple);
+	NFA_NEST_END(skb, nest_parms);
+
+	nf_ct_l3proto_put(l3proto);
+
+	if (unlikely(ret < 0))
+		return ret;
+
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
+	ret = ctnetlink_dump_tuples_proto(skb, tuple);
+	NFA_NEST_END(skb, nest_parms);
+
+	return ret;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	u_int32_t status = htonl((u_int32_t) ct->status);
+	NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	long timeout_l = ct->timeout.expires - jiffies;
+	u_int32_t timeout;
+
+	if (timeout_l < 0)
+		timeout = 0;
+	else
+		timeout = htonl(timeout_l / HZ);
+	
+	NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	struct nf_conntrack_protocol *proto = nf_ct_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+	struct nfattr *nest_proto;
+	int ret;
+
+	if (!proto->to_nfattr) {
+		nf_ct_proto_put(proto);
+		return 0;
+	}
+	
+	nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
+
+	ret = proto->to_nfattr(skb, nest_proto, ct);
+
+	nf_ct_proto_put(proto);
+
+	NFA_NEST_END(skb, nest_proto);
+
+	return ret;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	struct nfattr *nest_helper;
+
+	if (!ct->helper)
+		return 0;
+		
+	nest_helper = NFA_NEST(skb, CTA_HELP);
+	NFA_PUT(skb, CTA_HELP_NAME, strlen(ct->helper->name), ct->helper->name);
+
+	if (ct->helper->to_nfattr)
+		ct->helper->to_nfattr(skb, ct);
+
+	NFA_NEST_END(skb, nest_helper);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+#ifdef CONFIG_NF_CT_ACCT
+static inline int
+ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct,
+			enum ip_conntrack_dir dir)
+{
+	enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+	struct nfattr *nest_count = NFA_NEST(skb, type);
+	u_int32_t tmp;
+
+	tmp = htonl(ct->counters[dir].packets);
+	NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp);
+
+	tmp = htonl(ct->counters[dir].bytes);
+	NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp);
+
+	NFA_NEST_END(skb, nest_count);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+#else
+#define ctnetlink_dump_counters(a, b, c) (0)
+#endif
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+static inline int
+ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	u_int32_t mark = htonl(ct->mark);
+
+	NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+#else
+#define ctnetlink_dump_mark(a, b) (0)
+#endif
+
+static inline int
+ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	u_int32_t id = htonl(ct->id);
+	NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	u_int32_t use = htonl(atomic_read(&ct->ct_general.use));
+	
+	NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
+
+static int
+ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+		    int event, int nowait, 
+		    const struct nf_conn *ct)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct nfattr *nest_parms;
+	unsigned char *b;
+
+	b = skb->tail;
+
+	event |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh    = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+	nfmsg  = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = (nowait && pid) ? NLM_F_MULTI : 0;
+	nfmsg->nfgen_family = 
+		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+	nfmsg->version      = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+	
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+
+	if (ctnetlink_dump_status(skb, ct) < 0 ||
+	    ctnetlink_dump_timeout(skb, ct) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||
+	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+	    ctnetlink_dump_mark(skb, ct) < 0 ||
+	    ctnetlink_dump_id(skb, ct) < 0 ||
+	    ctnetlink_dump_use(skb, ct) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int ctnetlink_conntrack_event(struct notifier_block *this,
+                                     unsigned long events, void *ptr)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct nfattr *nest_parms;
+	struct nf_conn *ct = (struct nf_conn *)ptr;
+	struct sk_buff *skb;
+	unsigned int type;
+	unsigned char *b;
+	unsigned int flags = 0, group;
+
+	/* ignore our fake conntrack entry */
+	if (ct == &nf_conntrack_untracked)
+		return NOTIFY_DONE;
+
+	if (events & IPCT_DESTROY) {
+		type = IPCTNL_MSG_CT_DELETE;
+		group = NFNLGRP_CONNTRACK_DESTROY;
+	} else  if (events & (IPCT_NEW | IPCT_RELATED)) {
+		type = IPCTNL_MSG_CT_NEW;
+		flags = NLM_F_CREATE|NLM_F_EXCL;
+		/* dump everything */
+		events = ~0UL;
+		group = NFNLGRP_CONNTRACK_NEW;
+	} else  if (events & (IPCT_STATUS |
+		      IPCT_PROTOINFO |
+		      IPCT_HELPER |
+		      IPCT_HELPINFO |
+		      IPCT_NATINFO)) {
+		type = IPCTNL_MSG_CT_NEW;
+		group = NFNLGRP_CONNTRACK_UPDATE;
+	} else
+		return NOTIFY_DONE;
+	
+  /* FIXME: Check if there are any listeners before, don't hurt performance */
+	
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb)
+		return NOTIFY_DONE;
+
+	b = skb->tail;
+
+	type |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh   = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+	nfmsg = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = flags;
+	nfmsg->nfgen_family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+	nfmsg->version	= NFNETLINK_V0;
+	nfmsg->res_id	= 0;
+
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+	
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+	
+	/* NAT stuff is now a status flag */
+	if ((events & IPCT_STATUS || events & IPCT_NATINFO)
+	    && ctnetlink_dump_status(skb, ct) < 0)
+		goto nfattr_failure;
+	if (events & IPCT_REFRESH
+	    && ctnetlink_dump_timeout(skb, ct) < 0)
+		goto nfattr_failure;
+	if (events & IPCT_PROTOINFO
+	    && ctnetlink_dump_protoinfo(skb, ct) < 0)
+		goto nfattr_failure;
+	if (events & IPCT_HELPINFO
+	    && ctnetlink_dump_helpinfo(skb, ct) < 0)
+		goto nfattr_failure;
+
+	if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	nfnetlink_send(skb, 0, group, 0);
+	return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+	kfree_skb(skb);
+	return NOTIFY_DONE;
+}
+#endif /* CONFIG_NF_CONNTRACK_EVENTS */
+
+static int ctnetlink_done(struct netlink_callback *cb)
+{
+	DEBUGP("entered %s\n", __FUNCTION__);
+	return 0;
+}
+
+static int
+ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nf_conn *ct = NULL;
+	struct nf_conntrack_tuple_hash *h;
+	struct list_head *i;
+	u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+	DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, 
+			cb->args[0], *id);
+
+	read_lock_bh(&nf_conntrack_lock);
+	for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) {
+		list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) {
+			h = (struct nf_conntrack_tuple_hash *) i;
+			if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+				continue;
+			ct = nf_ct_tuplehash_to_ctrack(h);
+			if (ct->id <= *id)
+				continue;
+			if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+		                        	cb->nlh->nlmsg_seq,
+						IPCTNL_MSG_CT_NEW,
+						1, ct) < 0)
+				goto out;
+			*id = ct->id;
+		}
+	}
+out:	
+	read_unlock_bh(&nf_conntrack_lock);
+
+	DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+	return skb->len;
+}
+
+#ifdef CONFIG_NF_CT_ACCT
+static int
+ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nf_conn *ct = NULL;
+	struct nf_conntrack_tuple_hash *h;
+	struct list_head *i;
+	u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+	DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, 
+			cb->args[0], *id);
+
+	write_lock_bh(&nf_conntrack_lock);
+	for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) {
+		list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) {
+			h = (struct nf_conntrack_tuple_hash *) i;
+			if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+				continue;
+			ct = nf_ct_tuplehash_to_ctrack(h);
+			if (ct->id <= *id)
+				continue;
+			if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+		                        	cb->nlh->nlmsg_seq,
+						IPCTNL_MSG_CT_NEW,
+						1, ct) < 0)
+				goto out;
+			*id = ct->id;
+
+			memset(&ct->counters, 0, sizeof(ct->counters));
+		}
+	}
+out:	
+	write_unlock_bh(&nf_conntrack_lock);
+
+	DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+	return skb->len;
+}
+#endif
+
+static inline int
+ctnetlink_parse_tuple_ip(struct nfattr *attr, struct nf_conntrack_tuple *tuple)
+{
+	struct nfattr *tb[CTA_IP_MAX];
+	struct nf_conntrack_l3proto *l3proto;
+	int ret = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	nfattr_parse_nested(tb, CTA_IP_MAX, attr);
+
+	l3proto = nf_ct_l3proto_find_get(tuple->src.l3num);
+
+	if (likely(l3proto->nfattr_to_tuple))
+		ret = l3proto->nfattr_to_tuple(tb, tuple);
+
+	nf_ct_l3proto_put(l3proto);
+
+	DEBUGP("leaving\n");
+
+	return ret;
+}
+
+static const size_t cta_min_proto[CTA_PROTO_MAX] = {
+	[CTA_PROTO_NUM-1]	= sizeof(u_int8_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_proto(struct nfattr *attr, 
+			    struct nf_conntrack_tuple *tuple)
+{
+	struct nfattr *tb[CTA_PROTO_MAX];
+	struct nf_conntrack_protocol *proto;
+	int ret = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	nfattr_parse_nested(tb, CTA_PROTO_MAX, attr);
+
+	if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+		return -EINVAL;
+
+	if (!tb[CTA_PROTO_NUM-1])
+		return -EINVAL;
+	tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+
+	proto = nf_ct_proto_find_get(tuple->src.l3num, tuple->dst.protonum);
+
+	if (likely(proto->nfattr_to_tuple))
+		ret = proto->nfattr_to_tuple(tb, tuple);
+
+	nf_ct_proto_put(proto);
+	
+	return ret;
+}
+
+static inline int
+ctnetlink_parse_tuple(struct nfattr *cda[], struct nf_conntrack_tuple *tuple,
+		      enum ctattr_tuple type, u_int8_t l3num)
+{
+	struct nfattr *tb[CTA_TUPLE_MAX];
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	memset(tuple, 0, sizeof(*tuple));
+
+	nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]);
+
+	if (!tb[CTA_TUPLE_IP-1])
+		return -EINVAL;
+
+	tuple->src.l3num = l3num;
+
+	err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
+	if (err < 0)
+		return err;
+
+	if (!tb[CTA_TUPLE_PROTO-1])
+		return -EINVAL;
+
+	err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
+	if (err < 0)
+		return err;
+
+	/* orig and expect tuples get DIR_ORIGINAL */
+	if (type == CTA_TUPLE_REPLY)
+		tuple->dst.dir = IP_CT_DIR_REPLY;
+	else
+		tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+	NF_CT_DUMP_TUPLE(tuple);
+
+	DEBUGP("leaving\n");
+
+	return 0;
+}
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = {
+	[CTA_PROTONAT_PORT_MIN-1]       = sizeof(u_int16_t),
+	[CTA_PROTONAT_PORT_MAX-1]       = sizeof(u_int16_t),
+};
+
+static int ctnetlink_parse_nat_proto(struct nfattr *attr,
+				     const struct nf_conn *ct,
+				     struct ip_nat_range *range)
+{
+	struct nfattr *tb[CTA_PROTONAT_MAX];
+	struct ip_nat_protocol *npt;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr);
+
+	if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
+		return -EINVAL;
+
+	npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+
+	if (!npt->nfattr_to_range) {
+		ip_nat_proto_put(npt);
+		return 0;
+	}
+
+	/* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
+	if (npt->nfattr_to_range(tb, range) > 0)
+		range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+
+	ip_nat_proto_put(npt);
+
+	DEBUGP("leaving\n");
+	return 0;
+}
+
+static const size_t cta_min_nat[CTA_NAT_MAX] = {
+	[CTA_NAT_MINIP-1]       = sizeof(u_int32_t),
+	[CTA_NAT_MAXIP-1]       = sizeof(u_int32_t),
+};
+
+static inline int
+ctnetlink_parse_nat(struct nfattr *cda[],
+		    const struct nf_conn *ct, struct ip_nat_range *range)
+{
+	struct nfattr *tb[CTA_NAT_MAX];
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	memset(range, 0, sizeof(*range));
+	
+	nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
+
+	if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
+		return -EINVAL;
+
+	if (tb[CTA_NAT_MINIP-1])
+		range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
+
+	if (!tb[CTA_NAT_MAXIP-1])
+		range->max_ip = range->min_ip;
+	else
+		range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
+
+	if (range->min_ip)
+		range->flags |= IP_NAT_RANGE_MAP_IPS;
+
+	if (!tb[CTA_NAT_PROTO-1])
+		return 0;
+
+	err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
+	if (err < 0)
+		return err;
+
+	DEBUGP("leaving\n");
+	return 0;
+}
+#endif
+
+static inline int
+ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
+{
+	struct nfattr *tb[CTA_HELP_MAX];
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	nfattr_parse_nested(tb, CTA_HELP_MAX, attr);
+
+	if (!tb[CTA_HELP_NAME-1])
+		return -EINVAL;
+
+	*helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
+
+	return 0;
+}
+
+static const size_t cta_min[CTA_MAX] = {
+	[CTA_STATUS-1] 		= sizeof(u_int32_t),
+	[CTA_TIMEOUT-1] 	= sizeof(u_int32_t),
+	[CTA_MARK-1]		= sizeof(u_int32_t),
+	[CTA_USE-1]		= sizeof(u_int32_t),
+	[CTA_ID-1]		= sizeof(u_int32_t)
+};
+
+static int
+ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+			struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_tuple tuple;
+	struct nf_conn *ct;
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nfattr_bad_size(cda, CTA_MAX, cta_min))
+		return -EINVAL;
+
+	if (cda[CTA_TUPLE_ORIG-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+	else if (cda[CTA_TUPLE_REPLY-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+	else {
+		/* Flush the whole table */
+		nf_conntrack_flush();
+		return 0;
+	}
+
+	if (err < 0)
+		return err;
+
+	h = nf_conntrack_find_get(&tuple, NULL);
+	if (!h) {
+		DEBUGP("tuple not found in conntrack hash\n");
+		return -ENOENT;
+	}
+
+	ct = nf_ct_tuplehash_to_ctrack(h);
+	
+	if (cda[CTA_ID-1]) {
+		u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1]));
+		if (ct->id != id) {
+			nf_ct_put(ct);
+			return -ENOENT;
+		}
+	}	
+	if (del_timer(&ct->timeout))
+		ct->timeout.function((unsigned long)ct);
+
+	nf_ct_put(ct);
+	DEBUGP("leaving\n");
+
+	return 0;
+}
+
+static int
+ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+			struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_tuple tuple;
+	struct nf_conn *ct;
+	struct sk_buff *skb2 = NULL;
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		u32 rlen;
+
+		if (nfmsg->nfgen_family != AF_INET)
+			return -EAFNOSUPPORT;
+
+		if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
+					IPCTNL_MSG_CT_GET_CTRZERO) {
+#ifdef CONFIG_NF_CT_ACCT
+			if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+						ctnetlink_dump_table_w,
+						ctnetlink_done)) != 0)
+				return -EINVAL;
+#else
+			return -ENOTSUPP;
+#endif
+		} else {
+			if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+		      		                        ctnetlink_dump_table,
+		                                	ctnetlink_done)) != 0)
+			return -EINVAL;
+		}
+
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		skb_pull(skb, rlen);
+		return 0;
+	}
+
+	if (nfattr_bad_size(cda, CTA_MAX, cta_min))
+		return -EINVAL;
+
+	if (cda[CTA_TUPLE_ORIG-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+	else if (cda[CTA_TUPLE_REPLY-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+	else
+		return -EINVAL;
+
+	if (err < 0)
+		return err;
+
+	h = nf_conntrack_find_get(&tuple, NULL);
+	if (!h) {
+		DEBUGP("tuple not found in conntrack hash");
+		return -ENOENT;
+	}
+	DEBUGP("tuple found\n");
+	ct = nf_ct_tuplehash_to_ctrack(h);
+
+	err = -ENOMEM;
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2) {
+		nf_ct_put(ct);
+		return -ENOMEM;
+	}
+	NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+
+	err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 
+				  IPCTNL_MSG_CT_NEW, 1, ct);
+	nf_ct_put(ct);
+	if (err <= 0)
+		goto free;
+
+	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	if (err < 0)
+		goto out;
+
+	DEBUGP("leaving\n");
+	return 0;
+
+free:
+	kfree_skb(skb2);
+out:
+	return err;
+}
+
+static inline int
+ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[])
+{
+	unsigned long d;
+	unsigned status = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]));
+	d = ct->status ^ status;
+
+	if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+		/* unchangeable */
+		return -EINVAL;
+	
+	if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+		/* SEEN_REPLY bit can only be set */
+		return -EINVAL;
+
+	
+	if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+		/* ASSURED bit can only be set */
+		return -EINVAL;
+
+	if (cda[CTA_NAT-1]) {
+#ifndef CONFIG_IP_NF_NAT_NEEDED
+		return -EINVAL;
+#else
+		unsigned int hooknum;
+		struct ip_nat_range range;
+
+		if (ctnetlink_parse_nat(cda, ct, &range) < 0)
+			return -EINVAL;
+
+		DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
+		       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
+		       htons(range.min.all), htons(range.max.all));
+		
+		/* This is tricky but it works. ip_nat_setup_info needs the
+		 * hook number as parameter, so let's do the correct 
+		 * conversion and run away */
+		if (status & IPS_SRC_NAT_DONE)
+			hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
+		else if (status & IPS_DST_NAT_DONE)
+			hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
+		else 
+			return -EINVAL; /* Missing NAT flags */
+
+		DEBUGP("NAT status: %lu\n", 
+		       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+		
+		if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
+			return -EEXIST;
+		ip_nat_setup_info(ct, &range, hooknum);
+
+                DEBUGP("NAT status after setup_info: %lu\n",
+                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+#endif
+	}
+
+	/* Be careful here, modifying NAT bits can screw up things,
+	 * so don't let users modify them directly if they don't pass
+	 * ip_nat_range. */
+	ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
+	return 0;
+}
+
+
+static inline int
+ctnetlink_change_helper(struct nf_conn *ct, struct nfattr *cda[])
+{
+	struct nf_conntrack_helper *helper;
+	char *helpname;
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	/* don't change helper of sibling connections */
+	if (ct->master)
+		return -EINVAL;
+
+	err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
+	if (err < 0)
+		return err;
+
+	helper = __nf_conntrack_helper_find_byname(helpname);
+	if (!helper) {
+		if (!strcmp(helpname, ""))
+			helper = NULL;
+		else
+			return -EINVAL;
+	}
+
+	if (ct->helper) {
+		if (!helper) {
+			/* we had a helper before ... */
+			nf_ct_remove_expectations(ct);
+			ct->helper = NULL;
+		} else {
+			/* need to zero data of old helper */
+			memset(&ct->help, 0, sizeof(ct->help));
+		}
+	}
+	
+	ct->helper = helper;
+
+	return 0;
+}
+
+static inline int
+ctnetlink_change_timeout(struct nf_conn *ct, struct nfattr *cda[])
+{
+	u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+	
+	if (!del_timer(&ct->timeout))
+		return -ETIME;
+
+	ct->timeout.expires = jiffies + timeout * HZ;
+	add_timer(&ct->timeout);
+
+	return 0;
+}
+
+static inline int
+ctnetlink_change_protoinfo(struct nf_conn *ct, struct nfattr *cda[])
+{
+	struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1];
+	struct nf_conntrack_protocol *proto;
+	u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+	u_int16_t l3num = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+	int err = 0;
+
+	nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr);
+
+	proto = nf_ct_proto_find_get(l3num, npt);
+
+	if (proto->from_nfattr)
+		err = proto->from_nfattr(tb, ct);
+	nf_ct_proto_put(proto); 
+
+	return err;
+}
+
+static int
+ctnetlink_change_conntrack(struct nf_conn *ct, struct nfattr *cda[])
+{
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (cda[CTA_HELP-1]) {
+		err = ctnetlink_change_helper(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_TIMEOUT-1]) {
+		err = ctnetlink_change_timeout(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_STATUS-1]) {
+		err = ctnetlink_change_status(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_PROTOINFO-1]) {
+		err = ctnetlink_change_protoinfo(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
+	if (cda[CTA_MARK-1])
+		ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1]));
+#endif
+
+	DEBUGP("all done\n");
+	return 0;
+}
+
+static int
+ctnetlink_create_conntrack(struct nfattr *cda[], 
+			   struct nf_conntrack_tuple *otuple,
+			   struct nf_conntrack_tuple *rtuple)
+{
+	struct nf_conn *ct;
+	int err = -EINVAL;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	ct = nf_conntrack_alloc(otuple, rtuple);
+	if (ct == NULL || IS_ERR(ct))
+		return -ENOMEM;	
+
+	if (!cda[CTA_TIMEOUT-1])
+		goto err;
+	ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+
+	ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+	ct->status |= IPS_CONFIRMED;
+
+	err = ctnetlink_change_status(ct, cda);
+	if (err < 0)
+		goto err;
+
+	if (cda[CTA_PROTOINFO-1]) {
+		err = ctnetlink_change_protoinfo(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
+	if (cda[CTA_MARK-1])
+		ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1]));
+#endif
+
+	ct->helper = nf_ct_helper_find_get(rtuple);
+
+	add_timer(&ct->timeout);
+	nf_conntrack_hash_insert(ct);
+
+	if (ct->helper)
+		nf_ct_helper_put(ct->helper);
+
+	DEBUGP("conntrack with id %u inserted\n", ct->id);
+	return 0;
+
+err:	
+	nf_conntrack_free(ct);
+	return err;
+}
+
+static int 
+ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+			struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct nf_conntrack_tuple otuple, rtuple;
+	struct nf_conntrack_tuple_hash *h = NULL;
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nfattr_bad_size(cda, CTA_MAX, cta_min))
+		return -EINVAL;
+
+	if (cda[CTA_TUPLE_ORIG-1]) {
+		err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_TUPLE_REPLY-1]) {
+		err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3);
+		if (err < 0)
+			return err;
+	}
+
+	write_lock_bh(&nf_conntrack_lock);
+	if (cda[CTA_TUPLE_ORIG-1])
+		h = __nf_conntrack_find(&otuple, NULL);
+	else if (cda[CTA_TUPLE_REPLY-1])
+		h = __nf_conntrack_find(&rtuple, NULL);
+
+	if (h == NULL) {
+		write_unlock_bh(&nf_conntrack_lock);
+		DEBUGP("no such conntrack, create new\n");
+		err = -ENOENT;
+		if (nlh->nlmsg_flags & NLM_F_CREATE)
+			err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
+		return err;
+	}
+	/* implicit 'else' */
+
+	/* we only allow nat config for new conntracks */
+	if (cda[CTA_NAT-1]) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* We manipulate the conntrack inside the global conntrack table lock,
+	 * so there's no need to increase the refcount */
+	DEBUGP("conntrack found\n");
+	err = -EEXIST;
+	if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+		err = ctnetlink_change_conntrack(nf_ct_tuplehash_to_ctrack(h), cda);
+
+out_unlock:
+	write_unlock_bh(&nf_conntrack_lock);
+	return err;
+}
+
+/*********************************************************************** 
+ * EXPECT 
+ ***********************************************************************/ 
+
+static inline int
+ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+			 const struct nf_conntrack_tuple *tuple,
+			 enum ctattr_expect type)
+{
+	struct nfattr *nest_parms = NFA_NEST(skb, type);
+	
+	if (ctnetlink_dump_tuples(skb, tuple) < 0)
+		goto nfattr_failure;
+
+	NFA_NEST_END(skb, nest_parms);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}			
+
+static inline int
+ctnetlink_exp_dump_expect(struct sk_buff *skb,
+                          const struct nf_conntrack_expect *exp)
+{
+	struct nf_conn *master = exp->master;
+	u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
+	u_int32_t id = htonl(exp->id);
+
+	if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
+		goto nfattr_failure;
+	if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
+		goto nfattr_failure;
+	if (ctnetlink_exp_dump_tuple(skb,
+				 &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				 CTA_EXPECT_MASTER) < 0)
+		goto nfattr_failure;
+	
+	NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
+	NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
+
+	return 0;
+	
+nfattr_failure:
+	return -1;
+}
+
+static int
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+		    int event, 
+		    int nowait, 
+		    const struct nf_conntrack_expect *exp)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned char *b;
+
+	b = skb->tail;
+
+	event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+	nlh    = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+	nfmsg  = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = (nowait && pid) ? NLM_F_MULTI : 0;
+	nfmsg->nfgen_family = exp->tuple.src.l3num;
+	nfmsg->version	    = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int ctnetlink_expect_event(struct notifier_block *this,
+				  unsigned long events, void *ptr)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct nf_conntrack_expect *exp = (struct nf_conntrack_expect *)ptr;
+	struct sk_buff *skb;
+	unsigned int type;
+	unsigned char *b;
+	int flags = 0;
+
+	if (events & IPEXP_NEW) {
+		type = IPCTNL_MSG_EXP_NEW;
+		flags = NLM_F_CREATE|NLM_F_EXCL;
+	} else
+		return NOTIFY_DONE;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb)
+		return NOTIFY_DONE;
+
+	b = skb->tail;
+
+	type |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh   = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+	nfmsg = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = flags;
+	nfmsg->nfgen_family = exp->tuple.src.l3num;
+	nfmsg->version	    = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
+	return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+	kfree_skb(skb);
+	return NOTIFY_DONE;
+}
+#endif
+
+static int
+ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nf_conntrack_expect *exp = NULL;
+	struct list_head *i;
+	u_int32_t *id = (u_int32_t *) &cb->args[0];
+
+	DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
+
+	read_lock_bh(&nf_conntrack_lock);
+	list_for_each_prev(i, &nf_conntrack_expect_list) {
+		exp = (struct nf_conntrack_expect *) i;
+		if (exp->id <= *id)
+			continue;
+		if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
+					    cb->nlh->nlmsg_seq,
+					    IPCTNL_MSG_EXP_NEW,
+					    1, exp) < 0)
+			goto out;
+		*id = exp->id;
+	}
+out:	
+	read_unlock_bh(&nf_conntrack_lock);
+
+	DEBUGP("leaving, last id=%llu\n", *id);
+
+	return skb->len;
+}
+
+static const size_t cta_min_exp[CTA_EXPECT_MAX] = {
+	[CTA_EXPECT_TIMEOUT-1]          = sizeof(u_int32_t),
+	[CTA_EXPECT_ID-1]               = sizeof(u_int32_t)
+};
+
+static int
+ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, 
+		     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_expect *exp;
+	struct sk_buff *skb2;
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
+		return -EINVAL;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		u32 rlen;
+
+		if (nfmsg->nfgen_family != AF_INET)
+			return -EAFNOSUPPORT;
+
+		if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+		    				ctnetlink_exp_dump_table,
+						ctnetlink_done)) != 0)
+			return -EINVAL;
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		skb_pull(skb, rlen);
+		return 0;
+	}
+
+	if (cda[CTA_EXPECT_MASTER-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+	else
+		return -EINVAL;
+
+	if (err < 0)
+		return err;
+
+	exp = nf_conntrack_expect_find(&tuple);
+	if (!exp)
+		return -ENOENT;
+
+	if (cda[CTA_EXPECT_ID-1]) {
+		u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
+		if (exp->id != ntohl(id)) {
+			nf_conntrack_expect_put(exp);
+			return -ENOENT;
+		}
+	}	
+
+	err = -ENOMEM;
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		goto out;
+	NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+	
+	err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, 
+				      nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+				      1, exp);
+	if (err <= 0)
+		goto free;
+
+	nf_conntrack_expect_put(exp);
+
+	return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+
+free:
+	kfree_skb(skb2);
+out:
+	nf_conntrack_expect_put(exp);
+	return err;
+}
+
+static int
+ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, 
+		     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct nf_conntrack_expect *exp, *tmp;
+	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_helper *h;
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	int err;
+
+	if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
+		return -EINVAL;
+
+	if (cda[CTA_EXPECT_TUPLE-1]) {
+		/* delete a single expect by tuple */
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+		if (err < 0)
+			return err;
+
+		/* bump usage count to 2 */
+		exp = nf_conntrack_expect_find(&tuple);
+		if (!exp)
+			return -ENOENT;
+
+		if (cda[CTA_EXPECT_ID-1]) {
+			u_int32_t id = 
+				*(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
+			if (exp->id != ntohl(id)) {
+				nf_conntrack_expect_put(exp);
+				return -ENOENT;
+			}
+		}
+
+		/* after list removal, usage count == 1 */
+		nf_conntrack_unexpect_related(exp);
+		/* have to put what we 'get' above. 
+		 * after this line usage count == 0 */
+		nf_conntrack_expect_put(exp);
+	} else if (cda[CTA_EXPECT_HELP_NAME-1]) {
+		char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
+
+		/* delete all expectations for this helper */
+		write_lock_bh(&nf_conntrack_lock);
+		h = __nf_conntrack_helper_find_byname(name);
+		if (!h) {
+			write_unlock_bh(&nf_conntrack_lock);
+			return -EINVAL;
+		}
+		list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list,
+					 list) {
+			if (exp->master->helper == h 
+			    && del_timer(&exp->timeout)) {
+				nf_ct_unlink_expect(exp);
+				nf_conntrack_expect_put(exp);
+			}
+		}
+		write_unlock_bh(&nf_conntrack_lock);
+	} else {
+		/* This basically means we have to flush everything*/
+		write_lock_bh(&nf_conntrack_lock);
+		list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list,
+					 list) {
+			if (del_timer(&exp->timeout)) {
+				nf_ct_unlink_expect(exp);
+				nf_conntrack_expect_put(exp);
+			}
+		}
+		write_unlock_bh(&nf_conntrack_lock);
+	}
+
+	return 0;
+}
+static int
+ctnetlink_change_expect(struct nf_conntrack_expect *x, struct nfattr *cda[])
+{
+	return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_create_expect(struct nfattr *cda[], u_int8_t u3)
+{
+	struct nf_conntrack_tuple tuple, mask, master_tuple;
+	struct nf_conntrack_tuple_hash *h = NULL;
+	struct nf_conntrack_expect *exp;
+	struct nf_conn *ct;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	/* caller guarantees that those three CTA_EXPECT_* exist */
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+	if (err < 0)
+		return err;
+	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
+	if (err < 0)
+		return err;
+	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
+	if (err < 0)
+		return err;
+
+	/* Look for master conntrack of this expectation */
+	h = nf_conntrack_find_get(&master_tuple, NULL);
+	if (!h)
+		return -ENOENT;
+	ct = nf_ct_tuplehash_to_ctrack(h);
+
+	if (!ct->helper) {
+		/* such conntrack hasn't got any helper, abort */
+		err = -EINVAL;
+		goto out;
+	}
+
+	exp = nf_conntrack_expect_alloc(ct);
+	if (!exp) {
+		err = -ENOMEM;
+		goto out;
+	}
+	
+	exp->expectfn = NULL;
+	exp->flags = 0;
+	exp->master = ct;
+	memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
+	memcpy(&exp->mask, &mask, sizeof(struct nf_conntrack_tuple));
+
+	err = nf_conntrack_expect_related(exp);
+	nf_conntrack_expect_put(exp);
+
+out:	
+	nf_ct_put(nf_ct_tuplehash_to_ctrack(h));
+	return err;
+}
+
+static int
+ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
+		     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_expect *exp;
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);	
+
+	if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
+		return -EINVAL;
+
+	if (!cda[CTA_EXPECT_TUPLE-1]
+	    || !cda[CTA_EXPECT_MASK-1]
+	    || !cda[CTA_EXPECT_MASTER-1])
+		return -EINVAL;
+
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+	if (err < 0)
+		return err;
+
+	write_lock_bh(&nf_conntrack_lock);
+	exp = __nf_conntrack_expect_find(&tuple);
+
+	if (!exp) {
+		write_unlock_bh(&nf_conntrack_lock);
+		err = -ENOENT;
+		if (nlh->nlmsg_flags & NLM_F_CREATE)
+			err = ctnetlink_create_expect(cda, u3);
+		return err;
+	}
+
+	err = -EEXIST;
+	if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+		err = ctnetlink_change_expect(exp, cda);
+	write_unlock_bh(&nf_conntrack_lock);
+
+	DEBUGP("leaving\n");
+	
+	return err;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static struct notifier_block ctnl_notifier = {
+	.notifier_call	= ctnetlink_conntrack_event,
+};
+
+static struct notifier_block ctnl_notifier_exp = {
+	.notifier_call	= ctnetlink_expect_event,
+};
+#endif
+
+static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
+	[IPCTNL_MSG_CT_NEW]		= { .call = ctnetlink_new_conntrack,
+					    .attr_count = CTA_MAX, },
+	[IPCTNL_MSG_CT_GET] 		= { .call = ctnetlink_get_conntrack,
+					    .attr_count = CTA_MAX, },
+	[IPCTNL_MSG_CT_DELETE]  	= { .call = ctnetlink_del_conntrack,
+					    .attr_count = CTA_MAX, },
+	[IPCTNL_MSG_CT_GET_CTRZERO] 	= { .call = ctnetlink_get_conntrack,
+					    .attr_count = CTA_MAX, },
+};
+
+static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
+	[IPCTNL_MSG_EXP_GET]		= { .call = ctnetlink_get_expect,
+					    .attr_count = CTA_EXPECT_MAX, },
+	[IPCTNL_MSG_EXP_NEW]		= { .call = ctnetlink_new_expect,
+					    .attr_count = CTA_EXPECT_MAX, },
+	[IPCTNL_MSG_EXP_DELETE]		= { .call = ctnetlink_del_expect,
+					    .attr_count = CTA_EXPECT_MAX, },
+};
+
+static struct nfnetlink_subsystem ctnl_subsys = {
+	.name				= "conntrack",
+	.subsys_id			= NFNL_SUBSYS_CTNETLINK,
+	.cb_count			= IPCTNL_MSG_MAX,
+	.cb				= ctnl_cb,
+};
+
+static struct nfnetlink_subsystem ctnl_exp_subsys = {
+	.name				= "conntrack_expect",
+	.subsys_id			= NFNL_SUBSYS_CTNETLINK_EXP,
+	.cb_count			= IPCTNL_MSG_EXP_MAX,
+	.cb				= ctnl_exp_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
+
+static int __init ctnetlink_init(void)
+{
+	int ret;
+
+	printk("ctnetlink v%s: registering with nfnetlink.\n", version);
+	ret = nfnetlink_subsys_register(&ctnl_subsys);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot register with nfnetlink.\n");
+		goto err_out;
+	}
+
+	ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
+		goto err_unreg_subsys;
+	}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+	ret = nf_conntrack_register_notifier(&ctnl_notifier);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot register notifier.\n");
+		goto err_unreg_exp_subsys;
+	}
+
+	ret = nf_conntrack_expect_register_notifier(&ctnl_notifier_exp);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot expect register notifier.\n");
+		goto err_unreg_notifier;
+	}
+#endif
+
+	return 0;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+	nf_conntrack_unregister_notifier(&ctnl_notifier);
+err_unreg_exp_subsys:
+	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+#endif
+err_unreg_subsys:
+	nfnetlink_subsys_unregister(&ctnl_subsys);
+err_out:
+	return ret;
+}
+
+static void __exit ctnetlink_exit(void)
+{
+	printk("ctnetlink: unregistering from nfnetlink.\n");
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+	nf_conntrack_unregister_notifier(&ctnl_notifier_exp);
+	nf_conntrack_unregister_notifier(&ctnl_notifier);
+#endif
+
+	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+	nfnetlink_subsys_unregister(&ctnl_subsys);
+	return;
+}
+
+module_init(ctnetlink_init);
+module_exit(ctnetlink_exit);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 6035633d8225..6167137a5cb5 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1147,6 +1147,63 @@ static int tcp_new(struct nf_conn *conntrack,
 		receiver->td_scale);
 	return 1;
 }
+
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
+			 const struct nf_conn *ct)
+{
+	struct nfattr *nest_parms;
+	
+	read_lock_bh(&tcp_lock);
+	nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
+	NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
+		&ct->proto.tcp.state);
+	read_unlock_bh(&tcp_lock);
+
+	NFA_NEST_END(skb, nest_parms);
+
+	return 0;
+
+nfattr_failure:
+	read_unlock_bh(&tcp_lock);
+	return -1;
+}
+
+static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = {
+	[CTA_PROTOINFO_TCP_STATE-1]	= sizeof(u_int8_t),
+};
+
+static int nfattr_to_tcp(struct nfattr *cda[], struct nf_conn *ct)
+{
+	struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1];
+	struct nfattr *tb[CTA_PROTOINFO_TCP_MAX];
+
+	/* updates could not contain anything about the private
+	 * protocol info, in that case skip the parsing */
+	if (!attr)
+		return 0;
+
+        nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr);
+
+	if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp))
+		return -EINVAL;
+
+	if (!tb[CTA_PROTOINFO_TCP_STATE-1])
+		return -EINVAL;
+
+	write_lock_bh(&tcp_lock);
+	ct->proto.tcp.state = 
+		*(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]);
+	write_unlock_bh(&tcp_lock);
+
+	return 0;
+}
+#endif
   
 struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 =
 {
@@ -1160,6 +1217,13 @@ struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 =
 	.packet 		= tcp_packet,
 	.new 			= tcp_new,
 	.error			= tcp_error4,
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.to_nfattr		= tcp_to_nfattr,
+	.from_nfattr		= nfattr_to_tcp,
+	.tuple_to_nfattr	= nf_ct_port_tuple_to_nfattr,
+	.nfattr_to_tuple	= nf_ct_port_nfattr_to_tuple,
+#endif
 };
 
 struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 =
@@ -1174,6 +1238,13 @@ struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 =
 	.packet 		= tcp_packet,
 	.new 			= tcp_new,
 	.error			= tcp_error6,
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.to_nfattr		= tcp_to_nfattr,
+	.from_nfattr		= nfattr_to_tcp,
+	.tuple_to_nfattr	= nf_ct_port_tuple_to_nfattr,
+	.nfattr_to_tuple	= nf_ct_port_nfattr_to_tuple,
+#endif
 };
 
 EXPORT_SYMBOL(nf_conntrack_protocol_tcp4);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 3cae7ce420dd..1a592a556182 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -196,6 +196,11 @@ struct nf_conntrack_protocol nf_conntrack_protocol_udp4 =
 	.packet			= udp_packet,
 	.new			= udp_new,
 	.error			= udp_error4,
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nfattr	= nf_ct_port_tuple_to_nfattr,
+	.nfattr_to_tuple	= nf_ct_port_nfattr_to_tuple,
+#endif
 };
 
 struct nf_conntrack_protocol nf_conntrack_protocol_udp6 =
@@ -210,6 +215,11 @@ struct nf_conntrack_protocol nf_conntrack_protocol_udp6 =
 	.packet			= udp_packet,
 	.new			= udp_new,
 	.error			= udp_error6,
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nfattr	= nf_ct_port_tuple_to_nfattr,
+	.nfattr_to_tuple	= nf_ct_port_nfattr_to_tuple,
+#endif
 };
 
 EXPORT_SYMBOL(nf_conntrack_protocol_udp4);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5af381f9fe3d..d17e42b28c79 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -161,14 +161,14 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	if (NF_CT_DIRECTION(hash))
 		return 0;
 
-	l3proto = nf_ct_find_l3proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
-				     .tuple.src.l3num);
+	l3proto = __nf_ct_l3proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+				       .tuple.src.l3num);
 
 	NF_CT_ASSERT(l3proto);
-	proto = nf_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
-				 .tuple.src.l3num,
-				 conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
-				 .tuple.dst.protonum);
+	proto = __nf_ct_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+				   .tuple.src.l3num,
+				   conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+				   .tuple.dst.protonum);
 	NF_CT_ASSERT(proto);
 
 	if (seq_printf(s, "%-8s %u %-8s %u %ld ",
@@ -307,9 +307,9 @@ static int exp_seq_show(struct seq_file *s, void *v)
 		   expect->tuple.src.l3num,
 		   expect->tuple.dst.protonum);
 	print_tuple(s, &expect->tuple,
-		    nf_ct_find_l3proto(expect->tuple.src.l3num),
-		    nf_ct_find_proto(expect->tuple.src.l3num,
-				     expect->tuple.dst.protonum));
+		    __nf_ct_l3proto_find(expect->tuple.src.l3num),
+		    __nf_ct_proto_find(expect->tuple.src.l3num,
+				       expect->tuple.dst.protonum));
 	return seq_putc(s, '\n');
 }
 
@@ -847,7 +847,11 @@ EXPORT_SYMBOL(nf_conntrack_helper_unregister);
 EXPORT_SYMBOL(nf_ct_iterate_cleanup);
 EXPORT_SYMBOL(__nf_ct_refresh_acct);
 EXPORT_SYMBOL(nf_ct_protos);
-EXPORT_SYMBOL(nf_ct_find_proto);
+EXPORT_SYMBOL(__nf_ct_proto_find);
+EXPORT_SYMBOL(nf_ct_proto_find_get);
+EXPORT_SYMBOL(nf_ct_proto_put);
+EXPORT_SYMBOL(nf_ct_l3proto_find_get);
+EXPORT_SYMBOL(nf_ct_l3proto_put);
 EXPORT_SYMBOL(nf_ct_l3protos);
 EXPORT_SYMBOL(nf_conntrack_expect_alloc);
 EXPORT_SYMBOL(nf_conntrack_expect_put);
@@ -867,3 +871,21 @@ EXPORT_SYMBOL(nf_ct_get_tuple);
 EXPORT_SYMBOL(nf_ct_invert_tuple);
 EXPORT_SYMBOL(nf_conntrack_in);
 EXPORT_SYMBOL(__nf_conntrack_attach);
+EXPORT_SYMBOL(nf_conntrack_alloc);
+EXPORT_SYMBOL(nf_conntrack_free);
+EXPORT_SYMBOL(nf_conntrack_flush);
+EXPORT_SYMBOL(nf_ct_remove_expectations);
+EXPORT_SYMBOL(nf_ct_helper_find_get);
+EXPORT_SYMBOL(nf_ct_helper_put);
+EXPORT_SYMBOL(__nf_conntrack_helper_find_byname);
+EXPORT_SYMBOL(__nf_conntrack_find);
+EXPORT_SYMBOL(nf_ct_unlink_expect);
+EXPORT_SYMBOL(nf_conntrack_hash_insert);
+EXPORT_SYMBOL(__nf_conntrack_expect_find);
+EXPORT_SYMBOL(nf_conntrack_expect_find);
+EXPORT_SYMBOL(nf_conntrack_expect_list);
+#if defined(CONFIG_NF_CT_NETLINK) || \
+    defined(CONFIG_NF_CT_NETLINK_MODULE)
+EXPORT_SYMBOL(nf_ct_port_tuple_to_nfattr);
+EXPORT_SYMBOL(nf_ct_port_nfattr_to_tuple);
+#endif
-- 
cgit v1.2.3-71-gd317


From a9b305c4e56f97d6a2ae4f21691bc13797498caf Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 5 Jan 2006 12:20:02 -0800
Subject: [NETFILTER]: ctnetlink: Fix dumping of helper name

Properly dump the helper name instead of internal kernel data.
Based on patch by Marcus Sundberg <marcus@ingate.com>.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h | 2 --
 net/ipv4/netfilter/ip_conntrack_netlink.c     | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index b8e9a5b6fb1e..668ec946c8e2 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -131,6 +131,4 @@ enum ctattr_help {
 };
 #define CTA_HELP_MAX (__CTA_HELP_MAX - 1)
 
-#define CTA_HELP_MAXNAMESIZE	32
-
 #endif /* _IPCONNTRACK_NETLINK_H */
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 703f2d2e3464..c9ebbe0d2d9c 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -161,7 +161,7 @@ ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
 		return 0;
 		
 	nest_helper = NFA_NEST(skb, CTA_HELP);
-	NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name);
+	NFA_PUT(skb, CTA_HELP_NAME, strlen(ct->helper->name), ct->helper->name);
 
 	if (ct->helper->to_nfattr)
 		ct->helper->to_nfattr(skb, ct);
-- 
cgit v1.2.3-71-gd317


From b777e0ce7437a0e788e2aeb42aca9af2cce1f2e1 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 5 Jan 2006 12:21:16 -0800
Subject: [NETFILTER]: make ipv6_find_hdr() find transport protocol header

The original ipv6_find_hdr() finds the specified header in IPv6 packets.
This makes it possible to get transport header so that we can kill similar
loop in ip6_match_packet().

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6/ip6_tables.h |   2 +-
 net/ipv6/netfilter/ip6_tables.c           | 106 +++++++++++-------------------
 net/ipv6/netfilter/ip6t_ah.c              |   2 +-
 net/ipv6/netfilter/ip6t_dst.c             |   4 +-
 net/ipv6/netfilter/ip6t_esp.c             |   2 +-
 net/ipv6/netfilter/ip6t_frag.c            |   2 +-
 net/ipv6/netfilter/ip6t_hbh.c             |   4 +-
 net/ipv6/netfilter/ip6t_rt.c              |   2 +-
 8 files changed, 49 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 2efc046d9e94..a291cb76ef18 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -474,7 +474,7 @@ extern unsigned int ip6t_do_table(struct sk_buff **pskb,
 extern int ip6t_ext_hdr(u8 nexthdr);
 /* find specified header and get offset to it */
 extern int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
-			 u8 target);
+			 int target, unsigned short *fragoff);
 
 #define IP6T_ALIGN(s) (((s) + (__alignof__(struct ip6t_entry)-1)) & ~(__alignof__(struct ip6t_entry)-1))
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index ea43ef1d94a7..13b1a525b92c 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -205,69 +205,21 @@ ip6_packet_match(const struct sk_buff *skb,
 
 	/* look for the desired protocol header */
 	if((ip6info->flags & IP6T_F_PROTO)) {
-		u_int8_t currenthdr = ipv6->nexthdr;
-		struct ipv6_opt_hdr _hdr, *hp;
-		u_int16_t ptr;		/* Header offset in skb */
-		u_int16_t hdrlen;	/* Header */
-		u_int16_t _fragoff = 0, *fp = NULL;
-
-		ptr = IPV6_HDR_LEN;
-
-		while (ip6t_ext_hdr(currenthdr)) {
-	                /* Is there enough space for the next ext header? */
-	                if (skb->len - ptr < IPV6_OPTHDR_LEN)
-	                        return 0;
-
-			/* NONE or ESP: there isn't protocol part */
-			/* If we want to count these packets in '-p all',
-			 * we will change the return 0 to 1*/
-			if ((currenthdr == IPPROTO_NONE) || 
-				(currenthdr == IPPROTO_ESP))
-				break;
+		int protohdr;
+		unsigned short _frag_off;
 
-			hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
-			BUG_ON(hp == NULL);
-
-			/* Size calculation */
-	                if (currenthdr == IPPROTO_FRAGMENT) {
-				fp = skb_header_pointer(skb,
-						   ptr+offsetof(struct frag_hdr,
-								frag_off),
-						   sizeof(_fragoff),
-						   &_fragoff);
-				if (fp == NULL)
-					return 0;
-
-				_fragoff = ntohs(*fp) & ~0x7;
-	                        hdrlen = 8;
-	                } else if (currenthdr == IPPROTO_AH)
-	                        hdrlen = (hp->hdrlen+2)<<2;
-	                else
-	                        hdrlen = ipv6_optlen(hp);
-
-			currenthdr = hp->nexthdr;
-	                ptr += hdrlen;
-			/* ptr is too large */
-	                if ( ptr > skb->len ) 
-				return 0;
-			if (_fragoff) {
-				if (ip6t_ext_hdr(currenthdr))
-					return 0;
-				break;
-			}
-		}
-
-		*protoff = ptr;
-		*fragoff = _fragoff;
+		protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off);
+		if (protohdr < 0)
+			return 0;
 
-		/* currenthdr contains the protocol header */
+		*fragoff = _frag_off;
 
 		dprintf("Packet protocol %hi ?= %s%hi.\n",
-				currenthdr, 
+				protohdr, 
 				ip6info->invflags & IP6T_INV_PROTO ? "!":"",
 				ip6info->proto);
 
-		if (ip6info->proto == currenthdr) {
+		if (ip6info->proto == protohdr) {
 			if(ip6info->invflags & IP6T_INV_PROTO) {
 				return 0;
 			}
@@ -2098,26 +2050,39 @@ static void __exit fini(void)
 }
 
 /*
- * find specified header up to transport protocol header.
- * If found target header, the offset to the header is set to *offset
- * and return 0. otherwise, return -1.
+ * find the offset to specified header or the protocol number of last header
+ * if target < 0. "last header" is transport protocol header, ESP, or
+ * "No next header".
+ *
+ * If target header is found, its offset is set in *offset and return protocol
+ * number. Otherwise, return -1.
+ *
+ * Note that non-1st fragment is special case that "the protocol number
+ * of last header" is "next header" field in Fragment header. In this case,
+ * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
+ * isn't NULL.
  *
- * Notes: - non-1st Fragment Header isn't skipped.
- *	  - ESP header isn't skipped.
- *	  - The target header may be trancated.
  */
-int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, u8 target)
+int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
+		  int target, unsigned short *fragoff)
 {
 	unsigned int start = (u8*)(skb->nh.ipv6h + 1) - skb->data;
 	u8 nexthdr = skb->nh.ipv6h->nexthdr;
 	unsigned int len = skb->len - start;
 
+	if (fragoff)
+		*fragoff = 0;
+
 	while (nexthdr != target) {
 		struct ipv6_opt_hdr _hdr, *hp;
 		unsigned int hdrlen;
 
-		if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE)
+		if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
+			if (target < 0)
+				break;
 			return -1;
+		}
+
 		hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
 		if (hp == NULL)
 			return -1;
@@ -2131,8 +2096,17 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, u8 target)
 			if (fp == NULL)
 				return -1;
 
-			if (ntohs(*fp) & ~0x7)
+			_frag_off = ntohs(*fp) & ~0x7;
+			if (_frag_off) {
+				if (target < 0 &&
+				    ((!ipv6_ext_hdr(hp->nexthdr)) ||
+				     nexthdr == NEXTHDR_NONE)) {
+					if (fragoff)
+						*fragoff = _frag_off;
+					return hp->nexthdr;
+				}
 				return -1;
+			}
 			hdrlen = 8;
 		} else if (nexthdr == NEXTHDR_AUTH)
 			hdrlen = (hp->hdrlen + 2) << 2; 
@@ -2145,7 +2119,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, u8 target)
 	}
 
 	*offset = start;
-	return 0;
+	return nexthdr;
 }
 
 EXPORT_SYMBOL(ip6t_register_table);
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 268918d5deea..f5c1a7ff4a1f 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -54,7 +54,7 @@ match(const struct sk_buff *skb,
 	unsigned int ptr;
 	unsigned int hdrlen = 0;
 
-	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH) < 0)
+	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL) < 0)
 		return 0;
 
 	ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah);
diff --git a/net/ipv6/netfilter/ip6t_dst.c b/net/ipv6/netfilter/ip6t_dst.c
index c450a635e54b..48cf5f9efc95 100644
--- a/net/ipv6/netfilter/ip6t_dst.c
+++ b/net/ipv6/netfilter/ip6t_dst.c
@@ -71,9 +71,9 @@ match(const struct sk_buff *skb,
        unsigned int optlen;
        
 #if HOPBYHOP
-	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_HOP) < 0)
+	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_HOP, NULL) < 0)
 #else
-	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_DEST) < 0)
+	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_DEST, NULL) < 0)
 #endif
 		return 0;
 
diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c
index 65937de1b58c..e1828f6d0a40 100644
--- a/net/ipv6/netfilter/ip6t_esp.c
+++ b/net/ipv6/netfilter/ip6t_esp.c
@@ -56,7 +56,7 @@ match(const struct sk_buff *skb,
 	/* Make sure this isn't an evil packet */
 	/*DEBUGP("ipv6_esp entered \n");*/
 
-	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_ESP) < 0)
+	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_ESP, NULL) < 0)
 		return 0;
 
 	eh = skb_header_pointer(skb, ptr, sizeof(_esp), &_esp);
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index 085d5f8eea29..d1549b268669 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -52,7 +52,7 @@ match(const struct sk_buff *skb,
        const struct ip6t_frag *fraginfo = matchinfo;
        unsigned int ptr;
 
-	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT) < 0)
+	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL) < 0)
 		return 0;
 
 	fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag);
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index 1d09485111d0..e3bc8e2700e7 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -71,9 +71,9 @@ match(const struct sk_buff *skb,
        unsigned int optlen;
        
 #if HOPBYHOP
-	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_HOP) < 0)
+	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_HOP, NULL) < 0)
 #else
-	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_DEST) < 0)
+	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_DEST, NULL) < 0)
 #endif
 		return 0;
 
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index beb2fd5cebbb..c1e770e45543 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -58,7 +58,7 @@ match(const struct sk_buff *skb,
        unsigned int ret = 0;
        struct in6_addr *ap, _addr;
 
-	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING) < 0)
+	if (ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL) < 0)
 		return 0;
 
        rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route);
-- 
cgit v1.2.3-71-gd317


From 22dea562bb56dbc3430c8f23f60ccd38527b1f5a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 5 Jan 2006 12:21:34 -0800
Subject: [NETFILTER]: Export ip6_masked_addrcmp, don't pass IPv6 addresses on
 stack

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6/ip6_tables.h |  4 ++++
 net/ipv6/netfilter/ip6_tables.c           | 18 ++++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index a291cb76ef18..c163ba31aab7 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -476,6 +476,10 @@ extern int ip6t_ext_hdr(u8 nexthdr);
 extern int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
 			 int target, unsigned short *fragoff);
 
+extern int ip6_masked_addrcmp(const struct in6_addr *addr1,
+			      const struct in6_addr *mask,
+			      const struct in6_addr *addr2);
+
 #define IP6T_ALIGN(s) (((s) + (__alignof__(struct ip6t_entry)-1)) & ~(__alignof__(struct ip6t_entry)-1))
 
 #endif /*__KERNEL__*/
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 13b1a525b92c..925b42d48347 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -119,13 +119,14 @@ static LIST_HEAD(ip6t_tables);
 #define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0)
 #endif
 
-static int ip6_masked_addrcmp(struct in6_addr addr1, struct in6_addr mask,
-			      struct in6_addr addr2)
+int
+ip6_masked_addrcmp(const struct in6_addr *addr1, const struct in6_addr *mask,
+                   const struct in6_addr *addr2)
 {
 	int i;
 	for( i = 0; i < 16; i++){
-		if((addr1.s6_addr[i] & mask.s6_addr[i]) != 
-		   (addr2.s6_addr[i] & mask.s6_addr[i]))
+		if((addr1->s6_addr[i] & mask->s6_addr[i]) != 
+		   (addr2->s6_addr[i] & mask->s6_addr[i]))
 			return 1;
 	}
 	return 0;
@@ -159,10 +160,10 @@ ip6_packet_match(const struct sk_buff *skb,
 
 #define FWINV(bool,invflg) ((bool) ^ !!(ip6info->invflags & invflg))
 
-	if (FWINV(ip6_masked_addrcmp(ipv6->saddr,ip6info->smsk,ip6info->src),
-		  IP6T_INV_SRCIP)
-	    || FWINV(ip6_masked_addrcmp(ipv6->daddr,ip6info->dmsk,ip6info->dst),
-		     IP6T_INV_DSTIP)) {
+	if (FWINV(ip6_masked_addrcmp(&ipv6->saddr, &ip6info->smsk,
+	                             &ip6info->src), IP6T_INV_SRCIP)
+	    || FWINV(ip6_masked_addrcmp(&ipv6->daddr, &ip6info->dmsk,
+	                                &ip6info->dst), IP6T_INV_DSTIP)) {
 		dprintf("Source or dest mismatch.\n");
 /*
 		dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr,
@@ -2131,6 +2132,7 @@ EXPORT_SYMBOL(ip6t_register_target);
 EXPORT_SYMBOL(ip6t_unregister_target);
 EXPORT_SYMBOL(ip6t_ext_hdr);
 EXPORT_SYMBOL(ipv6_find_hdr);
+EXPORT_SYMBOL(ip6_masked_addrcmp);
 
 module_init(init);
 module_exit(fini);
-- 
cgit v1.2.3-71-gd317


From ff179c8cf5caa17bf3d407edbb5872aa2eee6900 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sat, 26 Nov 2005 20:24:59 +0100
Subject: [PATCH] i2c: Drop i2c_driver.flags, 1 of 3

The I2C_DF_DUMMY flag is gone since 2.5.70, it's about time to
drop all ifdef'd out references thereto.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/media/video/tvmixer.c | 4 ----
 include/linux/i2c.h           | 5 -----
 2 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/tvmixer.c b/drivers/media/video/tvmixer.c
index 8318bd1aad00..5897e5d4d3d2 100644
--- a/drivers/media/video/tvmixer.c
+++ b/drivers/media/video/tvmixer.c
@@ -232,12 +232,8 @@ static struct i2c_driver driver = {
 #endif
 	.name            = "tv card mixer driver",
 	.id              = I2C_DRIVERID_TVMIXER,
-#ifdef I2C_DF_DUMMY
-	.flags           = I2C_DF_DUMMY,
-#else
 	.flags           = I2C_DF_NOTIFY,
 	.detach_adapter  = tvmixer_adapters,
-#endif
 	.attach_adapter  = tvmixer_adapters,
 	.detach_client   = tvmixer_clients,
 };
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 5e19a7ba69b2..0316ba1294ca 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -252,11 +252,6 @@ static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data)
 
 /*flags for the driver struct: */
 #define I2C_DF_NOTIFY	0x01		/* notify on bus (de/a)ttaches 	*/
-#if 0
-/* this flag is gone -- there is a (optional) driver->detach_adapter
- * callback now which can be used instead */
-# define I2C_DF_DUMMY	0x02
-#endif
 
 /*flags for the client struct: */
 #define I2C_CLIENT_ALLOW_USE		0x01	/* Client allows access */
-- 
cgit v1.2.3-71-gd317


From 8a9947552d43b0d20d5fa23ac0ba435d526be454 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sat, 26 Nov 2005 20:28:06 +0100
Subject: [PATCH] i2c: Drop i2c_driver.flags, 2 of 3

Just about every i2c chip driver sets the I2C_DF_NOTIFY flag, so we
can simply make it the default and drop the flag. If any driver really
doesn't want to be notified when i2c adapters are added, that driver
can simply omit to set .attach_adapter. This approach is also more
robust as it prevents accidental NULL pointer dereferences.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/i2c/porting-clients              | 3 +++
 Documentation/i2c/writing-clients              | 5 -----
 arch/arm/mach-pxa/akita-ioexp.c                | 1 -
 drivers/acorn/char/pcf8583.c                   | 1 -
 drivers/hwmon/adm1021.c                        | 1 -
 drivers/hwmon/adm1025.c                        | 1 -
 drivers/hwmon/adm1026.c                        | 1 -
 drivers/hwmon/adm1031.c                        | 1 -
 drivers/hwmon/adm9240.c                        | 1 -
 drivers/hwmon/asb100.c                         | 1 -
 drivers/hwmon/atxp1.c                          | 1 -
 drivers/hwmon/ds1621.c                         | 1 -
 drivers/hwmon/fscher.c                         | 1 -
 drivers/hwmon/fscpos.c                         | 1 -
 drivers/hwmon/gl518sm.c                        | 1 -
 drivers/hwmon/gl520sm.c                        | 1 -
 drivers/hwmon/it87.c                           | 1 -
 drivers/hwmon/lm63.c                           | 1 -
 drivers/hwmon/lm75.c                           | 1 -
 drivers/hwmon/lm77.c                           | 1 -
 drivers/hwmon/lm78.c                           | 1 -
 drivers/hwmon/lm80.c                           | 1 -
 drivers/hwmon/lm83.c                           | 1 -
 drivers/hwmon/lm85.c                           | 1 -
 drivers/hwmon/lm87.c                           | 1 -
 drivers/hwmon/lm90.c                           | 1 -
 drivers/hwmon/lm92.c                           | 1 -
 drivers/hwmon/max1619.c                        | 1 -
 drivers/hwmon/w83781d.c                        | 1 -
 drivers/hwmon/w83792d.c                        | 1 -
 drivers/hwmon/w83l785ts.c                      | 1 -
 drivers/i2c/chips/ds1337.c                     | 1 -
 drivers/i2c/chips/ds1374.c                     | 1 -
 drivers/i2c/chips/eeprom.c                     | 1 -
 drivers/i2c/chips/isp1301_omap.c               | 1 -
 drivers/i2c/chips/m41t00.c                     | 1 -
 drivers/i2c/chips/max6875.c                    | 1 -
 drivers/i2c/chips/pca9539.c                    | 1 -
 drivers/i2c/chips/pcf8574.c                    | 1 -
 drivers/i2c/chips/pcf8591.c                    | 1 -
 drivers/i2c/chips/rtc8564.c                    | 1 -
 drivers/i2c/chips/tps65010.c                   | 1 -
 drivers/i2c/chips/x1205.c                      | 1 -
 drivers/i2c/i2c-core.c                         | 4 ++--
 drivers/i2c/i2c-dev.c                          | 1 -
 drivers/macintosh/therm_adt746x.c              | 1 -
 drivers/macintosh/therm_pm72.c                 | 1 -
 drivers/macintosh/therm_windtunnel.c           | 1 -
 drivers/macintosh/windfarm_lm75_sensor.c       | 1 -
 drivers/media/video/adv7170.c                  | 1 -
 drivers/media/video/adv7175.c                  | 1 -
 drivers/media/video/bt819.c                    | 1 -
 drivers/media/video/bt832.c                    | 1 -
 drivers/media/video/bt856.c                    | 1 -
 drivers/media/video/cs53l32a.c                 | 1 -
 drivers/media/video/cx25840/cx25840-core.c     | 1 -
 drivers/media/video/indycam.c                  | 1 -
 drivers/media/video/ir-kbd-i2c.c               | 1 -
 drivers/media/video/msp3400.c                  | 1 -
 drivers/media/video/ovcamchip/ovcamchip_core.c | 1 -
 drivers/media/video/saa5246a.c                 | 1 -
 drivers/media/video/saa5249.c                  | 1 -
 drivers/media/video/saa6588.c                  | 1 -
 drivers/media/video/saa7110.c                  | 1 -
 drivers/media/video/saa7111.c                  | 1 -
 drivers/media/video/saa7114.c                  | 1 -
 drivers/media/video/saa7115.c                  | 1 -
 drivers/media/video/saa711x.c                  | 1 -
 drivers/media/video/saa7127.c                  | 1 -
 drivers/media/video/saa7134/saa6752hs.c        | 1 -
 drivers/media/video/saa7185.c                  | 1 -
 drivers/media/video/saa7191.c                  | 1 -
 drivers/media/video/tda7432.c                  | 1 -
 drivers/media/video/tda9840.c                  | 1 -
 drivers/media/video/tda9875.c                  | 1 -
 drivers/media/video/tda9887.c                  | 1 -
 drivers/media/video/tea6415c.c                 | 1 -
 drivers/media/video/tea6420.c                  | 1 -
 drivers/media/video/tuner-3036.c               | 1 -
 drivers/media/video/tuner-core.c               | 1 -
 drivers/media/video/tvaudio.c                  | 1 -
 drivers/media/video/tveeprom.c                 | 1 -
 drivers/media/video/tvmixer.c                  | 1 -
 drivers/media/video/tvp5150.c                  | 1 -
 drivers/media/video/vpx3220.c                  | 1 -
 drivers/media/video/wm8775.c                   | 1 -
 drivers/video/matrox/matroxfb_maven.c          | 1 -
 include/linux/i2c.h                            | 1 -
 sound/oss/dmasound/dac3550a.c                  | 1 -
 sound/oss/dmasound/tas_common.c                | 1 -
 sound/ppc/keywest.c                            | 1 -
 91 files changed, 5 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/porting-clients b/Documentation/i2c/porting-clients
index 184fac2377aa..64c610bf2fbc 100644
--- a/Documentation/i2c/porting-clients
+++ b/Documentation/i2c/porting-clients
@@ -109,6 +109,9 @@ Technical changes:
   there is a MODULE_LICENSE() line, at the bottom of the file
   (after MODULE_AUTHOR() and MODULE_DESCRIPTION(), in this order).
 
+* [Driver] The flags field of the i2c_driver structure is gone.
+  I2C_DF_NOTIFY is now the default behavior.
+
 Coding policy:
 
 * [Copyright] Use (C), not (c), for copyright.
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index d19993cc0604..59d2c169cd61 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -27,7 +27,6 @@ address.
 static struct i2c_driver foo_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "Foo version 2.3 driver",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= &foo_attach_adapter,
 	.detach_client	= &foo_detach_client,
 	.command	= &foo_command /* may be NULL */
@@ -36,10 +35,6 @@ static struct i2c_driver foo_driver = {
 The name field must match the driver name, including the case. It must not
 contain spaces, and may be up to 31 characters long.
 
-Don't worry about the flags field; just put I2C_DF_NOTIFY into it. This
-means that your driver will be notified when new adapters are found.
-This is almost always what you want.
-
 All other fields are for call-back functions which will be explained 
 below.
 
diff --git a/arch/arm/mach-pxa/akita-ioexp.c b/arch/arm/mach-pxa/akita-ioexp.c
index f6d73cc01f78..440ebb3c3db1 100644
--- a/arch/arm/mach-pxa/akita-ioexp.c
+++ b/arch/arm/mach-pxa/akita-ioexp.c
@@ -127,7 +127,6 @@ static struct i2c_driver max7310_i2c_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "akita-max7310",
 	.id		= I2C_DRIVERID_AKITAIOEXP,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= max7310_attach_adapter,
 	.detach_client	= max7310_detach_client,
 };
diff --git a/drivers/acorn/char/pcf8583.c b/drivers/acorn/char/pcf8583.c
index e26f007a1417..befc9469b4f2 100644
--- a/drivers/acorn/char/pcf8583.c
+++ b/drivers/acorn/char/pcf8583.c
@@ -259,7 +259,6 @@ pcf8583_command(struct i2c_client *client, unsigned int cmd, void *arg)
 static struct i2c_driver pcf8583_driver = {
 	.name		= "PCF8583",
 	.id		= I2C_DRIVERID_PCF8583,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= pcf8583_probe,
 	.detach_client	= pcf8583_detach,
 	.command	= pcf8583_command
diff --git a/drivers/hwmon/adm1021.c b/drivers/hwmon/adm1021.c
index 8102876c7c3f..5e6df4b7b857 100644
--- a/drivers/hwmon/adm1021.c
+++ b/drivers/hwmon/adm1021.c
@@ -129,7 +129,6 @@ static struct i2c_driver adm1021_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "adm1021",
 	.id		= I2C_DRIVERID_ADM1021,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= adm1021_attach_adapter,
 	.detach_client	= adm1021_detach_client,
 };
diff --git a/drivers/hwmon/adm1025.c b/drivers/hwmon/adm1025.c
index bf67860e6a20..2be48a7a90b8 100644
--- a/drivers/hwmon/adm1025.c
+++ b/drivers/hwmon/adm1025.c
@@ -121,7 +121,6 @@ static struct i2c_driver adm1025_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "adm1025",
 	.id		= I2C_DRIVERID_ADM1025,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= adm1025_attach_adapter,
 	.detach_client	= adm1025_detach_client,
 };
diff --git a/drivers/hwmon/adm1026.c b/drivers/hwmon/adm1026.c
index 5036b17a39cd..5416db809f97 100644
--- a/drivers/hwmon/adm1026.c
+++ b/drivers/hwmon/adm1026.c
@@ -310,7 +310,6 @@ static void adm1026_init_client(struct i2c_client *client);
 static struct i2c_driver adm1026_driver = {
 	.owner          = THIS_MODULE,
 	.name           = "adm1026",
-	.flags          = I2C_DF_NOTIFY,
 	.attach_adapter = adm1026_attach_adapter,
 	.detach_client  = adm1026_detach_client,
 };
diff --git a/drivers/hwmon/adm1031.c b/drivers/hwmon/adm1031.c
index 7c545d5eee45..1e24428090c4 100644
--- a/drivers/hwmon/adm1031.c
+++ b/drivers/hwmon/adm1031.c
@@ -107,7 +107,6 @@ static struct adm1031_data *adm1031_update_device(struct device *dev);
 static struct i2c_driver adm1031_driver = {
 	.owner = THIS_MODULE,
 	.name = "adm1031",
-	.flags = I2C_DF_NOTIFY,
 	.attach_adapter = adm1031_attach_adapter,
 	.detach_client = adm1031_detach_client,
 };
diff --git a/drivers/hwmon/adm9240.c b/drivers/hwmon/adm9240.c
index 11dc95f8a17e..287733fe2c0d 100644
--- a/drivers/hwmon/adm9240.c
+++ b/drivers/hwmon/adm9240.c
@@ -140,7 +140,6 @@ static struct i2c_driver adm9240_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "adm9240",
 	.id		= I2C_DRIVERID_ADM9240,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= adm9240_attach_adapter,
 	.detach_client	= adm9240_detach_client,
 };
diff --git a/drivers/hwmon/asb100.c b/drivers/hwmon/asb100.c
index 52c469722a65..7227f800bef9 100644
--- a/drivers/hwmon/asb100.c
+++ b/drivers/hwmon/asb100.c
@@ -220,7 +220,6 @@ static struct i2c_driver asb100_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "asb100",
 	.id		= I2C_DRIVERID_ASB100,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= asb100_attach_adapter,
 	.detach_client	= asb100_detach_client,
 };
diff --git a/drivers/hwmon/atxp1.c b/drivers/hwmon/atxp1.c
index 53324f56404e..a60a9f20281b 100644
--- a/drivers/hwmon/atxp1.c
+++ b/drivers/hwmon/atxp1.c
@@ -52,7 +52,6 @@ static int atxp1_detect(struct i2c_adapter *adapter, int address, int kind);
 static struct i2c_driver atxp1_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "atxp1",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter = atxp1_attach_adapter,
 	.detach_client	= atxp1_detach_client,
 };
diff --git a/drivers/hwmon/ds1621.c b/drivers/hwmon/ds1621.c
index 34f71b7c7f37..0096eb37c663 100644
--- a/drivers/hwmon/ds1621.c
+++ b/drivers/hwmon/ds1621.c
@@ -92,7 +92,6 @@ static struct i2c_driver ds1621_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "ds1621",
 	.id		= I2C_DRIVERID_DS1621,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= ds1621_attach_adapter,
 	.detach_client	= ds1621_detach_client,
 };
diff --git a/drivers/hwmon/fscher.c b/drivers/hwmon/fscher.c
index a02e1c34c757..f56ca06dbf88 100644
--- a/drivers/hwmon/fscher.c
+++ b/drivers/hwmon/fscher.c
@@ -121,7 +121,6 @@ static struct i2c_driver fscher_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "fscher",
 	.id		= I2C_DRIVERID_FSCHER,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= fscher_attach_adapter,
 	.detach_client	= fscher_detach_client,
 };
diff --git a/drivers/hwmon/fscpos.c b/drivers/hwmon/fscpos.c
index 64e4edc64f8d..701dffc2ceed 100644
--- a/drivers/hwmon/fscpos.c
+++ b/drivers/hwmon/fscpos.c
@@ -103,7 +103,6 @@ static struct i2c_driver fscpos_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "fscpos",
 	.id		= I2C_DRIVERID_FSCPOS,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= fscpos_attach_adapter,
 	.detach_client	= fscpos_detach_client,
 };
diff --git a/drivers/hwmon/gl518sm.c b/drivers/hwmon/gl518sm.c
index 2f178dbe3d87..5788bbb77d8d 100644
--- a/drivers/hwmon/gl518sm.c
+++ b/drivers/hwmon/gl518sm.c
@@ -154,7 +154,6 @@ static struct i2c_driver gl518_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "gl518sm",
 	.id		= I2C_DRIVERID_GL518,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= gl518_attach_adapter,
 	.detach_client	= gl518_detach_client,
 };
diff --git a/drivers/hwmon/gl520sm.c b/drivers/hwmon/gl520sm.c
index c39ba1239426..b3998165193d 100644
--- a/drivers/hwmon/gl520sm.c
+++ b/drivers/hwmon/gl520sm.c
@@ -112,7 +112,6 @@ static struct i2c_driver gl520_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "gl520sm",
 	.id		= I2C_DRIVERID_GL520,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= gl520_attach_adapter,
 	.detach_client	= gl520_detach_client,
 };
diff --git a/drivers/hwmon/it87.c b/drivers/hwmon/it87.c
index a61f5d00f10a..d5f0d92378c5 100644
--- a/drivers/hwmon/it87.c
+++ b/drivers/hwmon/it87.c
@@ -237,7 +237,6 @@ static struct i2c_driver it87_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "it87",
 	.id		= I2C_DRIVERID_IT87,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= it87_attach_adapter,
 	.detach_client	= it87_detach_client,
 };
diff --git a/drivers/hwmon/lm63.c b/drivers/hwmon/lm63.c
index 954ec2497249..c2dd4ac8042d 100644
--- a/drivers/hwmon/lm63.c
+++ b/drivers/hwmon/lm63.c
@@ -141,7 +141,6 @@ static void lm63_init_client(struct i2c_client *client);
 static struct i2c_driver lm63_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "lm63",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= lm63_attach_adapter,
 	.detach_client	= lm63_detach_client,
 };
diff --git a/drivers/hwmon/lm75.c b/drivers/hwmon/lm75.c
index d70f4c8fc1e6..0bcbd6515139 100644
--- a/drivers/hwmon/lm75.c
+++ b/drivers/hwmon/lm75.c
@@ -69,7 +69,6 @@ static struct i2c_driver lm75_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "lm75",
 	.id		= I2C_DRIVERID_LM75,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= lm75_attach_adapter,
 	.detach_client	= lm75_detach_client,
 };
diff --git a/drivers/hwmon/lm77.c b/drivers/hwmon/lm77.c
index 9380fda7dcd1..6a524e92432e 100644
--- a/drivers/hwmon/lm77.c
+++ b/drivers/hwmon/lm77.c
@@ -76,7 +76,6 @@ static struct lm77_data *lm77_update_device(struct device *dev);
 static struct i2c_driver lm77_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "lm77",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter = lm77_attach_adapter,
 	.detach_client	= lm77_detach_client,
 };
diff --git a/drivers/hwmon/lm78.c b/drivers/hwmon/lm78.c
index 78cdd506439f..18448f83a6b0 100644
--- a/drivers/hwmon/lm78.c
+++ b/drivers/hwmon/lm78.c
@@ -167,7 +167,6 @@ static struct i2c_driver lm78_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "lm78",
 	.id		= I2C_DRIVERID_LM78,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= lm78_attach_adapter,
 	.detach_client	= lm78_detach_client,
 };
diff --git a/drivers/hwmon/lm80.c b/drivers/hwmon/lm80.c
index c359fdea211e..02303fa0c464 100644
--- a/drivers/hwmon/lm80.c
+++ b/drivers/hwmon/lm80.c
@@ -146,7 +146,6 @@ static struct i2c_driver lm80_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "lm80",
 	.id		= I2C_DRIVERID_LM80,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= lm80_attach_adapter,
 	.detach_client	= lm80_detach_client,
 };
diff --git a/drivers/hwmon/lm83.c b/drivers/hwmon/lm83.c
index 9a70611a9f69..96cb34ea2490 100644
--- a/drivers/hwmon/lm83.c
+++ b/drivers/hwmon/lm83.c
@@ -127,7 +127,6 @@ static struct i2c_driver lm83_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "lm83",
 	.id		= I2C_DRIVERID_LM83,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= lm83_attach_adapter,
 	.detach_client	= lm83_detach_client,
 };
diff --git a/drivers/hwmon/lm85.c b/drivers/hwmon/lm85.c
index 3f5544a40f3c..131ecab094ac 100644
--- a/drivers/hwmon/lm85.c
+++ b/drivers/hwmon/lm85.c
@@ -383,7 +383,6 @@ static struct i2c_driver lm85_driver = {
 	.owner          = THIS_MODULE,
 	.name           = "lm85",
 	.id             = I2C_DRIVERID_LM85,
-	.flags          = I2C_DF_NOTIFY,
 	.attach_adapter = lm85_attach_adapter,
 	.detach_client  = lm85_detach_client,
 };
diff --git a/drivers/hwmon/lm87.c b/drivers/hwmon/lm87.c
index eeec18177861..26fd0b33beaf 100644
--- a/drivers/hwmon/lm87.c
+++ b/drivers/hwmon/lm87.c
@@ -164,7 +164,6 @@ static struct i2c_driver lm87_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "lm87",
 	.id		= I2C_DRIVERID_LM87,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= lm87_attach_adapter,
 	.detach_client	= lm87_detach_client,
 };
diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c
index 83cf2e1b09f5..011923b7091d 100644
--- a/drivers/hwmon/lm90.c
+++ b/drivers/hwmon/lm90.c
@@ -189,7 +189,6 @@ static struct i2c_driver lm90_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "lm90",
 	.id		= I2C_DRIVERID_LM90,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= lm90_attach_adapter,
 	.detach_client	= lm90_detach_client,
 };
diff --git a/drivers/hwmon/lm92.c b/drivers/hwmon/lm92.c
index 7a4b3701ed1a..2005a9ee61fb 100644
--- a/drivers/hwmon/lm92.c
+++ b/drivers/hwmon/lm92.c
@@ -413,7 +413,6 @@ static struct i2c_driver lm92_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "lm92",
 	.id		= I2C_DRIVERID_LM92,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= lm92_attach_adapter,
 	.detach_client	= lm92_detach_client,
 };
diff --git a/drivers/hwmon/max1619.c b/drivers/hwmon/max1619.c
index 69e7e125683b..d5aebef126d5 100644
--- a/drivers/hwmon/max1619.c
+++ b/drivers/hwmon/max1619.c
@@ -92,7 +92,6 @@ static struct max1619_data *max1619_update_device(struct device *dev);
 static struct i2c_driver max1619_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "max1619",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= max1619_attach_adapter,
 	.detach_client	= max1619_detach_client,
 };
diff --git a/drivers/hwmon/w83781d.c b/drivers/hwmon/w83781d.c
index ffdb3a03e2b5..a78929f2b3d8 100644
--- a/drivers/hwmon/w83781d.c
+++ b/drivers/hwmon/w83781d.c
@@ -272,7 +272,6 @@ static struct i2c_driver w83781d_driver = {
 	.owner = THIS_MODULE,
 	.name = "w83781d",
 	.id = I2C_DRIVERID_W83781D,
-	.flags = I2C_DF_NOTIFY,
 	.attach_adapter = w83781d_attach_adapter,
 	.detach_client = w83781d_detach_client,
 };
diff --git a/drivers/hwmon/w83792d.c b/drivers/hwmon/w83792d.c
index 1ba072630361..6824243d90d7 100644
--- a/drivers/hwmon/w83792d.c
+++ b/drivers/hwmon/w83792d.c
@@ -319,7 +319,6 @@ static void w83792d_init_client(struct i2c_client *client);
 static struct i2c_driver w83792d_driver = {
 	.owner = THIS_MODULE,
 	.name = "w83792d",
-	.flags = I2C_DF_NOTIFY,
 	.attach_adapter = w83792d_attach_adapter,
 	.detach_client = w83792d_detach_client,
 };
diff --git a/drivers/hwmon/w83l785ts.c b/drivers/hwmon/w83l785ts.c
index f495b6378668..35172fb455d0 100644
--- a/drivers/hwmon/w83l785ts.c
+++ b/drivers/hwmon/w83l785ts.c
@@ -95,7 +95,6 @@ static struct i2c_driver w83l785ts_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "w83l785ts",
 	.id		= I2C_DRIVERID_W83L785TS,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= w83l785ts_attach_adapter,
 	.detach_client	= w83l785ts_detach_client,
 };
diff --git a/drivers/i2c/chips/ds1337.c b/drivers/i2c/chips/ds1337.c
index 18228957606c..65146cbc8390 100644
--- a/drivers/i2c/chips/ds1337.c
+++ b/drivers/i2c/chips/ds1337.c
@@ -54,7 +54,6 @@ static int ds1337_command(struct i2c_client *client, unsigned int cmd,
 static struct i2c_driver ds1337_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "ds1337",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= ds1337_attach_adapter,
 	.detach_client	= ds1337_detach_client,
 	.command	= ds1337_command,
diff --git a/drivers/i2c/chips/ds1374.c b/drivers/i2c/chips/ds1374.c
index da488b735abf..5a270d60b699 100644
--- a/drivers/i2c/chips/ds1374.c
+++ b/drivers/i2c/chips/ds1374.c
@@ -235,7 +235,6 @@ static struct i2c_driver ds1374_driver = {
 	.owner = THIS_MODULE,
 	.name = DS1374_DRV_NAME,
 	.id = I2C_DRIVERID_DS1374,
-	.flags = I2C_DF_NOTIFY,
 	.attach_adapter = ds1374_attach,
 	.detach_client = ds1374_detach,
 };
diff --git a/drivers/i2c/chips/eeprom.c b/drivers/i2c/chips/eeprom.c
index 4baf573fa04f..9bb1f8b3f38c 100644
--- a/drivers/i2c/chips/eeprom.c
+++ b/drivers/i2c/chips/eeprom.c
@@ -71,7 +71,6 @@ static struct i2c_driver eeprom_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "eeprom",
 	.id		= I2C_DRIVERID_EEPROM,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= eeprom_attach_adapter,
 	.detach_client	= eeprom_detach_client,
 };
diff --git a/drivers/i2c/chips/isp1301_omap.c b/drivers/i2c/chips/isp1301_omap.c
index d2a100d77839..4f472ba66a02 100644
--- a/drivers/i2c/chips/isp1301_omap.c
+++ b/drivers/i2c/chips/isp1301_omap.c
@@ -1636,7 +1636,6 @@ static struct i2c_driver isp1301_driver = {
 	.name		= "isp1301_omap",
 	.id		= 1301,		/* FIXME "official", i2c-ids.h */
 	.class		= I2C_CLASS_HWMON,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= isp1301_scan_bus,
 	.detach_client	= isp1301_detach_client,
 };
diff --git a/drivers/i2c/chips/m41t00.c b/drivers/i2c/chips/m41t00.c
index 3df309ae44a6..13e67836b348 100644
--- a/drivers/i2c/chips/m41t00.c
+++ b/drivers/i2c/chips/m41t00.c
@@ -214,7 +214,6 @@ static struct i2c_driver m41t00_driver = {
 	.owner		= THIS_MODULE,
 	.name		= M41T00_DRV_NAME,
 	.id		= I2C_DRIVERID_STM41T00,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= m41t00_attach,
 	.detach_client	= m41t00_detach,
 };
diff --git a/drivers/i2c/chips/max6875.c b/drivers/i2c/chips/max6875.c
index b376a006883c..7e61019e72dd 100644
--- a/drivers/i2c/chips/max6875.c
+++ b/drivers/i2c/chips/max6875.c
@@ -69,7 +69,6 @@ static int max6875_detach_client(struct i2c_client *client);
 static struct i2c_driver max6875_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "max6875",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= max6875_attach_adapter,
 	.detach_client	= max6875_detach_client,
 };
diff --git a/drivers/i2c/chips/pca9539.c b/drivers/i2c/chips/pca9539.c
index 59a930346229..26feb7a4f942 100644
--- a/drivers/i2c/chips/pca9539.c
+++ b/drivers/i2c/chips/pca9539.c
@@ -40,7 +40,6 @@ static int pca9539_detach_client(struct i2c_client *client);
 static struct i2c_driver pca9539_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "pca9539",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= pca9539_attach_adapter,
 	.detach_client	= pca9539_detach_client,
 };
diff --git a/drivers/i2c/chips/pcf8574.c b/drivers/i2c/chips/pcf8574.c
index c323c2de236c..2fae640cf329 100644
--- a/drivers/i2c/chips/pcf8574.c
+++ b/drivers/i2c/chips/pcf8574.c
@@ -68,7 +68,6 @@ static struct i2c_driver pcf8574_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "pcf8574",
 	.id		= I2C_DRIVERID_PCF8574,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= pcf8574_attach_adapter,
 	.detach_client	= pcf8574_detach_client,
 };
diff --git a/drivers/i2c/chips/pcf8591.c b/drivers/i2c/chips/pcf8591.c
index ce420a67560b..8750f71278e1 100644
--- a/drivers/i2c/chips/pcf8591.c
+++ b/drivers/i2c/chips/pcf8591.c
@@ -91,7 +91,6 @@ static struct i2c_driver pcf8591_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "pcf8591",
 	.id		= I2C_DRIVERID_PCF8591,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= pcf8591_attach_adapter,
 	.detach_client	= pcf8591_detach_client,
 };
diff --git a/drivers/i2c/chips/rtc8564.c b/drivers/i2c/chips/rtc8564.c
index 26e498d921da..e586f75dd024 100644
--- a/drivers/i2c/chips/rtc8564.c
+++ b/drivers/i2c/chips/rtc8564.c
@@ -362,7 +362,6 @@ static struct i2c_driver rtc8564_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "RTC8564",
 	.id		= I2C_DRIVERID_RTC8564,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter = rtc8564_probe,
 	.detach_client	= rtc8564_detach,
 	.command	= rtc8564_command
diff --git a/drivers/i2c/chips/tps65010.c b/drivers/i2c/chips/tps65010.c
index 280dd7a45db6..439bf6ceb789 100644
--- a/drivers/i2c/chips/tps65010.c
+++ b/drivers/i2c/chips/tps65010.c
@@ -639,7 +639,6 @@ static int __init tps65010_scan_bus(struct i2c_adapter *bus)
 static struct i2c_driver tps65010_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "tps65010",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= tps65010_scan_bus,
 	.detach_client	= __exit_p(tps65010_detach_client),
 };
diff --git a/drivers/i2c/chips/x1205.c b/drivers/i2c/chips/x1205.c
index 7da366cdc18c..c5ff2cee15ae 100644
--- a/drivers/i2c/chips/x1205.c
+++ b/drivers/i2c/chips/x1205.c
@@ -107,7 +107,6 @@ static int x1205_command(struct i2c_client *client, unsigned int cmd,
 static struct i2c_driver x1205_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "x1205",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter = &x1205_attach,
 	.detach_client	= &x1205_detach,
 };
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 82ea1b7ec914..ad68ac00d910 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -197,7 +197,7 @@ int i2c_add_adapter(struct i2c_adapter *adap)
 	/* inform drivers of new adapters */
 	list_for_each(item,&drivers) {
 		driver = list_entry(item, struct i2c_driver, list);
-		if (driver->flags & I2C_DF_NOTIFY)
+		if (driver->attach_adapter)
 			/* We ignore the return code; if it fails, too bad */
 			driver->attach_adapter(adap);
 	}
@@ -309,7 +309,7 @@ int i2c_add_driver(struct i2c_driver *driver)
 	pr_debug("i2c-core: driver [%s] registered\n", driver->name);
 
 	/* now look for instances of driver on our adapters */
-	if (driver->flags & I2C_DF_NOTIFY) {
+	if (driver->attach_adapter) {
 		list_for_each(item,&adapters) {
 			adapter = list_entry(item, struct i2c_adapter, list);
 			driver->attach_adapter(adapter);
diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c
index 8af0bd1424d2..9da51eb37c06 100644
--- a/drivers/i2c/i2c-dev.c
+++ b/drivers/i2c/i2c-dev.c
@@ -484,7 +484,6 @@ static struct i2c_driver i2cdev_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "dev_driver",
 	.id		= I2C_DRIVERID_I2CDEV,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= i2cdev_attach_adapter,
 	.detach_adapter	= i2cdev_detach_adapter,
 	.detach_client	= i2cdev_detach_client,
diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c
index f38696622eb4..f62c16fab42b 100644
--- a/drivers/macintosh/therm_adt746x.c
+++ b/drivers/macintosh/therm_adt746x.c
@@ -173,7 +173,6 @@ detach_thermostat(struct i2c_adapter *adapter)
 static struct i2c_driver thermostat_driver = {  
 	.owner		= THIS_MODULE,
 	.name		= "therm_adt746x",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= attach_thermostat,
 	.detach_adapter	= detach_thermostat,
 };
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index 190878eef990..df00c960fc5a 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -285,7 +285,6 @@ static struct i2c_driver therm_pm72_driver =
 {
 	.owner		= THIS_MODULE,
 	.name		= "therm_pm72",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= therm_pm72_attach,
 	.detach_adapter	= therm_pm72_detach,
 };
diff --git a/drivers/macintosh/therm_windtunnel.c b/drivers/macintosh/therm_windtunnel.c
index 6aaa1df1a64e..f3bae0d00ed2 100644
--- a/drivers/macintosh/therm_windtunnel.c
+++ b/drivers/macintosh/therm_windtunnel.c
@@ -357,7 +357,6 @@ static struct i2c_driver g4fan_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "therm_windtunnel",
 	.id		= I2C_DRIVERID_G4FAN,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter = do_attach,
 	.detach_client	= do_detach,
 };
diff --git a/drivers/macintosh/windfarm_lm75_sensor.c b/drivers/macintosh/windfarm_lm75_sensor.c
index a0a41ad0f2b5..2392789ccf32 100644
--- a/drivers/macintosh/windfarm_lm75_sensor.c
+++ b/drivers/macintosh/windfarm_lm75_sensor.c
@@ -49,7 +49,6 @@ static int wf_lm75_detach(struct i2c_client *client);
 static struct i2c_driver wf_lm75_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "wf_lm75",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= wf_lm75_attach,
 	.detach_client	= wf_lm75_detach,
 };
diff --git a/drivers/media/video/adv7170.c b/drivers/media/video/adv7170.c
index 1ca2b67aedfb..c4f2265167a2 100644
--- a/drivers/media/video/adv7170.c
+++ b/drivers/media/video/adv7170.c
@@ -502,7 +502,6 @@ static struct i2c_driver i2c_driver_adv7170 = {
 	.name = "adv7170",	/* name */
 
 	.id = I2C_DRIVERID_ADV7170,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = adv7170_attach_adapter,
 	.detach_client = adv7170_detach_client,
diff --git a/drivers/media/video/adv7175.c b/drivers/media/video/adv7175.c
index 173bca1e0295..4fc08b17d4d0 100644
--- a/drivers/media/video/adv7175.c
+++ b/drivers/media/video/adv7175.c
@@ -552,7 +552,6 @@ static struct i2c_driver i2c_driver_adv7175 = {
 	.name = "adv7175",	/* name */
 
 	.id = I2C_DRIVERID_ADV7175,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = adv7175_attach_adapter,
 	.detach_client = adv7175_detach_client,
diff --git a/drivers/media/video/bt819.c b/drivers/media/video/bt819.c
index 3ee0afca76a7..7bba69793b78 100644
--- a/drivers/media/video/bt819.c
+++ b/drivers/media/video/bt819.c
@@ -627,7 +627,6 @@ static struct i2c_driver i2c_driver_bt819 = {
 	.name = "bt819",
 
 	.id = I2C_DRIVERID_BT819,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = bt819_attach_adapter,
 	.detach_client = bt819_detach_client,
diff --git a/drivers/media/video/bt832.c b/drivers/media/video/bt832.c
index 3ca1d768bfd3..0ba8652357e2 100644
--- a/drivers/media/video/bt832.c
+++ b/drivers/media/video/bt832.c
@@ -233,7 +233,6 @@ static struct i2c_driver driver = {
 	.owner          = THIS_MODULE,
 	.name           = "i2c bt832 driver",
 	.id             = -1, /* FIXME */
-	.flags          = I2C_DF_NOTIFY,
 	.attach_adapter = bt832_probe,
 	.detach_client  = bt832_detach,
 	.command        = bt832_command,
diff --git a/drivers/media/video/bt856.c b/drivers/media/video/bt856.c
index 8eb871d0e85b..4c9acd1e2c70 100644
--- a/drivers/media/video/bt856.c
+++ b/drivers/media/video/bt856.c
@@ -409,7 +409,6 @@ static struct i2c_driver i2c_driver_bt856 = {
 	.name = "bt856",
 
 	.id = I2C_DRIVERID_BT856,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = bt856_attach_adapter,
 	.detach_client = bt856_detach_client,
diff --git a/drivers/media/video/cs53l32a.c b/drivers/media/video/cs53l32a.c
index 780b352ec119..fce5d89b7b15 100644
--- a/drivers/media/video/cs53l32a.c
+++ b/drivers/media/video/cs53l32a.c
@@ -218,7 +218,6 @@ static int cs53l32a_detach(struct i2c_client *client)
 static struct i2c_driver i2c_driver = {
 	.name = "cs53l32a",
 	.id = I2C_DRIVERID_CS53L32A,
-	.flags = I2C_DF_NOTIFY,
 	.attach_adapter = cs53l32a_probe,
 	.detach_client = cs53l32a_detach,
 	.command = cs53l32a_command,
diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c
index 5b93723a1768..c66bc147ee71 100644
--- a/drivers/media/video/cx25840/cx25840-core.c
+++ b/drivers/media/video/cx25840/cx25840-core.c
@@ -847,7 +847,6 @@ static struct i2c_driver i2c_driver_cx25840 = {
 	.name = "cx25840",
 
 	.id = I2C_DRIVERID_CX25840,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = cx25840_attach_adapter,
 	.detach_client = cx25840_detach_client,
diff --git a/drivers/media/video/indycam.c b/drivers/media/video/indycam.c
index deeef125eb92..3eba514cdb29 100644
--- a/drivers/media/video/indycam.c
+++ b/drivers/media/video/indycam.c
@@ -454,7 +454,6 @@ static struct i2c_driver i2c_driver_indycam = {
 	.owner		= THIS_MODULE,
 	.name		= "indycam",
 	.id		= I2C_DRIVERID_INDYCAM,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter = indycam_probe,
 	.detach_client	= indycam_detach,
 	.command	= indycam_command,
diff --git a/drivers/media/video/ir-kbd-i2c.c b/drivers/media/video/ir-kbd-i2c.c
index 740e543311af..2e2f78a4afc8 100644
--- a/drivers/media/video/ir-kbd-i2c.c
+++ b/drivers/media/video/ir-kbd-i2c.c
@@ -280,7 +280,6 @@ static int ir_probe(struct i2c_adapter *adap);
 static struct i2c_driver driver = {
 	.name           = "ir remote kbd driver",
 	.id             = I2C_DRIVERID_INFRARED,
-	.flags          = I2C_DF_NOTIFY,
 	.attach_adapter = ir_probe,
 	.detach_client  = ir_detach,
 };
diff --git a/drivers/media/video/msp3400.c b/drivers/media/video/msp3400.c
index d86f8e92e534..46328fb6fe80 100644
--- a/drivers/media/video/msp3400.c
+++ b/drivers/media/video/msp3400.c
@@ -1564,7 +1564,6 @@ static struct i2c_driver driver = {
 	.owner          = THIS_MODULE,
 	.name           = "msp3400",
 	.id             = I2C_DRIVERID_MSP3400,
-	.flags          = I2C_DF_NOTIFY,
 	.attach_adapter = msp_probe,
 	.detach_client  = msp_detach,
 	.command        = msp_command,
diff --git a/drivers/media/video/ovcamchip/ovcamchip_core.c b/drivers/media/video/ovcamchip/ovcamchip_core.c
index 2de34ebf0673..390d0d6c7838 100644
--- a/drivers/media/video/ovcamchip/ovcamchip_core.c
+++ b/drivers/media/video/ovcamchip/ovcamchip_core.c
@@ -414,7 +414,6 @@ static struct i2c_driver driver = {
 	.name =			"ovcamchip",
 	.id =			I2C_DRIVERID_OVCAMCHIP,
 	.class =		I2C_CLASS_CAM_DIGITAL,
-	.flags =		I2C_DF_NOTIFY,
 	.attach_adapter =	ovcamchip_attach,
 	.detach_client =	ovcamchip_detach,
 	.command =		ovcamchip_command,
diff --git a/drivers/media/video/saa5246a.c b/drivers/media/video/saa5246a.c
index b8054da31ffd..9bf686989aab 100644
--- a/drivers/media/video/saa5246a.c
+++ b/drivers/media/video/saa5246a.c
@@ -166,7 +166,6 @@ static struct i2c_driver i2c_driver_videotext =
 	.owner 		= THIS_MODULE,
 	.name 		= IF_NAME,		/* name */
 	.id 		= I2C_DRIVERID_SAA5249, /* in i2c.h */
-	.flags 		= I2C_DF_NOTIFY,
 	.attach_adapter = saa5246a_probe,
 	.detach_client  = saa5246a_detach,
 	.command 	= saa5246a_command
diff --git a/drivers/media/video/saa5249.c b/drivers/media/video/saa5249.c
index 7ffa2e9a9bf3..811e86396030 100644
--- a/drivers/media/video/saa5249.c
+++ b/drivers/media/video/saa5249.c
@@ -239,7 +239,6 @@ static struct i2c_driver i2c_driver_videotext =
 	.owner 		= THIS_MODULE,
 	.name 		= IF_NAME,		/* name */
 	.id 		= I2C_DRIVERID_SAA5249, /* in i2c.h */
-	.flags 		= I2C_DF_NOTIFY,
 	.attach_adapter = saa5249_probe,
 	.detach_client  = saa5249_detach,
 	.command 	= saa5249_command
diff --git a/drivers/media/video/saa6588.c b/drivers/media/video/saa6588.c
index 923322503e8f..18a0b7143e8b 100644
--- a/drivers/media/video/saa6588.c
+++ b/drivers/media/video/saa6588.c
@@ -498,7 +498,6 @@ static struct i2c_driver driver = {
 	.owner = THIS_MODULE,
 	.name = "i2c saa6588 driver",
 	.id = -1,		/* FIXME */
-	.flags = I2C_DF_NOTIFY,
 	.attach_adapter = saa6588_probe,
 	.detach_client = saa6588_detach,
 	.command = saa6588_command,
diff --git a/drivers/media/video/saa7110.c b/drivers/media/video/saa7110.c
index e116bdbed310..f266b35ceb35 100644
--- a/drivers/media/video/saa7110.c
+++ b/drivers/media/video/saa7110.c
@@ -591,7 +591,6 @@ static struct i2c_driver i2c_driver_saa7110 = {
 	.name = "saa7110",
 
 	.id = I2C_DRIVERID_SAA7110,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = saa7110_attach_adapter,
 	.detach_client = saa7110_detach_client,
diff --git a/drivers/media/video/saa7111.c b/drivers/media/video/saa7111.c
index fe8a5e453969..687beaf11adc 100644
--- a/drivers/media/video/saa7111.c
+++ b/drivers/media/video/saa7111.c
@@ -594,7 +594,6 @@ static struct i2c_driver i2c_driver_saa7111 = {
 	.name = "saa7111",
 
 	.id = I2C_DRIVERID_SAA7111A,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = saa7111_attach_adapter,
 	.detach_client = saa7111_detach_client,
diff --git a/drivers/media/video/saa7114.c b/drivers/media/video/saa7114.c
index d9f50e2f7b92..4748cf0598c0 100644
--- a/drivers/media/video/saa7114.c
+++ b/drivers/media/video/saa7114.c
@@ -1208,7 +1208,6 @@ static struct i2c_driver i2c_driver_saa7114 = {
 	.name = "saa7114",
 
 	.id = I2C_DRIVERID_SAA7114,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = saa7114_attach_adapter,
 	.detach_client = saa7114_detach_client,
diff --git a/drivers/media/video/saa7115.c b/drivers/media/video/saa7115.c
index e717e30d8187..b1079de938b7 100644
--- a/drivers/media/video/saa7115.c
+++ b/drivers/media/video/saa7115.c
@@ -1356,7 +1356,6 @@ static int saa7115_detach(struct i2c_client *client)
 static struct i2c_driver i2c_driver_saa7115 = {
 	.name = "saa7115",
 	.id = I2C_DRIVERID_SAA711X,
-	.flags = I2C_DF_NOTIFY,
 	.attach_adapter = saa7115_probe,
 	.detach_client = saa7115_detach,
 	.command = saa7115_command,
diff --git a/drivers/media/video/saa711x.c b/drivers/media/video/saa711x.c
index 31f7b950b01c..734a70919080 100644
--- a/drivers/media/video/saa711x.c
+++ b/drivers/media/video/saa711x.c
@@ -569,7 +569,6 @@ static struct i2c_driver i2c_driver_saa711x = {
 	.name = "saa711x",
 
 	.id = I2C_DRIVERID_SAA711X,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = saa711x_attach_adapter,
 	.detach_client = saa711x_detach_client,
diff --git a/drivers/media/video/saa7127.c b/drivers/media/video/saa7127.c
index c36f014f1fdf..a2fab9837507 100644
--- a/drivers/media/video/saa7127.c
+++ b/drivers/media/video/saa7127.c
@@ -821,7 +821,6 @@ static int saa7127_detach(struct i2c_client *client)
 static struct i2c_driver i2c_driver_saa7127 = {
 	.name = "saa7127",
 	.id = I2C_DRIVERID_SAA7127,
-	.flags = I2C_DF_NOTIFY,
 	.attach_adapter = saa7127_probe,
 	.detach_client = saa7127_detach,
 	.command = saa7127_command,
diff --git a/drivers/media/video/saa7134/saa6752hs.c b/drivers/media/video/saa7134/saa6752hs.c
index a61d24f588f7..6fc298e0a03a 100644
--- a/drivers/media/video/saa7134/saa6752hs.c
+++ b/drivers/media/video/saa7134/saa6752hs.c
@@ -600,7 +600,6 @@ static struct i2c_driver driver = {
 	.owner          = THIS_MODULE,
 	.name           = "i2c saa6752hs MPEG encoder",
 	.id             = I2C_DRIVERID_SAA6752HS,
-	.flags          = I2C_DF_NOTIFY,
 	.attach_adapter = saa6752hs_probe,
 	.detach_client  = saa6752hs_detach,
 	.command        = saa6752hs_command,
diff --git a/drivers/media/video/saa7185.c b/drivers/media/video/saa7185.c
index 132aa7943c16..e24aa16f2d8c 100644
--- a/drivers/media/video/saa7185.c
+++ b/drivers/media/video/saa7185.c
@@ -491,7 +491,6 @@ static struct i2c_driver i2c_driver_saa7185 = {
 	.name = "saa7185",	/* name */
 
 	.id = I2C_DRIVERID_SAA7185B,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = saa7185_attach_adapter,
 	.detach_client = saa7185_detach_client,
diff --git a/drivers/media/video/saa7191.c b/drivers/media/video/saa7191.c
index cbca896e8cfa..6be98fc0fe24 100644
--- a/drivers/media/video/saa7191.c
+++ b/drivers/media/video/saa7191.c
@@ -791,7 +791,6 @@ static struct i2c_driver i2c_driver_saa7191 = {
 	.owner		= THIS_MODULE,
 	.name		= "saa7191",
 	.id		= I2C_DRIVERID_SAA7191,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter = saa7191_probe,
 	.detach_client	= saa7191_detach,
 	.command	= saa7191_command
diff --git a/drivers/media/video/tda7432.c b/drivers/media/video/tda7432.c
index d32737dd2142..239a58666a12 100644
--- a/drivers/media/video/tda7432.c
+++ b/drivers/media/video/tda7432.c
@@ -504,7 +504,6 @@ static struct i2c_driver driver = {
 	.owner           = THIS_MODULE,
 	.name            = "i2c tda7432 driver",
 	.id              = I2C_DRIVERID_TDA7432,
-	.flags           = I2C_DF_NOTIFY,
 	.attach_adapter  = tda7432_probe,
 	.detach_client   = tda7432_detach,
 	.command         = tda7432_command,
diff --git a/drivers/media/video/tda9840.c b/drivers/media/video/tda9840.c
index 1794686612c6..f29fb507075d 100644
--- a/drivers/media/video/tda9840.c
+++ b/drivers/media/video/tda9840.c
@@ -224,7 +224,6 @@ static struct i2c_driver driver = {
 	.owner	= THIS_MODULE,
 	.name	= "tda9840",
 	.id	= I2C_DRIVERID_TDA9840,
-	.flags	= I2C_DF_NOTIFY,
 	.attach_adapter	= attach,
 	.detach_client	= detach,
 	.command	= command,
diff --git a/drivers/media/video/tda9875.c b/drivers/media/video/tda9875.c
index a5e37dc91f39..d053b6445502 100644
--- a/drivers/media/video/tda9875.c
+++ b/drivers/media/video/tda9875.c
@@ -375,7 +375,6 @@ static struct i2c_driver driver = {
 	.owner          = THIS_MODULE,
 	.name           = "i2c tda9875 driver",
 	.id             = I2C_DRIVERID_TDA9875,
-	.flags          = I2C_DF_NOTIFY,
 	.attach_adapter = tda9875_probe,
 	.detach_client  = tda9875_detach,
 	.command        = tda9875_command,
diff --git a/drivers/media/video/tda9887.c b/drivers/media/video/tda9887.c
index 2f2414e90e8b..049b44e0767b 100644
--- a/drivers/media/video/tda9887.c
+++ b/drivers/media/video/tda9887.c
@@ -822,7 +822,6 @@ static struct i2c_driver driver = {
 	.owner          = THIS_MODULE,
 	.name           = "i2c tda9887 driver",
 	.id             = -1, /* FIXME */
-	.flags          = I2C_DF_NOTIFY,
 	.attach_adapter = tda9887_probe,
 	.detach_client  = tda9887_detach,
 	.command        = tda9887_command,
diff --git a/drivers/media/video/tea6415c.c b/drivers/media/video/tea6415c.c
index ee3688348b66..96d88ce60c98 100644
--- a/drivers/media/video/tea6415c.c
+++ b/drivers/media/video/tea6415c.c
@@ -193,7 +193,6 @@ static struct i2c_driver driver = {
 	.owner	= THIS_MODULE,
 	.name 	= "tea6415c",
 	.id 	= I2C_DRIVERID_TEA6415C,
-	.flags 	= I2C_DF_NOTIFY,
 	.attach_adapter	= attach,
 	.detach_client	= detach,
 	.command	= command,
diff --git a/drivers/media/video/tea6420.c b/drivers/media/video/tea6420.c
index 17975c19da5e..fd417de95847 100644
--- a/drivers/media/video/tea6420.c
+++ b/drivers/media/video/tea6420.c
@@ -170,7 +170,6 @@ static struct i2c_driver driver = {
 	.owner	= THIS_MODULE,
 	.name	= "tea6420",
 	.id	= I2C_DRIVERID_TEA6420,
-	.flags	= I2C_DF_NOTIFY,
 	.attach_adapter	= attach,
 	.detach_client	= detach,
 	.command	= command,
diff --git a/drivers/media/video/tuner-3036.c b/drivers/media/video/tuner-3036.c
index 79203595b9c1..3505cec2e65a 100644
--- a/drivers/media/video/tuner-3036.c
+++ b/drivers/media/video/tuner-3036.c
@@ -178,7 +178,6 @@ i2c_driver_tuner =
 	.owner		=	THIS_MODULE,
 	.name		=	"sab3036",
 	.id		=	I2C_DRIVERID_SAB3036,
-        .flags		=	I2C_DF_NOTIFY,
 	.attach_adapter =	tuner_probe,
 	.detach_client  =	tuner_detach,
 	.command	=	tuner_command
diff --git a/drivers/media/video/tuner-core.c b/drivers/media/video/tuner-core.c
index e58abdfcaab8..3c75121f6383 100644
--- a/drivers/media/video/tuner-core.c
+++ b/drivers/media/video/tuner-core.c
@@ -745,7 +745,6 @@ static struct i2c_driver driver = {
 	.owner = THIS_MODULE,
 	.name = "tuner",
 	.id = I2C_DRIVERID_TUNER,
-	.flags = I2C_DF_NOTIFY,
 	.attach_adapter = tuner_probe,
 	.detach_client = tuner_detach,
 	.command = tuner_command,
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index 5b20e8177cad..3565f35be7a1 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -1705,7 +1705,6 @@ static struct i2c_driver driver = {
 	.owner           = THIS_MODULE,
 	.name            = "generic i2c audio driver",
 	.id              = I2C_DRIVERID_TVAUDIO,
-	.flags           = I2C_DF_NOTIFY,
 	.attach_adapter  = chip_probe,
 	.detach_client   = chip_detach,
 	.command         = chip_command,
diff --git a/drivers/media/video/tveeprom.c b/drivers/media/video/tveeprom.c
index 5ac235365dd8..195bc51d4576 100644
--- a/drivers/media/video/tveeprom.c
+++ b/drivers/media/video/tveeprom.c
@@ -782,7 +782,6 @@ static struct i2c_driver i2c_driver_tveeprom = {
 	.owner          = THIS_MODULE,
 	.name           = "tveeprom",
 	.id             = I2C_DRIVERID_TVEEPROM,
-	.flags          = I2C_DF_NOTIFY,
 	.attach_adapter = tveeprom_attach_adapter,
 	.detach_client  = tveeprom_detach_client,
 	.command        = tveeprom_command,
diff --git a/drivers/media/video/tvmixer.c b/drivers/media/video/tvmixer.c
index 5897e5d4d3d2..936e01d2c785 100644
--- a/drivers/media/video/tvmixer.c
+++ b/drivers/media/video/tvmixer.c
@@ -232,7 +232,6 @@ static struct i2c_driver driver = {
 #endif
 	.name            = "tv card mixer driver",
 	.id              = I2C_DRIVERID_TVMIXER,
-	.flags           = I2C_DF_NOTIFY,
 	.detach_adapter  = tvmixer_adapters,
 	.attach_adapter  = tvmixer_adapters,
 	.detach_client   = tvmixer_clients,
diff --git a/drivers/media/video/tvp5150.c b/drivers/media/video/tvp5150.c
index 97431e26d229..4f3ee2091611 100644
--- a/drivers/media/video/tvp5150.c
+++ b/drivers/media/video/tvp5150.c
@@ -806,7 +806,6 @@ static struct i2c_driver driver = {
 
 	/* FIXME */
 	.id = I2C_DRIVERID_SAA7110,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = tvp5150_attach_adapter,
 	.detach_client = tvp5150_detach_client,
diff --git a/drivers/media/video/vpx3220.c b/drivers/media/video/vpx3220.c
index 137b58f2c666..c66d28505bcd 100644
--- a/drivers/media/video/vpx3220.c
+++ b/drivers/media/video/vpx3220.c
@@ -726,7 +726,6 @@ static struct i2c_driver vpx3220_i2c_driver = {
 	.name = "vpx3220",
 
 	.id = I2C_DRIVERID_VPX3220,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = vpx3220_attach_adapter,
 	.detach_client = vpx3220_detach_client,
diff --git a/drivers/media/video/wm8775.c b/drivers/media/video/wm8775.c
index a6936ad74fcf..7b07717a3c67 100644
--- a/drivers/media/video/wm8775.c
+++ b/drivers/media/video/wm8775.c
@@ -236,7 +236,6 @@ static struct i2c_driver i2c_driver = {
 	.name = "wm8775",
 
 	.id = I2C_DRIVERID_WM8775,
-	.flags = I2C_DF_NOTIFY,
 
 	.attach_adapter = wm8775_probe,
 	.detach_client = wm8775_detach,
diff --git a/drivers/video/matrox/matroxfb_maven.c b/drivers/video/matrox/matroxfb_maven.c
index ad60bbb16cdf..78994c5fe932 100644
--- a/drivers/video/matrox/matroxfb_maven.c
+++ b/drivers/video/matrox/matroxfb_maven.c
@@ -1296,7 +1296,6 @@ static struct i2c_driver maven_driver={
 	.owner		= THIS_MODULE,
 	.name		= "maven",
 	.id		= I2C_DRIVERID_MGATVO,
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= maven_attach_adapter,
 	.detach_client	= maven_detach_client,
 	.command	= maven_command,
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 0316ba1294ca..99399fadf13f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -251,7 +251,6 @@ static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data)
 }
 
 /*flags for the driver struct: */
-#define I2C_DF_NOTIFY	0x01		/* notify on bus (de/a)ttaches 	*/
 
 /*flags for the client struct: */
 #define I2C_CLIENT_ALLOW_USE		0x01	/* Client allows access */
diff --git a/sound/oss/dmasound/dac3550a.c b/sound/oss/dmasound/dac3550a.c
index 533895eba0eb..3402a663d07f 100644
--- a/sound/oss/dmasound/dac3550a.c
+++ b/sound/oss/dmasound/dac3550a.c
@@ -44,7 +44,6 @@ struct i2c_driver daca_driver = {
 	.owner			= THIS_MODULE,
 	.name			= "DAC3550A driver  V " DACA_VERSION,
 	.id			= I2C_DRIVERID_DACA,
-	.flags			= I2C_DF_NOTIFY,
 	.attach_adapter		= daca_attach_adapter,
 	.detach_client		= daca_detach_client,
 };
diff --git a/sound/oss/dmasound/tas_common.c b/sound/oss/dmasound/tas_common.c
index d36a1fe2fcf3..7e3d517af4b9 100644
--- a/sound/oss/dmasound/tas_common.c
+++ b/sound/oss/dmasound/tas_common.c
@@ -49,7 +49,6 @@ static int tas_detach_client(struct i2c_client *);
 struct i2c_driver tas_driver = {
 	.owner		= THIS_MODULE,
 	.name		= "tas",
-	.flags		= I2C_DF_NOTIFY,
 	.attach_adapter	= tas_attach_adapter,
 	.detach_client	= tas_detach_client,
 };
diff --git a/sound/ppc/keywest.c b/sound/ppc/keywest.c
index 097fbcfc5d45..fd8e2e6062f6 100644
--- a/sound/ppc/keywest.c
+++ b/sound/ppc/keywest.c
@@ -43,7 +43,6 @@ static int keywest_detach_client(struct i2c_client *client);
 struct i2c_driver keywest_driver = {  
 	.name = "PMac Keywest Audio",
 	.id = I2C_DRIVERID_KEYWEST,
-	.flags = I2C_DF_NOTIFY,
 	.attach_adapter = &keywest_attach_adapter,
 	.detach_client = &keywest_detach_client,
 };
-- 
cgit v1.2.3-71-gd317


From 5d7b851dcced3611e4a4432308618b1ed1a9fc31 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sun, 27 Nov 2005 08:57:10 +0100
Subject: [PATCH] i2c: Drop i2c_driver.flags, 3 of 3

The flags member of the i2c_driver structure is no more used. Drop it.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/i2c.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 99399fadf13f..3c16a8fb95f4 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -112,7 +112,6 @@ struct i2c_driver {
 	char name[32];
 	int id;
 	unsigned int class;
-	unsigned int flags;		/* div., see below		*/
 
 	/* Notifies the driver that a new bus has appeared. This routine
 	 * can be used by the driver to test if the bus meets its conditions
@@ -250,8 +249,6 @@ static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data)
 	dev_set_drvdata (&dev->dev, data);
 }
 
-/*flags for the driver struct: */
-
 /*flags for the client struct: */
 #define I2C_CLIENT_ALLOW_USE		0x01	/* Client allows access */
 #define I2C_CLIENT_ALLOW_MULTIPLE_USE 	0x02  	/* Allow multiple access-locks */
-- 
cgit v1.2.3-71-gd317


From cb748fb20186d4b345c68a7f580429f379fdd268 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sat, 26 Nov 2005 20:58:35 +0100
Subject: [PATCH] i2c: Rework client usage count, 1 of 3

No i2c client uses the I2C_CLIENT_ALLOW_MULTIPLE_USE flag, drop it.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/i2c/i2c-core.c | 4 +---
 include/linux/i2c.h    | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index ad68ac00d910..2f0bc9529376 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -500,9 +500,7 @@ int i2c_use_client(struct i2c_client *client)
 		return ret;
 
 	if (client->flags & I2C_CLIENT_ALLOW_USE) {
-		if (client->flags & I2C_CLIENT_ALLOW_MULTIPLE_USE)
-			client->usage_count++;
-		else if (client->usage_count > 0) 
+		if (client->usage_count > 0)
 			goto busy;
 		else 
 			client->usage_count++;
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 3c16a8fb95f4..4487c5189747 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -251,8 +251,6 @@ static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data)
 
 /*flags for the client struct: */
 #define I2C_CLIENT_ALLOW_USE		0x01	/* Client allows access */
-#define I2C_CLIENT_ALLOW_MULTIPLE_USE 	0x02  	/* Allow multiple access-locks */
-						/* on an i2c_client */
 #define I2C_CLIENT_PEC  0x04			/* Use Packet Error Checking */
 #define I2C_CLIENT_TEN	0x10			/* we have a ten bit chip address	*/
 						/* Must equal I2C_M_TEN below */
-- 
cgit v1.2.3-71-gd317


From cde7859bda0d1124392b44e50aa11df99707e1d9 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sat, 26 Nov 2005 21:00:54 +0100
Subject: [PATCH] i2c: Rework client usage count, 2 of 3

Make I2C_CLIENT_ALLOW_USE the default for all i2c clients. It doesn't
hurt if the usage count is actually never used for any given driver,
and allows for nice code simplifications in i2c-core.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/arm/mach-pxa/akita-ioexp.c            |  1 -
 drivers/i2c/chips/rtc8564.c                |  1 -
 drivers/i2c/i2c-core.c                     | 28 ++++++++++------------------
 drivers/media/video/adv7170.c              |  1 -
 drivers/media/video/adv7175.c              |  1 -
 drivers/media/video/bt819.c                |  1 -
 drivers/media/video/bt832.c                |  1 -
 drivers/media/video/bt856.c                |  1 -
 drivers/media/video/cs53l32a.c             |  1 -
 drivers/media/video/cx25840/cx25840-core.c |  1 -
 drivers/media/video/em28xx/em28xx-i2c.c    |  1 -
 drivers/media/video/msp3400.c              |  1 -
 drivers/media/video/saa6588.c              |  1 -
 drivers/media/video/saa7110.c              |  1 -
 drivers/media/video/saa7111.c              |  1 -
 drivers/media/video/saa7114.c              |  1 -
 drivers/media/video/saa7115.c              |  1 -
 drivers/media/video/saa711x.c              |  1 -
 drivers/media/video/saa7127.c              |  1 -
 drivers/media/video/saa7134/saa6752hs.c    |  1 -
 drivers/media/video/saa7185.c              |  1 -
 drivers/media/video/tda9887.c              |  1 -
 drivers/media/video/tuner-core.c           |  1 -
 drivers/media/video/tvaudio.c              |  1 -
 drivers/media/video/tveeprom.c             |  1 -
 drivers/media/video/tvp5150.c              |  1 -
 drivers/media/video/vpx3220.c              |  1 -
 drivers/media/video/wm8775.c               |  1 -
 include/linux/i2c.h                        |  1 -
 29 files changed, 10 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/akita-ioexp.c b/arch/arm/mach-pxa/akita-ioexp.c
index 440ebb3c3db1..b6bff550c8e9 100644
--- a/arch/arm/mach-pxa/akita-ioexp.c
+++ b/arch/arm/mach-pxa/akita-ioexp.c
@@ -133,7 +133,6 @@ static struct i2c_driver max7310_i2c_driver = {
 
 static struct i2c_client max7310_template = {
 	name:   "akita-max7310",
-	flags:  I2C_CLIENT_ALLOW_USE,
 	driver: &max7310_i2c_driver,
 };
 
diff --git a/drivers/i2c/chips/rtc8564.c b/drivers/i2c/chips/rtc8564.c
index e586f75dd024..07494d394381 100644
--- a/drivers/i2c/chips/rtc8564.c
+++ b/drivers/i2c/chips/rtc8564.c
@@ -155,7 +155,6 @@ static int rtc8564_attach(struct i2c_adapter *adap, int addr, int kind)
 
 	strlcpy(new_client->name, "RTC8564", I2C_NAME_SIZE);
 	i2c_set_clientdata(new_client, d);
-	new_client->flags = I2C_CLIENT_ALLOW_USE;
 	new_client->addr = addr;
 	new_client->adapter = adap;
 	new_client->driver = &rtc8564_driver;
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 2f0bc9529376..d16b4998c4c2 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -419,8 +419,7 @@ int i2c_attach_client(struct i2c_client *client)
 		}
 	}
 
-	if (client->flags & I2C_CLIENT_ALLOW_USE)
-		client->usage_count = 0;
+	client->usage_count = 0;
 
 	client->dev.parent = &client->adapter->dev;
 	client->dev.driver = &client->driver->driver;
@@ -443,8 +442,7 @@ int i2c_detach_client(struct i2c_client *client)
 	struct i2c_adapter *adapter = client->adapter;
 	int res = 0;
 	
-	if ((client->flags & I2C_CLIENT_ALLOW_USE)
-	 && (client->usage_count > 0)) {
+	if (client->usage_count > 0) {
 		dev_warn(&client->dev, "Client [%s] still busy, "
 			 "can't detach\n", client->name);
 		return -EBUSY;
@@ -499,12 +497,9 @@ int i2c_use_client(struct i2c_client *client)
 	if (ret)
 		return ret;
 
-	if (client->flags & I2C_CLIENT_ALLOW_USE) {
-		if (client->usage_count > 0)
-			goto busy;
-		else 
-			client->usage_count++;
-	}
+	if (client->usage_count > 0)
+		goto busy;
+	client->usage_count++;
 
 	return 0;
  busy:
@@ -514,16 +509,13 @@ int i2c_use_client(struct i2c_client *client)
 
 int i2c_release_client(struct i2c_client *client)
 {
-	if(client->flags & I2C_CLIENT_ALLOW_USE) {
-		if(client->usage_count>0)
-			client->usage_count--;
-		else {
-			pr_debug("i2c-core: %s used one too many times\n",
-				__FUNCTION__);
-			return -EPERM;
-		}
+	if (!client->usage_count) {
+		pr_debug("i2c-core: %s used one too many times\n",
+			 __FUNCTION__);
+		return -EPERM;
 	}
 	
+	client->usage_count--;
 	i2c_dec_use_client(client);
 	
 	return 0;
diff --git a/drivers/media/video/adv7170.c b/drivers/media/video/adv7170.c
index c4f2265167a2..622b1619a7e3 100644
--- a/drivers/media/video/adv7170.c
+++ b/drivers/media/video/adv7170.c
@@ -420,7 +420,6 @@ adv7170_detect_client (struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_adv7170;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	if ((client->addr == I2C_ADV7170 >> 1) ||
 	    (client->addr == (I2C_ADV7170 >> 1) + 1)) {
 		dname = adv7170_name;
diff --git a/drivers/media/video/adv7175.c b/drivers/media/video/adv7175.c
index 4fc08b17d4d0..d4859b445715 100644
--- a/drivers/media/video/adv7175.c
+++ b/drivers/media/video/adv7175.c
@@ -470,7 +470,6 @@ adv7175_detect_client (struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_adv7175;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	if ((client->addr == I2C_ADV7175 >> 1) ||
 	    (client->addr == (I2C_ADV7175 >> 1) + 1)) {
 		dname = adv7175_name;
diff --git a/drivers/media/video/bt819.c b/drivers/media/video/bt819.c
index 7bba69793b78..741e59af0991 100644
--- a/drivers/media/video/bt819.c
+++ b/drivers/media/video/bt819.c
@@ -535,7 +535,6 @@ bt819_detect_client (struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_bt819;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 
 	decoder = kmalloc(sizeof(struct bt819), GFP_KERNEL);
 	if (decoder == NULL) {
diff --git a/drivers/media/video/bt832.c b/drivers/media/video/bt832.c
index 0ba8652357e2..4ed13860b523 100644
--- a/drivers/media/video/bt832.c
+++ b/drivers/media/video/bt832.c
@@ -240,7 +240,6 @@ static struct i2c_driver driver = {
 static struct i2c_client client_template =
 {
 	.name       = "bt832",
-	.flags      = I2C_CLIENT_ALLOW_USE,
 	.driver     = &driver,
 };
 
diff --git a/drivers/media/video/bt856.c b/drivers/media/video/bt856.c
index 4c9acd1e2c70..d4bba8efac69 100644
--- a/drivers/media/video/bt856.c
+++ b/drivers/media/video/bt856.c
@@ -323,7 +323,6 @@ bt856_detect_client (struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_bt856;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	strlcpy(I2C_NAME(client), "bt856", sizeof(I2C_NAME(client)));
 
 	encoder = kmalloc(sizeof(struct bt856), GFP_KERNEL);
diff --git a/drivers/media/video/cs53l32a.c b/drivers/media/video/cs53l32a.c
index fce5d89b7b15..f442ce3ba74b 100644
--- a/drivers/media/video/cs53l32a.c
+++ b/drivers/media/video/cs53l32a.c
@@ -154,7 +154,6 @@ static int cs53l32a_attach(struct i2c_adapter *adapter, int address, int kind)
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	snprintf(client->name, sizeof(client->name) - 1, "cs53l32a");
 
 	cs53l32a_info("chip found @ 0x%x (%s)\n", address << 1, adapter->name);
diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c
index c66bc147ee71..0b278abe879d 100644
--- a/drivers/media/video/cx25840/cx25840-core.c
+++ b/drivers/media/video/cx25840/cx25840-core.c
@@ -773,7 +773,6 @@ static int cx25840_detect_client(struct i2c_adapter *adapter, int address,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_cx25840;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	snprintf(client->name, sizeof(client->name) - 1, "cx25840");
 
 	cx25840_dbg("detecting cx25840 client on address 0x%x\n", address << 1);
diff --git a/drivers/media/video/em28xx/em28xx-i2c.c b/drivers/media/video/em28xx/em28xx-i2c.c
index 7f5603054f02..d14bcf4ceaea 100644
--- a/drivers/media/video/em28xx/em28xx-i2c.c
+++ b/drivers/media/video/em28xx/em28xx-i2c.c
@@ -497,7 +497,6 @@ static struct i2c_adapter em28xx_adap_template = {
 
 static struct i2c_client em28xx_client_template = {
 	.name = "em28xx internal",
-	.flags = I2C_CLIENT_ALLOW_USE,
 };
 
 /* ----------------------------------------------------------- */
diff --git a/drivers/media/video/msp3400.c b/drivers/media/video/msp3400.c
index 46328fb6fe80..c5e8ad3aac4a 100644
--- a/drivers/media/video/msp3400.c
+++ b/drivers/media/video/msp3400.c
@@ -1576,7 +1576,6 @@ static struct i2c_driver driver = {
 static struct i2c_client client_template =
 {
 	.name      = "(unset)",
-	.flags     = I2C_CLIENT_ALLOW_USE,
 	.driver    = &driver,
 };
 
diff --git a/drivers/media/video/saa6588.c b/drivers/media/video/saa6588.c
index 18a0b7143e8b..3d4076ca67c3 100644
--- a/drivers/media/video/saa6588.c
+++ b/drivers/media/video/saa6588.c
@@ -505,7 +505,6 @@ static struct i2c_driver driver = {
 
 static struct i2c_client client_template = {
 	.name = "saa6588",
-	.flags = I2C_CLIENT_ALLOW_USE,
 	.driver = &driver,
 };
 
diff --git a/drivers/media/video/saa7110.c b/drivers/media/video/saa7110.c
index f266b35ceb35..8affa63c8b24 100644
--- a/drivers/media/video/saa7110.c
+++ b/drivers/media/video/saa7110.c
@@ -501,7 +501,6 @@ saa7110_detect_client (struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_saa7110;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	strlcpy(I2C_NAME(client), "saa7110", sizeof(I2C_NAME(client)));
 
 	decoder = kmalloc(sizeof(struct saa7110), GFP_KERNEL);
diff --git a/drivers/media/video/saa7111.c b/drivers/media/video/saa7111.c
index 687beaf11adc..2b2204564514 100644
--- a/drivers/media/video/saa7111.c
+++ b/drivers/media/video/saa7111.c
@@ -518,7 +518,6 @@ saa7111_detect_client (struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_saa7111;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	strlcpy(I2C_NAME(client), "saa7111", sizeof(I2C_NAME(client)));
 
 	decoder = kmalloc(sizeof(struct saa7111), GFP_KERNEL);
diff --git a/drivers/media/video/saa7114.c b/drivers/media/video/saa7114.c
index 4748cf0598c0..285f6c7a8f71 100644
--- a/drivers/media/video/saa7114.c
+++ b/drivers/media/video/saa7114.c
@@ -859,7 +859,6 @@ saa7114_detect_client (struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_saa7114;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	strlcpy(I2C_NAME(client), "saa7114", sizeof(I2C_NAME(client)));
 
 	decoder = kmalloc(sizeof(struct saa7114), GFP_KERNEL);
diff --git a/drivers/media/video/saa7115.c b/drivers/media/video/saa7115.c
index b1079de938b7..79aadd2d408f 100644
--- a/drivers/media/video/saa7115.c
+++ b/drivers/media/video/saa7115.c
@@ -1270,7 +1270,6 @@ static int saa7115_attach(struct i2c_adapter *adapter, int address, int kind)
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_saa7115;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	snprintf(client->name, sizeof(client->name) - 1, "saa7115");
 
 	saa7115_dbg("detecting saa7115 client on address 0x%x\n", address << 1);
diff --git a/drivers/media/video/saa711x.c b/drivers/media/video/saa711x.c
index 734a70919080..44bfc047704c 100644
--- a/drivers/media/video/saa711x.c
+++ b/drivers/media/video/saa711x.c
@@ -494,7 +494,6 @@ saa711x_detect_client (struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_saa711x;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	strlcpy(I2C_NAME(client), "saa711x", sizeof(I2C_NAME(client)));
 	decoder = kmalloc(sizeof(struct saa711x), GFP_KERNEL);
 	if (decoder == NULL) {
diff --git a/drivers/media/video/saa7127.c b/drivers/media/video/saa7127.c
index a2fab9837507..1f4b41599445 100644
--- a/drivers/media/video/saa7127.c
+++ b/drivers/media/video/saa7127.c
@@ -719,7 +719,6 @@ static int saa7127_attach(struct i2c_adapter *adapter, int address, int kind)
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_saa7127;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	snprintf(client->name, sizeof(client->name) - 1, "saa7127");
 
 	saa7127_dbg("detecting saa7127 client on address 0x%x\n", address << 1);
diff --git a/drivers/media/video/saa7134/saa6752hs.c b/drivers/media/video/saa7134/saa6752hs.c
index 6fc298e0a03a..68206060c92d 100644
--- a/drivers/media/video/saa7134/saa6752hs.c
+++ b/drivers/media/video/saa7134/saa6752hs.c
@@ -608,7 +608,6 @@ static struct i2c_driver driver = {
 static struct i2c_client client_template =
 {
 	.name       = "saa6752hs",
-	.flags      = I2C_CLIENT_ALLOW_USE,
 	.driver     = &driver,
 };
 
diff --git a/drivers/media/video/saa7185.c b/drivers/media/video/saa7185.c
index e24aa16f2d8c..9f37585d3f73 100644
--- a/drivers/media/video/saa7185.c
+++ b/drivers/media/video/saa7185.c
@@ -415,7 +415,6 @@ saa7185_detect_client (struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_saa7185;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	strlcpy(I2C_NAME(client), "saa7185", sizeof(I2C_NAME(client)));
 
 	encoder = kmalloc(sizeof(struct saa7185), GFP_KERNEL);
diff --git a/drivers/media/video/tda9887.c b/drivers/media/video/tda9887.c
index 049b44e0767b..324f61bf714e 100644
--- a/drivers/media/video/tda9887.c
+++ b/drivers/media/video/tda9887.c
@@ -833,7 +833,6 @@ static struct i2c_driver driver = {
 static struct i2c_client client_template =
 {
 	.name      = "tda9887",
-	.flags     = I2C_CLIENT_ALLOW_USE,
 	.driver    = &driver,
 };
 
diff --git a/drivers/media/video/tuner-core.c b/drivers/media/video/tuner-core.c
index 3c75121f6383..6328f0954e70 100644
--- a/drivers/media/video/tuner-core.c
+++ b/drivers/media/video/tuner-core.c
@@ -755,7 +755,6 @@ static struct i2c_driver driver = {
 };
 static struct i2c_client client_template = {
 	.name = "(tuner unset)",
-	.flags = I2C_CLIENT_ALLOW_USE,
 	.driver = &driver,
 };
 
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index 3565f35be7a1..4f1f339283e0 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -1713,7 +1713,6 @@ static struct i2c_driver driver = {
 static struct i2c_client client_template =
 {
 	.name       = "(unset)",
-	.flags      = I2C_CLIENT_ALLOW_USE,
 	.driver     = &driver,
 };
 
diff --git a/drivers/media/video/tveeprom.c b/drivers/media/video/tveeprom.c
index 195bc51d4576..d833b651073a 100644
--- a/drivers/media/video/tveeprom.c
+++ b/drivers/media/video/tveeprom.c
@@ -751,7 +751,6 @@ tveeprom_detect_client(struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver_tveeprom;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	snprintf(client->name, sizeof(client->name), "tveeprom");
 	i2c_attach_client(client);
 
diff --git a/drivers/media/video/tvp5150.c b/drivers/media/video/tvp5150.c
index 4f3ee2091611..3734554fc73b 100644
--- a/drivers/media/video/tvp5150.c
+++ b/drivers/media/video/tvp5150.c
@@ -714,7 +714,6 @@ static struct i2c_driver driver;
 
 static struct i2c_client client_template = {
 	.name = "(unset)",
-	.flags = I2C_CLIENT_ALLOW_USE,
 	.driver = &driver,
 };
 
diff --git a/drivers/media/video/vpx3220.c b/drivers/media/video/vpx3220.c
index c66d28505bcd..54bc888c3891 100644
--- a/drivers/media/video/vpx3220.c
+++ b/drivers/media/video/vpx3220.c
@@ -631,7 +631,6 @@ vpx3220_detect_client (struct i2c_adapter *adapter,
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &vpx3220_i2c_driver;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 
 	/* Check for manufacture ID and part number */
 	if (kind < 0) {
diff --git a/drivers/media/video/wm8775.c b/drivers/media/video/wm8775.c
index 7b07717a3c67..527c2591749a 100644
--- a/drivers/media/video/wm8775.c
+++ b/drivers/media/video/wm8775.c
@@ -168,7 +168,6 @@ static int wm8775_attach(struct i2c_adapter *adapter, int address, int kind)
 	client->addr = address;
 	client->adapter = adapter;
 	client->driver = &i2c_driver;
-	client->flags = I2C_CLIENT_ALLOW_USE;
 	snprintf(client->name, sizeof(client->name) - 1, "wm8775");
 
 	wm8775_info("chip found @ 0x%x (%s)\n", address << 1, adapter->name);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 4487c5189747..8b4d4695de0e 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -250,7 +250,6 @@ static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data)
 }
 
 /*flags for the client struct: */
-#define I2C_CLIENT_ALLOW_USE		0x01	/* Client allows access */
 #define I2C_CLIENT_PEC  0x04			/* Use Packet Error Checking */
 #define I2C_CLIENT_TEN	0x10			/* we have a ten bit chip address	*/
 						/* Must equal I2C_M_TEN below */
-- 
cgit v1.2.3-71-gd317


From cf02df770228350254251fde520007a2709db785 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sat, 26 Nov 2005 21:03:41 +0100
Subject: [PATCH] i2c: Rework client usage count, 3 of 3

Do not limit the usage count of i2c clients to 1. In other words,
change the client usage count behavior from the old I2C_CLIENT_ALLOW_USE
to the old I2C_CLIENT_ALLOW_MULTIPLE_USE. The rationale is that no
driver actually needs the limiting behavior, and the unlimiting
behavior is slightly easier to implement.

Update the documentation to reflect this change.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/i2c/porting-clients | 1 +
 drivers/i2c/i2c-core.c            | 5 -----
 include/linux/i2c.h               | 4 +---
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/porting-clients b/Documentation/i2c/porting-clients
index 64c610bf2fbc..6b07f23039d2 100644
--- a/Documentation/i2c/porting-clients
+++ b/Documentation/i2c/porting-clients
@@ -92,6 +92,7 @@ Technical changes:
   Drop client->id.
   Drop any 24RF08 corruption prevention you find, as this is now done
   at the i2c-core level, and doing it twice voids it.
+  Don't add I2C_CLIENT_ALLOW_USE to client->flags, it's the default now.
 
 * [Init] Limits must not be set by the driver (can be done later in
   user-space). Chip should not be reset default (although a module
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index d16b4998c4c2..a1c5dff85431 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -497,14 +497,9 @@ int i2c_use_client(struct i2c_client *client)
 	if (ret)
 		return ret;
 
-	if (client->usage_count > 0)
-		goto busy;
 	client->usage_count++;
 
 	return 0;
- busy:
-	i2c_dec_use_client(client);
-	return -EBUSY;
 }
 
 int i2c_release_client(struct i2c_client *client)
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 8b4d4695de0e..85c517a9b05b 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -307,9 +307,7 @@ extern struct i2c_client *i2c_get_client(int driver_id, int adapter_id,
    extern struct i2c_client *i2c_get_client(int,int,struct i2c_client *);
    to make sure that client-struct is valid and that it is okay to access
    the i2c-client. 
-   returns -EACCES if client doesn't allow use (default)
-   returns -EBUSY if client doesn't allow multiple use (default) and 
-   usage_count >0 */
+   returns -ENODEV if client has gone in the meantime */
 extern int i2c_use_client(struct i2c_client *);
 extern int i2c_release_client(struct i2c_client *);
 
-- 
cgit v1.2.3-71-gd317


From 482c788ded0aa9710722eaf9cf60886d3b923218 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sat, 26 Nov 2005 21:06:08 +0100
Subject: [PATCH] i2c: i2c_get_client is gone

The i2c_get_client function doesn't exist anymore, so we shouldn't
have a definition for it in i2c.h.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/i2c.h | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 85c517a9b05b..a9cea62fd486 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -296,17 +296,8 @@ extern int i2c_del_driver(struct i2c_driver *);
 extern int i2c_attach_client(struct i2c_client *);
 extern int i2c_detach_client(struct i2c_client *);
 
-/* New function: This is to get an i2c_client-struct for controlling the 
-   client either by using i2c_control-function or having the 
-   client-module export functions that can be used with the i2c_client
-   -struct. */
-extern struct i2c_client *i2c_get_client(int driver_id, int adapter_id, 
-					struct i2c_client *prev);
-
-/* Should be used with new function
-   extern struct i2c_client *i2c_get_client(int,int,struct i2c_client *);
-   to make sure that client-struct is valid and that it is okay to access
-   the i2c-client. 
+/* Should be used to make sure that client-struct is valid and that it
+   is okay to access the i2c-client.
    returns -ENODEV if client has gone in the meantime */
 extern int i2c_use_client(struct i2c_client *);
 extern int i2c_release_client(struct i2c_client *);
-- 
cgit v1.2.3-71-gd317


From 35d8b2e6b8e86b0d5126f36613b5202d4eb978b6 Mon Sep 17 00:00:00 2001
From: Laurent Riffard <laurent.riffard@free.fr>
Date: Sat, 26 Nov 2005 20:34:05 +0100
Subject: [PATCH] i2c: Drop i2c_driver.{owner,name}, 1 of 11

We should use the i2c_driver.driver's .name and .owner fields
instead of the i2c_driver's ones.

This patch updates the core of the i2c drivers: it removes .name and
.owner fields from the struct i2c_device and modify various
functions to use struct device fields instead.

Signed-off-by: Laurent Riffard <laurent.riffard@free.fr>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/i2c/busses/i2c-isa.c |  4 +---
 drivers/i2c/i2c-core.c       | 22 +++++++++++-----------
 drivers/i2c/i2c-dev.c        |  6 ++++--
 include/linux/i2c.h          |  5 +++--
 4 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-isa.c b/drivers/i2c/busses/i2c-isa.c
index 03672c9ca409..9f93fb85d06e 100644
--- a/drivers/i2c/busses/i2c-isa.c
+++ b/drivers/i2c/busses/i2c-isa.c
@@ -92,8 +92,6 @@ int i2c_isa_add_driver(struct i2c_driver *driver)
 	int res;
 
 	/* Add the driver to the list of i2c drivers in the driver core */
-	driver->driver.name = driver->name;
-	driver->driver.owner = driver->owner;
 	driver->driver.bus = &i2c_bus_type;
 	driver->driver.probe = i2c_isa_device_probe;
 	driver->driver.remove = i2c_isa_device_remove;
@@ -124,7 +122,7 @@ int i2c_isa_del_driver(struct i2c_driver *driver)
 		if ((res = driver->detach_client(client))) {
 			dev_err(&isa_adapter.dev, "Failed, driver "
 				"%s not unregistered!\n",
-				driver->name);
+				driver->driver.name);
 			return res;
 		}
 	}
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index a1c5dff85431..4ce5f0f32fba 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -235,7 +235,8 @@ int i2c_del_adapter(struct i2c_adapter *adap)
 		if (driver->detach_adapter)
 			if ((res = driver->detach_adapter(adap))) {
 				dev_err(&adap->dev, "detach_adapter failed "
-					"for driver [%s]\n", driver->name);
+					"for driver [%s]\n",
+					driver->driver.name);
 				goto out_unlock;
 			}
 	}
@@ -295,8 +296,6 @@ int i2c_add_driver(struct i2c_driver *driver)
 	down(&core_lists);
 
 	/* add the driver to the list of i2c drivers in the driver core */
-	driver->driver.owner = driver->owner;
-	driver->driver.name = driver->name;
 	driver->driver.bus = &i2c_bus_type;
 	driver->driver.probe = i2c_device_probe;
 	driver->driver.remove = i2c_device_remove;
@@ -306,7 +305,7 @@ int i2c_add_driver(struct i2c_driver *driver)
 		goto out_unlock;
 	
 	list_add_tail(&driver->list,&drivers);
-	pr_debug("i2c-core: driver [%s] registered\n", driver->name);
+	pr_debug("i2c-core: driver [%s] registered\n", driver->driver.name);
 
 	/* now look for instances of driver on our adapters */
 	if (driver->attach_adapter) {
@@ -344,7 +343,8 @@ int i2c_del_driver(struct i2c_driver *driver)
 		if (driver->detach_adapter) {
 			if ((res = driver->detach_adapter(adap))) {
 				dev_err(&adap->dev, "detach_adapter failed "
-					"for driver [%s]\n", driver->name);
+					"for driver [%s]\n",
+					driver->driver.name);
 				goto out_unlock;
 			}
 		} else {
@@ -368,7 +368,7 @@ int i2c_del_driver(struct i2c_driver *driver)
 
 	driver_unregister(&driver->driver);
 	list_del(&driver->list);
-	pr_debug("i2c-core: driver [%s] unregistered\n", driver->name);
+	pr_debug("i2c-core: driver [%s] unregistered\n", driver->driver.name);
 
  out_unlock:
 	up(&core_lists);
@@ -473,10 +473,10 @@ int i2c_detach_client(struct i2c_client *client)
 static int i2c_inc_use_client(struct i2c_client *client)
 {
 
-	if (!try_module_get(client->driver->owner))
+	if (!try_module_get(client->driver->driver.owner))
 		return -ENODEV;
 	if (!try_module_get(client->adapter->owner)) {
-		module_put(client->driver->owner);
+		module_put(client->driver->driver.owner);
 		return -ENODEV;
 	}
 
@@ -485,7 +485,7 @@ static int i2c_inc_use_client(struct i2c_client *client)
 
 static void i2c_dec_use_client(struct i2c_client *client)
 {
-	module_put(client->driver->owner);
+	module_put(client->driver->driver.owner);
 	module_put(client->adapter->owner);
 }
 
@@ -524,14 +524,14 @@ void i2c_clients_command(struct i2c_adapter *adap, unsigned int cmd, void *arg)
 	down(&adap->clist_lock);
 	list_for_each(item,&adap->clients) {
 		client = list_entry(item, struct i2c_client, list);
-		if (!try_module_get(client->driver->owner))
+		if (!try_module_get(client->driver->driver.owner))
 			continue;
 		if (NULL != client->driver->command) {
 			up(&adap->clist_lock);
 			client->driver->command(client,cmd,arg);
 			down(&adap->clist_lock);
 		}
-		module_put(client->driver->owner);
+		module_put(client->driver->driver.owner);
        }
        up(&adap->clist_lock);
 }
diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c
index 9da51eb37c06..9715217a0343 100644
--- a/drivers/i2c/i2c-dev.c
+++ b/drivers/i2c/i2c-dev.c
@@ -481,8 +481,10 @@ static int i2cdev_command(struct i2c_client *client, unsigned int cmd,
 }
 
 static struct i2c_driver i2cdev_driver = {
-	.owner		= THIS_MODULE,
-	.name		= "dev_driver",
+	.driver = {
+		.owner	= THIS_MODULE,
+		.name	= "dev_driver",
+	},
 	.id		= I2C_DRIVERID_I2CDEV,
 	.attach_adapter	= i2cdev_attach_adapter,
 	.detach_adapter	= i2cdev_detach_adapter,
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index a9cea62fd486..75aa18e865da 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -105,11 +105,12 @@ extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client,
  * A driver is capable of handling one or more physical devices present on
  * I2C adapters. This information is used to inform the driver of adapter
  * events.
+ *
+ * The driver.owner field should be set to the module owner of this driver.
+ * The driver.name field should be set to the name of this driver.
  */
 
 struct i2c_driver {
-	struct module *owner;
-	char name[32];
 	int id;
 	unsigned int class;
 
-- 
cgit v1.2.3-71-gd317


From de59cf9ed44f64991e52a9de6094729537f0420c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 6 Dec 2005 15:33:15 -0800
Subject: [PATCH] I2C: Make i2c_add_driver automatically set the proper module
 owner

This prevents i2c drivers from messing up and forgetting to set the
module owner of their driver.  It also reduces the size of their drivers
by one line :)

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/i2c-core.c | 5 +++--
 include/linux/i2c.h    | 7 ++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 4ce5f0f32fba..c23443ee1b33 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -287,7 +287,7 @@ int i2c_del_adapter(struct i2c_adapter *adap)
  * chips.
  */
 
-int i2c_add_driver(struct i2c_driver *driver)
+int i2c_register_driver(struct module *owner, struct i2c_driver *driver)
 {
 	struct list_head   *item;
 	struct i2c_adapter *adapter;
@@ -296,6 +296,7 @@ int i2c_add_driver(struct i2c_driver *driver)
 	down(&core_lists);
 
 	/* add the driver to the list of i2c drivers in the driver core */
+	driver->driver.owner = owner;
 	driver->driver.bus = &i2c_bus_type;
 	driver->driver.probe = i2c_device_probe;
 	driver->driver.remove = i2c_device_remove;
@@ -319,6 +320,7 @@ int i2c_add_driver(struct i2c_driver *driver)
 	up(&core_lists);
 	return res;
 }
+EXPORT_SYMBOL(i2c_register_driver);
 
 int i2c_del_driver(struct i2c_driver *driver)
 {
@@ -1132,7 +1134,6 @@ EXPORT_SYMBOL_GPL(i2c_bus_type);
 
 EXPORT_SYMBOL(i2c_add_adapter);
 EXPORT_SYMBOL(i2c_del_adapter);
-EXPORT_SYMBOL(i2c_add_driver);
 EXPORT_SYMBOL(i2c_del_driver);
 EXPORT_SYMBOL(i2c_attach_client);
 EXPORT_SYMBOL(i2c_detach_client);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 75aa18e865da..7863a59bd598 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -291,9 +291,14 @@ struct i2c_client_address_data {
 extern int i2c_add_adapter(struct i2c_adapter *);
 extern int i2c_del_adapter(struct i2c_adapter *);
 
-extern int i2c_add_driver(struct i2c_driver *);
+extern int i2c_register_driver(struct module *, struct i2c_driver *);
 extern int i2c_del_driver(struct i2c_driver *);
 
+static inline int i2c_add_driver(struct i2c_driver *driver)
+{
+	return i2c_register_driver(THIS_MODULE, driver);
+}
+
 extern int i2c_attach_client(struct i2c_client *);
 extern int i2c_detach_client(struct i2c_client *);
 
-- 
cgit v1.2.3-71-gd317


From 734a12a36676ad3b9418fa5a829c518afec02b8a Mon Sep 17 00:00:00 2001
From: Rudolf Marek <r.marek@sh.cvut.cz>
Date: Sun, 18 Dec 2005 16:36:52 +0100
Subject: [PATCH] hwmon: add VRM/VID support for some VIA CPUs

This patch adds the VIA CENTAUR CPUs to detection table.
Table was updated to treat future Intel x86 CPUs as VRD10.
Stepping field was added, because some VIA CPUs have
different VRM specs across stepping. I changed the vrm type
to u8 because all drivers use u8 anyway.

Signed-off-by: Rudolf Marek <r.marek@sh.cvut.cz>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/hwmon/hwmon-vid.c | 69 ++++++++++++++++++++++++++++-------------------
 include/linux/hwmon-vid.h |  6 ++---
 2 files changed, 45 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwmon/hwmon-vid.c b/drivers/hwmon/hwmon-vid.c
index 312769ad4dab..e497274916ce 100644
--- a/drivers/hwmon/hwmon-vid.c
+++ b/drivers/hwmon/hwmon-vid.c
@@ -49,20 +49,22 @@
         . . . .
        11110  =  0.800 V
        11111  =  0.000 V (off)
+
+    The 17 specification is in fact Intel Mobile Voltage Positioning -
+    (IMVP-II). You can find more information in the datasheet of Max1718
+    http://www.maxim-ic.com/quick_view2.cfm/qv_pk/2452
+
 */
 
 /* vrm is the VRM/VRD document version multiplied by 10.
    val is the 4-, 5- or 6-bit VID code.
    Returned value is in mV to avoid floating point in the kernel. */
-int vid_from_reg(int val, int vrm)
+int vid_from_reg(int val, u8 vrm)
 {
 	int vid;
 
 	switch(vrm) {
 
-	case  0:
-		return 0;
-
 	case 100:               /* VRD 10.0 */
 		if((val & 0x1f) == 0x1f)
 			return 0;
@@ -91,10 +93,16 @@ int vid_from_reg(int val, int vrm)
 	case 84:		/* VRM 8.4 */
 		val &= 0x0f;
 				/* fall through */
-	default:		/* VRM 8.2 */
+	case 82:		/* VRM 8.2 */
 		return(val == 0x1f ? 0 :
 		       val & 0x10  ? 5100 - (val) * 100 :
 		                     2050 - (val) * 50);
+	case 17:		/* Intel IMVP-II */
+		return(val & 0x10 ? 975 - (val & 0xF) * 25 :
+				    1750 - val * 50);
+	default:		/* report 0 for unknown */
+		printk(KERN_INFO "hwmon-vid: requested unknown VRM version\n");
+		return 0;
 	}
 }
 
@@ -108,30 +116,36 @@ struct vrm_model {
 	u8 vendor;
 	u8 eff_family;
 	u8 eff_model;
-	int vrm_type;
+	u8 eff_stepping;
+	u8 vrm_type;
 };
 
 #define ANY 0xFF
 
 #ifdef CONFIG_X86
 
+/* the stepping parameter is highest acceptable stepping for current line */
+
 static struct vrm_model vrm_models[] = {
-	{X86_VENDOR_AMD, 0x6, ANY, 90},		/* Athlon Duron etc */
-	{X86_VENDOR_AMD, 0xF, ANY, 24},		/* Athlon 64, Opteron */
-	{X86_VENDOR_INTEL, 0x6, 0x9, 85},	/* 0.13um too */
-	{X86_VENDOR_INTEL, 0x6, 0xB, 85},	/* Tualatin */
-	{X86_VENDOR_INTEL, 0x6, ANY, 82},	/* any P6 */
-	{X86_VENDOR_INTEL, 0x7, ANY, 0},	/* Itanium */
-	{X86_VENDOR_INTEL, 0xF, 0x0, 90},	/* P4 */
-	{X86_VENDOR_INTEL, 0xF, 0x1, 90},	/* P4 Willamette */
-	{X86_VENDOR_INTEL, 0xF, 0x2, 90},	/* P4 Northwood */
-	{X86_VENDOR_INTEL, 0xF, 0x3, 100},	/* P4 Prescott */
-	{X86_VENDOR_INTEL, 0xF, 0x4, 100},	/* P4 Prescott */
-	{X86_VENDOR_INTEL, 0x10,ANY, 0},	/* Itanium 2 */
-	{X86_VENDOR_UNKNOWN, ANY, ANY, 0}	/* stop here */
+	{X86_VENDOR_AMD, 0x6, ANY, ANY, 90},		/* Athlon Duron etc */
+	{X86_VENDOR_AMD, 0xF, ANY, ANY, 24},		/* Athlon 64, Opteron and above VRM 24 */
+	{X86_VENDOR_INTEL, 0x6, 0x9, ANY, 85},		/* 0.13um too */
+	{X86_VENDOR_INTEL, 0x6, 0xB, ANY, 85},		/* Tualatin */
+	{X86_VENDOR_INTEL, 0x6, ANY, ANY, 82},		/* any P6 */
+	{X86_VENDOR_INTEL, 0x7, ANY, ANY, 0},		/* Itanium */
+	{X86_VENDOR_INTEL, 0xF, 0x0, ANY, 90},		/* P4 */
+	{X86_VENDOR_INTEL, 0xF, 0x1, ANY, 90},		/* P4 Willamette */
+	{X86_VENDOR_INTEL, 0xF, 0x2, ANY, 90},		/* P4 Northwood */
+	{X86_VENDOR_INTEL, 0xF, ANY, ANY, 100},		/* Prescott and above assume VRD 10 */
+	{X86_VENDOR_INTEL, 0x10, ANY, ANY, 0},		/* Itanium 2 */
+	{X86_VENDOR_CENTAUR, 0x6, 0x7, ANY, 85},	/* Eden ESP/Ezra */
+	{X86_VENDOR_CENTAUR, 0x6, 0x8, 0x7, 85},	/* Ezra T */
+	{X86_VENDOR_CENTAUR, 0x6, 0x9, 0x7, 85},	/* Nemiah */
+	{X86_VENDOR_CENTAUR, 0x6, 0x9, ANY, 17},	/* C3-M */
+	{X86_VENDOR_UNKNOWN, ANY, ANY, ANY, 0}		/* stop here */
 };
 
-static int find_vrm(u8 eff_family, u8 eff_model, u8 vendor)
+static u8 find_vrm(u8 eff_family, u8 eff_model, u8 eff_stepping, u8 vendor)
 {
 	int i = 0;
 
@@ -139,7 +153,8 @@ static int find_vrm(u8 eff_family, u8 eff_model, u8 vendor)
 		if (vrm_models[i].vendor==vendor)
 			if ((vrm_models[i].eff_family==eff_family)
 			 && ((vrm_models[i].eff_model==eff_model) ||
-			     (vrm_models[i].eff_model==ANY)))
+			     (vrm_models[i].eff_model==ANY)) &&
+			     (eff_stepping <= vrm_models[i].eff_stepping))
 				return vrm_models[i].vrm_type;
 		i++;
 	}
@@ -147,12 +162,11 @@ static int find_vrm(u8 eff_family, u8 eff_model, u8 vendor)
 	return 0;
 }
 
-int vid_which_vrm(void)
+u8 vid_which_vrm(void)
 {
 	struct cpuinfo_x86 *c = cpu_data;
 	u32 eax;
-	u8 eff_family, eff_model;
-	int vrm_ret;
+	u8 eff_family, eff_model, eff_stepping, vrm_ret;
 
 	if (c->x86 < 6)		/* Any CPU with family lower than 6 */
 		return 0;	/* doesn't have VID and/or CPUID */
@@ -160,20 +174,21 @@ int vid_which_vrm(void)
 	eax = cpuid_eax(1);
 	eff_family = ((eax & 0x00000F00)>>8);
 	eff_model  = ((eax & 0x000000F0)>>4);
+	eff_stepping = eax & 0xF;
 	if (eff_family == 0xF) {	/* use extended model & family */
 		eff_family += ((eax & 0x00F00000)>>20);
 		eff_model += ((eax & 0x000F0000)>>16)<<4;
 	}
-	vrm_ret = find_vrm(eff_family,eff_model,c->x86_vendor);
+	vrm_ret = find_vrm(eff_family, eff_model, eff_stepping, c->x86_vendor);
 	if (vrm_ret == 0)
 		printk(KERN_INFO "hwmon-vid: Unknown VRM version of your "
 		       "x86 CPU\n");
 	return vrm_ret;
 }
 
-/* and now something completely different for the non-x86 world */
+/* and now for something completely different for the non-x86 world */
 #else
-int vid_which_vrm(void)
+u8 vid_which_vrm(void)
 {
 	printk(KERN_INFO "hwmon-vid: Unknown VRM version of your CPU\n");
 	return 0;
diff --git a/include/linux/hwmon-vid.h b/include/linux/hwmon-vid.h
index cd4b7a042b86..f346e4d5381c 100644
--- a/include/linux/hwmon-vid.h
+++ b/include/linux/hwmon-vid.h
@@ -23,14 +23,14 @@
 #ifndef _LINUX_HWMON_VID_H
 #define _LINUX_HWMON_VID_H
 
-int vid_from_reg(int val, int vrm);
-int vid_which_vrm(void);
+int vid_from_reg(int val, u8 vrm);
+u8 vid_which_vrm(void);
 
 /* vrm is the VRM/VRD document version multiplied by 10.
    val is in mV to avoid floating point in the kernel.
    Returned value is the 4-, 5- or 6-bit VID code.
    Note that only VRM 9.x is supported for now. */
-static inline int vid_to_reg(int val, int vrm)
+static inline int vid_to_reg(int val, u8 vrm)
 {
 	switch (vrm) {
 	case 91:		/* VRM 9.1 */
-- 
cgit v1.2.3-71-gd317


From 04b4b8434a92b9ef127985113c0bd961957778b7 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sun, 18 Dec 2005 16:42:35 +0100
Subject: [PATCH] i2c: driver ID list cleanups

Cleanups to i2c driver ID list:
* Remove mostly bogus comments about driver ID ranges.
* Drop experimental driver IDs, as the concept is pretty broken.
* Drop now unused IDs of non-I2C (ISA) drivers.
* Drop a few more IDs which are no more used.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/i2c-id.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 006c81ef4d50..fb46f8d56999 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -25,12 +25,6 @@
 
 /*
  * ---- Driver types -----------------------------------------------------
- *       device id name + number        function description, i2c address(es)
- *
- *  Range 1000-1999 range is defined in sensors/sensors.h
- *  Range 0x100 - 0x1ff is for V4L2 Common Components
- *  Range 0xf000 - 0xffff is reserved for local experimentation, and should
- *        never be used in official drivers
  */
 
 #define I2C_DRIVERID_MSP3400	 1
@@ -110,13 +104,7 @@
 #define I2C_DRIVERID_AKITAIOEXP	74	/* IO Expander on Sharp SL-C1000 */
 #define I2C_DRIVERID_INFRARED	75	/* I2C InfraRed on Video boards */
 
-#define I2C_DRIVERID_EXP0	0xF0	/* experimental use id's	*/
-#define I2C_DRIVERID_EXP1	0xF1
-#define I2C_DRIVERID_EXP2	0xF2
-#define I2C_DRIVERID_EXP3	0xF3
-
 #define I2C_DRIVERID_I2CDEV	900
-#define I2C_DRIVERID_I2CPROC	901
 #define I2C_DRIVERID_ARP        902    /* SMBus ARP Client              */
 #define I2C_DRIVERID_ALERT      903    /* SMBus Alert Responder Client  */
 
@@ -131,15 +119,12 @@
 #define I2C_DRIVERID_ADM1021 1008
 #define I2C_DRIVERID_ADM9240 1009
 #define I2C_DRIVERID_LTC1710 1010
-#define I2C_DRIVERID_SIS5595 1011
 #define I2C_DRIVERID_ICSPLL 1012
 #define I2C_DRIVERID_BT869 1013
 #define I2C_DRIVERID_MAXILIFE 1014
 #define I2C_DRIVERID_MATORB 1015
 #define I2C_DRIVERID_GL520 1016
 #define I2C_DRIVERID_THMC50 1017
-#define I2C_DRIVERID_DDCMON 1018
-#define I2C_DRIVERID_VIA686A 1019
 #define I2C_DRIVERID_ADM1025 1020
 #define I2C_DRIVERID_LM87 1021
 #define I2C_DRIVERID_PCF8574 1022
@@ -151,21 +136,16 @@
 #define I2C_DRIVERID_FSCPOS 1028
 #define I2C_DRIVERID_FSCSCY 1029
 #define I2C_DRIVERID_PCF8591 1030
-#define I2C_DRIVERID_SMSC47M1 1031
-#define I2C_DRIVERID_VT1211 1032
 #define I2C_DRIVERID_LM92 1033
-#define I2C_DRIVERID_VT8231 1034
 #define I2C_DRIVERID_SMARTBATT 1035
 #define I2C_DRIVERID_BMCSENSORS 1036
 #define I2C_DRIVERID_FS451 1037
-#define I2C_DRIVERID_W83627HF 1038
 #define I2C_DRIVERID_LM85 1039
 #define I2C_DRIVERID_LM83 1040
 #define I2C_DRIVERID_LM90 1042
 #define I2C_DRIVERID_ASB100 1043
 #define I2C_DRIVERID_FSCHER 1046
 #define I2C_DRIVERID_W83L785TS 1047
-#define I2C_DRIVERID_SMSC47B397 1050
 
 /*
  * ---- Adapter types ----------------------------------------------------
-- 
cgit v1.2.3-71-gd317


From 7c72ccf09b6debe55b8e049377ad3183ed4f4cb3 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sun, 18 Dec 2005 17:25:18 +0100
Subject: [PATCH] i2c: i2c-nforce2 add nforce4 MCP-04 device ID

One more supported PCI ID for the i2c-nforce2 driver.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/i2c/busses/i2c-nforce2 | 3 ++-
 drivers/i2c/busses/i2c-nforce2.c     | 2 ++
 include/linux/pci_ids.h              | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-nforce2 b/Documentation/i2c/busses/i2c-nforce2
index e379e182e64f..d751282d9b2a 100644
--- a/Documentation/i2c/busses/i2c-nforce2
+++ b/Documentation/i2c/busses/i2c-nforce2
@@ -5,7 +5,8 @@ Supported adapters:
   * nForce2 Ultra 400 MCP      10de:0084 
   * nForce3 Pro150 MCP         10de:00D4 
   * nForce3 250Gb MCP          10de:00E4 
-  * nForce4 MCP	       	       10de:0052
+  * nForce4 MCP                10de:0052
+  * nForce4 MCP-04             10de:0034
 
 Datasheet: not publically available, but seems to be similar to the
            AMD-8111 SMBus 2.0 adapter.
diff --git a/drivers/i2c/busses/i2c-nforce2.c b/drivers/i2c/busses/i2c-nforce2.c
index 4d18e6e5f159..2d80eb26f688 100644
--- a/drivers/i2c/busses/i2c-nforce2.c
+++ b/drivers/i2c/busses/i2c-nforce2.c
@@ -30,6 +30,7 @@
     nForce3 Pro150 MCP		00D4
     nForce3 250Gb MCP		00E4
     nForce4 MCP			0052
+    nForce4 MCP-04		0034
 
     This driver supports the 2 SMBuses that are included in the MCP of the
     nForce2/3/4 chipsets.
@@ -257,6 +258,7 @@ static struct pci_device_id nforce2_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE3_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE3S_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE4_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_SMBUS) },
 	{ 0 }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4f01710485cd..96a0403f61f6 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -976,6 +976,7 @@
 #define PCI_DEVICE_ID_NVIDIA_TNT_UNKNOWN        0x002a
 #define PCI_DEVICE_ID_NVIDIA_VTNT2		0x002C
 #define PCI_DEVICE_ID_NVIDIA_UVTNT2		0x002D
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_SMBUS	0x0034
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE	0x0035
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_SATA	0x0036
 #define PCI_DEVICE_ID_NVIDIA_NVENET_10		0x0037
-- 
cgit v1.2.3-71-gd317


From 8ffdc6550c47f75ca4e6c9f30a2a89063e035cf2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:49:03 +0100
Subject: [BLOCK] add @uptodate to end_that_request_last() and @error to
 rq_end_io_fn()

add @uptodate argument to end_that_request_last() and @error
to rq_end_io_fn().  there's no generic way to pass error code
to request completion function, making generic error handling
of non-fs request difficult (rq->errors is driver-specific and
each driver uses it differently).  this patch adds @uptodate
to end_that_request_last() and @error to rq_end_io_fn().

for fs requests, this doesn't really matter, so just using the
same uptodate argument used in the last call to
end_that_request_first() should suffice.  imho, this can also
help the generic command-carrying request jens is working on.

Signed-off-by: tejun heo <htejun@gmail.com>
Signed-Off-By: Jens Axboe <axboe@suse.de>
---
 block/elevator.c                |  2 +-
 block/ll_rw_blk.c               | 22 +++++++++++++++-------
 drivers/block/DAC960.c          |  2 +-
 drivers/block/cciss.c           |  2 +-
 drivers/block/cpqarray.c        |  2 +-
 drivers/block/floppy.c          |  2 +-
 drivers/block/nbd.c             |  2 +-
 drivers/block/sx8.c             |  2 +-
 drivers/block/ub.c              |  2 +-
 drivers/block/viodasd.c         |  2 +-
 drivers/cdrom/cdu31a.c          |  2 +-
 drivers/ide/ide-cd.c            |  4 ++--
 drivers/ide/ide-io.c            |  6 +++---
 drivers/message/i2o/i2o_block.c |  2 +-
 drivers/mmc/mmc_block.c         |  4 ++--
 drivers/s390/block/dasd.c       |  2 +-
 drivers/s390/char/tape_block.c  |  2 +-
 drivers/scsi/ide-scsi.c         |  4 ++--
 drivers/scsi/scsi_lib.c         |  2 +-
 drivers/scsi/sd.c               |  2 +-
 include/linux/blkdev.h          |  6 +++---
 21 files changed, 42 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/block/elevator.c b/block/elevator.c
index 6c3fc8a10bf2..85a11cee7d1c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -498,7 +498,7 @@ struct request *elv_next_request(request_queue_t *q)
 			blkdev_dequeue_request(rq);
 			rq->flags |= REQ_QUIET;
 			end_that_request_chunk(rq, 0, nr_bytes);
-			end_that_request_last(rq);
+			end_that_request_last(rq, 0);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
 								ret);
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index e02c88ca8fb5..8b1ae69bc5ac 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_issue_flush_fn);
 /*
  * Cache flushing for ordered writes handling
  */
-static void blk_pre_flush_end_io(struct request *flush_rq)
+static void blk_pre_flush_end_io(struct request *flush_rq, int error)
 {
 	struct request *rq = flush_rq->end_io_data;
 	request_queue_t *q = rq->q;
@@ -362,7 +362,7 @@ static void blk_pre_flush_end_io(struct request *flush_rq)
 	}
 }
 
-static void blk_post_flush_end_io(struct request *flush_rq)
+static void blk_post_flush_end_io(struct request *flush_rq, int error)
 {
 	struct request *rq = flush_rq->end_io_data;
 	request_queue_t *q = rq->q;
@@ -2317,7 +2317,7 @@ EXPORT_SYMBOL(blk_rq_map_kern);
  */
 void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
 			   struct request *rq, int at_head,
-			   void (*done)(struct request *))
+			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
@@ -2521,7 +2521,7 @@ EXPORT_SYMBOL(blk_put_request);
  * blk_end_sync_rq - executes a completion event on a request
  * @rq: request to complete
  */
-void blk_end_sync_rq(struct request *rq)
+void blk_end_sync_rq(struct request *rq, int error)
 {
 	struct completion *waiting = rq->waiting;
 
@@ -3183,9 +3183,17 @@ EXPORT_SYMBOL(end_that_request_chunk);
 /*
  * queue lock must be held
  */
-void end_that_request_last(struct request *req)
+void end_that_request_last(struct request *req, int uptodate)
 {
 	struct gendisk *disk = req->rq_disk;
+	int error;
+
+	/*
+	 * extend uptodate bool to allow < 0 value to be direct io error
+	 */
+	error = 0;
+	if (end_io_error(uptodate))
+		error = !uptodate ? -EIO : uptodate;
 
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
@@ -3200,7 +3208,7 @@ void end_that_request_last(struct request *req)
 		disk->in_flight--;
 	}
 	if (req->end_io)
-		req->end_io(req);
+		req->end_io(req, error);
 	else
 		__blk_put_request(req->q, req);
 }
@@ -3212,7 +3220,7 @@ void end_request(struct request *req, int uptodate)
 	if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
 		add_disk_randomness(req->rq_disk);
 		blkdev_dequeue_request(req);
-		end_that_request_last(req);
+		end_that_request_last(req, uptodate);
 	}
 }
 
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 70eaa5c7ac08..21097a39a057 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3471,7 +3471,7 @@ static inline boolean DAC960_ProcessCompletedRequest(DAC960_Command_T *Command,
 
 	 if (!end_that_request_first(Request, UpToDate, Command->BlockCount)) {
 
- 	 	end_that_request_last(Request);
+ 	 	end_that_request_last(Request, UpToDate);
 
 		if (Command->Completion) {
 			complete(Command->Completion);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index c3441b3f086e..d2815b7a9150 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2310,7 +2310,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
 	printk("Done with %p\n", cmd->rq);
 #endif /* CCISS_DEBUG */ 
 
-	end_that_request_last(cmd->rq);
+	end_that_request_last(cmd->rq, status ? 1 : -EIO);
 	cmd_free(h,cmd,1);
 }
 
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index cf1822a6361c..9bddb6874873 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -1036,7 +1036,7 @@ static inline void complete_command(cmdlist_t *cmd, int timeout)
 	complete_buffers(cmd->rq->bio, ok);
 
         DBGPX(printk("Done with %p\n", cmd->rq););
-	end_that_request_last(cmd->rq);
+	end_that_request_last(cmd->rq, ok ? 1 : -EIO);
 }
 
 /*
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index f7e765a1d313..a5b857c5c4b8 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2301,7 +2301,7 @@ static void floppy_end_request(struct request *req, int uptodate)
 	add_disk_randomness(req->rq_disk);
 	floppy_off((long)req->rq_disk->private_data);
 	blkdev_dequeue_request(req);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 
 	/* We're done with the request */
 	current_req = NULL;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9e268ddedfbd..485345c8e632 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -136,7 +136,7 @@ static void nbd_end_request(struct request *req)
 
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (!end_that_request_first(req, uptodate, req->nr_sectors)) {
-		end_that_request_last(req);
+		end_that_request_last(req, uptodate);
 	}
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 1ded3b433459..9251f4131b53 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -770,7 +770,7 @@ static inline void carm_end_request_queued(struct carm_host *host,
 	rc = end_that_request_first(req, uptodate, req->hard_nr_sectors);
 	assert(rc == 0);
 
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 
 	rc = carm_put_request(host, crq);
 	assert(rc == 0);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 10740a065088..a05fe5843e6c 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -951,7 +951,7 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 static void ub_end_rq(struct request *rq, int uptodate)
 {
 	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
-	end_that_request_last(rq);
+	end_that_request_last(rq, uptodate);
 }
 
 static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 2d518aa2720a..063f0304a163 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -305,7 +305,7 @@ static void viodasd_end_request(struct request *req, int uptodate,
 	if (end_that_request_first(req, uptodate, num_sectors))
 		return;
 	add_disk_randomness(req->rq_disk);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 }
 
 /*
diff --git a/drivers/cdrom/cdu31a.c b/drivers/cdrom/cdu31a.c
index ac96de15d833..378e88d20757 100644
--- a/drivers/cdrom/cdu31a.c
+++ b/drivers/cdrom/cdu31a.c
@@ -1402,7 +1402,7 @@ static void do_cdu31a_request(request_queue_t * q)
 			if (!end_that_request_first(req, 1, nblock)) {
 				spin_lock_irq(q->queue_lock);
 				blkdev_dequeue_request(req);
-				end_that_request_last(req);
+				end_that_request_last(req, 1);
 				spin_unlock_irq(q->queue_lock);
 			}
 			continue;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 70aeb3a60120..d31117eb95aa 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -614,7 +614,7 @@ static void cdrom_end_request (ide_drive_t *drive, int uptodate)
 			 */
 			spin_lock_irqsave(&ide_lock, flags);
 			end_that_request_chunk(failed, 0, failed->data_len);
-			end_that_request_last(failed);
+			end_that_request_last(failed, 0);
 			spin_unlock_irqrestore(&ide_lock, flags);
 		}
 
@@ -1735,7 +1735,7 @@ end_request:
 
 	spin_lock_irqsave(&ide_lock, flags);
 	blkdev_dequeue_request(rq);
-	end_that_request_last(rq);
+	end_that_request_last(rq, 1);
 	HWGROUP(drive)->rq = NULL;
 	spin_unlock_irqrestore(&ide_lock, flags);
 	return ide_stopped;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index ecfafcdafea4..8435b44a700b 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -89,7 +89,7 @@ int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate,
 
 		blkdev_dequeue_request(rq);
 		HWGROUP(drive)->rq = NULL;
-		end_that_request_last(rq);
+		end_that_request_last(rq, uptodate);
 		ret = 0;
 	}
 	return ret;
@@ -247,7 +247,7 @@ static void ide_complete_pm_request (ide_drive_t *drive, struct request *rq)
 	}
 	blkdev_dequeue_request(rq);
 	HWGROUP(drive)->rq = NULL;
-	end_that_request_last(rq);
+	end_that_request_last(rq, 1);
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
 
@@ -379,7 +379,7 @@ void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
 	blkdev_dequeue_request(rq);
 	HWGROUP(drive)->rq = NULL;
 	rq->errors = err;
-	end_that_request_last(rq);
+	end_that_request_last(rq, !rq->errors);
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index f283b5bafdd3..4f522527b7ed 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -466,7 +466,7 @@ static void i2o_block_end_request(struct request *req, int uptodate,
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 
 	if (likely(dev)) {
 		dev->open_queue_depth--;
diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c
index abcf19116d70..8e380c14bf65 100644
--- a/drivers/mmc/mmc_block.c
+++ b/drivers/mmc/mmc_block.c
@@ -263,7 +263,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 			 */
 			add_disk_randomness(req->rq_disk);
 			blkdev_dequeue_request(req);
-			end_that_request_last(req);
+			end_that_request_last(req, 1);
 		}
 		spin_unlock_irq(&md->lock);
 	} while (ret);
@@ -289,7 +289,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 
 	add_disk_randomness(req->rq_disk);
 	blkdev_dequeue_request(req);
-	end_that_request_last(req);
+	end_that_request_last(req, 0);
 	spin_unlock_irq(&md->lock);
 
 	return 0;
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 7008d32433bf..fdb61380c523 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1035,7 +1035,7 @@ dasd_end_request(struct request *req, int uptodate)
 	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
 		BUG();
 	add_disk_randomness(req->rq_disk);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 }
 
 /*
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 1efc9f21229e..559d51490e2f 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -78,7 +78,7 @@ tapeblock_end_request(struct request *req, int uptodate)
 {
 	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
 		BUG();
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 }
 
 static void
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 4cb1f3ed9100..3c688ef54660 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -1046,7 +1046,7 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
 
 	/* kill current request */
 	blkdev_dequeue_request(req);
-	end_that_request_last(req);
+	end_that_request_last(req, 0);
 	if (req->flags & REQ_SENSE)
 		kfree(scsi->pc->buffer);
 	kfree(scsi->pc);
@@ -1056,7 +1056,7 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
 	/* now nuke the drive queue */
 	while ((req = elv_next_request(drive->queue))) {
 		blkdev_dequeue_request(req);
-		end_that_request_last(req);
+		end_that_request_last(req, 0);
 	}
 
 	HWGROUP(drive)->rq = NULL;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a7f3f0c84db7..53551f1dfe21 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -791,7 +791,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(q, req);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	/*
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3d3ad7d1b779..d651150ee76d 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -748,7 +748,7 @@ static void sd_end_flush(request_queue_t *q, struct request *flush_rq)
 		 * force journal abort of barriers
 		 */
 		end_that_request_first(rq, -EOPNOTSUPP, rq->hard_nr_sectors);
-		end_that_request_last(rq);
+		end_that_request_last(rq, -EOPNOTSUPP);
 	}
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a18500d196e1..a0ce8c585165 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -102,7 +102,7 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 
 struct request;
-typedef void (rq_end_io_fn)(struct request *);
+typedef void (rq_end_io_fn)(struct request *, int);
 
 struct request_list {
 	int count[2];
@@ -560,7 +560,7 @@ extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(request_queue_t *, struct request *);
-extern void blk_end_sync_rq(struct request *rq);
+extern void blk_end_sync_rq(struct request *rq, int error);
 extern void blk_attempt_remerge(request_queue_t *, struct request *);
 extern struct request *blk_get_request(request_queue_t *, int, gfp_t);
 extern void blk_insert_request(request_queue_t *, struct request *, int, void *);
@@ -614,7 +614,7 @@ static inline void blk_run_address_space(struct address_space *mapping)
  */
 extern int end_that_request_first(struct request *, int, int);
 extern int end_that_request_chunk(struct request *, int, int);
-extern void end_that_request_last(struct request *);
+extern void end_that_request_last(struct request *, int);
 extern void end_request(struct request *req, int uptodate);
 
 /*
-- 
cgit v1.2.3-71-gd317


From 797e7dbbee0a91fa1349192f18ad5c454997d876 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:51:03 +0100
Subject: [BLOCK] reimplement handling of barrier request

Reimplement handling of barrier requests.

* Flexible handling to deal with various capabilities of
  target devices.
* Retry support for falling back.
* Tagged queues which don't support ordered tag can do ordered.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/elevator.c         |  84 +++++++----
 block/ll_rw_blk.c        | 384 ++++++++++++++++++++++++++++++-----------------
 include/linux/blkdev.h   |  82 +++++++---
 include/linux/elevator.h |   1 +
 4 files changed, 359 insertions(+), 192 deletions(-)

(limited to 'include/linux')

diff --git a/block/elevator.c b/block/elevator.c
index 85a11cee7d1c..39dcccc82ada 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -304,15 +304,7 @@ void elv_requeue_request(request_queue_t *q, struct request *rq)
 
 	rq->flags &= ~REQ_STARTED;
 
-	/*
-	 * if this is the flush, requeue the original instead and drop the flush
-	 */
-	if (rq->flags & REQ_BAR_FLUSH) {
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		rq = rq->end_io_data;
-	}
-
-	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
+	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE, 0);
 }
 
 static void elv_drain_elevator(request_queue_t *q)
@@ -332,7 +324,18 @@ static void elv_drain_elevator(request_queue_t *q)
 void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		       int plug)
 {
+	struct list_head *pos;
+	unsigned ordseq;
+
+	if (q->ordcolor)
+		rq->flags |= REQ_ORDERED_COLOR;
+
 	if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
+		/*
+		 * toggle ordered color
+		 */
+		q->ordcolor ^= 1;
+
 		/*
 		 * barriers implicitly indicate back insertion
 		 */
@@ -393,6 +396,30 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 
+	case ELEVATOR_INSERT_REQUEUE:
+		/*
+		 * If ordered flush isn't in progress, we do front
+		 * insertion; otherwise, requests should be requeued
+		 * in ordseq order.
+		 */
+		rq->flags |= REQ_SOFTBARRIER;
+
+		if (q->ordseq == 0) {
+			list_add(&rq->queuelist, &q->queue_head);
+			break;
+		}
+
+		ordseq = blk_ordered_req_seq(rq);
+
+		list_for_each(pos, &q->queue_head) {
+			struct request *pos_rq = list_entry_rq(pos);
+			if (ordseq <= blk_ordered_req_seq(pos_rq))
+				break;
+		}
+
+		list_add_tail(&rq->queuelist, pos);
+		break;
+
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __FUNCTION__, where);
@@ -422,25 +449,16 @@ static inline struct request *__elv_next_request(request_queue_t *q)
 {
 	struct request *rq;
 
-	if (unlikely(list_empty(&q->queue_head) &&
-		     !q->elevator->ops->elevator_dispatch_fn(q, 0)))
-		return NULL;
-
-	rq = list_entry_rq(q->queue_head.next);
-
-	/*
-	 * if this is a barrier write and the device has to issue a
-	 * flush sequence to support it, check how far we are
-	 */
-	if (blk_fs_request(rq) && blk_barrier_rq(rq)) {
-		BUG_ON(q->ordered == QUEUE_ORDERED_NONE);
+	while (1) {
+		while (!list_empty(&q->queue_head)) {
+			rq = list_entry_rq(q->queue_head.next);
+			if (blk_do_ordered(q, &rq))
+				return rq;
+		}
 
-		if (q->ordered == QUEUE_ORDERED_FLUSH &&
-		    !blk_barrier_preflush(rq))
-			rq = blk_start_pre_flush(q, rq);
+		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+			return NULL;
 	}
-
-	return rq;
 }
 
 struct request *elv_next_request(request_queue_t *q)
@@ -593,7 +611,21 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 	 * request is released from the driver, io must be done
 	 */
 	if (blk_account_rq(rq)) {
+		struct request *first_rq = list_entry_rq(q->queue_head.next);
+
 		q->in_flight--;
+
+		/*
+		 * Check if the queue is waiting for fs requests to be
+		 * drained for flush sequence.
+		 */
+		if (q->ordseq && q->in_flight == 0 &&
+		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
+		    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
+			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
+			q->request_fn(q);
+		}
+
 		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 65c4efc02adf..91d3b4828c49 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -290,8 +290,8 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
 
 /**
  * blk_queue_ordered - does this queue support ordered writes
- * @q:     the request queue
- * @flag:  see below
+ * @q:        the request queue
+ * @ordered:  one of QUEUE_ORDERED_*
  *
  * Description:
  *   For journalled file systems, doing ordered writes on a commit
@@ -300,28 +300,30 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
  *   feature should call this function and indicate so.
  *
  **/
-void blk_queue_ordered(request_queue_t *q, int flag)
-{
-	switch (flag) {
-		case QUEUE_ORDERED_NONE:
-			if (q->flush_rq)
-				kmem_cache_free(request_cachep, q->flush_rq);
-			q->flush_rq = NULL;
-			q->ordered = flag;
-			break;
-		case QUEUE_ORDERED_TAG:
-			q->ordered = flag;
-			break;
-		case QUEUE_ORDERED_FLUSH:
-			q->ordered = flag;
-			if (!q->flush_rq)
-				q->flush_rq = kmem_cache_alloc(request_cachep,
-								GFP_KERNEL);
-			break;
-		default:
-			printk("blk_queue_ordered: bad value %d\n", flag);
-			break;
+int blk_queue_ordered(request_queue_t *q, unsigned ordered,
+		      prepare_flush_fn *prepare_flush_fn)
+{
+	if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
+	    prepare_flush_fn == NULL) {
+		printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
+		return -EINVAL;
+	}
+
+	if (ordered != QUEUE_ORDERED_NONE &&
+	    ordered != QUEUE_ORDERED_DRAIN &&
+	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
+	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
+	    ordered != QUEUE_ORDERED_TAG &&
+	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
+	    ordered != QUEUE_ORDERED_TAG_FUA) {
+		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
+		return -EINVAL;
 	}
+
+	q->next_ordered = ordered;
+	q->prepare_flush_fn = prepare_flush_fn;
+
+	return 0;
 }
 
 EXPORT_SYMBOL(blk_queue_ordered);
@@ -346,167 +348,265 @@ EXPORT_SYMBOL(blk_queue_issue_flush_fn);
 /*
  * Cache flushing for ordered writes handling
  */
-static void blk_pre_flush_end_io(struct request *flush_rq, int error)
+inline unsigned blk_ordered_cur_seq(request_queue_t *q)
 {
-	struct request *rq = flush_rq->end_io_data;
-	request_queue_t *q = rq->q;
-
-	elv_completed_request(q, flush_rq);
-
-	rq->flags |= REQ_BAR_PREFLUSH;
-
-	if (!flush_rq->errors)
-		elv_requeue_request(q, rq);
-	else {
-		q->end_flush_fn(q, flush_rq);
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		q->request_fn(q);
-	}
+	if (!q->ordseq)
+		return 0;
+	return 1 << ffz(q->ordseq);
 }
 
-static void blk_post_flush_end_io(struct request *flush_rq, int error)
+unsigned blk_ordered_req_seq(struct request *rq)
 {
-	struct request *rq = flush_rq->end_io_data;
 	request_queue_t *q = rq->q;
 
-	elv_completed_request(q, flush_rq);
+	BUG_ON(q->ordseq == 0);
 
-	rq->flags |= REQ_BAR_POSTFLUSH;
+	if (rq == &q->pre_flush_rq)
+		return QUEUE_ORDSEQ_PREFLUSH;
+	if (rq == &q->bar_rq)
+		return QUEUE_ORDSEQ_BAR;
+	if (rq == &q->post_flush_rq)
+		return QUEUE_ORDSEQ_POSTFLUSH;
 
-	q->end_flush_fn(q, flush_rq);
-	clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-	q->request_fn(q);
+	if ((rq->flags & REQ_ORDERED_COLOR) ==
+	    (q->orig_bar_rq->flags & REQ_ORDERED_COLOR))
+		return QUEUE_ORDSEQ_DRAIN;
+	else
+		return QUEUE_ORDSEQ_DONE;
 }
 
-struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq)
+void blk_ordered_complete_seq(request_queue_t *q, unsigned seq, int error)
 {
-	struct request *flush_rq = q->flush_rq;
-
-	BUG_ON(!blk_barrier_rq(rq));
+	struct request *rq;
+	int uptodate;
 
-	if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags))
-		return NULL;
+	if (error && !q->orderr)
+		q->orderr = error;
 
-	rq_init(q, flush_rq);
-	flush_rq->elevator_private = NULL;
-	flush_rq->flags = REQ_BAR_FLUSH;
-	flush_rq->rq_disk = rq->rq_disk;
-	flush_rq->rl = NULL;
+	BUG_ON(q->ordseq & seq);
+	q->ordseq |= seq;
 
-	/*
-	 * prepare_flush returns 0 if no flush is needed, just mark both
-	 * pre and post flush as done in that case
-	 */
-	if (!q->prepare_flush_fn(q, flush_rq)) {
-		rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		return rq;
-	}
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
+		return;
 
 	/*
-	 * some drivers dequeue requests right away, some only after io
-	 * completion. make sure the request is dequeued.
+	 * Okay, sequence complete.
 	 */
-	if (!list_empty(&rq->queuelist))
-		blkdev_dequeue_request(rq);
+	rq = q->orig_bar_rq;
+	uptodate = q->orderr ? q->orderr : 1;
 
-	flush_rq->end_io_data = rq;
-	flush_rq->end_io = blk_pre_flush_end_io;
+	q->ordseq = 0;
 
-	__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
-	return flush_rq;
+	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
+	end_that_request_last(rq, uptodate);
 }
 
-static void blk_start_post_flush(request_queue_t *q, struct request *rq)
+static void pre_flush_end_io(struct request *rq, int error)
 {
-	struct request *flush_rq = q->flush_rq;
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+}
 
-	BUG_ON(!blk_barrier_rq(rq));
+static void bar_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+}
 
-	rq_init(q, flush_rq);
-	flush_rq->elevator_private = NULL;
-	flush_rq->flags = REQ_BAR_FLUSH;
-	flush_rq->rq_disk = rq->rq_disk;
-	flush_rq->rl = NULL;
+static void post_flush_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+}
 
-	if (q->prepare_flush_fn(q, flush_rq)) {
-		flush_rq->end_io_data = rq;
-		flush_rq->end_io = blk_post_flush_end_io;
+static void queue_flush(request_queue_t *q, unsigned which)
+{
+	struct request *rq;
+	rq_end_io_fn *end_io;
 
-		__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
-		q->request_fn(q);
+	if (which == QUEUE_ORDERED_PREFLUSH) {
+		rq = &q->pre_flush_rq;
+		end_io = pre_flush_end_io;
+	} else {
+		rq = &q->post_flush_rq;
+		end_io = post_flush_end_io;
 	}
+
+	rq_init(q, rq);
+	rq->flags = REQ_HARDBARRIER;
+	rq->elevator_private = NULL;
+	rq->rq_disk = q->bar_rq.rq_disk;
+	rq->rl = NULL;
+	rq->end_io = end_io;
+	q->prepare_flush_fn(q, rq);
+
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
 }
 
-static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq,
-					int sectors)
+static inline struct request *start_ordered(request_queue_t *q,
+					    struct request *rq)
 {
-	if (sectors > rq->nr_sectors)
-		sectors = rq->nr_sectors;
+	q->bi_size = 0;
+	q->orderr = 0;
+	q->ordered = q->next_ordered;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+
+	/*
+	 * Prep proxy barrier request.
+	 */
+	blkdev_dequeue_request(rq);
+	q->orig_bar_rq = rq;
+	rq = &q->bar_rq;
+	rq_init(q, rq);
+	rq->flags = bio_data_dir(q->orig_bar_rq->bio);
+	rq->flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
+	rq->elevator_private = NULL;
+	rq->rl = NULL;
+	init_request_from_bio(rq, q->orig_bar_rq->bio);
+	rq->end_io = bar_end_io;
+
+	/*
+	 * Queue ordered sequence.  As we stack them at the head, we
+	 * need to queue in reverse order.  Note that we rely on that
+	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
+	 * request gets inbetween ordered sequence.
+	 */
+	if (q->ordered & QUEUE_ORDERED_POSTFLUSH)
+		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
+	else
+		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
+
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
+
+	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
+		queue_flush(q, QUEUE_ORDERED_PREFLUSH);
+		rq = &q->pre_flush_rq;
+	} else
+		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 
-	rq->nr_sectors -= sectors;
-	return rq->nr_sectors;
+	if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
+		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
+	else
+		rq = NULL;
+
+	return rq;
 }
 
-static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq,
-				     int sectors, int queue_locked)
+int blk_do_ordered(request_queue_t *q, struct request **rqp)
 {
-	if (q->ordered != QUEUE_ORDERED_FLUSH)
-		return 0;
-	if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
-		return 0;
-	if (blk_barrier_postflush(rq))
-		return 0;
+	struct request *rq = *rqp, *allowed_rq;
+	int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 
-	if (!blk_check_end_barrier(q, rq, sectors)) {
-		unsigned long flags = 0;
+	if (!q->ordseq) {
+		if (!is_barrier)
+			return 1;
 
-		if (!queue_locked)
-			spin_lock_irqsave(q->queue_lock, flags);
+		if (q->next_ordered != QUEUE_ORDERED_NONE) {
+			*rqp = start_ordered(q, rq);
+			return 1;
+		} else {
+			/*
+			 * This can happen when the queue switches to
+			 * ORDERED_NONE while this request is on it.
+			 */
+			blkdev_dequeue_request(rq);
+			end_that_request_first(rq, -EOPNOTSUPP,
+					       rq->hard_nr_sectors);
+			end_that_request_last(rq, -EOPNOTSUPP);
+			*rqp = NULL;
+			return 0;
+		}
+	}
 
-		blk_start_post_flush(q, rq);
+	if (q->ordered & QUEUE_ORDERED_TAG) {
+		if (is_barrier && rq != &q->bar_rq)
+			*rqp = NULL;
+		return 1;
+	}
 
-		if (!queue_locked)
-			spin_unlock_irqrestore(q->queue_lock, flags);
+	switch (blk_ordered_cur_seq(q)) {
+	case QUEUE_ORDSEQ_PREFLUSH:
+		allowed_rq = &q->pre_flush_rq;
+		break;
+	case QUEUE_ORDSEQ_BAR:
+		allowed_rq = &q->bar_rq;
+		break;
+	case QUEUE_ORDSEQ_POSTFLUSH:
+		allowed_rq = &q->post_flush_rq;
+		break;
+	default:
+		allowed_rq = NULL;
+		break;
 	}
 
+	if (rq != allowed_rq &&
+	    (blk_fs_request(rq) || rq == &q->pre_flush_rq ||
+	     rq == &q->post_flush_rq))
+		*rqp = NULL;
+
 	return 1;
 }
 
-/**
- * blk_complete_barrier_rq - complete possible barrier request
- * @q:  the request queue for the device
- * @rq:  the request
- * @sectors:  number of sectors to complete
- *
- * Description:
- *   Used in driver end_io handling to determine whether to postpone
- *   completion of a barrier request until a post flush has been done. This
- *   is the unlocked variant, used if the caller doesn't already hold the
- *   queue lock.
- **/
-int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors)
+static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error)
 {
-	return __blk_complete_barrier_rq(q, rq, sectors, 0);
+	request_queue_t *q = bio->bi_private;
+	struct bio_vec *bvec;
+	int i;
+
+	/*
+	 * This is dry run, restore bio_sector and size.  We'll finish
+	 * this request again with the original bi_end_io after an
+	 * error occurs or post flush is complete.
+	 */
+	q->bi_size += bytes;
+
+	if (bio->bi_size)
+		return 1;
+
+	/* Rewind bvec's */
+	bio->bi_idx = 0;
+	bio_for_each_segment(bvec, bio, i) {
+		bvec->bv_len += bvec->bv_offset;
+		bvec->bv_offset = 0;
+	}
+
+	/* Reset bio */
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio->bi_size = q->bi_size;
+	bio->bi_sector -= (q->bi_size >> 9);
+	q->bi_size = 0;
+
+	return 0;
 }
-EXPORT_SYMBOL(blk_complete_barrier_rq);
 
-/**
- * blk_complete_barrier_rq_locked - complete possible barrier request
- * @q:  the request queue for the device
- * @rq:  the request
- * @sectors:  number of sectors to complete
- *
- * Description:
- *   See blk_complete_barrier_rq(). This variant must be used if the caller
- *   holds the queue lock.
- **/
-int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq,
-				   int sectors)
+static inline int ordered_bio_endio(struct request *rq, struct bio *bio,
+				    unsigned int nbytes, int error)
 {
-	return __blk_complete_barrier_rq(q, rq, sectors, 1);
+	request_queue_t *q = rq->q;
+	bio_end_io_t *endio;
+	void *private;
+
+	if (&q->bar_rq != rq)
+		return 0;
+
+	/*
+	 * Okay, this is the barrier request in progress, dry finish it.
+	 */
+	if (error && !q->orderr)
+		q->orderr = error;
+
+	endio = bio->bi_end_io;
+	private = bio->bi_private;
+	bio->bi_end_io = flush_dry_bio_endio;
+	bio->bi_private = q;
+
+	bio_endio(bio, nbytes, error);
+
+	bio->bi_end_io = endio;
+	bio->bi_private = private;
+
+	return 1;
 }
-EXPORT_SYMBOL(blk_complete_barrier_rq_locked);
 
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
@@ -1047,6 +1147,7 @@ static const char * const rq_flags[] = {
 	"REQ_SORTED",
 	"REQ_SOFTBARRIER",
 	"REQ_HARDBARRIER",
+	"REQ_FUA",
 	"REQ_CMD",
 	"REQ_NOMERGE",
 	"REQ_STARTED",
@@ -1066,6 +1167,7 @@ static const char * const rq_flags[] = {
 	"REQ_PM_SUSPEND",
 	"REQ_PM_RESUME",
 	"REQ_PM_SHUTDOWN",
+	"REQ_ORDERED_COLOR",
 };
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -1643,8 +1745,6 @@ void blk_cleanup_queue(request_queue_t * q)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	blk_queue_ordered(q, QUEUE_ORDERED_NONE);
-
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -2714,7 +2814,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 	spin_lock_prefetch(q->queue_lock);
 
 	barrier = bio_barrier(bio);
-	if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) {
+	if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		err = -EOPNOTSUPP;
 		goto end_io;
 	}
@@ -3075,7 +3175,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
-			bio_endio(bio, nbytes, error);
+			if (!ordered_bio_endio(req, bio, nbytes, error))
+				bio_endio(bio, nbytes, error);
 			next_idx = 0;
 			bio_nbytes = 0;
 		} else {
@@ -3130,7 +3231,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 	 * if the request wasn't completed, update state
 	 */
 	if (bio_nbytes) {
-		bio_endio(bio, bio_nbytes, error);
+		if (!ordered_bio_endio(req, bio, bio_nbytes, error))
+			bio_endio(bio, bio_nbytes, error);
 		bio->bi_idx += next_idx;
 		bio_iovec(bio)->bv_offset += nr_bytes;
 		bio_iovec(bio)->bv_len -= nr_bytes;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a0ce8c585165..15db0f112d0a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -207,6 +207,7 @@ enum rq_flag_bits {
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
 	__REQ_HARDBARRIER,	/* may not be passed by drive either */
+	__REQ_FUA,		/* forced unit access */
 	__REQ_CMD,		/* is a regular fs rw request */
 	__REQ_NOMERGE,		/* don't touch this for merging */
 	__REQ_STARTED,		/* drive already may have started this one */
@@ -230,9 +231,7 @@ enum rq_flag_bits {
 	__REQ_PM_SUSPEND,	/* suspend request */
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
-	__REQ_BAR_PREFLUSH,	/* barrier pre-flush done */
-	__REQ_BAR_POSTFLUSH,	/* barrier post-flush */
-	__REQ_BAR_FLUSH,	/* rq is the flush request */
+	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -241,6 +240,7 @@ enum rq_flag_bits {
 #define REQ_SORTED	(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
 #define REQ_HARDBARRIER	(1 << __REQ_HARDBARRIER)
+#define REQ_FUA		(1 << __REQ_FUA)
 #define REQ_CMD		(1 << __REQ_CMD)
 #define REQ_NOMERGE	(1 << __REQ_NOMERGE)
 #define REQ_STARTED	(1 << __REQ_STARTED)
@@ -260,9 +260,7 @@ enum rq_flag_bits {
 #define REQ_PM_SUSPEND	(1 << __REQ_PM_SUSPEND)
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
-#define REQ_BAR_PREFLUSH	(1 << __REQ_BAR_PREFLUSH)
-#define REQ_BAR_POSTFLUSH	(1 << __REQ_BAR_POSTFLUSH)
-#define REQ_BAR_FLUSH	(1 << __REQ_BAR_FLUSH)
+#define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -292,8 +290,7 @@ struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
 typedef void (activity_fn) (void *data, int rw);
 typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *);
-typedef int (prepare_flush_fn) (request_queue_t *, struct request *);
-typedef void (end_flush_fn) (request_queue_t *, struct request *);
+typedef void (prepare_flush_fn) (request_queue_t *, struct request *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -335,7 +332,6 @@ struct request_queue
 	activity_fn		*activity_fn;
 	issue_flush_fn		*issue_flush_fn;
 	prepare_flush_fn	*prepare_flush_fn;
-	end_flush_fn		*end_flush_fn;
 
 	/*
 	 * Dispatch queue sorting
@@ -420,14 +416,11 @@ struct request_queue
 	/*
 	 * reserved for flush operations
 	 */
-	struct request		*flush_rq;
-	unsigned char		ordered;
-};
-
-enum {
-	QUEUE_ORDERED_NONE,
-	QUEUE_ORDERED_TAG,
-	QUEUE_ORDERED_FLUSH,
+	unsigned int		ordered, next_ordered, ordseq;
+	int			orderr, ordcolor;
+	struct request		pre_flush_rq, bar_rq, post_flush_rq;
+	struct request		*orig_bar_rq;
+	unsigned int		bi_size;
 };
 
 #define RQ_INACTIVE		(-1)
@@ -445,12 +438,51 @@ enum {
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 #define QUEUE_FLAG_ELVSWITCH	8	/* don't use elevator, just do FIFO */
-#define QUEUE_FLAG_FLUSH	9	/* doing barrier flush sequence */
+
+enum {
+	/*
+	 * Hardbarrier is supported with one of the following methods.
+	 *
+	 * NONE		: hardbarrier unsupported
+	 * DRAIN	: ordering by draining is enough
+	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
+	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
+	 * TAG		: ordering by tag is enough
+	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
+	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
+	 */
+	QUEUE_ORDERED_NONE	= 0x00,
+	QUEUE_ORDERED_DRAIN	= 0x01,
+	QUEUE_ORDERED_TAG	= 0x02,
+
+	QUEUE_ORDERED_PREFLUSH	= 0x10,
+	QUEUE_ORDERED_POSTFLUSH	= 0x20,
+	QUEUE_ORDERED_FUA	= 0x40,
+
+	QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
+	QUEUE_ORDERED_DRAIN_FUA	= QUEUE_ORDERED_DRAIN |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
+	QUEUE_ORDERED_TAG_FLUSH	= QUEUE_ORDERED_TAG |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
+	QUEUE_ORDERED_TAG_FUA	= QUEUE_ORDERED_TAG |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
+
+	/*
+	 * Ordered operation sequence
+	 */
+	QUEUE_ORDSEQ_STARTED	= 0x01,	/* flushing in progress */
+	QUEUE_ORDSEQ_DRAIN	= 0x02,	/* waiting for the queue to be drained */
+	QUEUE_ORDSEQ_PREFLUSH	= 0x04,	/* pre-flushing in progress */
+	QUEUE_ORDSEQ_BAR	= 0x08,	/* original barrier req in progress */
+	QUEUE_ORDSEQ_POSTFLUSH	= 0x10,	/* post-flushing in progress */
+	QUEUE_ORDSEQ_DONE	= 0x20,
+};
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
-#define blk_queue_flushing(q)	test_bit(QUEUE_FLAG_FLUSH, &(q)->queue_flags)
+#define blk_queue_flushing(q)	((q)->ordseq)
 
 #define blk_fs_request(rq)	((rq)->flags & REQ_CMD)
 #define blk_pc_request(rq)	((rq)->flags & REQ_BLOCK_PC)
@@ -466,8 +498,7 @@ enum {
 
 #define blk_sorted_rq(rq)	((rq)->flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->flags & REQ_HARDBARRIER)
-#define blk_barrier_preflush(rq)	((rq)->flags & REQ_BAR_PREFLUSH)
-#define blk_barrier_postflush(rq)	((rq)->flags & REQ_BAR_POSTFLUSH)
+#define blk_fua_rq(rq)		((rq)->flags & REQ_FUA)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
@@ -665,11 +696,12 @@ extern void blk_queue_prep_rq(request_queue_t *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(request_queue_t *, int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern void blk_queue_ordered(request_queue_t *, int);
+extern int blk_queue_ordered(request_queue_t *, unsigned, prepare_flush_fn *);
 extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *);
-extern struct request *blk_start_pre_flush(request_queue_t *,struct request *);
-extern int blk_complete_barrier_rq(request_queue_t *, struct request *, int);
-extern int blk_complete_barrier_rq_locked(request_queue_t *, struct request *, int);
+extern int blk_do_ordered(request_queue_t *, struct request **);
+extern unsigned blk_ordered_cur_seq(request_queue_t *);
+extern unsigned blk_ordered_req_seq(struct request *);
+extern void blk_ordered_complete_seq(request_queue_t *, unsigned, int);
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index a74c27e460ba..fb80fa44c4dd 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -130,6 +130,7 @@ extern int elv_try_last_merge(request_queue_t *, struct bio *);
 #define ELEVATOR_INSERT_FRONT	1
 #define ELEVATOR_INSERT_BACK	2
 #define ELEVATOR_INSERT_SORT	3
+#define ELEVATOR_INSERT_REQUEUE	4
 
 /*
  * return values from elevator_may_queue_fn
-- 
cgit v1.2.3-71-gd317


From 9a3dccc42556537a48f39ee9a9e7ab90a933f766 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:56:18 +0100
Subject: [BLOCK] add FUA support to libata

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/libata-core.c | 31 +++++++++++++++++++++++++------
 drivers/scsi/libata-scsi.c | 32 ++++++++++++++++++++++++++------
 drivers/scsi/libata.h      |  4 +++-
 include/linux/ata.h        |  6 +++++-
 include/linux/libata.h     |  3 ++-
 5 files changed, 61 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 9ea102587914..bdfb0a88cd6f 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -562,16 +562,28 @@ static const u8 ata_rw_cmds[] = {
 	ATA_CMD_WRITE_MULTI,
 	ATA_CMD_READ_MULTI_EXT,
 	ATA_CMD_WRITE_MULTI_EXT,
+	0,
+	0,
+	0,
+	ATA_CMD_WRITE_MULTI_FUA_EXT,
 	/* pio */
 	ATA_CMD_PIO_READ,
 	ATA_CMD_PIO_WRITE,
 	ATA_CMD_PIO_READ_EXT,
 	ATA_CMD_PIO_WRITE_EXT,
+	0,
+	0,
+	0,
+	0,
 	/* dma */
 	ATA_CMD_READ,
 	ATA_CMD_WRITE,
 	ATA_CMD_READ_EXT,
-	ATA_CMD_WRITE_EXT
+	ATA_CMD_WRITE_EXT,
+	0,
+	0,
+	0,
+	ATA_CMD_WRITE_FUA_EXT
 };
 
 /**
@@ -584,25 +596,32 @@ static const u8 ata_rw_cmds[] = {
  *	LOCKING:
  *	caller.
  */
-void ata_rwcmd_protocol(struct ata_queued_cmd *qc)
+int ata_rwcmd_protocol(struct ata_queued_cmd *qc)
 {
 	struct ata_taskfile *tf = &qc->tf;
 	struct ata_device *dev = qc->dev;
+	u8 cmd;
 
-	int index, lba48, write;
+	int index, fua, lba48, write;
  
+	fua = (tf->flags & ATA_TFLAG_FUA) ? 4 : 0;
 	lba48 = (tf->flags & ATA_TFLAG_LBA48) ? 2 : 0;
 	write = (tf->flags & ATA_TFLAG_WRITE) ? 1 : 0;
 
 	if (dev->flags & ATA_DFLAG_PIO) {
 		tf->protocol = ATA_PROT_PIO;
-		index = dev->multi_count ? 0 : 4;
+		index = dev->multi_count ? 0 : 8;
 	} else {
 		tf->protocol = ATA_PROT_DMA;
-		index = 8;
+		index = 16;
 	}
 
-	tf->command = ata_rw_cmds[index + lba48 + write];
+	cmd = ata_rw_cmds[index + fua + lba48 + write];
+	if (cmd) {
+		tf->command = cmd;
+		return 0;
+	}
+	return -1;
 }
 
 static const char * const xfer_mode_str[] = {
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index e0439be4b573..2c644cbb6e9c 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -1080,11 +1080,13 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 	    scsicmd[0] == WRITE_16)
 		tf->flags |= ATA_TFLAG_WRITE;
 
-	/* Calculate the SCSI LBA and transfer length. */
+	/* Calculate the SCSI LBA, transfer length and FUA. */
 	switch (scsicmd[0]) {
 	case READ_10:
 	case WRITE_10:
 		scsi_10_lba_len(scsicmd, &block, &n_block);
+		if (unlikely(scsicmd[1] & (1 << 3)))
+			tf->flags |= ATA_TFLAG_FUA;
 		break;
 	case READ_6:
 	case WRITE_6:
@@ -1099,6 +1101,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 	case READ_16:
 	case WRITE_16:
 		scsi_16_lba_len(scsicmd, &block, &n_block);
+		if (unlikely(scsicmd[1] & (1 << 3)))
+			tf->flags |= ATA_TFLAG_FUA;
 		break;
 	default:
 		DPRINTK("no-byte command\n");
@@ -1142,7 +1146,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 			tf->device |= (block >> 24) & 0xf;
 		}
 
-		ata_rwcmd_protocol(qc);
+		if (unlikely(ata_rwcmd_protocol(qc) < 0))
+			goto invalid_fld;
 
 		qc->nsect = n_block;
 		tf->nsect = n_block & 0xff;
@@ -1160,7 +1165,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 		if ((block >> 28) || (n_block > 256))
 			goto out_of_range;
 
-		ata_rwcmd_protocol(qc);
+		if (unlikely(ata_rwcmd_protocol(qc) < 0))
+			goto invalid_fld;
 
 		/* Convert LBA to CHS */
 		track = (u32)block / dev->sectors;
@@ -1695,6 +1701,7 @@ static unsigned int ata_msense_rw_recovery(u8 **ptr_io, const u8 *last)
 unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 				  unsigned int buflen)
 {
+	struct ata_device *dev = args->dev;
 	u8 *scsicmd = args->cmd->cmnd, *p, *last;
 	const u8 sat_blk_desc[] = {
 		0, 0, 0, 0,	/* number of blocks: sat unspecified */
@@ -1703,6 +1710,7 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 	};
 	u8 pg, spg;
 	unsigned int ebd, page_control, six_byte, output_len, alloc_len, minlen;
+	u8 dpofua;
 
 	VPRINTK("ENTER\n");
 
@@ -1771,9 +1779,17 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 
 	if (minlen < 1)
 		return 0;
+
+	dpofua = 0;
+	if (ata_id_has_fua(args->id) && dev->flags & ATA_DFLAG_LBA48 &&
+	    (!(dev->flags & ATA_DFLAG_PIO) || dev->multi_count))
+		dpofua = 1 << 4;
+
 	if (six_byte) {
 		output_len--;
 		rbuf[0] = output_len;
+		if (minlen > 2)
+			rbuf[2] |= dpofua;
 		if (ebd) {
 			if (minlen > 3)
 				rbuf[3] = sizeof(sat_blk_desc);
@@ -1786,6 +1802,8 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 		rbuf[0] = output_len >> 8;
 		if (minlen > 1)
 			rbuf[1] = output_len;
+		if (minlen > 3)
+			rbuf[3] |= dpofua;
 		if (ebd) {
 			if (minlen > 7)
 				rbuf[7] = sizeof(sat_blk_desc);
@@ -2446,7 +2464,7 @@ int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 		if (xlat_func)
 			ata_scsi_translate(ap, dev, cmd, done, xlat_func);
 		else
-			ata_scsi_simulate(dev->id, cmd, done);
+			ata_scsi_simulate(ap, dev, cmd, done);
 	} else
 		ata_scsi_translate(ap, dev, cmd, done, atapi_xlat);
 
@@ -2469,14 +2487,16 @@ out_unlock:
  *	spin_lock_irqsave(host_set lock)
  */
 
-void ata_scsi_simulate(u16 *id,
+void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
 		      struct scsi_cmnd *cmd,
 		      void (*done)(struct scsi_cmnd *))
 {
 	struct ata_scsi_args args;
 	const u8 *scsicmd = cmd->cmnd;
 
-	args.id = id;
+	args.ap = ap;
+	args.dev = dev;
+	args.id = dev->id;
 	args.cmd = cmd;
 	args.done = done;
 
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 251e53bdc6e0..e03ce48b7b4b 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -32,6 +32,8 @@
 #define DRV_VERSION	"1.20"	/* must be exactly four chars */
 
 struct ata_scsi_args {
+	struct ata_port		*ap;
+	struct ata_device	*dev;
 	u16			*id;
 	struct scsi_cmnd	*cmd;
 	void			(*done)(struct scsi_cmnd *);
@@ -41,7 +43,7 @@ struct ata_scsi_args {
 extern int atapi_enabled;
 extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 				      struct ata_device *dev);
-extern void ata_rwcmd_protocol(struct ata_queued_cmd *qc);
+extern int ata_rwcmd_protocol(struct ata_queued_cmd *qc);
 extern void ata_qc_free(struct ata_queued_cmd *qc);
 extern int ata_qc_issue(struct ata_queued_cmd *qc);
 extern int ata_check_atapi_dma(struct ata_queued_cmd *qc);
diff --git a/include/linux/ata.h b/include/linux/ata.h
index d2873b732bb1..f63dad4165b1 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -129,6 +129,7 @@ enum {
 	ATA_CMD_READ_EXT	= 0x25,
 	ATA_CMD_WRITE		= 0xCA,
 	ATA_CMD_WRITE_EXT	= 0x35,
+	ATA_CMD_WRITE_FUA_EXT	= 0x3D,
 	ATA_CMD_PIO_READ	= 0x20,
 	ATA_CMD_PIO_READ_EXT	= 0x24,
 	ATA_CMD_PIO_WRITE	= 0x30,
@@ -137,6 +138,7 @@ enum {
 	ATA_CMD_READ_MULTI_EXT	= 0x29,
 	ATA_CMD_WRITE_MULTI	= 0xC5,
 	ATA_CMD_WRITE_MULTI_EXT	= 0x39,
+	ATA_CMD_WRITE_MULTI_FUA_EXT = 0xCE,
 	ATA_CMD_SET_FEATURES	= 0xEF,
 	ATA_CMD_PACKET		= 0xA0,
 	ATA_CMD_VERIFY		= 0x40,
@@ -192,6 +194,7 @@ enum {
 	ATA_TFLAG_DEVICE	= (1 << 2), /* enable r/w to device reg */
 	ATA_TFLAG_WRITE		= (1 << 3), /* data dir: host->dev==1 (write) */
 	ATA_TFLAG_LBA		= (1 << 4), /* enable LBA */
+	ATA_TFLAG_FUA		= (1 << 5), /* enable FUA */
 };
 
 enum ata_tf_protocols {
@@ -245,7 +248,8 @@ struct ata_taskfile {
 #define ata_id_is_sata(id)	((id)[93] == 0)
 #define ata_id_rahead_enabled(id) ((id)[85] & (1 << 6))
 #define ata_id_wcache_enabled(id) ((id)[85] & (1 << 5))
-#define ata_id_has_flush(id) ((id)[83] & (1 << 12))
+#define ata_id_has_fua(id)	((id)[84] & (1 << 6))
+#define ata_id_has_flush(id)	((id)[83] & (1 << 12))
 #define ata_id_has_flush_ext(id) ((id)[83] & (1 << 13))
 #define ata_id_has_lba48(id)	((id)[83] & (1 << 10))
 #define ata_id_has_wcache(id)	((id)[82] & (1 << 5))
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e828e172ccbf..6db2c0845731 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -480,7 +480,8 @@ extern u8   ata_bmdma_status(struct ata_port *ap);
 extern void ata_bmdma_irq_clear(struct ata_port *ap);
 extern void ata_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eng_timeout(struct ata_port *ap);
-extern void ata_scsi_simulate(u16 *id, struct scsi_cmnd *cmd,
+extern void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
+			      struct scsi_cmnd *cmd,
 			      void (*done)(struct scsi_cmnd *));
 extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
-- 
cgit v1.2.3-71-gd317


From 15fc858a0067c800f410a24551a7b461978abf0b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Fri, 6 Jan 2006 10:00:50 +0100
Subject: [BLOCK] Correct blk_execute_rq_nowait() prototype

---
 include/linux/blkdev.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 15db0f112d0a..fb0985377421 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -613,8 +613,7 @@ extern int blk_rq_map_user_iov(request_queue_t *, struct request *, struct sg_io
 extern int blk_execute_rq(request_queue_t *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(request_queue_t *, struct gendisk *,
-				  struct request *, int,
-				  void (*done)(struct request *));
+				  struct request *, int, rq_end_io_fn *);
 
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 {
-- 
cgit v1.2.3-71-gd317


From 4b2f0260c74324abca76ccaa42d426af163125e7 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 6 Jan 2006 00:09:47 -0800
Subject: [PATCH] nbd: fix TX/RX race condition

Janos Haar of First NetCenter Bt.  reported numerous crashes involving the
NBD driver.  With his help, this was tracked down to bogus bio vectors
which in turn was the result of a race condition between the
receive/transmit routines in the NBD driver.

The bug manifests itself like this:

CPU0				CPU1
do_nbd_request
	add req to queuelist
	nbd_send_request
		send req head
		for each bio
			kmap
			send
				nbd_read_stat
					nbd_find_request
					nbd_end_request
			kunmap

When CPU1 finishes nbd_end_request, the request and all its associated
bio's are freed.  So when CPU0 calls kunmap whose argument is derived from
the last bio, it may crash.

Under normal circumstances, the race occurs only on the last bio.  However,
if an error is encountered on the remote NBD server (such as an incorrect
magic number in the request), or if there were a bug in the server, it is
possible for the nbd_end_request to occur any time after the request's
addition to the queuelist.

The following patch fixes this problem by making sure that requests are not
added to the queuelist until after they have been completed transmission.

In order for the receiving side to be ready for responses involving
requests still being transmitted, the patch introduces the concept of the
active request.

When a response matches the current active request, its processing is
delayed until after the tranmission has come to a stop.

This has been tested by Janos and it has been successful in curing this
race condition.

From: Herbert Xu <herbert@gondor.apana.org.au>

  Here is an updated patch which removes the active_req wait in
  nbd_clear_queue and the associated memory barrier.

  I've also clarified this in the comment.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: <djani22@dynamicweb.hu>
Cc: Paul Clements <Paul.Clements@SteelEye.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/nbd.c | 122 ++++++++++++++++++++++++++--------------------------
 include/linux/nbd.h |   8 ++++
 2 files changed, 68 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9e268ddedfbd..d5c8ee7d9815 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -54,11 +54,15 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/ioctl.h>
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
 #include <net/sock.h>
 
 #include <linux/devfs_fs_kernel.h>
 
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <asm/types.h>
 
 #include <linux/nbd.h>
@@ -230,14 +234,6 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 	request.len = htonl(size);
 	memcpy(request.handle, &req, sizeof(req));
 
-	down(&lo->tx_lock);
-
-	if (!sock || !lo->sock) {
-		printk(KERN_ERR "%s: Attempted send on closed socket\n",
-				lo->disk->disk_name);
-		goto error_out;
-	}
-
 	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n",
 			lo->disk->disk_name, req,
 			nbdcmd_to_ascii(nbd_cmd(req)),
@@ -276,11 +272,9 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 			}
 		}
 	}
-	up(&lo->tx_lock);
 	return 0;
 
 error_out:
-	up(&lo->tx_lock);
 	return 1;
 }
 
@@ -289,9 +283,14 @@ static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
 	struct request *req;
 	struct list_head *tmp;
 	struct request *xreq;
+	int err;
 
 	memcpy(&xreq, handle, sizeof(xreq));
 
+	err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq);
+	if (unlikely(err))
+		goto out;
+
 	spin_lock(&lo->queue_lock);
 	list_for_each(tmp, &lo->queue_head) {
 		req = list_entry(tmp, struct request, queuelist);
@@ -302,7 +301,11 @@ static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
 		return req;
 	}
 	spin_unlock(&lo->queue_lock);
-	return NULL;
+
+	err = -ENOENT;
+
+out:
+	return ERR_PTR(err);
 }
 
 static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec)
@@ -331,7 +334,11 @@ static struct request *nbd_read_stat(struct nbd_device *lo)
 		goto harderror;
 	}
 	req = nbd_find_request(lo, reply.handle);
-	if (req == NULL) {
+	if (unlikely(IS_ERR(req))) {
+		result = PTR_ERR(req);
+		if (result != -ENOENT)
+			goto harderror;
+
 		printk(KERN_ERR "%s: Unexpected reply (%p)\n",
 				lo->disk->disk_name, reply.handle);
 		result = -EBADR;
@@ -395,19 +402,24 @@ static void nbd_clear_que(struct nbd_device *lo)
 
 	BUG_ON(lo->magic != LO_MAGIC);
 
-	do {
-		req = NULL;
-		spin_lock(&lo->queue_lock);
-		if (!list_empty(&lo->queue_head)) {
-			req = list_entry(lo->queue_head.next, struct request, queuelist);
-			list_del_init(&req->queuelist);
-		}
-		spin_unlock(&lo->queue_lock);
-		if (req) {
-			req->errors++;
-			nbd_end_request(req);
-		}
-	} while (req);
+	/*
+	 * Because we have set lo->sock to NULL under the tx_lock, all
+	 * modifications to the list must have completed by now.  For
+	 * the same reason, the active_req must be NULL.
+	 *
+	 * As a consequence, we don't need to take the spin lock while
+	 * purging the list here.
+	 */
+	BUG_ON(lo->sock);
+	BUG_ON(lo->active_req);
+
+	while (!list_empty(&lo->queue_head)) {
+		req = list_entry(lo->queue_head.next, struct request,
+				 queuelist);
+		list_del_init(&req->queuelist);
+		req->errors++;
+		nbd_end_request(req);
+	}
 }
 
 /*
@@ -435,11 +447,6 @@ static void do_nbd_request(request_queue_t * q)
 
 		BUG_ON(lo->magic != LO_MAGIC);
 
-		if (!lo->file) {
-			printk(KERN_ERR "%s: Request when not-ready\n",
-					lo->disk->disk_name);
-			goto error_out;
-		}
 		nbd_cmd(req) = NBD_CMD_READ;
 		if (rq_data_dir(req) == WRITE) {
 			nbd_cmd(req) = NBD_CMD_WRITE;
@@ -453,32 +460,34 @@ static void do_nbd_request(request_queue_t * q)
 		req->errors = 0;
 		spin_unlock_irq(q->queue_lock);
 
-		spin_lock(&lo->queue_lock);
-
-		if (!lo->file) {
-			spin_unlock(&lo->queue_lock);
-			printk(KERN_ERR "%s: failed between accept and semaphore, file lost\n",
-					lo->disk->disk_name);
+		down(&lo->tx_lock);
+		if (unlikely(!lo->sock)) {
+			up(&lo->tx_lock);
+			printk(KERN_ERR "%s: Attempted send on closed socket\n",
+			       lo->disk->disk_name);
 			req->errors++;
 			nbd_end_request(req);
 			spin_lock_irq(q->queue_lock);
 			continue;
 		}
 
-		list_add(&req->queuelist, &lo->queue_head);
-		spin_unlock(&lo->queue_lock);
+		lo->active_req = req;
 
 		if (nbd_send_req(lo, req) != 0) {
 			printk(KERN_ERR "%s: Request send failed\n",
 					lo->disk->disk_name);
-			if (nbd_find_request(lo, (char *)&req) != NULL) {
-				/* we still own req */
-				req->errors++;
-				nbd_end_request(req);
-			} else /* we're racing with nbd_clear_que */
-				printk(KERN_DEBUG "nbd: can't find req\n");
+			req->errors++;
+			nbd_end_request(req);
+		} else {
+			spin_lock(&lo->queue_lock);
+			list_add(&req->queuelist, &lo->queue_head);
+			spin_unlock(&lo->queue_lock);
 		}
 
+		lo->active_req = NULL;
+		up(&lo->tx_lock);
+		wake_up_all(&lo->active_wq);
+
 		spin_lock_irq(q->queue_lock);
 		continue;
 
@@ -529,17 +538,10 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
 		down(&lo->tx_lock);
 		lo->sock = NULL;
 		up(&lo->tx_lock);
-		spin_lock(&lo->queue_lock);
 		file = lo->file;
 		lo->file = NULL;
-		spin_unlock(&lo->queue_lock);
 		nbd_clear_que(lo);
-		spin_lock(&lo->queue_lock);
-		if (!list_empty(&lo->queue_head)) {
-			printk(KERN_ERR "nbd: disconnect: some requests are in progress -> please try again.\n");
-			error = -EBUSY;
-		}
-		spin_unlock(&lo->queue_lock);
+		BUG_ON(!list_empty(&lo->queue_head));
 		if (file)
 			fput(file);
 		return error;
@@ -598,24 +600,19 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
 			lo->sock = NULL;
 		}
 		up(&lo->tx_lock);
-		spin_lock(&lo->queue_lock);
 		file = lo->file;
 		lo->file = NULL;
-		spin_unlock(&lo->queue_lock);
 		nbd_clear_que(lo);
 		printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name);
 		if (file)
 			fput(file);
 		return lo->harderror;
 	case NBD_CLEAR_QUE:
-		down(&lo->tx_lock);
-		if (lo->sock) {
-			up(&lo->tx_lock);
-			return 0; /* probably should be error, but that would
-				   * break "nbd-client -d", so just return 0 */
-		}
-		up(&lo->tx_lock);
-		nbd_clear_que(lo);
+		/*
+		 * This is for compatibility only.  The queue is always cleared
+		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
+		 */
+		BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
 		return 0;
 	case NBD_PRINT_DEBUG:
 		printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n",
@@ -688,6 +685,7 @@ static int __init nbd_init(void)
 		spin_lock_init(&nbd_dev[i].queue_lock);
 		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
 		init_MUTEX(&nbd_dev[i].tx_lock);
+		init_waitqueue_head(&nbd_dev[i].active_wq);
 		nbd_dev[i].blksize = 1024;
 		nbd_dev[i].bytesize = 0x7ffffc00ULL << 10; /* 2TB */
 		disk->major = NBD_MAJOR;
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index 090e210e98f0..f95d51fae733 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -37,18 +37,26 @@ enum {
 /* userspace doesn't need the nbd_device structure */
 #ifdef __KERNEL__
 
+#include <linux/wait.h>
+
 /* values for flags field */
 #define NBD_READ_ONLY 0x0001
 #define NBD_WRITE_NOCHK 0x0002
 
+struct request;
+
 struct nbd_device {
 	int flags;
 	int harderror;		/* Code of hard error			*/
 	struct socket * sock;
 	struct file * file; 	/* If == NULL, device is not ready, yet	*/
 	int magic;
+
 	spinlock_t queue_lock;
 	struct list_head queue_head;/* Requests are added here...	*/
+	struct request *active_req;
+	wait_queue_head_t active_wq;
+
 	struct semaphore tx_lock;
 	struct gendisk *disk;
 	int blksize;
-- 
cgit v1.2.3-71-gd317


From d7339071f6a8b50101d7ba327926b770f22d5d8b Mon Sep 17 00:00:00 2001
From: Hans Reiser <reiser@namesys.com>
Date: Fri, 6 Jan 2006 00:10:36 -0800
Subject: [PATCH] reiser4: vfs: add truncate_inode_pages_range()

This patch makes truncate_inode_pages_range from truncate_inode_pages.
truncate_inode_pages became a one-liner call to truncate_inode_pages_range.

Reiser4 needs truncate_inode_pages_ranges because it tries to keep
correspondence between existences of metadata pointing to data pages and pages
to which those metadata point to.  So, when metadata of certain part of file
is removed from filesystem tree, only pages of corresponding range are to be
truncated.

(Needed by the madvise(MADV_REMOVE) patch)

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |  2 ++
 mm/truncate.c      | 44 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a06a84d347fb..92acae9f1f4c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -896,6 +896,8 @@ extern unsigned long do_brk(unsigned long, unsigned long);
 /* filemap.c */
 extern unsigned long page_unuse(struct page *);
 extern void truncate_inode_pages(struct address_space *, loff_t);
+extern void truncate_inode_pages_range(struct address_space *,
+				       loff_t lstart, loff_t lend);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
diff --git a/mm/truncate.c b/mm/truncate.c
index 9173ab500604..7dee32745901 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 }
 
 /**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
  * @mapping: mapping to truncate
  * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
  *
- * Truncate the page cache at a set offset, removing the pages that are beyond
- * that offset (and zeroing out partial pages).
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
  *
  * Truncate takes two passes - the first pass is nonblocking.  It will not
  * block on page locks and it will not block on writeback.  The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
  * We pass down the cache-hot hint to the page freeing code.  Even if the
  * mapping is large, it is probably the case that the final pages are the most
  * recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
  */
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+				loff_t lstart, loff_t lend)
 {
 	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	pgoff_t end;
 	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 	if (mapping->nrpages == 0)
 		return;
 
+	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+	end = (lend >> PAGE_CACHE_SHIFT);
+
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+	while (next <= end &&
+	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index = page->index;
 
+			if (page_index > end) {
+				next = page_index;
+				break;
+			}
+
 			if (page_index > next)
 				next = page_index;
 			next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 			next = start;
 			continue;
 		}
+		if (pvec.pages[0]->index > end) {
+			pagevec_release(&pvec);
+			break;
+		}
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
+			if (page->index > end)
+				break;
 			lock_page(page);
 			wait_on_page_writeback(page);
 			if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 		pagevec_release(&pvec);
 	}
 }
+EXPORT_SYMBOL(truncate_inode_pages_range);
 
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
 EXPORT_SYMBOL(truncate_inode_pages);
 
 /**
-- 
cgit v1.2.3-71-gd317


From f6b3ec238d12c8cc6cc71490c6e3127988460349 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Fri, 6 Jan 2006 00:10:38 -0800
Subject: [PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing
 store

Here is the patch to implement madvise(MADV_REMOVE) - which frees up a
given range of pages & its associated backing store.  Current
implementation supports only shmfs/tmpfs and other filesystems return
-ENOSYS.

"Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released.  However the only way to
release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli

Databases want to use this feature to drop a section of their bufferpool
(shared memory segments) - without writing back to disk/swap space.

This feature is also useful for supporting hot-plug memory on UML.

Concerns raised by Andrew Morton:

- "We have no plan for holepunching!  If we _do_ have such a plan (or
  might in the future) then what would the API look like?  I think
  sys_holepunch(fd, start, len), so we should start out with that."

- Using madvise is very weird, because people will ask "why do I need to
  mmap my file before I can stick a hole in it?"

- None of the other madvise operations call into the filesystem in this
  manner.  A broad question is: is this capability an MM operation or a
  filesytem operation?  truncate, for example, is a filesystem operation
  which sometimes has MM side-effects.  madvise is an mm operation and with
  this patch, it gains FS side-effects, only they're really, really
  significant ones."

Comments:

- Andrea suggested the fs operation too but then it's more efficient to
  have it as a mm operation with fs side effects, because they don't
  immediatly know fd and physical offset of the range.  It's possible to
  fixup in userland and to use the fs operation but it's more expensive,
  the vmas are already in the kernel and we can use them.

Short term plan &  Future Direction:

- We seem to need this interface only for shmfs/tmpfs files in the short
  term.  We have to add hooks into the filesystem for correctness and
  completeness.  This is what this patch does.

- In the future, plan is to support both fs and mmap apis also.  This
  also involves (other) filesystem specific functions to be implemented.

- Current patch doesn't support VM_NONLINEAR - which can be addressed in
  the future.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andrea Arcangeli <andrea@suse.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/mman.h   |  1 +
 include/asm-arm/mman.h     |  1 +
 include/asm-arm26/mman.h   |  1 +
 include/asm-cris/mman.h    |  1 +
 include/asm-frv/mman.h     |  1 +
 include/asm-h8300/mman.h   |  1 +
 include/asm-i386/mman.h    |  1 +
 include/asm-ia64/mman.h    |  1 +
 include/asm-m32r/mman.h    |  1 +
 include/asm-m68k/mman.h    |  1 +
 include/asm-mips/mman.h    |  1 +
 include/asm-parisc/mman.h  |  1 +
 include/asm-powerpc/mman.h |  1 +
 include/asm-s390/mman.h    |  1 +
 include/asm-sh/mman.h      |  1 +
 include/asm-sparc/mman.h   |  1 +
 include/asm-sparc64/mman.h |  1 +
 include/asm-v850/mman.h    |  1 +
 include/asm-x86_64/mman.h  |  1 +
 include/asm-xtensa/mman.h  |  1 +
 include/linux/fs.h         |  1 +
 include/linux/mm.h         |  1 +
 mm/madvise.c               | 35 +++++++++++++++++++++++++++++++++++
 mm/memory.c                | 25 ++++++++++++++++++++++++-
 mm/shmem.c                 | 32 ++++++++++++++++++++++++--------
 25 files changed, 105 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-alpha/mman.h b/include/asm-alpha/mman.h
index eb9c279045ef..f6439532a262 100644
--- a/include/asm-alpha/mman.h
+++ b/include/asm-alpha/mman.h
@@ -42,6 +42,7 @@
 #define MADV_WILLNEED	3		/* will need these pages */
 #define	MADV_SPACEAVAIL	5		/* ensure resources are available */
 #define MADV_DONTNEED	6		/* don't need these pages */
+#define MADV_REMOVE	7		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-arm/mman.h b/include/asm-arm/mman.h
index 8e4f69c4fa5f..f0bebca2ac21 100644
--- a/include/asm-arm/mman.h
+++ b/include/asm-arm/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-arm26/mman.h b/include/asm-arm26/mman.h
index cc27b8240265..0ed7780541fa 100644
--- a/include/asm-arm26/mman.h
+++ b/include/asm-arm26/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-cris/mman.h b/include/asm-cris/mman.h
index 8570e72b9502..5a382b8bf3f7 100644
--- a/include/asm-cris/mman.h
+++ b/include/asm-cris/mman.h
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-frv/mman.h b/include/asm-frv/mman.h
index c684720dfbdd..8af4a41c255e 100644
--- a/include/asm-frv/mman.h
+++ b/include/asm-frv/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-h8300/mman.h b/include/asm-h8300/mman.h
index 63f727a59850..744a8fb485c2 100644
--- a/include/asm-h8300/mman.h
+++ b/include/asm-h8300/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-i386/mman.h b/include/asm-i386/mman.h
index 196619a83854..ba4941e6f643 100644
--- a/include/asm-i386/mman.h
+++ b/include/asm-i386/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-ia64/mman.h b/include/asm-ia64/mman.h
index 1c0a73af1461..828beb24a20e 100644
--- a/include/asm-ia64/mman.h
+++ b/include/asm-ia64/mman.h
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-m32r/mman.h b/include/asm-m32r/mman.h
index 011f6d9ec5cc..12e29747bc84 100644
--- a/include/asm-m32r/mman.h
+++ b/include/asm-m32r/mman.h
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-m68k/mman.h b/include/asm-m68k/mman.h
index f831c4eeae6e..ea262ab88b3b 100644
--- a/include/asm-m68k/mman.h
+++ b/include/asm-m68k/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-mips/mman.h b/include/asm-mips/mman.h
index 62060957ba93..dd17c8bd62a1 100644
--- a/include/asm-mips/mman.h
+++ b/include/asm-mips/mman.h
@@ -65,6 +65,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff --git a/include/asm-parisc/mman.h b/include/asm-parisc/mman.h
index e829607eb8bc..736b0abcac05 100644
--- a/include/asm-parisc/mman.h
+++ b/include/asm-parisc/mman.h
@@ -38,6 +38,7 @@
 #define MADV_SPACEAVAIL 5               /* insure that resources are reserved */
 #define MADV_VPS_PURGE  6               /* Purge pages from VM page cache */
 #define MADV_VPS_INHERIT 7              /* Inherit parents page size */
+#define MADV_REMOVE     8		/* remove these pages & resources */
 
 /* The range 12-64 is reserved for page size specification. */
 #define MADV_4K_PAGES   12              /* Use 4K pages  */
diff --git a/include/asm-powerpc/mman.h b/include/asm-powerpc/mman.h
index f5e5342fcac5..a2e34c21b44f 100644
--- a/include/asm-powerpc/mman.h
+++ b/include/asm-powerpc/mman.h
@@ -44,6 +44,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-s390/mman.h b/include/asm-s390/mman.h
index ea86bd12204f..c8d5409b5d56 100644
--- a/include/asm-s390/mman.h
+++ b/include/asm-s390/mman.h
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL        0x2             /* read-ahead aggressively */
 #define MADV_WILLNEED  0x3              /* pre-fault pages */
 #define MADV_DONTNEED  0x4              /* discard these pages */
+#define MADV_REMOVE    0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-sh/mman.h b/include/asm-sh/mman.h
index 3ebab5f79db7..693bd55a3710 100644
--- a/include/asm-sh/mman.h
+++ b/include/asm-sh/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-sparc/mman.h b/include/asm-sparc/mman.h
index 138eb81dd70d..98435ad8619e 100644
--- a/include/asm-sparc/mman.h
+++ b/include/asm-sparc/mman.h
@@ -54,6 +54,7 @@
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
 #define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_REMOVE	0x6		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-sparc64/mman.h b/include/asm-sparc64/mman.h
index 01cecf54357b..cb4b6156194d 100644
--- a/include/asm-sparc64/mman.h
+++ b/include/asm-sparc64/mman.h
@@ -54,6 +54,7 @@
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
 #define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_REMOVE	0x6		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-v850/mman.h b/include/asm-v850/mman.h
index e2b90081b56f..edc79965193a 100644
--- a/include/asm-v850/mman.h
+++ b/include/asm-v850/mman.h
@@ -32,6 +32,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-x86_64/mman.h b/include/asm-x86_64/mman.h
index 78e60a4fd4ee..d0e97b74f735 100644
--- a/include/asm-x86_64/mman.h
+++ b/include/asm-x86_64/mman.h
@@ -36,6 +36,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-xtensa/mman.h b/include/asm-xtensa/mman.h
index 9a95a45df996..082a7504925e 100644
--- a/include/asm-xtensa/mman.h
+++ b/include/asm-xtensa/mman.h
@@ -72,6 +72,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed9a41a71e8b..115e72be25d0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1050,6 +1050,7 @@ struct inode_operations {
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
+	void (*truncate_range)(struct inode *, loff_t, loff_t);
 };
 
 struct seq_file;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 92acae9f1f4c..6c9be99429f3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -690,6 +690,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 }
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
+extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
 extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2b7cf0400a21..ae0ae3ea299a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 	return 0;
 }
 
+/*
+ * Application wants to free up the pages and associated backing store.
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_remove(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	struct address_space *mapping;
+        loff_t offset, endoff;
+
+	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+		return -EINVAL;
+
+	if (!vma->vm_file || !vma->vm_file->f_mapping
+		|| !vma->vm_file->f_mapping->host) {
+			return -EINVAL;
+	}
+
+	mapping = vma->vm_file->f_mapping;
+
+	offset = (loff_t)(start - vma->vm_start)
+			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+	endoff = (loff_t)(end - vma->vm_start - 1)
+			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+	return  vmtruncate_range(mapping->host, offset, endoff);
+}
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	case MADV_RANDOM:
 		error = madvise_behavior(vma, prev, start, end, behavior);
 		break;
+	case MADV_REMOVE:
+		error = madvise_remove(vma, start, end);
+		break;
 
 	case MADV_WILLNEED:
 		error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  *		some pages ahead.
  *  MADV_DONTNEED - the application is finished with the given range,
  *		so the kernel can free resources associated with it.
+ *  MADV_REMOVE - the application wants to free up the given range of
+ *		pages and associated backing store.
  *
  * return values:
  *  zero    - success
diff --git a/mm/memory.c b/mm/memory.c
index d8dde07a3656..e249088908c4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1770,9 +1770,32 @@ out_big:
 out_busy:
 	return -ETXTBSY;
 }
-
 EXPORT_SYMBOL(vmtruncate);
 
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * If the underlying filesystem is not going to provide
+	 * a way to truncate a range of blocks (punch a hole) -
+	 * we should return failure right now.
+	 */
+	if (!inode->i_op || !inode->i_op->truncate_range)
+		return -ENOSYS;
+
+	down(&inode->i_sem);
+	down_write(&inode->i_alloc_sem);
+	unmap_mapping_range(mapping, offset, (end - offset), 1);
+	truncate_inode_pages_range(mapping, offset, end);
+	inode->i_op->truncate_range(inode, offset, end);
+	up_write(&inode->i_alloc_sem);
+	up(&inode->i_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL(vmtruncate_range);
+
 /* 
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
diff --git a/mm/shmem.c b/mm/shmem.c
index d9fc277940da..65c148efa2ed 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
 	} while (next);
 }
 
-static void shmem_truncate(struct inode *inode)
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
 	long nr_swaps_freed = 0;
 	int offset;
 	int freed;
+	int punch_hole = 0;
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (idx >= info->next_index)
 		return;
 
 	spin_lock(&info->lock);
 	info->flags |= SHMEM_TRUNCATE;
-	limit = info->next_index;
-	info->next_index = idx;
+	if (likely(end == (loff_t) -1)) {
+		limit = info->next_index;
+		info->next_index = idx;
+	} else {
+		limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		if (limit > info->next_index)
+			limit = info->next_index;
+		punch_hole = 1;
+	}
+
 	topdir = info->i_indirect;
-	if (topdir && idx <= SHMEM_NR_DIRECT) {
+	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
 		info->i_indirect = NULL;
 		nr_pages_to_free++;
 		list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
 			set_page_private(subdir, page_private(subdir) - freed);
 			if (offset)
 				spin_unlock(&info->lock);
-			BUG_ON(page_private(subdir) > offset);
+			if (!punch_hole)
+				BUG_ON(page_private(subdir) > offset);
 		}
 		if (offset)
 			offset = 0;
-		else if (subdir) {
+		else if (subdir && !page_private(subdir)) {
 			dir[diroff] = NULL;
 			nr_pages_to_free++;
 			list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
 		 * Also, though shmem_getpage checks i_size before adding to
 		 * cache, no recheck after: so fix the narrow window there too.
 		 */
-		truncate_inode_pages(inode->i_mapping, inode->i_size);
+		truncate_inode_pages_range(inode->i_mapping, start, end);
 	}
 
 	spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
 	}
 }
 
+static void shmem_truncate(struct inode *inode)
+{
+	shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
+
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = {
 static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
+	.truncate_range	= shmem_truncate_range,
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
-- 
cgit v1.2.3-71-gd317


From 5da7ca86078964cbfe6c83efc1205904587706fe Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:10:46 -0800
Subject: [PATCH] Add NUMA policy support for huge pages.

The huge_zonelist() function in the memory policy layer provides an list of
zones ordered by NUMA distance.  The hugetlb layer will walk that list looking
for a zone that has available huge pages but is also in the nodeset of the
current cpuset.

This patch does not contain the folding of find_or_alloc_huge_page() that was
controversial in the earlier discussion.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hugetlb.h   |  4 ++--
 include/linux/mempolicy.h |  8 ++++++++
 mm/hugetlb.c              | 24 ++++++++++++++----------
 mm/mempolicy.c            | 39 ++++++++++++++++++++++++++++++---------
 4 files changed, 54 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1056717ee501..68d82ad6b17c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -22,7 +22,7 @@ int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
 int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
-struct page *alloc_huge_page(void);
+struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
 void free_huge_page(struct page *);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
@@ -97,7 +97,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
 						do { } while (0)
-#define alloc_huge_page()			({ NULL; })
+#define alloc_huge_page(vma, addr)		({ NULL; })
 #define free_huge_page(p)			({ (void)(p); BUG(); })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
 
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 8b67cf837ca9..817db6427113 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -156,6 +156,8 @@ extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
 extern struct mempolicy default_policy;
+extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
+		unsigned long addr);
 
 #else
 
@@ -232,6 +234,12 @@ static inline void numa_policy_rebind(const nodemask_t *old,
 {
 }
 
+static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
+		unsigned long addr)
+{
+	return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+}
+
 #endif /* CONFIG_NUMA */
 #endif /* __KERNEL__ */
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e93bd63462f0..eb405565949d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,8 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
@@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page)
 	free_huge_pages_node[nid]++;
 }
 
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+				unsigned long address)
 {
 	int nid = numa_node_id();
 	struct page *page = NULL;
-	struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;
+	struct zonelist *zonelist = huge_zonelist(vma, address);
 	struct zone **z;
 
 	for (z = zonelist->zones; *z; z++) {
@@ -87,13 +90,13 @@ void free_huge_page(struct page *page)
 	spin_unlock(&hugetlb_lock);
 }
 
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
 	struct page *page;
 	int i;
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page();
+	page = dequeue_huge_page(vma, addr);
 	if (!page) {
 		spin_unlock(&hugetlb_lock);
 		return NULL;
@@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
 	spin_lock(&hugetlb_lock);
 	try_to_free_low(count);
 	while (count < nr_huge_pages) {
-		struct page *page = dequeue_huge_page();
+		struct page *page = dequeue_huge_page(NULL, 0);
 		if (!page)
 			break;
 		update_and_free_page(page);
@@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	flush_tlb_range(vma, start, end);
 }
 
-static struct page *find_or_alloc_huge_page(struct address_space *mapping,
-				unsigned long idx, int shared)
+static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma,
+			unsigned long addr, struct address_space *mapping,
+			unsigned long idx, int shared)
 {
 	struct page *page;
 	int err;
@@ -378,7 +382,7 @@ retry:
 
 	if (hugetlb_get_quota(mapping))
 		goto out;
-	page = alloc_huge_page();
+	page = alloc_huge_page(vma, addr);
 	if (!page) {
 		hugetlb_put_quota(mapping);
 		goto out;
@@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	page_cache_get(old_page);
-	new_page = alloc_huge_page();
+	new_page = alloc_huge_page(vma, address);
 
 	if (!new_page) {
 		page_cache_release(old_page);
@@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Use page lock to guard against racing truncation
 	 * before we get page_table_lock.
 	 */
-	page = find_or_alloc_huge_page(mapping, idx,
+	page = find_or_alloc_huge_page(vma, address, mapping, idx,
 			vma->vm_flags & VM_SHARED);
 	if (!page)
 		goto out;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72f402cc9c9a..45c51ac63443 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
 	return nid;
 }
 
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+		 struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+	if (vma) {
+		unsigned long off;
+
+		off = vma->vm_pgoff;
+		off += (addr - vma->vm_start) >> shift;
+		return offset_il_node(pol, vma, off);
+	} else
+		return interleave_nodes(pol);
+}
+
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+
+	if (pol->policy == MPOL_INTERLEAVE) {
+		unsigned nid;
+
+		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+	}
+	return zonelist_policy(GFP_HIGHUSER, pol);
+}
+
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 
 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 		unsigned nid;
-		if (vma) {
-			unsigned long off;
-			off = vma->vm_pgoff;
-			off += (addr - vma->vm_start) >> PAGE_SHIFT;
-			nid = offset_il_node(pol, vma, off);
-		} else {
-			/* fall back to process interleaving */
-			nid = interleave_nodes(pol);
-		}
+
+		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 		return alloc_page_interleave(gfp, 0, nid);
 	}
 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
-- 
cgit v1.2.3-71-gd317


From 21abb1478a87e26f5fa71dbcb7cf4264272c2248 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:10:47 -0800
Subject: [PATCH] Remove old node based policy interface from mempolicy.c

mempolicy.c contains provisional interface for huge page allocation based on
node numbers.  This is in use in SLES9 but was never used (AFAIK) in upstream
versions of Linux.

Huge page allocations now use zonelists to figure out where to allocate pages.
 The use of zonelists allows us to find the closest hugepage which was the
consideration of the NUMA distance for huge page allocations.

Remove the obsolete functions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 19 -------------------
 mm/mempolicy.c            | 48 -----------------------------------------------
 2 files changed, 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 817db6427113..b972f985a3c5 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -109,14 +109,6 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
 
 #define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)
 
-/*
- * Hugetlb policy. i386 hugetlb so far works with node numbers
- * instead of zone lists, so give it special interfaces for now.
- */
-extern int mpol_first_node(struct vm_area_struct *vma, unsigned long addr);
-extern int mpol_node_valid(int nid, struct vm_area_struct *vma,
-			unsigned long addr);
-
 /*
  * Tree of shared policies for a shared memory region.
  * Maintain the policies in a pseudo mm that contains vmas. The vmas
@@ -184,17 +176,6 @@ static inline struct mempolicy *mpol_copy(struct mempolicy *old)
 	return NULL;
 }
 
-static inline int mpol_first_node(struct vm_area_struct *vma, unsigned long a)
-{
-	return numa_node_id();
-}
-
-static inline int
-mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long a)
-{
-	return 1;
-}
-
 struct shared_policy {};
 
 static inline int mpol_set_shared_policy(struct shared_policy *info,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 45c51ac63443..96714e2646ad 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -960,54 +960,6 @@ void __mpol_free(struct mempolicy *p)
 	kmem_cache_free(policy_cache, p);
 }
 
-/*
- * Hugetlb policy. Same as above, just works with node numbers instead of
- * zonelists.
- */
-
-/* Find first node suitable for an allocation */
-int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
-{
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
-
-	switch (pol->policy) {
-	case MPOL_DEFAULT:
-		return numa_node_id();
-	case MPOL_BIND:
-		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
-	case MPOL_INTERLEAVE:
-		return interleave_nodes(pol);
-	case MPOL_PREFERRED:
-		return pol->v.preferred_node >= 0 ?
-				pol->v.preferred_node : numa_node_id();
-	}
-	BUG();
-	return 0;
-}
-
-/* Find secondary valid nodes for an allocation */
-int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
-{
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
-
-	switch (pol->policy) {
-	case MPOL_PREFERRED:
-	case MPOL_DEFAULT:
-	case MPOL_INTERLEAVE:
-		return 1;
-	case MPOL_BIND: {
-		struct zone **z;
-		for (z = pol->v.zonelist->zones; *z; z++)
-			if ((*z)->zone_pgdat->node_id == nid)
-				return 1;
-		return 0;
-	}
-	default:
-		BUG();
-		return 0;
-	}
-}
-
 /*
  * Shared memory backing store policy support.
  *
-- 
cgit v1.2.3-71-gd317


From 9f3fd602aef96c2a490e3bfd669d06475aeba8d8 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:50 -0800
Subject: [PATCH] mm: kvaddr_to_nid not used in common code

kvaddr_to_nid() isn't used in common code nor in i386 code.  Remove these
definitions.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/mmzone.h | 5 -----
 include/linux/mmzone.h    | 5 -----
 2 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 620a90641ea8..74f595d80579 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -76,11 +76,6 @@ static inline int pfn_to_nid(unsigned long pfn)
  * Following are macros that each numa implmentation must define.
  */
 
-/*
- * Given a kernel address, find the home node of the underlying memory.
- */
-#define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
-
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9f22090df7dd..3c49f786f90c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -564,11 +564,6 @@ static inline int valid_section_nr(unsigned long nr)
 	return valid_section(__nr_to_section(nr));
 }
 
-/*
- * Given a kernel address, find the home node of the underlying memory.
- */
-#define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
-
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
 	return __nr_to_section(pfn_to_section_nr(pfn));
-- 
cgit v1.2.3-71-gd317


From d5afa6dcf74c0efb60ce07c63d0a727be93c67c5 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:50 -0800
Subject: [PATCH] mm: pfn_to_pgdat not used in common code

pfn_to_pgdat() isn't used in common code.  Remove definition.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3c49f786f90c..28f8496abcb9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -596,11 +596,6 @@ static inline int pfn_valid(unsigned long pfn)
 #define pfn_to_nid		early_pfn_to_nid
 #endif
 
-#define pfn_to_pgdat(pfn)						\
-({									\
-	NODE_DATA(pfn_to_nid(pfn));					\
-})
-
 #define early_pfn_valid(pfn)	pfn_valid(pfn)
 void sparse_init(void);
 #else
-- 
cgit v1.2.3-71-gd317


From a94b3ab7eab4edcc9b2cb474b188f774c331adf7 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <kravetz@us.ibm.com>
Date: Fri, 6 Jan 2006 00:10:51 -0800
Subject: [PATCH] mm: remove arch independent NODES_SPAN_OTHER_NODES

The NODES_SPAN_OTHER_NODES config option was created so that DISCONTIGMEM
could handle pSeries numa layouts.  However, support for DISCONTIGMEM has
been replaced by SPARSEMEM on powerpc.  As a result, this config option and
supporting code is no longer needed.

I have already sent a patch to Paul that removes the option from powerpc
specific code.  This removes the arch independent piece.  Doesn't really
matter which is applied first.

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 6 ------
 mm/page_alloc.c        | 2 --
 2 files changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 28f8496abcb9..d294b57a4016 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -603,12 +603,6 @@ void sparse_init(void);
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-#define early_pfn_in_nid(pfn, nid)	(early_pfn_to_nid(pfn) == (nid))
-#else
-#define early_pfn_in_nid(pfn, nid)	(1)
-#endif
-
 #ifndef early_pfn_valid
 #define early_pfn_valid(pfn)	(1)
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1e49dc7cd619..07825c637a58 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1708,8 +1708,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
 		if (!early_pfn_valid(pfn))
 			continue;
-		if (!early_pfn_in_nid(pfn, nid))
-			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		set_page_count(page, 1);
-- 
cgit v1.2.3-71-gd317


From 03b00ebcc804180829d513df9e92e5fe8f72aacf Mon Sep 17 00:00:00 2001
From: Russell King <rmk@arm.linux.org.uk>
Date: Fri, 6 Jan 2006 00:10:52 -0800
Subject: [PATCH] Shut up warnings in ipc/shm.c

Fix two warnings in ipc/shm.c

ipc/shm.c:122: warning: statement with no effect
ipc/shm.c:560: warning: statement with no effect

by converting the macros to empty inline functions.  For safety, let's do
all three.  This also has the advantage that typechecking gets performed
even without CONFIG_SHMEM enabled.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6c9be99429f3..75ec04e2f184 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -634,9 +634,24 @@ struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 int shmem_lock(struct file *file, int lock, struct user_struct *user);
 #else
 #define shmem_nopage filemap_nopage
-#define shmem_lock(a, b, c) 	({0;})	/* always in memory, no need to lock */
-#define shmem_set_policy(a, b)	(0)
-#define shmem_get_policy(a, b)	(NULL)
+
+static inline int shmem_lock(struct file *file, int lock,
+			     struct user_struct *user)
+{
+	return 0;
+}
+
+static inline int shmem_set_policy(struct vm_area_struct *vma,
+				   struct mempolicy *new)
+{
+	return 0;
+}
+
+static inline struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
+						 unsigned long addr)
+{
+	return NULL;
+}
 #endif
 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
 
-- 
cgit v1.2.3-71-gd317


From 2bdaf115b1c364d89484b59d5b937973f1c5a5c3 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:53 -0800
Subject: [PATCH] flatmem split out memory model

There are three places we define pfn_to_nid().  Two in linux/mmzone.h and one
in asm/mmzone.h.  These in essence represent the three memory models.  The
definition in linux/mmzone.h under !NEED_MULTIPLE_NODES is both the FLATMEM
definition and the optimisation for single NUMA nodes; the one under SPARSEMEM
is the NUMA sparsemem one; the one in asm/mmzone.h under DISCONTIGMEM is the
discontigmem one.  This is not in the least bit obvious, particularly the
connection between the non-NUMA optimisations and the memory models.

Two patches:

flatmem-split-out-memory-model: simplifies the selection of pfn_to_nid()
implementations.  The selection is based primarily off the memory model
selected.  Optimisations for non-NUMA are applied where needed.

sparse-provide-pfn_to_nid: implement pfn_to_nid() for SPARSEMEM

This patch:

pfn_to_nid is memory model specific

The pfn_to_nid() call is memory model specific.  It represents the locality
identifier for the memory passed.  Classically this would be a NUMA node,
but not a chunk of memory under DISCONTIGMEM.

The SPARSEMEM and FLATMEM memory model non-NUMA versions of pfn_to_nid()
are folded together under NEED_MULTIPLE_NODES, while DISCONTIGMEM has its
own optimisation.  This is all very confusing.

This patch splits out each implementation of pfn_to_nid() so that we can
see them and the optimisations to each.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d294b57a4016..ee9f7b74e613 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -435,7 +435,6 @@ extern struct pglist_data contig_page_data;
 #define NODE_DATA(nid)		(&contig_page_data)
 #define NODE_MEM_MAP(nid)	mem_map
 #define MAX_NODES_SHIFT		1
-#define pfn_to_nid(pfn)		(0)
 
 #else /* CONFIG_NEED_MULTIPLE_NODES */
 
@@ -470,6 +469,10 @@ extern struct pglist_data contig_page_data;
 #define early_pfn_to_nid(nid)  (0UL)
 #endif
 
+#ifdef CONFIG_FLATMEM
+#define pfn_to_nid(pfn)		(0)
+#endif
+
 #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
 #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
 
@@ -594,6 +597,8 @@ static inline int pfn_valid(unsigned long pfn)
  */
 #ifdef CONFIG_NUMA
 #define pfn_to_nid		early_pfn_to_nid
+#else
+#define pfn_to_nid(pfn)		(0)
 #endif
 
 #define early_pfn_valid(pfn)	pfn_valid(pfn)
-- 
cgit v1.2.3-71-gd317


From 161599ff39a3c3cdea0a1be05ac53accd2c45cdd Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:54 -0800
Subject: [PATCH] sparsemem: provide pfn_to_nid

Before SPARSEMEM is initialised we cannot provide an efficient pfn_to_nid()
implmentation; before initialisation is complete we use early_pfn_to_nid()
to provide location information.  Until recently there was no non-init user
of this functionality.  Provide a post init pfn_to_nid() implementation.

Note that this implmentation assumes that the pfn passed has been validated
with pfn_valid().  The current single user of this function already has
this check.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ee9f7b74e613..8cba76c6a28c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -596,7 +596,11 @@ static inline int pfn_valid(unsigned long pfn)
  * this restriction.
  */
 #ifdef CONFIG_NUMA
-#define pfn_to_nid		early_pfn_to_nid
+#define pfn_to_nid(pfn)							\
+({									\
+	unsigned long __pfn_to_nid_pfn = (pfn);				\
+	page_to_nid(pfn_to_page(__pfn_to_nid_pfn));			\
+})
 #else
 #define pfn_to_nid(pfn)		(0)
 #endif
-- 
cgit v1.2.3-71-gd317


From 2d92c5c9150a2a9ca3dc25da58d5042e17a96b6a Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:10:59 -0800
Subject: [PATCH] mm: remove pcp low

struct per_cpu_pages.low is useless.  Remove it.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 1 -
 mm/page_alloc.c        | 9 ++-------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8cba76c6a28c..0d1a5981bb94 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -46,7 +46,6 @@ struct zone_padding {
 
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
-	int low;		/* low watermark, refill needed */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
 	struct list_head list;	/* the list of pages */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 088712f2ac02..7cff958e7813 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -740,7 +740,7 @@ again:
 		page = NULL;
 		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 		local_irq_save(flags);
-		if (pcp->count <= pcp->low)
+		if (!pcp->count)
 			pcp->count += rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
 		if (likely(pcp->count)) {
@@ -1345,10 +1345,9 @@ void show_free_areas(void)
 			pageset = zone_pcp(zone, cpu);
 
 			for (temperature = 0; temperature < 2; temperature++)
-				printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
+				printk("cpu %d %s: high %d, batch %d used:%d\n",
 					cpu,
 					temperature ? "cold" : "hot",
-					pageset->pcp[temperature].low,
 					pageset->pcp[temperature].high,
 					pageset->pcp[temperature].batch,
 					pageset->pcp[temperature].count);
@@ -1790,14 +1789,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 
 	pcp = &p->pcp[0];		/* hot */
 	pcp->count = 0;
-	pcp->low = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
 
 	pcp = &p->pcp[1];		/* cold*/
 	pcp->count = 0;
-	pcp->low = 0;
 	pcp->high = 2 * batch;
 	pcp->batch = max(1UL, batch/2);
 	INIT_LIST_HEAD(&pcp->list);
@@ -2193,12 +2190,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 				seq_printf(m,
 					   "\n    cpu: %i pcp: %i"
 					   "\n              count: %i"
-					   "\n              low:   %i"
 					   "\n              high:  %i"
 					   "\n              batch: %i",
 					   i, j,
 					   pageset->pcp[j].count,
-					   pageset->pcp[j].low,
 					   pageset->pcp[j].high,
 					   pageset->pcp[j].batch);
 			}
-- 
cgit v1.2.3-71-gd317


From 008857c1a49ccffc31a54c3ea7e182833bd61304 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Fri, 6 Jan 2006 00:11:01 -0800
Subject: [PATCH] Cleanup bootmem allocator and fix alloc_bootmem_low

Patch cleans up the alloc_bootmem fix for swiotlb.  Patch removes
alloc_bootmem_*_limit api and fixes alloc_boot_*low api to do the right
thing -- allocate from low32 memory.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h | 46 ++++++++++++----------------------------------
 lib/swiotlb.c           |  3 +--
 mm/bootmem.c            | 38 +++++++++++++++++++++++++++++++-------
 3 files changed, 44 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 3b03b0b868dd..993da8cc9706 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -43,50 +43,38 @@ typedef struct bootmem_data {
 extern unsigned long __init bootmem_bootmap_pages (unsigned long);
 extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend);
 extern void __init free_bootmem (unsigned long addr, unsigned long size);
-extern void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, unsigned long limit);
+extern void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal);
+extern void * __init __alloc_bootmem_low(unsigned long size,
+					 unsigned long align,
+					 unsigned long goal);
+extern void * __init __alloc_bootmem_low_node(pg_data_t *pgdat,
+					      unsigned long size,
+					      unsigned long align,
+					      unsigned long goal);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
 	__alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
-	__alloc_bootmem((x), SMP_CACHE_BYTES, 0)
+	__alloc_bootmem_low((x), SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem((x), PAGE_SIZE, 0)
-
-#define alloc_bootmem_limit(x, limit)						\
-	__alloc_bootmem_limit((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS), (limit))
-#define alloc_bootmem_low_limit(x, limit)			\
-	__alloc_bootmem_limit((x), SMP_CACHE_BYTES, 0, (limit))
-#define alloc_bootmem_pages_limit(x, limit)					\
-	__alloc_bootmem_limit((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS), (limit))
-#define alloc_bootmem_low_pages_limit(x, limit)		\
-	__alloc_bootmem_limit((x), PAGE_SIZE, 0, (limit))
-
+	__alloc_bootmem_low((x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 extern unsigned long __init free_all_bootmem (void);
-
+extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
 extern unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn);
 extern void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size);
 extern void __init free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size);
 extern unsigned long __init free_all_bootmem_node (pg_data_t *pgdat);
-extern void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
 	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages_node(pgdat, x) \
-	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
-
-#define alloc_bootmem_node_limit(pgdat, x, limit)				\
-	__alloc_bootmem_node_limit((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS), (limit))
-#define alloc_bootmem_pages_node_limit(pgdat, x, limit)				\
-	__alloc_bootmem_node_limit((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS), (limit))
-#define alloc_bootmem_low_pages_node_limit(pgdat, x, limit)		\
-	__alloc_bootmem_node_limit((pgdat), (x), PAGE_SIZE, 0, (limit))
-
+	__alloc_bootmem_low_node((pgdat), (x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
@@ -123,15 +111,5 @@ extern void *__init alloc_large_system_hash(const char *tablename,
 #endif
 extern int __initdata hashdist;		/* Distribute hashes across NUMA nodes? */
 
-static inline void *__alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
-{
-	return __alloc_bootmem_limit(size, align, goal, 0);
-}
-
-static inline void *__alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align,
-				     unsigned long goal)
-{
-	return __alloc_bootmem_node_limit(pgdat, size, align, goal, 0);
-}
 
 #endif /* _LINUX_BOOTMEM_H */
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1ff8dcebf7c6..3b482052f403 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -142,8 +142,7 @@ swiotlb_init_with_default_size (size_t default_size)
 	/*
 	 * Get IO TLB memory from the low pages
 	 */
-	io_tlb_start = alloc_bootmem_low_pages_limit(io_tlb_nslabs *
-					     (1 << IO_TLB_SHIFT), 0x100000000);
+	io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
 	if (!io_tlb_start)
 		panic("Cannot allocate SWIOTLB buffer");
 	io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 16b9465eb4eb..cbb82ee14fb5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -393,15 +393,14 @@ unsigned long __init free_all_bootmem (void)
 	return(free_all_bootmem_core(NODE_DATA(0)));
 }
 
-void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal,
-				unsigned long limit)
+void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
 {
 	pg_data_t *pgdat = pgdat_list;
 	void *ptr;
 
 	for_each_pgdat(pgdat)
 		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-						 align, goal, limit)))
+						 align, goal, 0)))
 			return(ptr);
 
 	/*
@@ -413,15 +412,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
 }
 
 
-void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align,
-				     unsigned long goal, unsigned long limit)
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
+				   unsigned long goal)
 {
 	void *ptr;
 
-	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit);
+	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return (ptr);
 
-	return __alloc_bootmem_limit(size, align, goal, limit);
+	return __alloc_bootmem(size, align, goal);
 }
 
+#define LOW32LIMIT 0xffffffff
+
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
+{
+	pg_data_t *pgdat = pgdat_list;
+	void *ptr;
+
+	for_each_pgdat(pgdat)
+		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+						 align, goal, LOW32LIMIT)))
+			return(ptr);
+
+	/*
+	 * Whoops, we cannot satisfy the allocation request.
+	 */
+	printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of low memory");
+	return NULL;
+}
+
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+				       unsigned long align, unsigned long goal)
+{
+	return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
+}
-- 
cgit v1.2.3-71-gd317


From 7756b9e4e321c3c83c7aa5b9532d3e7fd7ddeb4a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 6 Jan 2006 00:11:09 -0800
Subject: [PATCH] kill last zone_reclaim() bits

Remove the last bits of Martin's ill-fated sys_set_zone_reclaim().

Cc: Martin Hicks <mort@wildopensource.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/unistd.h |  2 +-
 include/asm-ia64/unistd.h |  2 +-
 include/linux/swap.h      |  1 -
 mm/vmscan.c               | 80 -----------------------------------------------
 4 files changed, 2 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 0f92e78dfea1..fe38b9a96233 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -256,7 +256,7 @@
 #define __NR_io_submit		248
 #define __NR_io_cancel		249
 #define __NR_fadvise64		250
-#define __NR_set_zone_reclaim	251
+/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
 #define __NR_exit_group		252
 #define __NR_lookup_dcookie	253
 #define __NR_epoll_create	254
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 6d96a67439be..2bf543493cb8 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -265,7 +265,7 @@
 #define __NR_keyctl			1273
 #define __NR_ioprio_set			1274
 #define __NR_ioprio_get			1275
-#define __NR_set_zone_reclaim		1276
+/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */
 #define __NR_inotify_init		1277
 #define __NR_inotify_add_watch		1278
 #define __NR_inotify_rm_watch		1279
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 508668f840b6..bd6641784107 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -172,7 +172,6 @@ extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(struct zone **, gfp_t);
-extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 795a050fe471..b2baca7645d7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -74,9 +74,6 @@ struct scan_control {
 
 	int may_writepage;
 
-	/* Can pages be swapped as part of reclaim? */
-	int may_swap;
-
 	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
 	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
 	 * In this context, it doesn't matter that we scan the
@@ -430,8 +427,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!sc->may_swap)
-				goto keep_locked;
 			if (!add_to_swap(page))
 				goto activate_locked;
 		}
@@ -952,7 +947,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 
 	sc.gfp_mask = gfp_mask;
 	sc.may_writepage = 0;
-	sc.may_swap = 1;
 
 	inc_page_state(allocstall);
 
@@ -1055,7 +1049,6 @@ loop_again:
 	total_reclaimed = 0;
 	sc.gfp_mask = GFP_KERNEL;
 	sc.may_writepage = 0;
-	sc.may_swap = 1;
 	sc.nr_mapped = read_page_state(nr_mapped);
 
 	inc_page_state(pageoutrun);
@@ -1353,76 +1346,3 @@ static int __init kswapd_init(void)
 }
 
 module_init(kswapd_init)
-
-
-/*
- * Try to free up some pages from this zone through reclaim.
- */
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
-{
-	struct scan_control sc;
-	int nr_pages = 1 << order;
-	int total_reclaimed = 0;
-
-	/* The reclaim may sleep, so don't do it if sleep isn't allowed */
-	if (!(gfp_mask & __GFP_WAIT))
-		return 0;
-	if (zone->all_unreclaimable)
-		return 0;
-
-	sc.gfp_mask = gfp_mask;
-	sc.may_writepage = 0;
-	sc.may_swap = 0;
-	sc.nr_mapped = read_page_state(nr_mapped);
-	sc.nr_scanned = 0;
-	sc.nr_reclaimed = 0;
-	/* scan at the highest priority */
-	sc.priority = 0;
-	disable_swap_token();
-
-	if (nr_pages > SWAP_CLUSTER_MAX)
-		sc.swap_cluster_max = nr_pages;
-	else
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-
-	/* Don't reclaim the zone if there are other reclaimers active */
-	if (atomic_read(&zone->reclaim_in_progress) > 0)
-		goto out;
-
-	shrink_zone(zone, &sc);
-	total_reclaimed = sc.nr_reclaimed;
-
- out:
-	return total_reclaimed;
-}
-
-asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
-				     unsigned int state)
-{
-	struct zone *z;
-	int i;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (node >= MAX_NUMNODES || !node_online(node))
-		return -EINVAL;
-
-	/* This will break if we ever add more zones */
-	if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
-		return -EINVAL;
-
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		if (!(zone & 1<<i))
-			continue;
-
-		z = &NODE_DATA(node)->node_zones[i];
-
-		if (state)
-			z->reclaim_pages = 1;
-		else
-			z->reclaim_pages = 0;
-	}
-
-	return 0;
-}
-- 
cgit v1.2.3-71-gd317


From 9328b8faae922e52073785ed6c1eaa8565648a0e Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:10 -0800
Subject: [PATCH] mm: dma32 zone statistics

Add dma32 to zone statistics.  Also attempt to arrange struct page_state a
bit better (visually).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h     | 11 +++++++++++
 include/linux/page-flags.h | 38 ++++++++++++++++++++++++--------------
 mm/page_alloc.c            | 14 +++++++++++---
 3 files changed, 46 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0d1a5981bb94..8d6caa414c4c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -397,6 +397,7 @@ static inline int is_normal_idx(int idx)
 {
 	return (idx == ZONE_NORMAL);
 }
+
 /**
  * is_highmem - helper function to quickly check if a struct zone is a 
  *              highmem zone or not.  This is an attempt to keep references
@@ -413,6 +414,16 @@ static inline int is_normal(struct zone *zone)
 	return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
 }
 
+static inline int is_dma32(struct zone *zone)
+{
+	return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
+}
+
+static inline int is_dma(struct zone *zone)
+{
+	return zone == zone->zone_pgdat->node_zones + ZONE_DMA;
+}
+
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
 struct file;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 343083fec258..32d09c8d952b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -97,32 +97,40 @@ struct page_state {
 	unsigned long pgpgout;		/* Disk writes */
 	unsigned long pswpin;		/* swap reads */
 	unsigned long pswpout;		/* swap writes */
-	unsigned long pgalloc_high;	/* page allocations */
 
+	unsigned long pgalloc_high;	/* page allocations */
 	unsigned long pgalloc_normal;
+	unsigned long pgalloc_dma32;
 	unsigned long pgalloc_dma;
+
 	unsigned long pgfree;		/* page freeings */
 	unsigned long pgactivate;	/* pages moved inactive->active */
 	unsigned long pgdeactivate;	/* pages moved active->inactive */
 
 	unsigned long pgfault;		/* faults (major+minor) */
 	unsigned long pgmajfault;	/* faults (major only) */
+
 	unsigned long pgrefill_high;	/* inspected in refill_inactive_zone */
 	unsigned long pgrefill_normal;
+	unsigned long pgrefill_dma32;
 	unsigned long pgrefill_dma;
 
 	unsigned long pgsteal_high;	/* total highmem pages reclaimed */
 	unsigned long pgsteal_normal;
+	unsigned long pgsteal_dma32;
 	unsigned long pgsteal_dma;
+
 	unsigned long pgscan_kswapd_high;/* total highmem pages scanned */
 	unsigned long pgscan_kswapd_normal;
-
+	unsigned long pgscan_kswapd_dma32;
 	unsigned long pgscan_kswapd_dma;
+
 	unsigned long pgscan_direct_high;/* total highmem pages scanned */
 	unsigned long pgscan_direct_normal;
+	unsigned long pgscan_direct_dma32;
 	unsigned long pgscan_direct_dma;
-	unsigned long pginodesteal;	/* pages reclaimed via inode freeing */
 
+	unsigned long pginodesteal;	/* pages reclaimed via inode freeing */
 	unsigned long slabs_scanned;	/* slab objects scanned */
 	unsigned long kswapd_steal;	/* pages reclaimed by kswapd */
 	unsigned long kswapd_inodesteal;/* reclaimed via kswapd inode freeing */
@@ -150,17 +158,19 @@ extern void __mod_page_state(unsigned long offset, unsigned long delta);
 #define add_page_state(member,delta) mod_page_state(member, (delta))
 #define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))
 
-#define mod_page_state_zone(zone, member, delta)				\
-	do {									\
-		unsigned offset;						\
-		if (is_highmem(zone))						\
-			offset = offsetof(struct page_state, member##_high);	\
-		else if (is_normal(zone))					\
-			offset = offsetof(struct page_state, member##_normal);	\
-		else								\
-			offset = offsetof(struct page_state, member##_dma);	\
-		__mod_page_state(offset, (delta));				\
-	} while (0)
+#define mod_page_state_zone(zone, member, delta)			\
+ do {									\
+	unsigned offset;						\
+	if (is_highmem(zone))						\
+		offset = offsetof(struct page_state, member##_high);	\
+	else if (is_normal(zone))					\
+		offset = offsetof(struct page_state, member##_normal);	\
+	else if (is_dma32(zone))					\
+		offset = offsetof(struct page_state, member##_dma32);	\
+	else								\
+		offset = offsetof(struct page_state, member##_dma);	\
+	__mod_page_state(offset, (delta));				\
+ } while (0)
 
 /*
  * Manipulation of page state flags
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cdad3249cf7f..e12154d9c4ed 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2277,32 +2277,40 @@ static char *vmstat_text[] = {
 	"pgpgout",
 	"pswpin",
 	"pswpout",
-	"pgalloc_high",
 
+	"pgalloc_high",
 	"pgalloc_normal",
+	"pgalloc_dma32",
 	"pgalloc_dma",
+
 	"pgfree",
 	"pgactivate",
 	"pgdeactivate",
 
 	"pgfault",
 	"pgmajfault",
+
 	"pgrefill_high",
 	"pgrefill_normal",
+	"pgrefill_dma32",
 	"pgrefill_dma",
 
 	"pgsteal_high",
 	"pgsteal_normal",
+	"pgsteal_dma32",
 	"pgsteal_dma",
+
 	"pgscan_kswapd_high",
 	"pgscan_kswapd_normal",
-
+	"pgscan_kswapd_dma32",
 	"pgscan_kswapd_dma",
+
 	"pgscan_direct_high",
 	"pgscan_direct_normal",
+	"pgscan_direct_dma32",
 	"pgscan_direct_dma",
-	"pginodesteal",
 
+	"pginodesteal",
 	"slabs_scanned",
 	"kswapd_steal",
 	"kswapd_inodesteal",
-- 
cgit v1.2.3-71-gd317


From 9617d95e6e9ffd883cf90a89724fe60d7ab22f9a Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:12 -0800
Subject: [PATCH] mm: rmap optimisation

Optimise rmap functions by minimising atomic operations when we know there
will be no concurrent modifications.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c            |  2 +-
 include/linux/rmap.h |  1 +
 mm/memory.c          |  6 +++---
 mm/rmap.c            | 49 ++++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 43 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 22533cce0611..e75a9548da8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -324,7 +324,7 @@ void install_arg_page(struct vm_area_struct *vma,
 	lru_cache_add_active(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
-	page_add_anon_rmap(page, vma, address);
+	page_add_new_anon_rmap(page, vma, address);
 	pte_unmap_unlock(pte, ptl);
 
 	/* no need for flush_tlb */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 33261f1d2239..9d6fbeef2104 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -71,6 +71,7 @@ void __anon_vma_link(struct vm_area_struct *);
  * rmap interfaces called when adding or removing pte of page
  */
 void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void page_add_file_rmap(struct page *);
 void page_remove_rmap(struct page *);
 
diff --git a/mm/memory.c b/mm/memory.c
index e249088908c4..d7ca7de10f4d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ gotten:
 		update_mmu_cache(vma, address, entry);
 		lazy_mmu_prot_update(entry);
 		lru_cache_add_active(new_page);
-		page_add_anon_rmap(new_page, vma, address);
+		page_add_new_anon_rmap(new_page, vma, address);
 
 		/* Free the old page.. */
 		new_page = old_page;
@@ -1978,7 +1978,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		inc_mm_counter(mm, anon_rss);
 		lru_cache_add_active(page);
 		SetPageReferenced(page);
-		page_add_anon_rmap(page, vma, address);
+		page_add_new_anon_rmap(page, vma, address);
 	} else {
 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
 		page = ZERO_PAGE(address);
@@ -2109,7 +2109,7 @@ retry:
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
-			page_add_anon_rmap(new_page, vma, address);
+			page_add_new_anon_rmap(new_page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6def159..4107f64ff749 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -434,6 +434,26 @@ int page_referenced(struct page *page, int is_locked)
 	return referenced;
 }
 
+/**
+ * page_set_anon_rmap - setup new anonymous rmap
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @address:	the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+
+	BUG_ON(!anon_vma);
+	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+	page->mapping = (struct address_space *) anon_vma;
+
+	page->index = linear_page_index(vma, address);
+
+	inc_page_state(nr_mapped);
+}
+
 /**
  * page_add_anon_rmap - add pte mapping to an anonymous page
  * @page:	the page to add the mapping to
@@ -445,20 +465,27 @@ int page_referenced(struct page *page, int is_locked)
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	if (atomic_inc_and_test(&page->_mapcount)) {
-		struct anon_vma *anon_vma = vma->anon_vma;
-
-		BUG_ON(!anon_vma);
-		anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-		page->mapping = (struct address_space *) anon_vma;
-
-		page->index = linear_page_index(vma, address);
-
-		inc_page_state(nr_mapped);
-	}
+	if (atomic_inc_and_test(&page->_mapcount))
+		__page_set_anon_rmap(page, vma, address);
 	/* else checking page index and mapping is racy */
 }
 
+/*
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @address:	the user virtual address mapped
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ */
+void page_add_new_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address)
+{
+	atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+	__page_set_anon_rmap(page, vma, address);
+}
+
 /**
  * page_add_file_rmap - add pte mapping to a file page
  * @page: the page to add the mapping to
-- 
cgit v1.2.3-71-gd317


From f3fe65122da05e1cd4c9140340d96ea2f95d0c49 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 6 Jan 2006 00:11:15 -0800
Subject: [PATCH] mm: add populated_zone() helper

There are numerous places we check whether a zone is populated or not.

Provide a helper function to check for populated zones and convert all
checks for zone->present_pages.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 5 +++++
 mm/page_alloc.c        | 8 ++++----
 mm/vmscan.c            | 8 ++++----
 3 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8d6caa414c4c..c34f4a2c62f8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -388,6 +388,11 @@ static inline struct zone *next_zone(struct zone *zone)
 #define for_each_zone(zone) \
 	for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
 
+static inline int populated_zone(struct zone *zone)
+{
+	return (!!zone->present_pages);
+}
+
 static inline int is_highmem_idx(int idx)
 {
 	return (idx == ZONE_HIGHMEM);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9fd2c238f13..8f3de5af92dd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1358,7 +1358,7 @@ void show_free_areas(void)
 		show_node(zone);
 		printk("%s per-cpu:", zone->name);
 
-		if (!zone->present_pages) {
+		if (!populated_zone(zone)) {
 			printk(" empty\n");
 			continue;
 		} else
@@ -1435,7 +1435,7 @@ void show_free_areas(void)
 
 		show_node(zone);
 		printk("%s: ", zone->name);
-		if (!zone->present_pages) {
+		if (!populated_zone(zone)) {
 			printk("empty\n");
 			continue;
 		}
@@ -2134,7 +2134,7 @@ static int frag_show(struct seq_file *m, void *arg)
 	int order;
 
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-		if (!zone->present_pages)
+		if (!populated_zone(zone))
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
@@ -2167,7 +2167,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
 		int i;
 
-		if (!zone->present_pages)
+		if (!populated_zone(zone))
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5c8a412b43f4..7681d8ee04fe 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -897,7 +897,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
-		if (zone->present_pages == 0)
+		if (!populated_zone(zone))
 			continue;
 
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -1069,7 +1069,7 @@ loop_again:
 			for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 				struct zone *zone = pgdat->node_zones + i;
 
-				if (zone->present_pages == 0)
+				if (!populated_zone(zone))
 					continue;
 
 				if (zone->all_unreclaimable &&
@@ -1106,7 +1106,7 @@ scan:
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
 
-			if (zone->present_pages == 0)
+			if (!populated_zone(zone))
 				continue;
 
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1258,7 +1258,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 {
 	pg_data_t *pgdat;
 
-	if (zone->present_pages == 0)
+	if (!populated_zone(zone))
 		return;
 
 	pgdat = zone->zone_pgdat;
-- 
cgit v1.2.3-71-gd317


From 4be38e351c5f455f6f490f5aff29053e33ab4f99 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:11:17 -0800
Subject: [PATCH] mm: move determination of policy_zone into page allocator

Currently the function to build a zonelist for a BIND policy has the side
effect to set the policy_zone.  This seems to be a bit strange.  policy
zone seems to not be initialized elsewhere and therefore 0.  Do we police
ZONE_DMA if no bind policy has been used yet?

This patch moves the determination of the zone to apply policies to into
the page allocator.  We determine the zone while building the zonelist for
nodes.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 11 +++++++++++
 mm/mempolicy.c            | 15 +++------------
 mm/page_alloc.c           |  2 ++
 3 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index b972f985a3c5..ed00b278cb93 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -151,6 +151,14 @@ extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
 
+extern int policy_zone;
+
+static inline void check_highest_zone(int k)
+{
+	if (k > policy_zone)
+		policy_zone = k;
+}
+
 #else
 
 struct mempolicy {};
@@ -221,6 +229,9 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 	return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
 }
 
+static inline void check_highest_zone(int k)
+{
+}
 #endif /* CONFIG_NUMA */
 #endif /* __KERNEL__ */
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 96714e2646ad..0f1d2b8a952b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,7 +93,7 @@ static kmem_cache_t *sn_cache;
 
 /* Highest zone. An specific allocation for a zone below that is not
    policied. */
-static int policy_zone;
+int policy_zone = ZONE_DMA;
 
 struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +131,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 	if (!zl)
 		return NULL;
 	num = 0;
-	for_each_node_mask(nd, *nodes) {
-		int k;
-		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
-			struct zone *z = &NODE_DATA(nd)->node_zones[k];
-			if (!z->present_pages)
-				continue;
-			zl->zones[num++] = z;
-			if (k > policy_zone)
-				policy_zone = k;
-		}
-	}
+	for_each_node_mask(nd, *nodes)
+		zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 	zl->zones[num] = NULL;
 	return zl;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7adc9526d329..512e3f4d4963 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -1470,6 +1471,7 @@ static int __init build_zonelists_node(pg_data_t *pgdat,
 			BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL);
 #endif
 			zonelist->zones[j++] = zone;
+			check_highest_zone(k);
 		}
 	}
 	return j;
-- 
cgit v1.2.3-71-gd317


From d3cb487149bd706aa6aeb02042332a450978dc1c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:11:20 -0800
Subject: [PATCH] atomic_long_t & include/asm-generic/atomic.h V2

Several counters already have the need to use 64 atomic variables on 64 bit
platforms (see mm_counter_t in sched.h).  We have to do ugly ifdefs to fall
back to 32 bit atomic on 32 bit platforms.

The VM statistics patch that I am working on will also make more extensive
use of atomic64.

This patch introduces a new type atomic_long_t by providing definitions in
asm-generic/atomic.h that works similar to the c "long" type.  Its 32 bits
on 32 bit platforms and 64 bits on 64 bit platforms.

Also cleans up the determination of the mm_counter_t in sched.h.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/atomic.h     |   1 +
 include/asm-arm/atomic.h       |   1 +
 include/asm-arm26/atomic.h     |   1 +
 include/asm-cris/atomic.h      |   1 +
 include/asm-frv/atomic.h       |   1 +
 include/asm-generic/atomic.h   | 116 +++++++++++++++++++++++++++++++++++++++++
 include/asm-h8300/atomic.h     |   1 +
 include/asm-i386/atomic.h      |   1 +
 include/asm-ia64/atomic.h      |   1 +
 include/asm-m32r/atomic.h      |   1 +
 include/asm-m68k/atomic.h      |   1 +
 include/asm-m68knommu/atomic.h |   1 +
 include/asm-mips/atomic.h      |   1 +
 include/asm-parisc/atomic.h    |   1 +
 include/asm-powerpc/atomic.h   |   1 +
 include/asm-s390/atomic.h      |   1 +
 include/asm-sh/atomic.h        |   1 +
 include/asm-sh64/atomic.h      |   1 +
 include/asm-sparc/atomic.h     |   1 +
 include/asm-sparc64/atomic.h   |   1 +
 include/asm-v850/atomic.h      |   1 +
 include/asm-x86_64/atomic.h    |   1 +
 include/asm-xtensa/atomic.h    |   1 +
 include/linux/sched.h          |  25 +++------
 24 files changed, 144 insertions(+), 19 deletions(-)
 create mode 100644 include/asm-generic/atomic.h

(limited to 'include/linux')

diff --git a/include/asm-alpha/atomic.h b/include/asm-alpha/atomic.h
index 6183eab006d4..cb03bbe92cdf 100644
--- a/include/asm-alpha/atomic.h
+++ b/include/asm-alpha/atomic.h
@@ -216,4 +216,5 @@ static __inline__ long atomic64_sub_return(long i, atomic64_t * v)
 #define smp_mb__before_atomic_inc()	smp_mb()
 #define smp_mb__after_atomic_inc()	smp_mb()
 
+#include <asm-generic/atomic.h>
 #endif /* _ALPHA_ATOMIC_H */
diff --git a/include/asm-arm/atomic.h b/include/asm-arm/atomic.h
index d586f65c8228..f72b63309bc5 100644
--- a/include/asm-arm/atomic.h
+++ b/include/asm-arm/atomic.h
@@ -205,5 +205,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif
 #endif
diff --git a/include/asm-arm26/atomic.h b/include/asm-arm26/atomic.h
index a47cadc59686..3074b0e76343 100644
--- a/include/asm-arm26/atomic.h
+++ b/include/asm-arm26/atomic.h
@@ -118,5 +118,6 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif
 #endif
diff --git a/include/asm-cris/atomic.h b/include/asm-cris/atomic.h
index 683b05a57d88..2df2c7aa19b7 100644
--- a/include/asm-cris/atomic.h
+++ b/include/asm-cris/atomic.h
@@ -156,4 +156,5 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define smp_mb__before_atomic_inc()    barrier()
 #define smp_mb__after_atomic_inc()     barrier()
 
+#include <asm-generic/atomic.h>
 #endif
diff --git a/include/asm-frv/atomic.h b/include/asm-frv/atomic.h
index f6539ff569c5..3f54fea2b051 100644
--- a/include/asm-frv/atomic.h
+++ b/include/asm-frv/atomic.h
@@ -426,4 +426,5 @@ extern uint32_t __cmpxchg_32(uint32_t *v, uint32_t test, uint32_t new);
 })
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
 
+#include <asm-generic/atomic.h>
 #endif /* _ASM_ATOMIC_H */
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
new file mode 100644
index 000000000000..e0a28b925ef0
--- /dev/null
+++ b/include/asm-generic/atomic.h
@@ -0,0 +1,116 @@
+#ifndef _ASM_GENERIC_ATOMIC_H
+#define _ASM_GENERIC_ATOMIC_H
+/*
+ * Copyright (C) 2005 Silicon Graphics, Inc.
+ *	Christoph Lameter <clameter@sgi.com>
+ *
+ * Allows to provide arch independent atomic definitions without the need to
+ * edit all arch specific atomic.h files.
+ */
+
+
+/*
+ * Suppport for atomic_long_t
+ *
+ * Casts for parameters are avoided for existing atomic functions in order to
+ * avoid issues with cast-as-lval under gcc 4.x and other limitations that the
+ * macros of a platform may have.
+ */
+
+#if BITS_PER_LONG == 64
+
+typedef atomic64_t atomic_long_t;
+
+#define ATOMIC_LONG_INIT(i)	ATOMIC64_INIT(i)
+
+static inline long atomic_long_read(atomic_long_t *l)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	return (long)atomic64_read(v);
+}
+
+static inline void atomic_long_set(atomic_long_t *l, long i)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	atomic_set(v, i);
+}
+
+static inline void atomic_long_inc(atomic_long_t *l)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	atomic64_inc(v);
+}
+
+static inline void atomic_long_dec(atomic_long_t *l)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	atomic64_dec(v);
+}
+
+static inline void atomic_long_add(long i, atomic_long_t *l)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	atomic64_add(i, v);
+}
+
+static inline void atomic_long_sub(long i, atomic_long_t *l)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	atomic64_sub(i, v);
+}
+
+#else
+
+typedef atomic_t atomic_long_t;
+
+#define ATOMIC_LONG_INIT(i)	ATOMIC_INIT(i)
+static inline long atomic_long_read(atomic_long_t *l)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	return (long)atomic_read(v);
+}
+
+static inline void atomic_long_set(atomic_long_t *l, long i)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	atomic_set(v, i);
+}
+
+static inline void atomic_long_inc(atomic_long_t *l)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	atomic_inc(v);
+}
+
+static inline void atomic_long_dec(atomic_long_t *l)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	atomic_dec(v);
+}
+
+static inline void atomic_long_add(long i, atomic_long_t *l)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	atomic_add(i, v);
+}
+
+static inline void atomic_long_sub(long i, atomic_long_t *l)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	atomic_sub(i, v);
+}
+
+#endif
+#endif
diff --git a/include/asm-h8300/atomic.h b/include/asm-h8300/atomic.h
index f23d86819ea8..d891541e89c3 100644
--- a/include/asm-h8300/atomic.h
+++ b/include/asm-h8300/atomic.h
@@ -137,4 +137,5 @@ static __inline__ void atomic_set_mask(unsigned long mask, unsigned long *v)
 #define smp_mb__before_atomic_inc()    barrier()
 #define smp_mb__after_atomic_inc() barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __ARCH_H8300_ATOMIC __ */
diff --git a/include/asm-i386/atomic.h b/include/asm-i386/atomic.h
index c68557aa04b2..7a5472d77091 100644
--- a/include/asm-i386/atomic.h
+++ b/include/asm-i386/atomic.h
@@ -254,4 +254,5 @@ __asm__ __volatile__(LOCK "orl %0,%1" \
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif
diff --git a/include/asm-ia64/atomic.h b/include/asm-ia64/atomic.h
index 2fbebf85c31d..15cf7984c48e 100644
--- a/include/asm-ia64/atomic.h
+++ b/include/asm-ia64/atomic.h
@@ -192,4 +192,5 @@ atomic64_add_negative (__s64 i, atomic64_t *v)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* _ASM_IA64_ATOMIC_H */
diff --git a/include/asm-m32r/atomic.h b/include/asm-m32r/atomic.h
index ef1fb8ea4726..70761278b6cb 100644
--- a/include/asm-m32r/atomic.h
+++ b/include/asm-m32r/atomic.h
@@ -313,4 +313,5 @@ static __inline__ void atomic_set_mask(unsigned long  mask, atomic_t *addr)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif	/* _ASM_M32R_ATOMIC_H */
diff --git a/include/asm-m68k/atomic.h b/include/asm-m68k/atomic.h
index e3c962eeabf3..b8a4e75d679d 100644
--- a/include/asm-m68k/atomic.h
+++ b/include/asm-m68k/atomic.h
@@ -157,4 +157,5 @@ static inline void atomic_set_mask(unsigned long mask, unsigned long *v)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __ARCH_M68K_ATOMIC __ */
diff --git a/include/asm-m68knommu/atomic.h b/include/asm-m68knommu/atomic.h
index 3c1cc153c415..1702dbe9318c 100644
--- a/include/asm-m68knommu/atomic.h
+++ b/include/asm-m68knommu/atomic.h
@@ -143,4 +143,5 @@ static inline int atomic_sub_return(int i, atomic_t * v)
 #define atomic_dec_return(v) atomic_sub_return(1,(v))
 #define atomic_inc_return(v) atomic_add_return(1,(v))
 
+#include <asm-generic/atomic.h>
 #endif /* __ARCH_M68KNOMMU_ATOMIC __ */
diff --git a/include/asm-mips/atomic.h b/include/asm-mips/atomic.h
index 55c37c106ef0..92256e43a938 100644
--- a/include/asm-mips/atomic.h
+++ b/include/asm-mips/atomic.h
@@ -713,4 +713,5 @@ static __inline__ long atomic64_sub_if_positive(long i, atomic64_t * v)
 #define smp_mb__before_atomic_inc()	smp_mb()
 #define smp_mb__after_atomic_inc()	smp_mb()
 
+#include <asm-generic/atomic.h>
 #endif /* _ASM_ATOMIC_H */
diff --git a/include/asm-parisc/atomic.h b/include/asm-parisc/atomic.h
index 983e9a2b6042..64ebd086c40d 100644
--- a/include/asm-parisc/atomic.h
+++ b/include/asm-parisc/atomic.h
@@ -216,4 +216,5 @@ static __inline__ int atomic_read(const atomic_t *v)
 #define smp_mb__before_atomic_inc()	smp_mb()
 #define smp_mb__after_atomic_inc()	smp_mb()
 
+#include <asm-generic/atomic.h>
 #endif
diff --git a/include/asm-powerpc/atomic.h b/include/asm-powerpc/atomic.h
index ec4b14468959..ae395a0632a6 100644
--- a/include/asm-powerpc/atomic.h
+++ b/include/asm-powerpc/atomic.h
@@ -402,5 +402,6 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 
 #endif /* __powerpc64__ */
 
+#include <asm-generic/atomic.h>
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_ATOMIC_H_ */
diff --git a/include/asm-s390/atomic.h b/include/asm-s390/atomic.h
index b3bd4f679f72..6d07c7df4b40 100644
--- a/include/asm-s390/atomic.h
+++ b/include/asm-s390/atomic.h
@@ -215,5 +215,6 @@ atomic_compare_and_swap(int expected_oldval,int new_val,atomic_t *v)
 #define smp_mb__before_atomic_inc()	smp_mb()
 #define smp_mb__after_atomic_inc()	smp_mb()
 
+#include <asm-generic/atomic.h>
 #endif /* __KERNEL__ */
 #endif /* __ARCH_S390_ATOMIC__  */
diff --git a/include/asm-sh/atomic.h b/include/asm-sh/atomic.h
index aabfd334462c..618d8e0de348 100644
--- a/include/asm-sh/atomic.h
+++ b/include/asm-sh/atomic.h
@@ -140,4 +140,5 @@ static __inline__ void atomic_set_mask(unsigned int mask, atomic_t *v)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __ASM_SH_ATOMIC_H */
diff --git a/include/asm-sh64/atomic.h b/include/asm-sh64/atomic.h
index 927a2bc27b30..f3ce5c0df13a 100644
--- a/include/asm-sh64/atomic.h
+++ b/include/asm-sh64/atomic.h
@@ -152,4 +152,5 @@ static __inline__ void atomic_set_mask(unsigned int mask, atomic_t *v)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __ASM_SH64_ATOMIC_H */
diff --git a/include/asm-sparc/atomic.h b/include/asm-sparc/atomic.h
index 62bec7ad271c..accb4967e9d2 100644
--- a/include/asm-sparc/atomic.h
+++ b/include/asm-sparc/atomic.h
@@ -159,4 +159,5 @@ static inline int __atomic24_sub(int i, atomic24_t *v)
 
 #endif /* !(__KERNEL__) */
 
+#include <asm-generic/atomic.h>
 #endif /* !(__ARCH_SPARC_ATOMIC__) */
diff --git a/include/asm-sparc64/atomic.h b/include/asm-sparc64/atomic.h
index 3789fe315992..11f5aa5d108c 100644
--- a/include/asm-sparc64/atomic.h
+++ b/include/asm-sparc64/atomic.h
@@ -96,4 +96,5 @@ extern int atomic64_sub_ret(int, atomic64_t *);
 #define smp_mb__after_atomic_inc()	barrier()
 #endif
 
+#include <asm-generic/atomic.h>
 #endif /* !(__ARCH_SPARC64_ATOMIC__) */
diff --git a/include/asm-v850/atomic.h b/include/asm-v850/atomic.h
index bede3172ce7f..f5b9ab6f4e70 100644
--- a/include/asm-v850/atomic.h
+++ b/include/asm-v850/atomic.h
@@ -126,4 +126,5 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __V850_ATOMIC_H__ */
diff --git a/include/asm-x86_64/atomic.h b/include/asm-x86_64/atomic.h
index 50db9f39274f..72eb071488c7 100644
--- a/include/asm-x86_64/atomic.h
+++ b/include/asm-x86_64/atomic.h
@@ -424,4 +424,5 @@ __asm__ __volatile__(LOCK "orl %0,%1" \
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif
diff --git a/include/asm-xtensa/atomic.h b/include/asm-xtensa/atomic.h
index 3670cc7695da..e2ce06b101ad 100644
--- a/include/asm-xtensa/atomic.h
+++ b/include/asm-xtensa/atomic.h
@@ -286,6 +286,7 @@ static inline void atomic_set_mask(unsigned int mask, atomic_t *v)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __KERNEL__ */
 
 #endif /* _XTENSA_ATOMIC_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0ad6f30679e..7da33619d5d0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -254,25 +254,12 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
  * The mm counters are not protected by its page_table_lock,
  * so must be incremented atomically.
  */
-#ifdef ATOMIC64_INIT
-#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
-typedef atomic64_t mm_counter_t;
-#else /* !ATOMIC64_INIT */
-/*
- * The counters wrap back to 0 at 2^32 * PAGE_SIZE,
- * that is, at 16TB if using 4kB page size.
- */
-#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
-typedef atomic_t mm_counter_t;
-#endif /* !ATOMIC64_INIT */
+#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
+typedef atomic_long_t mm_counter_t;
 
 #else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
 /*
-- 
cgit v1.2.3-71-gd317


From a74609fafa2e5cc31d558012abaaa55ec9ad9da4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:20 -0800
Subject: [PATCH] mm: page_state opt

Optimise page_state manipulations by introducing interrupt unsafe accessors
to page_state fields.  Callers must provide their own locking (either
disable interrupts or not update from interrupt context).

Switch over the hot callsites that can easily be moved under interrupts off
sections.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 43 ++++++++++++++++------
 mm/page_alloc.c            | 89 ++++++++++++++++++++++++++--------------------
 mm/rmap.c                  | 10 ++++--
 mm/vmscan.c                | 27 +++++++-------
 4 files changed, 104 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 32d09c8d952b..dede8d412dca 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -144,22 +144,33 @@ struct page_state {
 extern void get_page_state(struct page_state *ret);
 extern void get_page_state_node(struct page_state *ret, int node);
 extern void get_full_page_state(struct page_state *ret);
-extern unsigned long __read_page_state(unsigned long offset);
-extern void __mod_page_state(unsigned long offset, unsigned long delta);
+extern unsigned long read_page_state_offset(unsigned long offset);
+extern void mod_page_state_offset(unsigned long offset, unsigned long delta);
+extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 
 #define read_page_state(member) \
-	__read_page_state(offsetof(struct page_state, member))
+	read_page_state_offset(offsetof(struct page_state, member))
 
 #define mod_page_state(member, delta)	\
-	__mod_page_state(offsetof(struct page_state, member), (delta))
+	mod_page_state_offset(offsetof(struct page_state, member), (delta))
 
-#define inc_page_state(member)	mod_page_state(member, 1UL)
-#define dec_page_state(member)	mod_page_state(member, 0UL - 1)
-#define add_page_state(member,delta) mod_page_state(member, (delta))
-#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))
+#define __mod_page_state(member, delta)	\
+	__mod_page_state_offset(offsetof(struct page_state, member), (delta))
 
-#define mod_page_state_zone(zone, member, delta)			\
- do {									\
+#define inc_page_state(member)		mod_page_state(member, 1UL)
+#define dec_page_state(member)		mod_page_state(member, 0UL - 1)
+#define add_page_state(member,delta)	mod_page_state(member, (delta))
+#define sub_page_state(member,delta)	mod_page_state(member, 0UL - (delta))
+
+#define __inc_page_state(member)	__mod_page_state(member, 1UL)
+#define __dec_page_state(member)	__mod_page_state(member, 0UL - 1)
+#define __add_page_state(member,delta)	__mod_page_state(member, (delta))
+#define __sub_page_state(member,delta)	__mod_page_state(member, 0UL - (delta))
+
+#define page_state(member) (*__page_state(offsetof(struct page_state, member)))
+
+#define state_zone_offset(zone, member)					\
+({									\
 	unsigned offset;						\
 	if (is_highmem(zone))						\
 		offset = offsetof(struct page_state, member##_high);	\
@@ -169,7 +180,17 @@ extern void __mod_page_state(unsigned long offset, unsigned long delta);
 		offset = offsetof(struct page_state, member##_dma32);	\
 	else								\
 		offset = offsetof(struct page_state, member##_dma);	\
-	__mod_page_state(offset, (delta));				\
+	offset;								\
+})
+
+#define __mod_page_state_zone(zone, member, delta)			\
+ do {									\
+	__mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
+ } while (0)
+
+#define mod_page_state_zone(zone, member, delta)			\
+ do {									\
+	mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
  } while (0)
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7f580779abdb..fd47494cb989 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -424,9 +424,9 @@ void __free_pages_ok(struct page *page, unsigned int order)
 		return;
 
 	list_add(&page->lru, &list);
-	mod_page_state(pgfree, 1 << order);
 	kernel_map_pages(page, 1<<order, 0);
 	local_irq_save(flags);
+	__mod_page_state(pgfree, 1 << order);
 	free_pages_bulk(page_zone(page), 1, &list, order);
 	local_irq_restore(flags);
 }
@@ -674,18 +674,14 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
 
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
 {
 #ifdef CONFIG_NUMA
-	unsigned long flags;
-	int cpu;
 	pg_data_t *pg = z->zone_pgdat;
 	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
 	struct per_cpu_pageset *p;
 
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-	p = zone_pcp(z,cpu);
+	p = zone_pcp(z, cpu);
 	if (pg == orig) {
 		p->numa_hit++;
 	} else {
@@ -696,7 +692,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
 		p->local_node++;
 	else
 		p->other_node++;
-	local_irq_restore(flags);
 #endif
 }
 
@@ -716,11 +711,11 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 	if (free_pages_check(page))
 		return;
 
-	inc_page_state(pgfree);
 	kernel_map_pages(page, 1, 0);
 
 	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 	local_irq_save(flags);
+	__inc_page_state(pgfree);
 	list_add(&page->lru, &pcp->list);
 	pcp->count++;
 	if (pcp->count >= pcp->high)
@@ -753,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-static struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
+			struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
+	int cpu;
 
 again:
+	cpu  = get_cpu();
 	if (order == 0) {
 		struct per_cpu_pages *pcp;
 
-		page = NULL;
-		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+		pcp = &zone_pcp(zone, cpu)->pcp[cold];
 		local_irq_save(flags);
-		if (!pcp->count)
+		if (!pcp->count) {
 			pcp->count += rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
-		if (likely(pcp->count)) {
-			page = list_entry(pcp->list.next, struct page, lru);
-			list_del(&page->lru);
-			pcp->count--;
+			if (unlikely(!pcp->count))
+				goto failed;
 		}
-		local_irq_restore(flags);
-		put_cpu();
+		page = list_entry(pcp->list.next, struct page, lru);
+		list_del(&page->lru);
+		pcp->count--;
 	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
-		spin_unlock_irqrestore(&zone->lock, flags);
+		spin_unlock(&zone->lock);
+		if (!page)
+			goto failed;
 	}
 
-	if (page != NULL) {
-		BUG_ON(bad_range(zone, page));
-		mod_page_state_zone(zone, pgalloc, 1 << order);
-		if (prep_new_page(page, order))
-			goto again;
+	__mod_page_state_zone(zone, pgalloc, 1 << order);
+	zone_statistics(zonelist, zone, cpu);
+	local_irq_restore(flags);
+	put_cpu();
 
-		if (gfp_flags & __GFP_ZERO)
-			prep_zero_page(page, order, gfp_flags);
+	BUG_ON(bad_range(zone, page));
+	if (prep_new_page(page, order))
+		goto again;
 
-		if (order && (gfp_flags & __GFP_COMP))
-			prep_compound_page(page, order);
-	}
+	if (gfp_flags & __GFP_ZERO)
+		prep_zero_page(page, order, gfp_flags);
+
+	if (order && (gfp_flags & __GFP_COMP))
+		prep_compound_page(page, order);
 	return page;
+
+failed:
+	local_irq_restore(flags);
+	put_cpu();
+	return NULL;
 }
 
 #define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
@@ -871,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 				continue;
 		}
 
-		page = buffered_rmqueue(*z, order, gfp_mask);
+		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
 		if (page) {
-			zone_statistics(zonelist, *z);
 			break;
 		}
 	} while (*(++z) != NULL);
@@ -1248,7 +1251,7 @@ void get_full_page_state(struct page_state *ret)
 	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
 
-unsigned long __read_page_state(unsigned long offset)
+unsigned long read_page_state_offset(unsigned long offset)
 {
 	unsigned long ret = 0;
 	int cpu;
@@ -1262,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset)
 	return ret;
 }
 
-void __mod_page_state(unsigned long offset, unsigned long delta)
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+	void *ptr;
+
+	ptr = &__get_cpu_var(page_states);
+	*(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
 {
 	unsigned long flags;
-	void* ptr;
+	void *ptr;
 
 	local_irq_save(flags);
 	ptr = &__get_cpu_var(page_states);
-	*(unsigned long*)(ptr + offset) += delta;
+	*(unsigned long *)(ptr + offset) += delta;
 	local_irq_restore(flags);
 }
-
-EXPORT_SYMBOL(__mod_page_state);
+EXPORT_SYMBOL(mod_page_state_offset);
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat)
diff --git a/mm/rmap.c b/mm/rmap.c
index 4107f64ff749..6f3f7db27128 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -451,7 +451,11 @@ static void __page_set_anon_rmap(struct page *page,
 
 	page->index = linear_page_index(vma, address);
 
-	inc_page_state(nr_mapped);
+	/*
+	 * nr_mapped state can be updated without turning off
+	 * interrupts because it is not modified via interrupt.
+	 */
+	__inc_page_state(nr_mapped);
 }
 
 /**
@@ -498,7 +502,7 @@ void page_add_file_rmap(struct page *page)
 	BUG_ON(!pfn_valid(page_to_pfn(page)));
 
 	if (atomic_inc_and_test(&page->_mapcount))
-		inc_page_state(nr_mapped);
+		__inc_page_state(nr_mapped);
 }
 
 /**
@@ -522,7 +526,7 @@ void page_remove_rmap(struct page *page)
 		 */
 		if (page_test_and_clear_dirty(page))
 			set_page_dirty(page);
-		dec_page_state(nr_mapped);
+		__dec_page_state(nr_mapped);
 	}
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7681d8ee04fe..be8235fb1939 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -645,16 +645,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 			goto done;
 
 		max_scan -= nr_scan;
-		if (current_is_kswapd())
-			mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
-		else
-			mod_page_state_zone(zone, pgscan_direct, nr_scan);
 		nr_freed = shrink_list(&page_list, sc);
-		if (current_is_kswapd())
-			mod_page_state(kswapd_steal, nr_freed);
-		mod_page_state_zone(zone, pgsteal, nr_freed);
 
-		spin_lock_irq(&zone->lru_lock);
+		local_irq_disable();
+		if (current_is_kswapd()) {
+			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+			__mod_page_state(kswapd_steal, nr_freed);
+		} else
+			__mod_page_state_zone(zone, pgscan_direct, nr_scan);
+		__mod_page_state_zone(zone, pgsteal, nr_freed);
+
+		spin_lock(&zone->lru_lock);
 		/*
 		 * Put back any unfreeable pages.
 		 */
@@ -816,11 +817,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 		}
 	}
 	zone->nr_active += pgmoved;
-	spin_unlock_irq(&zone->lru_lock);
-	pagevec_release(&pvec);
+	spin_unlock(&zone->lru_lock);
+
+	__mod_page_state_zone(zone, pgrefill, pgscanned);
+	__mod_page_state(pgdeactivate, pgdeactivate);
+	local_irq_enable();
 
-	mod_page_state_zone(zone, pgrefill, pgscanned);
-	mod_page_state(pgdeactivate, pgdeactivate);
+	pagevec_release(&pvec);
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From b09eb1c06a14641209e6b86e9a5b28ea8287f193 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:21 -0800
Subject: [PATCH] mm: page_state opt docs

Comment the new locking rules for page_state statistics.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index dede8d412dca..d52999c43336 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -79,13 +79,23 @@
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
  * allowed.
+ *
+ * - Fields can be modified with xxx_page_state and xxx_page_state_zone at
+ * any time safely (which protects the instance from modification by
+ * interrupt.
+ * - The __xxx_page_state variants can be used safely when interrupts are
+ * disabled.
+ * - The __xxx_page_state variants can be used if the field is only
+ * modified from process context, or only modified from interrupt context.
+ * In this case, the field should be commented here.
  */
 struct page_state {
 	unsigned long nr_dirty;		/* Dirty writeable pages */
 	unsigned long nr_writeback;	/* Pages under writeback */
 	unsigned long nr_unstable;	/* NFS unstable pages */
 	unsigned long nr_page_table_pages;/* Pages used for pagetables */
-	unsigned long nr_mapped;	/* mapped into pagetables */
+	unsigned long nr_mapped;	/* mapped into pagetables.
+					 * only modified from process context */
 	unsigned long nr_slab;		/* In slab */
 #define GET_PAGE_STATE_LAST nr_slab
 
-- 
cgit v1.2.3-71-gd317


From 8d9067bda99c68e1a17d93e78cf3a5a3f67e0c35 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:24 -0800
Subject: [PATCH] Keys: Remove key duplication

Remove the key duplication stuff since there's nothing that uses it, no way
to get at it and it's awkward to deal with for LSM purposes.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt       | 18 -------------
 include/keys/user-type.h     |  1 -
 include/linux/key.h          |  8 ------
 security/keys/key.c          | 56 +++-----------------------------------
 security/keys/keyring.c      | 64 --------------------------------------------
 security/keys/user_defined.c | 33 -----------------------
 6 files changed, 3 insertions(+), 177 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 31154882000a..6304db59bfe4 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -860,24 +860,6 @@ The structure has a number of fields, some of which are mandatory:
      It is safe to sleep in this method.
 
 
- (*) int (*duplicate)(struct key *key, const struct key *source);
-
-     If this type of key can be duplicated, then this method should be
-     provided. It is called to copy the payload attached to the source into the
-     new key. The data length on the new key will have been updated and the
-     quota adjusted already.
-
-     This method will be called with the source key's semaphore read-locked to
-     prevent its payload from being changed, thus RCU constraints need not be
-     applied to the source key.
-
-     This method does not have to lock the destination key in order to attach a
-     payload. The fact that KEY_FLAG_INSTANTIATED is not set in key->flags
-     prevents anything else from gaining access to the key.
-
-     It is safe to sleep in this method.
-
-
  (*) int (*update)(struct key *key, const void *data, size_t datalen);
 
      If this type of key can be updated, then this method should be provided.
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index 26f6ec38577a..a3dae1803f45 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -35,7 +35,6 @@ struct user_key_payload {
 extern struct key_type key_type_user;
 
 extern int user_instantiate(struct key *key, const void *data, size_t datalen);
-extern int user_duplicate(struct key *key, const struct key *source);
 extern int user_update(struct key *key, const void *data, size_t datalen);
 extern int user_match(const struct key *key, const void *criterion);
 extern void user_destroy(struct key *key);
diff --git a/include/linux/key.h b/include/linux/key.h
index 53513a3be53b..4d189e51bc6c 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -193,14 +193,6 @@ struct key_type {
 	 */
 	int (*instantiate)(struct key *key, const void *data, size_t datalen);
 
-	/* duplicate a key of this type (optional)
-	 * - the source key will be locked against change
-	 * - the new description will be attached
-	 * - the quota will have been adjusted automatically from
-	 *   source->quotalen
-	 */
-	int (*duplicate)(struct key *key, const struct key *source);
-
 	/* update a key of this type (optional)
 	 * - this method should call key_payload_reserve() to recalculate the
 	 *   quota consumption
diff --git a/security/keys/key.c b/security/keys/key.c
index 01bcfecb7eae..bb036623d0a8 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -240,9 +240,9 @@ static inline void key_alloc_serial(struct key *key)
 /*
  * allocate a key of the specified type
  * - update the user's quota to reflect the existence of the key
- * - called from a key-type operation with key_types_sem read-locked by either
- *   key_create_or_update() or by key_duplicate(); this prevents unregistration
- *   of the key type
+ * - called from a key-type operation with key_types_sem read-locked by
+ *   key_create_or_update()
+ *   - this prevents unregistration of the key type
  * - upon return the key is as yet uninstantiated; the caller needs to either
  *   instantiate the key or discard it before returning
  */
@@ -887,56 +887,6 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen)
 
 EXPORT_SYMBOL(key_update);
 
-/*****************************************************************************/
-/*
- * duplicate a key, potentially with a revised description
- * - must be supported by the keytype (keyrings for instance can be duplicated)
- */
-struct key *key_duplicate(struct key *source, const char *desc)
-{
-	struct key *key;
-	int ret;
-
-	key_check(source);
-
-	if (!desc)
-		desc = source->description;
-
-	down_read(&key_types_sem);
-
-	ret = -EINVAL;
-	if (!source->type->duplicate)
-		goto error;
-
-	/* allocate and instantiate a key */
-	key = key_alloc(source->type, desc, current->fsuid, current->fsgid,
-			source->perm, 0);
-	if (IS_ERR(key))
-		goto error_k;
-
-	down_read(&source->sem);
-	ret = key->type->duplicate(key, source);
-	up_read(&source->sem);
-	if (ret < 0)
-		goto error2;
-
-	atomic_inc(&key->user->nikeys);
-	set_bit(KEY_FLAG_INSTANTIATED, &key->flags);
-
- error_k:
-	up_read(&key_types_sem);
- out:
-	return key;
-
- error2:
-	key_put(key);
- error:
-	up_read(&key_types_sem);
-	key = ERR_PTR(ret);
-	goto out;
-
-} /* end key_duplicate() */
-
 /*****************************************************************************/
 /*
  * revoke a key
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 4e9fa8be44b8..0acecbd4fa37 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -48,7 +48,6 @@ static inline unsigned keyring_hash(const char *desc)
  */
 static int keyring_instantiate(struct key *keyring,
 			       const void *data, size_t datalen);
-static int keyring_duplicate(struct key *keyring, const struct key *source);
 static int keyring_match(const struct key *keyring, const void *criterion);
 static void keyring_destroy(struct key *keyring);
 static void keyring_describe(const struct key *keyring, struct seq_file *m);
@@ -59,7 +58,6 @@ struct key_type key_type_keyring = {
 	.name		= "keyring",
 	.def_datalen	= sizeof(struct keyring_list),
 	.instantiate	= keyring_instantiate,
-	.duplicate	= keyring_duplicate,
 	.match		= keyring_match,
 	.destroy	= keyring_destroy,
 	.describe	= keyring_describe,
@@ -118,68 +116,6 @@ static int keyring_instantiate(struct key *keyring,
 
 } /* end keyring_instantiate() */
 
-/*****************************************************************************/
-/*
- * duplicate the list of subscribed keys from a source keyring into this one
- */
-static int keyring_duplicate(struct key *keyring, const struct key *source)
-{
-	struct keyring_list *sklist, *klist;
-	unsigned max;
-	size_t size;
-	int loop, ret;
-
-	const unsigned limit =
-		(PAGE_SIZE - sizeof(*klist)) / sizeof(struct key *);
-
-	ret = 0;
-
-	/* find out how many keys are currently linked */
-	rcu_read_lock();
-	sklist = rcu_dereference(source->payload.subscriptions);
-	max = 0;
-	if (sklist)
-		max = sklist->nkeys;
-	rcu_read_unlock();
-
-	/* allocate a new payload and stuff load with key links */
-	if (max > 0) {
-		BUG_ON(max > limit);
-
-		max = (max + 3) & ~3;
-		if (max > limit)
-			max = limit;
-
-		ret = -ENOMEM;
-		size = sizeof(*klist) + sizeof(struct key *) * max;
-		klist = kmalloc(size, GFP_KERNEL);
-		if (!klist)
-			goto error;
-
-		/* set links */
-		rcu_read_lock();
-		sklist = rcu_dereference(source->payload.subscriptions);
-
-		klist->maxkeys = max;
-		klist->nkeys = sklist->nkeys;
-		memcpy(klist->keys,
-		       sklist->keys,
-		       sklist->nkeys * sizeof(struct key *));
-
-		for (loop = klist->nkeys - 1; loop >= 0; loop--)
-			atomic_inc(&klist->keys[loop]->usage);
-
-		rcu_read_unlock();
-
-		rcu_assign_pointer(keyring->payload.subscriptions, klist);
-		ret = 0;
-	}
-
- error:
-	return ret;
-
-} /* end keyring_duplicate() */
-
 /*****************************************************************************/
 /*
  * match keyrings on their name
diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c
index cbda3b2780a1..8e71895b97a7 100644
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -26,7 +26,6 @@
 struct key_type key_type_user = {
 	.name		= "user",
 	.instantiate	= user_instantiate,
-	.duplicate	= user_duplicate,
 	.update		= user_update,
 	.match		= user_match,
 	.destroy	= user_destroy,
@@ -68,40 +67,8 @@ error:
 	return ret;
 
 } /* end user_instantiate() */
-
 EXPORT_SYMBOL_GPL(user_instantiate);
 
-/*****************************************************************************/
-/*
- * duplicate a user defined key
- * - both keys' semaphores are locked against further modification
- * - the new key cannot yet be accessed
- */
-int user_duplicate(struct key *key, const struct key *source)
-{
-	struct user_key_payload *upayload, *spayload;
-	int ret;
-
-	/* just copy the payload */
-	ret = -ENOMEM;
-	upayload = kmalloc(sizeof(*upayload) + source->datalen, GFP_KERNEL);
-	if (upayload) {
-		spayload = rcu_dereference(source->payload.data);
-		BUG_ON(source->datalen != spayload->datalen);
-
-		upayload->datalen = key->datalen = spayload->datalen;
-		memcpy(upayload->data, spayload->data, key->datalen);
-
-		key->payload.data = upayload;
-		ret = 0;
-	}
-
-	return ret;
-
-} /* end user_duplicate() */
-
-EXPORT_SYMBOL_GPL(user_duplicate);
-
 /*****************************************************************************/
 /*
  * dispose of the old data from an updated user defined key
-- 
cgit v1.2.3-71-gd317


From 642fb4d1f1dd2417aa69189fe5ceb81e4fb72900 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:41 -0800
Subject: [PATCH] NOMMU: Provide shared-writable mmap support on ramfs

The attached patch makes ramfs support shared-writable mmaps by:

 (1) Attempting to perform a contiguous block allocation to the requested size
     when truncate attempts to increase the file from zero size, such as
     happens when:

	fd = shm_open("/file/on/ramfs", ...):
	ftruncate(fd, size_requested);
	addr = mmap(NULL, subsize, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_SHARED,
		    fd, offset);

 (2) Permitting any shared-writable mapping over any contiguous set of extant
     pages. get_unmapped_area() will return the address into the actual ramfs
     pages. The mapping may start anywhere and be of any size, but may not go
     over the end of file. Multiple mappings may overlap in any way.

 (3) Not permitting a file to be shrunk if it would truncate any shared
     mappings (private mappings are copied).

Thus this patch provides support for POSIX shared memory on NOMMU kernels,
with certain limitations such as there being a large enough block of pages
available to support the allocation and it only working on directly mappable
filesystems.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ramfs/Makefile     |   4 +-
 fs/ramfs/file-mmu.c   |  57 ++++++++++
 fs/ramfs/file-nommu.c | 292 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ramfs/inode.c      |  22 +---
 fs/ramfs/internal.h   |  15 +++
 include/linux/ramfs.h |  10 ++
 6 files changed, 378 insertions(+), 22 deletions(-)
 create mode 100644 fs/ramfs/file-mmu.c
 create mode 100644 fs/ramfs/file-nommu.c
 create mode 100644 fs/ramfs/internal.h

(limited to 'include/linux')

diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile
index f096f3007091..5a0236e02ee1 100644
--- a/fs/ramfs/Makefile
+++ b/fs/ramfs/Makefile
@@ -4,4 +4,6 @@
 
 obj-$(CONFIG_RAMFS) += ramfs.o
 
-ramfs-objs := inode.o
+file-mmu-y := file-nommu.o
+file-mmu-$(CONFIG_MMU) := file-mmu.o
+ramfs-objs += inode.o $(file-mmu-y)
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
new file mode 100644
index 000000000000..2115383dcc8d
--- /dev/null
+++ b/fs/ramfs/file-mmu.c
@@ -0,0 +1,57 @@
+/* file-mmu.c: ramfs MMU-based file operations
+ *
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+struct address_space_operations ramfs_aops = {
+	.readpage	= simple_readpage,
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+	.read		= generic_file_read,
+	.write		= generic_file_write,
+	.mmap		= generic_file_mmap,
+	.fsync		= simple_sync_file,
+	.sendfile	= generic_file_sendfile,
+	.llseek		= generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
new file mode 100644
index 000000000000..3f810acd0bfa
--- /dev/null
+++ b/fs/ramfs/file-nommu.c
@@ -0,0 +1,292 @@
+/* file-nommu.c: no-MMU version of ramfs
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+#include <linux/quotaops.h>
+#include <linux/pagevec.h>
+#include <linux/mman.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+
+struct address_space_operations ramfs_aops = {
+	.readpage		= simple_readpage,
+	.prepare_write		= simple_prepare_write,
+	.commit_write		= simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+	.mmap			= ramfs_nommu_mmap,
+	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
+	.read			= generic_file_read,
+	.write			= generic_file_write,
+	.fsync			= simple_sync_file,
+	.sendfile		= generic_file_sendfile,
+	.llseek			= generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+	.setattr		= ramfs_nommu_setattr,
+	.getattr		= simple_getattr,
+};
+
+/*****************************************************************************/
+/*
+ * add a contiguous set of pages into a ramfs inode when it's truncated from
+ * size 0 on the assumption that it's going to be used for an mmap of shared
+ * memory
+ */
+static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+	struct pagevec lru_pvec;
+	unsigned long npages, xpages, loop, limit;
+	struct page *pages;
+	unsigned order;
+	void *data;
+	int ret;
+
+	/* make various checks */
+	order = get_order(newsize);
+	if (unlikely(order >= MAX_ORDER))
+		goto too_big;
+
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	if (limit != RLIM_INFINITY && newsize > limit)
+		goto fsize_exceeded;
+
+	if (newsize > inode->i_sb->s_maxbytes)
+		goto too_big;
+
+	i_size_write(inode, newsize);
+
+	/* allocate enough contiguous pages to be able to satisfy the
+	 * request */
+	pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+	if (!pages)
+		return -ENOMEM;
+
+	/* split the high-order page into an array of single pages */
+	xpages = 1UL << order;
+	npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	for (loop = 0; loop < npages; loop++)
+		set_page_count(pages + loop, 1);
+
+	/* trim off any pages we don't actually require */
+	for (loop = npages; loop < xpages; loop++)
+		__free_page(pages + loop);
+
+	/* clear the memory we allocated */
+	newsize = PAGE_SIZE * npages;
+	data = page_address(pages);
+	memset(data, 0, newsize);
+
+	/* attach all the pages to the inode's address space */
+	pagevec_init(&lru_pvec, 0);
+	for (loop = 0; loop < npages; loop++) {
+		struct page *page = pages + loop;
+
+		ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+		if (ret < 0)
+			goto add_error;
+
+		if (!pagevec_add(&lru_pvec, page))
+			__pagevec_lru_add(&lru_pvec);
+
+		unlock_page(page);
+	}
+
+	pagevec_lru_add(&lru_pvec);
+	return 0;
+
+ fsize_exceeded:
+	send_sig(SIGXFSZ, current, 0);
+ too_big:
+	return -EFBIG;
+
+ add_error:
+	page_cache_release(pages + loop);
+	for (loop++; loop < npages; loop++)
+		__free_page(pages + loop);
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * check that file shrinkage doesn't leave any VMAs dangling in midair
+ */
+static int ramfs_nommu_check_mappings(struct inode *inode,
+				      size_t newsize, size_t size)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+
+	/* search for VMAs that fall within the dead zone */
+	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+			      newsize >> PAGE_SHIFT,
+			      (size + PAGE_SIZE - 1) >> PAGE_SHIFT
+			      ) {
+		/* found one - only interested if it's shared out of the page
+		 * cache */
+		if (vma->vm_flags & VM_SHARED)
+			return -ETXTBSY; /* not quite true, but near enough */
+	}
+
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ *
+ */
+static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
+{
+	int ret;
+
+	/* assume a truncate from zero size is going to be for the purposes of
+	 * shared mmap */
+	if (size == 0) {
+		if (unlikely(newsize >> 32))
+			return -EFBIG;
+
+		return ramfs_nommu_expand_for_mapping(inode, newsize);
+	}
+
+	/* check that a decrease in size doesn't cut off any shared mappings */
+	if (newsize < size) {
+		ret = ramfs_nommu_check_mappings(inode, newsize, size);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = vmtruncate(inode, size);
+
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * handle a change of attributes
+ * - we're specifically interested in a change of size
+ */
+static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	struct inode *inode = dentry->d_inode;
+	unsigned int old_ia_valid = ia->ia_valid;
+	int ret = 0;
+
+	/* by providing our own setattr() method, we skip this quotaism */
+	if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
+	    (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
+		ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
+
+	/* pick out size-changing events */
+	if (ia->ia_valid & ATTR_SIZE) {
+		loff_t size = i_size_read(inode);
+		if (ia->ia_size != size) {
+			ret = ramfs_nommu_resize(inode, ia->ia_size, size);
+			if (ret < 0 || ia->ia_valid == ATTR_SIZE)
+				goto out;
+		} else {
+			/* we skipped the truncate but must still update
+			 * timestamps
+			 */
+			ia->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+		}
+	}
+
+	ret = inode_setattr(inode, ia);
+ out:
+	ia->ia_valid = old_ia_valid;
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * try to determine where a shared mapping can be made
+ * - we require that:
+ *   - the pages to be mapped must exist
+ *   - the pages be physically contiguous in sequence
+ */
+unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+					    unsigned long addr, unsigned long len,
+					    unsigned long pgoff, unsigned long flags)
+{
+	unsigned long maxpages, lpages, nr, loop, ret;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page **pages = NULL, **ptr, *page;
+	loff_t isize;
+
+	if (!(flags & MAP_SHARED))
+		return addr;
+
+	/* the mapping mustn't extend beyond the EOF */
+	lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	isize = i_size_read(inode);
+
+	ret = -EINVAL;
+	maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= maxpages)
+		goto out;
+
+	if (maxpages - pgoff < lpages)
+		goto out;
+
+	/* gang-find the pages */
+	ret = -ENOMEM;
+	pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		goto out;
+
+	nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+	if (nr != lpages)
+		goto out; /* leave if some pages were missing */
+
+	/* check the pages for physical adjacency */
+	ptr = pages;
+	page = *ptr++;
+	page++;
+	for (loop = lpages; loop > 1; loop--)
+		if (*ptr++ != page++)
+			goto out;
+
+	/* okay - all conditions fulfilled */
+	ret = (unsigned long) page_address(pages[0]);
+
+ out:
+	if (pages) {
+		ptr = pages;
+		for (loop = lpages; loop > 0; loop--)
+			put_page(*ptr++);
+		kfree(pages);
+	}
+
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * set up a mapping
+ */
+int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return 0;
+}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0a88917605ae..c66bd5e4c05c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,13 +34,12 @@
 #include <linux/ramfs.h>
 
 #include <asm/uaccess.h>
+#include "internal.h"
 
 /* some random number */
 #define RAMFS_MAGIC	0x858458f6
 
 static struct super_operations ramfs_ops;
-static struct address_space_operations ramfs_aops;
-static struct inode_operations ramfs_file_inode_operations;
 static struct inode_operations ramfs_dir_inode_operations;
 
 static struct backing_dev_info ramfs_backing_dev_info = {
@@ -142,25 +141,6 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
 	return error;
 }
 
-static struct address_space_operations ramfs_aops = {
-	.readpage	= simple_readpage,
-	.prepare_write	= simple_prepare_write,
-	.commit_write	= simple_commit_write
-};
-
-struct file_operations ramfs_file_operations = {
-	.read		= generic_file_read,
-	.write		= generic_file_write,
-	.mmap		= generic_file_mmap,
-	.fsync		= simple_sync_file,
-	.sendfile	= generic_file_sendfile,
-	.llseek		= generic_file_llseek,
-};
-
-static struct inode_operations ramfs_file_inode_operations = {
-	.getattr	= simple_getattr,
-};
-
 static struct inode_operations ramfs_dir_inode_operations = {
 	.create		= ramfs_create,
 	.lookup		= simple_lookup,
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
new file mode 100644
index 000000000000..272c8a7120b0
--- /dev/null
+++ b/fs/ramfs/internal.h
@@ -0,0 +1,15 @@
+/* internal.h: ramfs internal definitions
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+extern struct address_space_operations ramfs_aops;
+extern struct file_operations ramfs_file_operations;
+extern struct inode_operations ramfs_file_inode_operations;
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index e0a4faa9610c..953b6df5d037 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -5,6 +5,16 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev);
 struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
 	 int flags, const char *dev_name, void *data);
 
+#ifndef CONFIG_MMU
+extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+						   unsigned long addr,
+						   unsigned long len,
+						   unsigned long pgoff,
+						   unsigned long flags);
+
+extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+#endif
+
 extern struct file_operations ramfs_file_operations;
 extern struct vm_operations_struct generic_file_vm_ops;
 
-- 
cgit v1.2.3-71-gd317


From b0e15190ead07056ab0c3844a499ff35e66d27cc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:42 -0800
Subject: [PATCH] NOMMU: Make SYSV IPC SHM use ramfs facilities on NOMMU

The attached patch makes the SYSV IPC shared memory facilities use the new
ramfs facilities on a no-MMU kernel.

The following changes are made:

 (1) There are now shmem_mmap() and shmem_get_unmapped_area() functions to
     allow the IPC SHM facilities to commune with the tiny-shmem and shmem
     code.

 (2) ramfs files now need resizing using do_truncate() rather than by modifying
     the inode size directly (see shmem_file_setup()). This causes ramfs to
     attempt to bind a block of pages of sufficient size to the inode.

 (3) CONFIG_SYSVIPC is no longer contingent on CONFIG_MMU.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |  9 +++++++++
 init/Kconfig       |  1 -
 ipc/shm.c          | 18 +++++++++++++-----
 mm/nommu.c         |  7 +++++++
 mm/shmem.c         |  2 +-
 mm/tiny-shmem.c    | 29 ++++++++++++++++++++++++++++-
 6 files changed, 58 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75ec04e2f184..26f3094911a5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -654,9 +654,18 @@ static inline struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 }
 #endif
 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
+extern int shmem_mmap(struct file *file, struct vm_area_struct *vma);
 
 int shmem_zero_setup(struct vm_area_struct *);
 
+#ifndef CONFIG_MMU
+extern unsigned long shmem_get_unmapped_area(struct file *file,
+					     unsigned long addr,
+					     unsigned long len,
+					     unsigned long pgoff,
+					     unsigned long flags);
+#endif
+
 static inline int can_do_mlock(void)
 {
 	if (capable(CAP_IPC_LOCK))
diff --git a/init/Kconfig b/init/Kconfig
index ce737e02c5a2..24e0f7c756c0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -105,7 +105,6 @@ config SWAP
 
 config SYSVIPC
 	bool "System V IPC"
-	depends on MMU
 	---help---
 	  Inter Process Communication is a suite of library functions and
 	  system calls which let processes (running programs) synchronize and
diff --git a/ipc/shm.c b/ipc/shm.c
index 587d836d80d9..0ef4a1cf3e27 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -157,14 +157,22 @@ static void shm_close (struct vm_area_struct *shmd)
 
 static int shm_mmap(struct file * file, struct vm_area_struct * vma)
 {
-	file_accessed(file);
-	vma->vm_ops = &shm_vm_ops;
-	shm_inc(file->f_dentry->d_inode->i_ino);
-	return 0;
+	int ret;
+
+	ret = shmem_mmap(file, vma);
+	if (ret == 0) {
+		vma->vm_ops = &shm_vm_ops;
+		shm_inc(file->f_dentry->d_inode->i_ino);
+	}
+
+	return ret;
 }
 
 static struct file_operations shm_file_operations = {
-	.mmap	= shm_mmap
+	.mmap	= shm_mmap,
+#ifndef CONFIG_MMU
+	.get_unmapped_area = shmem_get_unmapped_area,
+#endif
 };
 
 static struct vm_operations_struct shm_vm_ops = {
diff --git a/mm/nommu.c b/mm/nommu.c
index c1196812876b..c10262d68232 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
 {
 	return 0;
 }
+
+struct page *filemap_nopage(struct vm_area_struct *area,
+			unsigned long address, int *type)
+{
+	BUG();
+	return NULL;
+}
diff --git a/mm/shmem.c b/mm/shmem.c
index 65c148efa2ed..a1f2f02af724 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1270,7 +1270,7 @@ out_nomem:
 	return retval;
 }
 
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44ed6..cdc6d431972b 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
 		goto close_file;
 
 	d_instantiate(dentry, inode);
-	inode->i_size = size;
 	inode->i_nlink = 0;	/* It is unlinked */
+
 	file->f_vfsmnt = mntget(shm_mnt);
 	file->f_dentry = dentry;
 	file->f_mapping = inode->i_mapping;
 	file->f_op = &ramfs_file_operations;
 	file->f_mode = FMODE_WRITE | FMODE_READ;
+
+	/* notify everyone as to the change of file size */
+	error = do_truncate(dentry, size, file);
+	if (error < 0)
+		goto close_file;
+
 	return file;
 
 close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 {
 	return 0;
 }
+
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+#ifndef CONFIG_MMU
+	return ramfs_nommu_mmap(file, vma);
+#else
+	return 0;
+#endif
+}
+
+#ifndef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+				      unsigned long addr,
+				      unsigned long len,
+				      unsigned long pgoff,
+				      unsigned long flags)
+{
+	return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
-- 
cgit v1.2.3-71-gd317


From 7ee1dd3fee22f15728f545d266403fc977e1eb99 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:44 -0800
Subject: [PATCH] FRV: Make futex code compilable on nommu [try #2]

Make the futex code compilable and usable on NOMMU by making the attempt to
handle page faults conditional on CONFIG_MMU.  If this is not enabled, then
we can assume that EFAULT returned from futex_atomic_op_inuser() is not
recoverable, and that the address lies outside of valid memory.

handle_mm_fault() is made to BUG if called on NOMMU without attempting to
invoke the actual handler (__handle_mm_fault).

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 22 +++++++++++++++++++---
 kernel/futex.c     |  7 +++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 26f3094911a5..bc01fff3aa01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -717,12 +717,28 @@ extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
-extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
 
-static inline int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access)
+#ifdef CONFIG_MMU
+extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma,
+			unsigned long address, int write_access);
+
+static inline int handle_mm_fault(struct mm_struct *mm,
+			struct vm_area_struct *vma, unsigned long address,
+			int write_access)
 {
-	return __handle_mm_fault(mm, vma, address, write_access) & (~VM_FAULT_WRITE);
+	return __handle_mm_fault(mm, vma, address, write_access) &
+				(~VM_FAULT_WRITE);
 }
+#else
+static inline int handle_mm_fault(struct mm_struct *mm,
+			struct vm_area_struct *vma, unsigned long address,
+			int write_access)
+{
+	/* should never happen if there's no MMU */
+	BUG();
+	return VM_FAULT_SIGBUS;
+}
+#endif
 
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5e71a6bf6f6b..5efa2f978032 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -356,6 +356,13 @@ retry:
 		if (bh1 != bh2)
 			spin_unlock(&bh2->lock);
 
+#ifndef CONFIG_MMU
+		/* we don't get EFAULT from MMU faults if we don't have an MMU,
+		 * but we might get them from range checking */
+		ret = op_ret;
+		goto out;
+#endif
+
 		if (unlikely(op_ret != -EFAULT)) {
 			ret = op_ret;
 			goto out;
-- 
cgit v1.2.3-71-gd317


From f90b8116032f4216d260e31f966a3585319387ac Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jordan.crouse@amd.com>
Date: Fri, 6 Jan 2006 00:12:14 -0800
Subject: [PATCH] Base support for AMD Geode GX/LX processors

Provide basic support for the AMD Geode GX and LX processors.

Signed-off-by: Jordan Crouse <jordan.crouse@amd.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 MAINTAINERS                  |  7 +++++++
 arch/i386/Kconfig.cpu        | 14 ++++++++++----
 arch/i386/kernel/cpu/amd.c   |  7 ++++++-
 arch/i386/kernel/cpu/cyrix.c | 27 ++++++++++++++++++++++++++-
 include/asm-i386/module.h    |  4 +++-
 include/linux/pci_ids.h      | 10 ++++++++++
 6 files changed, 62 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index e9db0d6b928a..cb536bbed9ff 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -258,6 +258,13 @@ P:	Ivan Kokshaysky
 M:	ink@jurassic.park.msu.ru
 S:	Maintained for 2.4; PCI support for 2.6.
 
+AMD GEODE PROCESSOR/CHIPSET SUPPORT
+P:      Jordan Crouse
+M:      info-linux@geode.amd.com
+L:	info-linux@geode.amd.com
+W:	http://www.amd.com/us-en/ConnectivitySolutions/TechnicalResources/0,,50_2334_2452_11363,00.html
+S:	Supported
+
 APM DRIVER
 P:	Stephen Rothwell
 M:	sfr@canb.auug.org.au
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index 53bbb3c008ee..79603b3471f9 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -39,6 +39,7 @@ config M386
 	  - "Winchip-2" for IDT Winchip 2.
 	  - "Winchip-2A" for IDT Winchips with 3dNow! capabilities.
 	  - "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
+	  - "Geode GX/LX" For AMD Geode GX and LX processors.
 	  - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
 	  - "VIA C3-2 for VIA C3-2 "Nehemiah" (model 9 and above).
 
@@ -171,6 +172,11 @@ config MGEODEGX1
 	help
 	  Select this for a Geode GX1 (Cyrix MediaGX) chip.
 
+config MGEODE_LX
+       bool "Geode GX/LX"
+       help
+         Select this for AMD Geode GX and LX processors.
+
 config MCYRIXIII
 	bool "CyrixIII/VIA-C3"
 	help
@@ -220,8 +226,8 @@ config X86_XADD
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || X86_GENERIC
-	default "4" if X86_ELAN || M486 || M386
-	default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODEGX1
+	default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
+	default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
 	default "6" if MK7 || MK8 || MPENTIUMM
 
 config RWSEM_GENERIC_SPINLOCK
@@ -290,12 +296,12 @@ config X86_INTEL_USERCOPY
 
 config X86_USE_PPRO_CHECKSUM
 	bool
-	depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON
+	depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX
 	default y
 
 config X86_USE_3DNOW
 	bool
-	depends on MCYRIXIII || MK7
+	depends on MCYRIXIII || MK7 || MGEODE_LX
 	default y
 
 config X86_OOSTORE
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e344ef88cfcd..e7697e077f6b 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -161,8 +161,13 @@ static void __init init_amd(struct cpuinfo_x86 *c)
 					set_bit(X86_FEATURE_K6_MTRR, c->x86_capability);
 				break;
 			}
-			break;
 
+			if (c->x86_model == 10) {
+				/* AMD Geode LX is model 10 */
+				/* placeholder for any needed mods */
+				break;
+			}
+			break;
 		case 6: /* An Athlon/Duron */
  
 			/* Bit 15 of Athlon specific MSR 15, needs to be 0
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index ff87cc22b323..75015975d038 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -342,6 +342,31 @@ static void __init init_cyrix(struct cpuinfo_x86 *c)
 	return;
 }
 
+/*
+ * Handle National Semiconductor branded processors
+ */
+static void __devinit init_nsc(struct cpuinfo_x86 *c)
+{
+	/* There may be GX1 processors in the wild that are branded
+	 * NSC and not Cyrix.
+	 *
+	 * This function only handles the GX processor, and kicks every
+	 * thing else to the Cyrix init function above - that should
+	 * cover any processors that might have been branded differently
+	 * after NSC aquired Cyrix.
+	 *
+	 * If this breaks your GX1 horribly, please e-mail
+	 * info-linux@ldcmail.amd.com to tell us.
+	 */
+
+	/* Handle the GX (Formally known as the GX2) */
+
+	if (c->x86 == 5 && c->x86_model == 5)
+		display_cacheinfo(c);
+	else
+		init_cyrix(c);
+}
+
 /*
  * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected
  * by the fact that they preserve the flags across the division of 5/2.
@@ -422,7 +447,7 @@ int __init cyrix_init_cpu(void)
 static struct cpu_dev nsc_cpu_dev __initdata = {
 	.c_vendor	= "NSC",
 	.c_ident 	= { "Geode by NSC" },
-	.c_init		= init_cyrix,
+	.c_init		= init_nsc,
 	.c_identify	= generic_identify,
 };
 
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h
index eb7f2b4234aa..424661d25bd3 100644
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -52,8 +52,10 @@ struct mod_arch_specific
 #define MODULE_PROC_FAMILY "CYRIXIII "
 #elif defined CONFIG_MVIAC3_2
 #define MODULE_PROC_FAMILY "VIAC3-2 "
-#elif CONFIG_MGEODEGX1
+#elif defined CONFIG_MGEODEGX1
 #define MODULE_PROC_FAMILY "GEODEGX1 "
+#elif defined CONFIG_MGEODE_LX
+#define MODULE_PROC_FAMILY "GEODE "
 #else
 #error unknown processor family
 #endif
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4f01710485cd..24db7248301a 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -394,6 +394,13 @@
 #define PCI_DEVICE_ID_NS_87410		0xd001
 #define PCI_DEVICE_ID_NS_CS5535_IDE	0x002d
 
+#define PCI_DEVICE_ID_NS_CS5535_HOST_BRIDGE  0x0028
+#define PCI_DEVICE_ID_NS_CS5535_ISA_BRIDGE   0x002b
+#define PCI_DEVICE_ID_NS_CS5535_IDE          0x002d
+#define PCI_DEVICE_ID_NS_CS5535_AUDIO        0x002e
+#define PCI_DEVICE_ID_NS_CS5535_USB          0x002f
+#define PCI_DEVICE_ID_NS_CS5535_VIDEO        0x0030
+
 #define PCI_VENDOR_ID_TSENG		0x100c
 #define PCI_DEVICE_ID_TSENG_W32P_2	0x3202
 #define PCI_DEVICE_ID_TSENG_W32P_b	0x3205
@@ -496,6 +503,9 @@
 
 #define PCI_DEVICE_ID_AMD_CS5536_IDE	0x209A
 
+#define PCI_DEVICE_ID_AMD_LX_VIDEO  0x2081
+#define PCI_DEVICE_ID_AMD_LX_AES    0x2082
+
 #define PCI_VENDOR_ID_TRIDENT		0x1023
 #define PCI_DEVICE_ID_TRIDENT_4DWAVE_DX	0x2000
 #define PCI_DEVICE_ID_TRIDENT_4DWAVE_NX	0x2001
-- 
cgit v1.2.3-71-gd317


From eee45269b0f5979c70bc151c6c2f4e5f4f5ababe Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Fri, 6 Jan 2006 00:12:21 -0800
Subject: [PATCH] Alpha: convert to generic irq framework (generic part)

Thanks to Christoph for doing most of the work.

This allows automatic SMP IRQ affinity assignment other than default "all
interrupts on all CPUs" which is rather expensive.  This might be useful if
the hardware can be programmed to distribute interrupts among different
CPUs, like Alpha does.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h | 11 +++++++++++
 kernel/irq/manage.c |  2 ++
 kernel/irq/proc.c   |  4 +++-
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index f04ba20712a2..60f8bc78a35a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -221,6 +221,17 @@ extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
 extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
 extern void init_irq_proc(void);
+
+#ifdef CONFIG_AUTO_IRQ_AFFINITY
+extern int select_smp_affinity(unsigned int irq);
+#else
+static inline int
+select_smp_affinity(unsigned int irq)
+{
+	return 1;
+}
+#endif
+
 #endif
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 81c49a4d679e..97d5559997d2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -366,6 +366,8 @@ int request_irq(unsigned int irq,
 	action->next = NULL;
 	action->dev_id = dev_id;
 
+	select_smp_affinity(irq);
+
 	retval = setup_irq(irq, action);
 	if (retval)
 		kfree(action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f26e534c6585..8a64a4844cde 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -68,7 +68,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	 */
 	cpus_and(tmp, new_value, cpu_online_map);
 	if (cpus_empty(tmp))
-		return -EINVAL;
+		/* Special case for empty set - allow the architecture
+		   code to set default SMP affinity. */
+		return select_smp_affinity(irq) ? -EINVAL : full_count;
 
 	proc_set_irq_affinity(irq, new_value);
 
-- 
cgit v1.2.3-71-gd317


From 7088a5c00103ef48782d6c359cd12b13a10666e6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:13:05 -0800
Subject: [PATCH] swsusp: introduce the swap map structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces the swap map structure that can be used by swsusp for
keeping tracks of data pages written to the swap.   The structure itself is
described in a comment within the patch.

The overall idea is to reduce the amount of metadata written to the swap and
to write and read the image pages sequentially, in a file-alike way.  This
makes the swap-handling part of swsusp fairly independent of its
snapshot-handling part and will hopefully allow us to completely separate
these two parts in the future.

This patch is needed to remove the suspend image size limit imposed by the
limited size of the swsusp_info structure, which is essential for x86-64
systems with more than 512 MB of RAM.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |   6 +-
 kernel/power/disk.c     |   8 +-
 kernel/power/power.h    |  13 +-
 kernel/power/snapshot.c |  14 +-
 kernel/power/swsusp.c   | 558 ++++++++++++++++++++++++++++++++++--------------
 5 files changed, 418 insertions(+), 181 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index a61c04f804b2..33bbaea23aaf 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -14,11 +14,7 @@
 typedef struct pbe {
 	unsigned long address;		/* address of the copy */
 	unsigned long orig_address;	/* original address of page */
-	swp_entry_t swap_address;	
-
-	struct pbe *next;	/* also used as scratch space at
-				 * end of page (see link, diskpage)
-				 */
+	struct pbe *next;
 } suspend_pagedir_t;
 
 #define for_each_pbe(pbe, pblist) \
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4d944b281b28..76a5131b0e80 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -25,9 +25,9 @@
 extern suspend_disk_method_t pm_disk_mode;
 
 extern int swsusp_suspend(void);
-extern int swsusp_write(void);
+extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
 extern int swsusp_check(void);
-extern int swsusp_read(void);
+extern int swsusp_read(struct pbe **pblist_ptr);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
 
@@ -176,7 +176,7 @@ int pm_suspend_disk(void)
 	if (in_suspend) {
 		device_resume();
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write();
+		error = swsusp_write(pagedir_nosave, nr_copy_pages);
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
@@ -247,7 +247,7 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read())) {
+	if ((error = swsusp_read(&pagedir_nosave))) {
 		swsusp_free();
 		goto Thaw;
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6c042b5ee14b..977877c6dcfc 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -9,19 +9,14 @@
 #define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
 #endif
 
-#define MAX_PBES	((PAGE_SIZE - sizeof(struct new_utsname) \
-			- 4 - 3*sizeof(unsigned long) - sizeof(int) \
-			- sizeof(void *)) / sizeof(swp_entry_t))
-
 struct swsusp_info {
 	struct new_utsname	uts;
 	u32			version_code;
 	unsigned long		num_physpages;
 	int			cpus;
 	unsigned long		image_pages;
-	unsigned long		pagedir_pages;
-	suspend_pagedir_t	* suspend_pagedir;
-	swp_entry_t		pagedir[MAX_PBES];
+	unsigned long		pages;
+	swp_entry_t		start;
 } __attribute__((aligned(PAGE_SIZE)));
 
 
@@ -67,6 +62,8 @@ extern asmlinkage int swsusp_arch_resume(void);
 
 extern void free_pagedir(struct pbe *pblist);
 extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
-extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
 extern void swsusp_free(void);
 extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
+extern unsigned int snapshot_nr_pages(void);
+extern struct pbe *snapshot_pblist(void);
+extern void snapshot_pblist_set(struct pbe *pblist);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4a6dbcefd378..152d56cdf017 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -33,6 +33,9 @@
 
 #include "power.h"
 
+struct pbe *pagedir_nosave;
+unsigned int nr_copy_pages;
+
 #ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
@@ -244,7 +247,7 @@ static inline void fill_pb_page(struct pbe *pbpage)
  *	of memory pages allocated with alloc_pagedir()
  */
 
-void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
+static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct pbe *pbpage, *p;
 	unsigned int num = PBES_PER_PAGE;
@@ -261,7 +264,6 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 			p->next = p + 1;
 		p->next = NULL;
 	}
-	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
 }
 
 /**
@@ -332,7 +334,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
 		pblist = NULL;
-        }
+        } else
+        	create_pbe_list(pblist, nr_pages);
 	return pblist;
 }
 
@@ -395,7 +398,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages)
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return NULL;
 	}
-	create_pbe_list(pblist, nr_pages);
 
 	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
 		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
@@ -421,10 +423,6 @@ asmlinkage int swsusp_save(void)
 		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
 		 PAGES_FOR_IO, nr_free_pages());
 
-	/* This is needed because of the fixed size of swsusp_info */
-	if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
-		return -ENOSPC;
-
 	if (!enough_free_mem(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index bd3097c583bf..b09bd7c0998d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -30,6 +30,9 @@
  * Alex Badea <vampire@go.ro>:
  * Fixed runaway init
  *
+ * Rafael J. Wysocki <rjw@sisk.pl>
+ * Added the swap map data structure and reworked the handling of swap
+ *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -76,18 +79,6 @@ static int restore_highmem(void) { return 0; }
 
 extern char resume_file[];
 
-/* Local variables that should not be affected by save */
-unsigned int nr_copy_pages __nosavedata = 0;
-
-/* Suspend pagedir is allocated before final copy, therefore it
-   must be freed after resume
-
-   Warning: this is even more evil than it seems. Pagedirs this file
-   talks about are completely different from page directories used by
-   MMU hardware.
- */
-suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
@@ -238,48 +229,205 @@ static int write_page(unsigned long addr, swp_entry_t *loc)
 }
 
 /**
- *	data_free - Free the swap entries used by the saved image.
+ *	Swap map-handling functions
+ *
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to the swap.  It consists of many swap_map_page structures
+ *	that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are linked together with the help of either the
+ *	.next (in memory) or the .next_swap (in swap) member.
  *
- *	Walk the list of used swap entries and free each one.
- *	This is only used for cleanup when suspend fails.
+ *	The swap map is created during suspend.  At that time we need to keep
+ *	it in memory, because we have to free all of the allocated swap
+ *	entries if an error occurs.  The memory needed is preallocated
+ *	so that we know in advance if there's enough of it.
+ *
+ *	The first swap_map_page structure is filled with the swap entries that
+ *	correspond to the first MAP_PAGE_SIZE data pages written to swap and
+ *	so on.  After the all of the data pages have been written, the order
+ *	of the swap_map_page structures in the map is reversed so that they
+ *	can be read from swap in the original order.  This causes the data
+ *	pages to be loaded in exactly the same order in which they have been
+ *	saved.
+ *
+ *	During resume we only need to use one swap_map_page structure
+ *	at a time, which means that we only need to use two memory pages for
+ *	reading the image - one for reading the swap_map_page structures
+ *	and the second for reading the data pages from swap.
  */
-static void data_free(void)
+
+#define MAP_PAGE_SIZE	((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
+			/ sizeof(swp_entry_t))
+
+struct swap_map_page {
+	swp_entry_t		entries[MAP_PAGE_SIZE];
+	swp_entry_t		next_swap;
+	struct swap_map_page	*next;
+};
+
+static inline void free_swap_map(struct swap_map_page *swap_map)
 {
-	swp_entry_t entry;
-	struct pbe *p;
+	struct swap_map_page *swp;
 
-	for_each_pbe (p, pagedir_nosave) {
-		entry = p->swap_address;
-		if (entry.val)
-			swap_free(entry);
-		else
-			break;
+	while (swap_map) {
+		swp = swap_map->next;
+		free_page((unsigned long)swap_map);
+		swap_map = swp;
+	}
+}
+
+static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+{
+	struct swap_map_page *swap_map, *swp;
+	unsigned n = 0;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
+	swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	swp = swap_map;
+	for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
+		swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+		swp = swp->next;
+		if (!swp) {
+			free_swap_map(swap_map);
+			return NULL;
+		}
 	}
+	return swap_map;
 }
 
 /**
- *	data_write - Write saved image to swap.
- *
- *	Walk the list of pages in the image and sync each one to swap.
+ *	reverse_swap_map - reverse the order of pages in the swap map
+ *	@swap_map
  */
-static int data_write(void)
+
+static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
 {
-	int error = 0, i = 0;
-	unsigned int mod = nr_copy_pages / 100;
-	struct pbe *p;
+	struct swap_map_page *prev, *next;
+
+	prev = NULL;
+	while (swap_map) {
+		next = swap_map->next;
+		swap_map->next = prev;
+		prev = swap_map;
+		swap_map = next;
+	}
+	return prev;
+}
 
-	if (!mod)
-		mod = 1;
+/**
+ *	free_swap_map_entries - free the swap entries allocated to store
+ *	the swap map @swap_map (this is only called in case of an error)
+ */
+static inline void free_swap_map_entries(struct swap_map_page *swap_map)
+{
+	while (swap_map) {
+		if (swap_map->next_swap.val)
+			swap_free(swap_map->next_swap);
+		swap_map = swap_map->next;
+	}
+}
 
-	printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
-	for_each_pbe (p, pagedir_nosave) {
-		if (!(i%mod))
-			printk( "\b\b\b\b%3d%%", i / mod );
-		if ((error = write_page(p->address, &p->swap_address)))
+/**
+ *	save_swap_map - save the swap map used for tracing the data pages
+ *	stored in the swap
+ */
+
+static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
+{
+	swp_entry_t entry = (swp_entry_t){0};
+	int error;
+
+	while (swap_map) {
+		swap_map->next_swap = entry;
+		if ((error = write_page((unsigned long)swap_map, &entry)))
 			return error;
-		i++;
+		swap_map = swap_map->next;
 	}
-	printk("\b\b\b\bdone\n");
+	*start = entry;
+	return 0;
+}
+
+/**
+ *	free_image_entries - free the swap entries allocated to store
+ *	the image data pages (this is only called in case of an error)
+ */
+
+static inline void free_image_entries(struct swap_map_page *swp)
+{
+	unsigned k;
+
+	while (swp) {
+		for (k = 0; k < MAP_PAGE_SIZE; k++)
+			if (swp->entries[k].val)
+				swap_free(swp->entries[k]);
+		swp = swp->next;
+	}
+}
+
+/**
+ *	The swap_map_handle structure is used for handling the swap map in
+ *	a file-alike way
+ */
+
+struct swap_map_handle {
+	struct swap_map_page *cur;
+	unsigned int k;
+};
+
+static inline void init_swap_map_handle(struct swap_map_handle *handle,
+                                        struct swap_map_page *map)
+{
+	handle->cur = map;
+	handle->k = 0;
+}
+
+static inline int swap_map_write_page(struct swap_map_handle *handle,
+                                      unsigned long addr)
+{
+	int error;
+
+	error = write_page(addr, handle->cur->entries + handle->k);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_SIZE) {
+		handle->cur = handle->cur->next;
+		handle->k = 0;
+	}
+	return 0;
+}
+
+/**
+ *	save_image_data - save the data pages pointed to by the PBEs
+ *	from the list @pblist using the swap map handle @handle
+ *	(assume there are @nr_pages data pages to save)
+ */
+
+static int save_image_data(struct pbe *pblist,
+                           struct swap_map_handle *handle,
+                           unsigned int nr_pages)
+{
+	unsigned int m;
+	struct pbe *p;
+	int error = 0;
+
+	printk("Saving image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	for_each_pbe (p, pblist) {
+		error = swap_map_write_page(handle, p->address);
+		if (error)
+			break;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
+	}
+	if (!error)
+		printk("\b\b\b\bdone\n");
 	return error;
 }
 
@@ -295,19 +443,20 @@ static void dump_info(void)
 	pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
 	pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
 	pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
-	pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages);
+	pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
 }
 
-static void init_header(void)
+static void init_header(unsigned int nr_pages)
 {
 	memset(&swsusp_info, 0, sizeof(swsusp_info));
 	swsusp_info.version_code = LINUX_VERSION_CODE;
 	swsusp_info.num_physpages = num_physpages;
 	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
 
-	swsusp_info.suspend_pagedir = pagedir_nosave;
 	swsusp_info.cpus = num_online_cpus();
-	swsusp_info.image_pages = nr_copy_pages;
+	swsusp_info.image_pages = nr_pages;
+	swsusp_info.pages = nr_pages +
+		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT);
 }
 
 static int close_swap(void)
@@ -326,39 +475,53 @@ static int close_swap(void)
 }
 
 /**
- *	free_pagedir_entries - Free pages used by the page directory.
- *
- *	This is used during suspend for error recovery.
+ *	pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *	list starting at @pbe are stored in the array @buf[] (1 page)
  */
 
-static void free_pagedir_entries(void)
+static inline struct pbe *pack_orig_addresses(unsigned long *buf,
+                                              struct pbe *pbe)
 {
-	int i;
+	int j;
 
-	for (i = 0; i < swsusp_info.pagedir_pages; i++)
-		swap_free(swsusp_info.pagedir[i]);
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		buf[j] = pbe->orig_address;
+		pbe = pbe->next;
+	}
+	if (!pbe)
+		for (; j < PAGE_SIZE / sizeof(long); j++)
+			buf[j] = 0;
+	return pbe;
 }
 
-
 /**
- *	write_pagedir - Write the array of pages holding the page directory.
- *	@last:	Last swap entry we write (needed for header).
+ *	save_image_metadata - save the .orig_address fields of the PBEs
+ *	from the list @pblist using the swap map handle @handle
  */
 
-static int write_pagedir(void)
+static int save_image_metadata(struct pbe *pblist,
+                               struct swap_map_handle *handle)
 {
-	int error = 0;
+	unsigned long *buf;
 	unsigned int n = 0;
-	struct pbe *pbe;
+	struct pbe *p;
+	int error = 0;
 
-	printk( "Writing pagedir...");
-	for_each_pb_page (pbe, pagedir_nosave) {
-		if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
-			return error;
+	printk("Saving image metadata ... ");
+	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+	p = pblist;
+	while (p) {
+		p = pack_orig_addresses(buf, p);
+		error = swap_map_write_page(handle, (unsigned long)buf);
+		if (error)
+			break;
+		n++;
 	}
-
-	swsusp_info.pagedir_pages = n;
-	printk("done (%u pages)\n", n);
+	free_page((unsigned long)buf);
+	if (!error)
+		printk("done (%u pages saved)\n", n);
 	return error;
 }
 
@@ -384,33 +547,48 @@ static int enough_swap(unsigned int nr_pages)
 
 /**
  *	write_suspend_image - Write entire image and metadata.
- *
  */
-static int write_suspend_image(void)
+static int write_suspend_image(struct pbe *pblist, unsigned int nr_pages)
 {
+	struct swap_map_page *swap_map;
+	struct swap_map_handle handle;
 	int error;
 
-	if (!enough_swap(nr_copy_pages)) {
+	if (!enough_swap(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
 
-	init_header();
-	if ((error = data_write()))
-		goto FreeData;
+	init_header(nr_pages);
+	swap_map = alloc_swap_map(swsusp_info.pages);
+	if (!swap_map)
+		return -ENOMEM;
+	init_swap_map_handle(&handle, swap_map);
 
-	if ((error = write_pagedir()))
-		goto FreePagedir;
+	error = save_image_metadata(pblist, &handle);
+	if (!error)
+		error = save_image_data(pblist, &handle, nr_pages);
+	if (error)
+		goto Free_image_entries;
 
-	if ((error = close_swap()))
-		goto FreePagedir;
- Done:
+	swap_map = reverse_swap_map(swap_map);
+	error = save_swap_map(swap_map, &swsusp_info.start);
+	if (error)
+		goto Free_map_entries;
+
+	error = close_swap();
+	if (error)
+		goto Free_map_entries;
+
+Free_swap_map:
+	free_swap_map(swap_map);
 	return error;
- FreePagedir:
-	free_pagedir_entries();
- FreeData:
-	data_free();
-	goto Done;
+
+Free_map_entries:
+	free_swap_map_entries(swap_map);
+Free_image_entries:
+	free_image_entries(swap_map);
+	goto Free_swap_map;
 }
 
 /* It is important _NOT_ to umount filesystems at this point. We want
@@ -418,7 +596,7 @@ static int write_suspend_image(void)
  * filesystem clean: it is not. (And it does not matter, if we resume
  * correctly, we'll mark system clean, anyway.)
  */
-int swsusp_write(void)
+int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 {
 	int error;
 
@@ -427,14 +605,12 @@ int swsusp_write(void)
 		return error;
 	}
 	lock_swapdevices();
-	error = write_suspend_image();
+	error = write_suspend_image(pblist, nr_pages);
 	/* This will unlock ignored swap devices since writing is finished */
 	lock_swapdevices();
 	return error;
 }
 
-
-
 int swsusp_suspend(void)
 {
 	int error;
@@ -531,7 +707,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
 	/* We assume both lists contain the same number of elements */
 	while (src) {
 		dst->orig_address = src->orig_address;
-		dst->swap_address = src->swap_address;
 		dst = dst->next;
 		src = src->next;
 	}
@@ -611,6 +786,61 @@ static int bio_write_page(pgoff_t page_off, void *page)
 	return submit(WRITE, page_off, page);
 }
 
+/**
+ *	The following functions allow us to read data using a swap map
+ *	in a file-alike way
+ */
+
+static inline void release_swap_map_reader(struct swap_map_handle *handle)
+{
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+}
+
+static inline int get_swap_map_reader(struct swap_map_handle *handle,
+                                      swp_entry_t start)
+{
+	int error;
+
+	if (!swp_offset(start))
+		return -EINVAL;
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	if (!handle->cur)
+		return -ENOMEM;
+	error = bio_read_page(swp_offset(start), handle->cur);
+	if (error) {
+		release_swap_map_reader(handle);
+		return error;
+	}
+	handle->k = 0;
+	return 0;
+}
+
+static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
+{
+	unsigned long offset;
+	int error;
+
+	if (!handle->cur)
+		return -EINVAL;
+	offset = swp_offset(handle->cur->entries[handle->k]);
+	if (!offset)
+		return -EINVAL;
+	error = bio_read_page(offset, buf);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_SIZE) {
+		handle->k = 0;
+		offset = swp_offset(handle->cur->next_swap);
+		if (!offset)
+			release_swap_map_reader(handle);
+		else
+			error = bio_read_page(offset, handle->cur);
+	}
+	return error;
+}
+
 /*
  * Sanity check if this image makes sense with this kernel/swap context
  * I really don't think that it's foolproof but more than nothing..
@@ -639,7 +869,6 @@ static const char *sanity_check(void)
 	return NULL;
 }
 
-
 static int check_header(void)
 {
 	const char *reason = NULL;
@@ -653,7 +882,6 @@ static int check_header(void)
 		printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
 		return -EPERM;
 	}
-	nr_copy_pages = swsusp_info.image_pages;
 	return error;
 }
 
@@ -680,75 +908,88 @@ static int check_sig(void)
 }
 
 /**
- *	data_read - Read image pages from swap.
- *
- *	You do not need to check for overlaps, check_pagedir()
- *	already did that.
+ *	load_image_data - load the image data using the swap map handle
+ *	@handle and store them using the page backup list @pblist
+ *	(assume there are @nr_pages pages to load)
  */
 
-static int data_read(struct pbe *pblist)
+static int load_image_data(struct pbe *pblist,
+                           struct swap_map_handle *handle,
+                           unsigned int nr_pages)
 {
+	int error;
+	unsigned int m;
 	struct pbe *p;
-	int error = 0;
-	int i = 0;
-	int mod = swsusp_info.image_pages / 100;
-
-	if (!mod)
-		mod = 1;
-
-	printk("swsusp: Reading image data (%lu pages):     ",
-			swsusp_info.image_pages);
-
-	for_each_pbe (p, pblist) {
-		if (!(i % mod))
-			printk("\b\b\b\b%3d%%", i / mod);
 
-		if ((error = bio_read_page(swp_offset(p->swap_address),
-						(void *)p->address)))
-			return error;
-
-		i++;
+	if (!pblist)
+		return -EINVAL;
+	printk("Loading image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	p = pblist;
+	while (p) {
+		error = swap_map_read_page(handle, (void *)p->address);
+		if (error)
+			break;
+		p = p->next;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
 	}
-	printk("\b\b\b\bdone\n");
+	if (!error)
+		printk("\b\b\b\bdone\n");
 	return error;
 }
 
 /**
- *	read_pagedir - Read page backup list pages from swap
+ *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *	the PBEs in the list starting at @pbe
  */
 
-static int read_pagedir(struct pbe *pblist)
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
 {
-	struct pbe *pbpage, *p;
-	unsigned int i = 0;
-	int error;
+	int j;
 
-	if (!pblist)
-		return -EFAULT;
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		pbe->orig_address = buf[j];
+		pbe = pbe->next;
+	}
+	return pbe;
+}
 
-	printk("swsusp: Reading pagedir (%lu pages)\n",
-			swsusp_info.pagedir_pages);
+/**
+ *	load_image_metadata - load the image metadata using the swap map
+ *	handle @handle and put them into the PBEs in the list @pblist
+ */
 
-	for_each_pb_page (pbpage, pblist) {
-		unsigned long offset = swp_offset(swsusp_info.pagedir[i++]);
+static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
+{
+	struct pbe *p;
+	unsigned long *buf;
+	unsigned int n = 0;
+	int error = 0;
 
-		error = -EFAULT;
-		if (offset) {
-			p = (pbpage + PB_PAGE_SKIP)->next;
-			error = bio_read_page(offset, (void *)pbpage);
-			(pbpage + PB_PAGE_SKIP)->next = p;
-		}
+	printk("Loading image metadata ... ");
+	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+	p = pblist;
+	while (p) {
+		error = swap_map_read_page(handle, buf);
 		if (error)
 			break;
+		p = unpack_orig_addresses(buf, p);
+		n++;
 	}
-
+	free_page((unsigned long)buf);
 	if (!error)
-		BUG_ON(i != swsusp_info.pagedir_pages);
-
+		printk("done (%u pages loaded)\n", n);
 	return error;
 }
 
-
 static int check_suspend_image(void)
 {
 	int error = 0;
@@ -762,34 +1003,39 @@ static int check_suspend_image(void)
 	return 0;
 }
 
-static int read_suspend_image(void)
+static int read_suspend_image(struct pbe **pblist_ptr)
 {
 	int error = 0;
-	struct pbe *p;
+	struct pbe *p, *pblist;
+	struct swap_map_handle handle;
+	unsigned int nr_pages = swsusp_info.image_pages;
 
-	if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0)))
+	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
+	if (!p)
 		return -ENOMEM;
-
-	if ((error = read_pagedir(p)))
+	error = get_swap_map_reader(&handle, swsusp_info.start);
+	if (error)
+		/* The PBE list at p will be released by swsusp_free() */
 		return error;
-	create_pbe_list(p, nr_copy_pages);
-	mark_unsafe_pages(p);
-	pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
-	if (pagedir_nosave) {
-		create_pbe_list(pagedir_nosave, nr_copy_pages);
-		copy_page_backup_list(pagedir_nosave, p);
+	error = load_image_metadata(p, &handle);
+	if (!error) {
+		mark_unsafe_pages(p);
+		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
+		if (pblist)
+			copy_page_backup_list(pblist, p);
+		free_pagedir(p);
+		if (!pblist)
+			error = -ENOMEM;
+
+		/* Allocate memory for the image and read the data from swap */
+		if (!error)
+			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+		if (!error)
+			error = load_image_data(pblist, &handle, nr_pages);
+		if (!error)
+			*pblist_ptr = pblist;
 	}
-	free_pagedir(p);
-	if (!pagedir_nosave)
-		return -ENOMEM;
-
-	/* Allocate memory for the image and read the data from swap */
-
-	error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1);
-
-	if (!error)
-		error = data_read(pagedir_nosave);
-
+	release_swap_map_reader(&handle);
 	return error;
 }
 
@@ -821,7 +1067,7 @@ int swsusp_check(void)
  *	swsusp_read - Read saved image from swap.
  */
 
-int swsusp_read(void)
+int swsusp_read(struct pbe **pblist_ptr)
 {
 	int error;
 
@@ -830,7 +1076,7 @@ int swsusp_read(void)
 		return PTR_ERR(resume_bdev);
 	}
 
-	error = read_suspend_image();
+	error = read_suspend_image(pblist_ptr);
 	blkdev_put(resume_bdev);
 
 	if (!error)
-- 
cgit v1.2.3-71-gd317


From 72a97e08394a3b2e75481ff680ec2a0591e3cba4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:13:46 -0800
Subject: [PATCH] swsusp: improve freeing of memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch makes swsusp free only as much memory as needed to complete the
suspend and not as much as possible.   In the most of cases this should speed
up the suspend and make the system much more responsive after resume,
especially if a GUI (eg.  X Windows) is used.

If needed, the old behavior (ie to free as much memory as possible during
suspend) can be restored by unsetting FAST_FREE in power.h

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |  2 +-
 kernel/power/disk.c     | 30 +++--------------------
 kernel/power/power.h    | 14 ++++++++---
 kernel/power/snapshot.c | 65 +++++++++++++++++++++++++++++++++++++++++++++----
 kernel/power/swsusp.c   | 52 ++++++++++++++++++++++++++++++++++++++-
 5 files changed, 126 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 33bbaea23aaf..5dc94e777fab 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -73,6 +73,6 @@ unsigned long get_safe_page(gfp_t gfp_mask);
  * XXX: We try to keep some more pages free so that I/O operations succeed
  * without paging. Might this be more?
  */
-#define PAGES_FOR_IO	512
+#define PAGES_FOR_IO	1024
 
 #endif /* _LINUX_SWSUSP_H */
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 76a5131b0e80..9e51cdf7b78d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -24,6 +24,7 @@
 
 extern suspend_disk_method_t pm_disk_mode;
 
+extern int swsusp_shrink_memory(void);
 extern int swsusp_suspend(void);
 extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
 extern int swsusp_check(void);
@@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode)
 static int in_suspend __nosavedata = 0;
 
 
-/**
- *	free_some_memory -  Try to free as much memory as possible
- *
- *	... but do not OOM-kill anyone
- *
- *	Notice: all userland should be stopped at this point, or
- *	livelock is possible.
- */
-
-static void free_some_memory(void)
-{
-	unsigned int i = 0;
-	unsigned int tmp;
-	unsigned long pages = 0;
-	char *p = "-\\|/";
-
-	printk("Freeing memory...  ");
-	while ((tmp = shrink_all_memory(10000))) {
-		pages += tmp;
-		printk("\b%c", p[i++ % 4]);
-	}
-	printk("\bdone (%li pages freed)\n", pages);
-}
-
-
 static inline void platform_finish(void)
 {
 	if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -127,8 +103,8 @@ static int prepare_processes(void)
 	}
 
 	/* Free memory before shutting down devices. */
-	free_some_memory();
-	return 0;
+	if (!(error = swsusp_shrink_memory()))
+		return 0;
 thaw:
 	thaw_processes();
 	enable_nonboot_cpus();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 977877c6dcfc..acdc83b3d890 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -49,18 +49,26 @@ extern void thaw_processes(void);
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
 
-
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
 extern unsigned int nr_copy_pages;
-extern suspend_pagedir_t *pagedir_nosave;
-extern suspend_pagedir_t *pagedir_save;
+extern struct pbe *pagedir_nosave;
+
+/*
+ * This compilation switch determines the way in which memory will be freed
+ * during suspend.  If defined, only as much memory will be freed as needed
+ * to complete the suspend, which will make it go faster.  Otherwise, the
+ * largest possible amount of memory will be freed.
+ */
+#define FAST_FREE	1
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
+extern unsigned int count_data_pages(void);
 extern void free_pagedir(struct pbe *pblist);
+extern void release_eaten_pages(void);
 extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void swsusp_free(void);
 extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 152d56cdf017..e80d282dbf58 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -37,6 +37,31 @@ struct pbe *pagedir_nosave;
 unsigned int nr_copy_pages;
 
 #ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	unsigned int n = 0;
+
+	for_each_zone (zone)
+		if (is_highmem(zone)) {
+			mark_free_pages(zone);
+			for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
+				struct page *page;
+				unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+				if (!pfn_valid(pfn))
+					continue;
+				page = pfn_to_page(pfn);
+				if (PageReserved(page))
+					continue;
+				if (PageNosaveFree(page))
+					continue;
+				n++;
+			}
+		}
+	return n;
+}
+
 struct highmem_page {
 	char *data;
 	struct page *page;
@@ -152,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
 	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
-	if (PageReserved(page) && pfn_is_nosave(pfn)) {
-		pr_debug("[nosave pfn 0x%lx]", pfn);
+	if (PageReserved(page) && pfn_is_nosave(pfn))
 		return 0;
-	}
 	if (PageNosaveFree(page))
 		return 0;
 
 	return 1;
 }
 
-static unsigned count_data_pages(void)
+unsigned int count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
@@ -266,6 +289,35 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 	}
 }
 
+/**
+ *	On resume it is necessary to trace and eventually free the unsafe
+ *	pages that have been allocated, because they are needed for I/O
+ *	(on x86-64 we likely will "eat" these pages once again while
+ *	creating the temporary page translation tables)
+ */
+
+struct eaten_page {
+	struct eaten_page *next;
+	char padding[PAGE_SIZE - sizeof(void *)];
+};
+
+static struct eaten_page *eaten_pages = NULL;
+
+void release_eaten_pages(void)
+{
+	struct eaten_page *p, *q;
+
+	p = eaten_pages;
+	while (p) {
+		q = p->next;
+		/* We don't want swsusp_free() to free this page again */
+		ClearPageNosave(virt_to_page(p));
+		free_page((unsigned long)p);
+		p = q;
+	}
+	eaten_pages = NULL;
+}
+
 /**
  *	@safe_needed - on resume, for storing the PBE list and the image,
  *	we can only use memory pages that do not conflict with the pages
@@ -284,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 	if (safe_needed)
 		do {
 			res = (void *)get_zeroed_page(gfp_mask);
-			if (res && PageNosaveFree(virt_to_page(res)))
+			if (res && PageNosaveFree(virt_to_page(res))) {
 				/* This is for swsusp_free() */
 				SetPageNosave(virt_to_page(res));
+				((struct eaten_page *)res)->next = eaten_pages;
+				eaten_pages = res;
+			}
 		} while (res && PageNosaveFree(virt_to_page(res)));
 	else
 		res = (void *)get_zeroed_page(gfp_mask);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index b09bd7c0998d..f77f9397a364 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -70,11 +70,13 @@
 #include "power.h"
 
 #ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void);
 int save_highmem(void);
 int restore_highmem(void);
 #else
 static int save_highmem(void) { return 0; }
 static int restore_highmem(void) { return 0; }
+static unsigned int count_highmem_pages(void) { return 0; }
 #endif
 
 extern char resume_file[];
@@ -611,6 +613,52 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 	return error;
 }
 
+/**
+ *	swsusp_shrink_memory -  Try to free as much memory as needed
+ *
+ *	... but do not OOM-kill anyone
+ *
+ *	Notice: all userland should be stopped before it is called, or
+ *	livelock is possible.
+ */
+
+#define SHRINK_BITE	10000
+
+int swsusp_shrink_memory(void)
+{
+	long tmp;
+	struct zone *zone;
+	unsigned long pages = 0;
+	unsigned int i = 0;
+	char *p = "-\\|/";
+
+	printk("Shrinking memory...  ");
+	do {
+#ifdef FAST_FREE
+		tmp = 2 * count_highmem_pages();
+		tmp += tmp / 50 + count_data_pages();
+		tmp += (tmp + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
+			PAGES_FOR_IO;
+		for_each_zone (zone)
+			if (!is_highmem(zone))
+				tmp -= zone->free_pages;
+		if (tmp > 0) {
+			tmp = shrink_all_memory(SHRINK_BITE);
+			if (!tmp)
+				return -ENOMEM;
+			pages += tmp;
+		}
+#else
+		tmp = shrink_all_memory(SHRINK_BITE);
+		pages += tmp;
+#endif
+		printk("\b%c", p[i++%4]);
+	} while (tmp > 0);
+	printk("\bdone (%lu pages freed)\n", pages);
+
+	return 0;
+}
+
 int swsusp_suspend(void)
 {
 	int error;
@@ -1030,8 +1078,10 @@ static int read_suspend_image(struct pbe **pblist_ptr)
 		/* Allocate memory for the image and read the data from swap */
 		if (!error)
 			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
-		if (!error)
+		if (!error) {
+			release_eaten_pages();
 			error = load_image_data(pblist, &handle, nr_pages);
+		}
 		if (!error)
 			*pblist_ptr = pblist;
 	}
-- 
cgit v1.2.3-71-gd317


From 3a291a20bd6fcfafb2109031f0760a0d3e92ecd7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:16:37 -0800
Subject: [PATCH] mm: add a new function (needed for swap suspend)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds the function get_swap_page_of_type() allowing us to specify an index
in swap_info[] and select a swap_info_struct structure to be used for
allocating a swap page.

This function (or another one of similar functionality) will be necessary for
implementing the image-writing part of swsusp in the user space.   It can also
be used for simplifying the current in-kernel implementation of the
image-writing part of swsusp.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  1 +
 mm/swapfile.c        | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bd6641784107..556617bcf7ac 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -209,6 +209,7 @@ extern unsigned int nr_swapfiles;
 extern struct swap_info_struct swap_info[];
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
+extern swp_entry_t get_swap_page_of_type(int type);
 extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace301f..6da4b28b896b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -211,6 +211,26 @@ noswap:
 	return (swp_entry_t) {0};
 }
 
+swp_entry_t get_swap_page_of_type(int type)
+{
+	struct swap_info_struct *si;
+	pgoff_t offset;
+
+	spin_lock(&swap_lock);
+	si = swap_info + type;
+	if (si->flags & SWP_WRITEOK) {
+		nr_swap_pages--;
+		offset = scan_swap_map(si);
+		if (offset) {
+			spin_unlock(&swap_lock);
+			return swp_entry(type, offset);
+		}
+		nr_swap_pages++;
+	}
+	spin_unlock(&swap_lock);
+	return (swp_entry_t) {0};
+}
+
 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct * p;
-- 
cgit v1.2.3-71-gd317


From 347a8dc3b815f0c0fa62a1df075184ffe4cbdcf1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:28 -0800
Subject: [PATCH] s390: cleanup Kconfig

Sanitize some s390 Kconfig options.  We have ARCH_S390, ARCH_S390X,
ARCH_S390_31, 64BIT, S390_SUPPORT and COMPAT.  Replace these 6 options by
S390, 64BIT and COMPAT.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/Kconfig                  | 27 +++++++--------------------
 arch/s390/Makefile                 |  6 ++----
 arch/s390/appldata/appldata_base.c |  8 ++++----
 arch/s390/crypto/crypt_s390.h      | 10 +++++-----
 arch/s390/defconfig                |  4 +---
 arch/s390/kernel/Makefile          | 15 +++++----------
 arch/s390/kernel/cpcmd.c           | 16 ++++++++--------
 arch/s390/kernel/entry64.S         | 18 +++++++++---------
 arch/s390/kernel/head.S            |  4 ++--
 arch/s390/kernel/module.c          | 12 ++++++------
 arch/s390/kernel/process.c         | 12 ++++++------
 arch/s390/kernel/ptrace.c          | 24 ++++++++++++------------
 arch/s390/kernel/reipl_diag.c      |  2 +-
 arch/s390/kernel/setup.c           | 14 +++++++-------
 arch/s390/kernel/signal.c          |  2 +-
 arch/s390/kernel/smp.c             |  8 ++++----
 arch/s390/kernel/sys_s390.c        | 12 +++++-------
 arch/s390/kernel/traps.c           | 10 +++++-----
 arch/s390/kernel/vmlinux.lds.S     |  2 +-
 arch/s390/lib/Makefile             |  5 ++---
 arch/s390/lib/spinlock.c           |  2 +-
 arch/s390/mm/extmem.c              |  2 +-
 arch/s390/mm/fault.c               | 18 +++++++++---------
 arch/s390/mm/init.c                |  8 ++++----
 arch/s390/mm/mmap.c                |  2 +-
 block/Kconfig                      |  2 +-
 crypto/Kconfig                     |  8 ++++----
 drivers/char/Kconfig               |  2 +-
 drivers/char/hangcheck-timer.c     |  2 +-
 drivers/char/watchdog/Kconfig      |  2 +-
 drivers/input/evdev.c              |  2 +-
 drivers/net/phy/Kconfig            |  2 +-
 drivers/s390/block/Kconfig         |  8 ++++----
 drivers/s390/block/dasd.c          |  2 +-
 drivers/s390/block/dasd_diag.c     |  2 +-
 drivers/s390/block/dasd_diag.h     |  6 +++---
 drivers/s390/block/dasd_eckd.c     |  2 +-
 drivers/s390/block/dasd_fba.c      |  2 +-
 drivers/s390/block/xpram.c         |  4 ++--
 drivers/s390/char/vmwatchdog.c     |  2 +-
 drivers/s390/cio/cio.c             |  2 +-
 drivers/s390/cio/device_id.c       |  2 +-
 drivers/s390/cio/ioasm.h           |  4 ++--
 drivers/s390/cio/qdio.c            |  2 +-
 drivers/s390/cio/qdio.h            | 34 +++++++++++++++++-----------------
 drivers/s390/crypto/z90hardware.c  |  8 ++++----
 drivers/s390/net/Kconfig           |  2 +-
 drivers/s390/net/claw.c            |  6 +++---
 drivers/s390/s390mach.c            | 10 +++++-----
 drivers/s390/sysinfo.c             |  2 +-
 drivers/scsi/Kconfig               |  2 +-
 fs/partitions/Kconfig              |  2 +-
 fs/proc/array.c                    |  2 +-
 include/asm-s390/unistd.h          |  2 +-
 include/linux/irq.h                |  2 +-
 init/Kconfig                       |  2 +-
 init/do_mounts_rd.c                |  4 ++--
 kernel/panic.c                     |  4 ++--
 kernel/sysctl.c                    |  6 +++---
 lib/Kconfig.debug                  |  2 +-
 60 files changed, 183 insertions(+), 208 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 1846fbfd6bf2..6fe532d82417 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -23,14 +23,14 @@ config GENERIC_BUST_SPINLOCK
 
 mainmenu "Linux Kernel Configuration"
 
-config ARCH_S390
+config S390
 	bool
 	default y
 
 config UID16
 	bool
 	default y
-	depends on ARCH_S390X = 'n'
+	depends on !64BIT
 
 source "init/Kconfig"
 
@@ -38,20 +38,12 @@ menu "Base setup"
 
 comment "Processor type and features"
 
-config ARCH_S390X
+config 64BIT
 	bool "64 bit kernel"
 	help
 	  Select this option if you have a 64 bit IBM zSeries machine
 	  and want to use the 64 bit addressing mode.
 
-config 64BIT
-	def_bool ARCH_S390X
-
-config ARCH_S390_31
-	bool
-	depends on ARCH_S390X = 'n'
-	default y
-
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
@@ -101,20 +93,15 @@ config MATHEMU
 	  on older S/390 machines. Say Y unless you know your machine doesn't
 	  need this.
 
-config S390_SUPPORT
+config COMPAT
 	bool "Kernel support for 31 bit emulation"
-	depends on ARCH_S390X
+	depends on 64BIT
 	help
 	  Select this option if you want to enable your system kernel to
 	  handle system-calls from ELF binaries for 31 bit ESA.  This option
 	  (and some other stuff like libraries and such) is needed for
 	  executing 31 bit applications.  It is safe to say "Y".
 
-config COMPAT
-	bool
-	depends on S390_SUPPORT
-	default y
-
 config SYSVIPC_COMPAT
 	bool
 	depends on COMPAT && SYSVIPC
@@ -122,7 +109,7 @@ config SYSVIPC_COMPAT
 
 config BINFMT_ELF32
 	tristate "Kernel support for 31 bit ELF binaries"
-	depends on S390_SUPPORT
+	depends on COMPAT
 	help
 	  This allows you to run 32-bit Linux/ELF binaries on your zSeries
 	  in 64 bit mode. Everybody wants this; say Y.
@@ -135,7 +122,7 @@ choice
 
 config MARCH_G5
 	bool "S/390 model G5 and G6"
-	depends on ARCH_S390_31
+	depends on !64BIT
 	help
 	  Select this to build a 31 bit kernel that works
 	  on all S/390 and zSeries machines.
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 73a09a6ee6c8..6c6b197898d0 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -13,16 +13,14 @@
 # Copyright (C) 1994 by Linus Torvalds
 #
 
-ifdef CONFIG_ARCH_S390_31
+ifndef CONFIG_64BIT
 LDFLAGS		:= -m elf_s390
 CFLAGS		+= -m31
 AFLAGS		+= -m31
 UTS_MACHINE	:= s390
 STACK_SIZE	:= 8192
 CHECKFLAGS	+= -D__s390__
-endif
-
-ifdef CONFIG_ARCH_S390X
+else
 LDFLAGS		:= -m elf64_s390
 MODFLAGS	+= -fpic -D__PIC__
 CFLAGS		+= -m64
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index dee6ab54984d..d06a8d71c71d 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -40,7 +40,7 @@
 
 #define TOD_MICRO	0x01000			/* nr. of TOD clock units
 						   for 1 microsecond */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 
 #define APPLDATA_START_INTERVAL_REC 0x00   	/* Function codes for */
 #define APPLDATA_STOP_REC	    0x01	/* DIAG 0xDC	  */
@@ -54,13 +54,13 @@
 #define APPLDATA_GEN_EVENT_RECORD   0x82
 #define APPLDATA_START_CONFIG_REC   0x83
 
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 
 /*
  * Parameter list for DIAGNOSE X'DC'
  */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 struct appldata_parameter_list {
 	u16 diag;		/* The DIAGNOSE code X'00DC'          */
 	u8  function;		/* The function code for the DIAGNOSE */
@@ -82,7 +82,7 @@ struct appldata_parameter_list {
 	u64 product_id_addr;
 	u64 buffer_addr;
 };
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * /proc entries (sysctl)
diff --git a/arch/s390/crypto/crypt_s390.h b/arch/s390/crypto/crypt_s390.h
index d6712cfa6def..d1c259a7fe33 100644
--- a/arch/s390/crypto/crypt_s390.h
+++ b/arch/s390/crypto/crypt_s390.h
@@ -112,7 +112,7 @@ struct crypt_s390_query_status {
  * [ret] is the variable to receive the error code
  * [ERR] is the error code value
  */
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 #define __crypt_s390_fixup \
 	".section .fixup,\"ax\" \n"	\
 	"7:	lhi	%0,%h[e1] \n"	\
@@ -129,7 +129,7 @@ struct crypt_s390_query_status {
 	"	.long	0b,7b \n"	\
 	"	.long	1b,8b \n"	\
 	".previous"
-#else /* __s390x__ */
+#else /* CONFIG_64BIT */
 #define __crypt_s390_fixup \
 	".section .fixup,\"ax\" \n"	\
 	"7:	lhi	%0,%h[e1] \n"	\
@@ -142,7 +142,7 @@ struct crypt_s390_query_status {
 	"	.quad	0b,7b \n"	\
 	"	.quad	1b,8b \n"	\
 	".previous"
-#endif /* __s390x__ */
+#endif /* CONFIG_64BIT */
 
 /*
  * Standard code for setting the result of s390 crypto instructions.
@@ -150,10 +150,10 @@ struct crypt_s390_query_status {
  * [result]: the register containing the result (e.g. second operand length
  * to compute number of processed bytes].
  */
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 #define __crypt_s390_set_result \
 	"	lr	%0,%[result] \n"
-#else /* __s390x__ */
+#else /* CONFIG_64BIT */
 #define __crypt_s390_set_result \
 	"	lgr	%0,%[result] \n"
 #endif
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index f195c7ea1d7b..7d23edc6facb 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -6,7 +6,7 @@
 CONFIG_MMU=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_ARCH_S390=y
+CONFIG_S390=y
 CONFIG_UID16=y
 
 #
@@ -89,9 +89,7 @@ CONFIG_DEFAULT_IOSCHED="anticipatory"
 #
 # Processor type and features
 #
-# CONFIG_ARCH_S390X is not set
 # CONFIG_64BIT is not set
-CONFIG_ARCH_S390_31=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=32
 CONFIG_HOTPLUG_CPU=y
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 7434c32bc631..4865e4b49464 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -8,31 +8,26 @@ obj-y	:=  bitmap.o traps.o time.o process.o \
             setup.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o \
             semaphore.o s390_ext.o debug.o profile.o irq.o reipl_diag.o
 
+obj-y	+= $(if $(CONFIG_64BIT),entry64.o,entry.o)
+obj-y	+= $(if $(CONFIG_64BIT),reipl64.o,reipl.o)
+
 extra-y				+= head.o init_task.o vmlinux.lds
 
 obj-$(CONFIG_MODULES)		+= s390_ksyms.o module.o
 obj-$(CONFIG_SMP)		+= smp.o
 
-obj-$(CONFIG_S390_SUPPORT)	+= compat_linux.o compat_signal.o \
+obj-$(CONFIG_COMPAT)		+= compat_linux.o compat_signal.o \
 					compat_ioctl.o compat_wrapper.o \
 					compat_exec_domain.o
 obj-$(CONFIG_BINFMT_ELF32)	+= binfmt_elf32.o
 
-obj-$(CONFIG_ARCH_S390_31)	+= entry.o reipl.o
-obj-$(CONFIG_ARCH_S390X)	+= entry64.o reipl64.o
-
 obj-$(CONFIG_VIRT_TIMER)	+= vtime.o
 
 # Kexec part
 S390_KEXEC_OBJS := machine_kexec.o crash.o
-ifeq ($(CONFIG_ARCH_S390X),y)
-S390_KEXEC_OBJS += relocate_kernel64.o
-else
-S390_KEXEC_OBJS += relocate_kernel.o
-endif
+S390_KEXEC_OBJS += $(if $(CONFIG_64BIT),relocate_kernel64.o,relocate_kernel.o)
 obj-$(CONFIG_KEXEC) += $(S390_KEXEC_OBJS)
 
-
 #
 # This is just to get the dependencies...
 #
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index d47fecb42cc5..4ef44e536b2c 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -39,7 +39,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 
 	if (response != NULL && rlen > 0) {
 		memset(response, 0, rlen);
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		asm volatile (	"lra	2,0(%2)\n"
 				"lr	4,%3\n"
 				"o	4,%6\n"
@@ -55,7 +55,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "a" (cpcmd_buf), "d" (cmdlen),
 				"a" (response), "d" (rlen), "m" (mask)
 				: "cc", "2", "3", "4", "5" );
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
                 asm volatile (	"lrag	2,0(%2)\n"
 				"lgr	4,%3\n"
 				"o	4,%6\n"
@@ -73,11 +73,11 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "a" (cpcmd_buf), "d" (cmdlen),
 				"a" (response), "d" (rlen), "m" (mask)
 				: "cc", "2", "3", "4", "5" );
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
                 EBCASC(response, rlen);
         } else {
 		return_len = 0;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
                 asm volatile (	"lra	2,0(%1)\n"
 				"lr	3,%2\n"
 				"diag	2,3,0x8\n"
@@ -85,7 +85,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "=d" (return_code)
 				: "a" (cpcmd_buf), "d" (cmdlen)
 				: "2", "3"  );
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
                 asm volatile (	"lrag	2,0(%1)\n"
 				"lgr	3,%2\n"
 				"sam31\n"
@@ -95,7 +95,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "=d" (return_code)
 				: "a" (cpcmd_buf), "d" (cmdlen)
 				: "2", "3" );
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
         }
 	spin_unlock_irqrestore(&cpcmd_lock, flags);
 	if (response_code != NULL)
@@ -105,7 +105,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 
 EXPORT_SYMBOL(__cpcmd);
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 int cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 {
 	char *lowbuf;
@@ -129,4 +129,4 @@ int cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 }
 
 EXPORT_SYMBOL(cpcmd);
-#endif		/* CONFIG_ARCH_S390X */
+#endif		/* CONFIG_64BIT */
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 4eb71ffcf484..369ab4413ec7 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -213,7 +213,7 @@ sysc_nr_ok:
 	mvc	SP_ARGS(8,%r15),SP_R7(%r15)
 sysc_do_restart:
 	larl    %r10,sys_call_table
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 	tm	__TI_flags+5(%r9),(_TIF_31BIT>>16)  # running in 31 bit mode ?
 	jno	sysc_noemu
 	larl    %r10,sys_call_table_emu  # use 31 bit emulation system calls
@@ -361,7 +361,7 @@ sys_clone_glue:
         la      %r2,SP_PTREGS(%r15)    # load pt_regs
         jg      sys_clone              # branch to sys_clone
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_clone_glue: 
         la      %r2,SP_PTREGS(%r15)    # load pt_regs
         jg      sys32_clone            # branch to sys32_clone
@@ -383,7 +383,7 @@ sys_execve_glue:
         bnz     0(%r12)               # it did fail -> store result in gpr2
         b       6(%r12)               # SKIP STG 2,SP_R2(15) in
                                       # system_call/sysc_tracesys
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_execve_glue:        
         la      %r2,SP_PTREGS(%r15)   # load pt_regs
 	lgr     %r12,%r14             # save return address
@@ -398,7 +398,7 @@ sys_sigreturn_glue:
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys_sigreturn         # branch to sys_sigreturn
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_sigreturn_glue:     
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys32_sigreturn       # branch to sys32_sigreturn
@@ -408,7 +408,7 @@ sys_rt_sigreturn_glue:
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys_rt_sigreturn      # branch to sys_sigreturn
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_rt_sigreturn_glue:     
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys32_rt_sigreturn    # branch to sys32_sigreturn
@@ -429,7 +429,7 @@ sys_sigsuspend_glue:
 	la      %r14,6(%r14)          # skip store of return value
         jg      sys_sigsuspend        # branch to sys_sigsuspend
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_sigsuspend_glue:    
 	llgfr	%r4,%r4               # unsigned long			
         lgr     %r5,%r4               # move mask back
@@ -449,7 +449,7 @@ sys_rt_sigsuspend_glue:
 	la      %r14,6(%r14)          # skip store of return value
         jg      sys_rt_sigsuspend     # branch to sys_rt_sigsuspend
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_rt_sigsuspend_glue: 
 	llgfr	%r3,%r3               # size_t			
         lgr     %r4,%r3               # move sigsetsize parameter
@@ -464,7 +464,7 @@ sys_sigaltstack_glue:
         la      %r4,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys_sigaltstack       # branch to sys_sigreturn
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_sigaltstack_glue:
         la      %r4,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys32_sigaltstack_wrapper # branch to sys_sigreturn
@@ -1009,7 +1009,7 @@ sys_call_table:
 #include "syscalls.S"
 #undef SYSCALL
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 
 #define SYSCALL(esa,esame,emu)	.long emu
 	.globl  sys_call_table_emu
diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S
index d31a97c89f68..ea88d066bf04 100644
--- a/arch/s390/kernel/head.S
+++ b/arch/s390/kernel/head.S
@@ -30,7 +30,7 @@
 #include <asm/thread_info.h>
 #include <asm/page.h>
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #define ARCH_OFFSET	4
 #else
 #define ARCH_OFFSET	0
@@ -539,7 +539,7 @@ ipl_devno:
 	.word 0
 .endm
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #include "head64.S"
 #else
 #include "head31.S"
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 607d506689c8..c271cdab58e2 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -37,11 +37,11 @@
 #define DEBUGP(fmt , ...)
 #endif
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 #define PLT_ENTRY_SIZE 12
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define PLT_ENTRY_SIZE 20
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 void *module_alloc(unsigned long size)
 {
@@ -294,17 +294,17 @@ apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 			unsigned int *ip;
 			ip = me->module_core + me->arch.plt_offset +
 				info->plt_offset;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 			ip[0] = 0x0d105810; /* basr 1,0; l 1,6(1); br 1 */
 			ip[1] = 0x100607f1;
 			ip[2] = val;
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 			ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */
 			ip[1] = 0x100a0004;
 			ip[2] = 0x07f10000;
 			ip[3] = (unsigned int) (val >> 32);
 			ip[4] = (unsigned int) val;
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 			info->plt_initialized = 1;
 		}
 		if (r_type == R_390_PLTOFF16 ||
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 78b64fe5e7c2..a942bf2d58e9 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -235,7 +235,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 	/* Save access registers to new thread structure. */
 	save_access_regs(&p->thread.acrs[0]);
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
         /*
 	 * save fprs to current->thread.fp_regs to merge them with
 	 * the emulated registers and then copy the result to the child.
@@ -247,7 +247,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 	/* Set a new TLS ?  */
 	if (clone_flags & CLONE_SETTLS)
 		p->thread.acrs[0] = regs->gprs[6];
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	/* Save the fpu registers to new thread structure. */
 	save_fp_regs(&p->thread.fp_regs);
         p->thread.user_seg = __pa((unsigned long) p->mm->pgd) | _REGION_TABLE;
@@ -260,7 +260,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 			p->thread.acrs[1] = (unsigned int) regs->gprs[6];
 		}
 	}
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	/* start new process with ar4 pointing to the correct address space */
 	p->thread.mm_segment = get_fs();
         /* Don't copy debug registers */
@@ -339,16 +339,16 @@ out:
  */
 int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
 {
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
         /*
 	 * save fprs to current->thread.fp_regs to merge them with
 	 * the emulated registers and then copy the result to the dump.
 	 */
 	save_fp_regs(&current->thread.fp_regs);
 	memcpy(fpregs, &current->thread.fp_regs, sizeof(s390_fp_regs));
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	save_fp_regs(fpregs);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	return 1;
 }
 
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 06afa3103ace..8ecda6d66de4 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -42,7 +42,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 #include "compat_ptrace.h"
 #endif
 
@@ -59,7 +59,7 @@ FixPerRegisters(struct task_struct *task)
 	
 	if (per_info->single_step) {
 		per_info->control_regs.bits.starting_addr = 0;
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (test_thread_flag(TIF_31BIT))
 			per_info->control_regs.bits.ending_addr = 0x7fffffffUL;
 		else
@@ -112,7 +112,7 @@ ptrace_disable(struct task_struct *child)
 	clear_single_step(child);
 }
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 # define __ADDR_MASK 3
 #else
 # define __ADDR_MASK 7
@@ -138,7 +138,7 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
 	 * an alignment of 4. Programmers from hell...
 	 */
 	mask = __ADDR_MASK;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	if (addr >= (addr_t) &dummy->regs.acrs &&
 	    addr < (addr_t) &dummy->regs.orig_gpr2)
 		mask = 3;
@@ -160,7 +160,7 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
 		 * access registers are stored in the thread structure
 		 */
 		offset = addr - (addr_t) &dummy->regs.acrs;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		/*
 		 * Very special case: old & broken 64 bit gdb reading
 		 * from acrs[15]. Result is a 64 bit value. Read the
@@ -218,7 +218,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 	 * an alignment of 4. Programmers from hell indeed...
 	 */
 	mask = __ADDR_MASK;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	if (addr >= (addr_t) &dummy->regs.acrs &&
 	    addr < (addr_t) &dummy->regs.orig_gpr2)
 		mask = 3;
@@ -231,13 +231,13 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 		 * psw and gprs are stored on the stack
 		 */
 		if (addr == (addr_t) &dummy->regs.psw.mask &&
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		    data != PSW_MASK_MERGE(PSW_USER32_BITS, data) &&
 #endif
 		    data != PSW_MASK_MERGE(PSW_USER_BITS, data))
 			/* Invalid psw mask. */
 			return -EINVAL;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		if (addr == (addr_t) &dummy->regs.psw.addr)
 			/* I'd like to reject addresses without the
 			   high order bit but older gdb's rely on it */
@@ -250,7 +250,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 		 * access registers are stored in the thread structure
 		 */
 		offset = addr - (addr_t) &dummy->regs.acrs;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		/*
 		 * Very special case: old & broken 64 bit gdb writing
 		 * to acrs[15] with a 64 bit value. Ignore the lower
@@ -357,7 +357,7 @@ do_ptrace_normal(struct task_struct *child, long request, long addr, long data)
 	return ptrace_request(child, request, addr, data);
 }
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 /*
  * Now the fun part starts... a 31 bit program running in the
  * 31 bit emulation tracing another program. PTRACE_PEEKTEXT,
@@ -629,7 +629,7 @@ do_ptrace(struct task_struct *child, long request, long addr, long data)
 			return peek_user(child, addr, data);
 		if (request == PTRACE_POKEUSR && addr == PT_IEEE_IP)
 			return poke_user(child, addr, data);
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (request == PTRACE_PEEKUSR &&
 		    addr == PT32_IEEE_IP && test_thread_flag(TIF_31BIT))
 			return peek_user_emu31(child, addr, data);
@@ -695,7 +695,7 @@ do_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	/* Do requests that differ for 31/64 bit */
 	default:
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (test_thread_flag(TIF_31BIT))
 			return do_ptrace_emu31(child, request, addr, data);
 #endif
diff --git a/arch/s390/kernel/reipl_diag.c b/arch/s390/kernel/reipl_diag.c
index 83cb42bc0b76..1f33951ba439 100644
--- a/arch/s390/kernel/reipl_diag.c
+++ b/arch/s390/kernel/reipl_diag.c
@@ -26,7 +26,7 @@ void reipl_diag(void)
 		"   st   %%r4,%0\n"
 		"   st   %%r5,%1\n"
                 ".section __ex_table,\"a\"\n"
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
                 "   .align 8\n"
                 "   .quad 0b, 0b\n"
 #else
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 31e7b19348b7..b03847d100d9 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -427,7 +427,7 @@ setup_lowcore(void)
 		__alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0) + PAGE_SIZE;
 	lc->current_task = (unsigned long) init_thread_union.thread_info.task;
 	lc->thread_info = (unsigned long) &init_thread_union;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	if (MACHINE_HAS_IEEE) {
 		lc->extended_save_area_addr = (__u32)
 			__alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0);
@@ -562,21 +562,21 @@ setup_arch(char **cmdline_p)
         /*
          * print what head.S has found out about the machine
          */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	printk((MACHINE_IS_VM) ?
 	       "We are running under VM (31 bit mode)\n" :
 	       "We are running native (31 bit mode)\n");
 	printk((MACHINE_HAS_IEEE) ?
 	       "This machine has an IEEE fpu\n" :
 	       "This machine has no IEEE fpu\n");
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	printk((MACHINE_IS_VM) ?
 	       "We are running under VM (64 bit mode)\n" :
 	       "We are running native (64 bit mode)\n");
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
         ROOT_DEV = Root_RAM0;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	memory_end = memory_size & ~0x400000UL;  /* align memory end to 4MB */
         /*
          * We need some free virtual space to be able to do vmalloc.
@@ -585,9 +585,9 @@ setup_arch(char **cmdline_p)
          */
         if (memory_end > 1920*1024*1024)
                 memory_end = 1920*1024*1024;
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	memory_end = memory_size & ~0x200000UL;  /* detected in head.s */
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 	init_mm.start_code = PAGE_OFFSET;
 	init_mm.end_code = (unsigned long) &_etext;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 13592d00a10f..6ae4a77270b5 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -501,7 +501,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (test_thread_flag(TIF_31BIT)) {
 			extern void handle_signal32(unsigned long sig,
 						    struct k_sigaction *ka,
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index bd5b311006be..e10f4ca00499 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -402,7 +402,7 @@ static void smp_ext_bitcall_others(ec_bit_sig sig)
         }
 }
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 /*
  * this function sends a 'purge tlb' signal to another CPU.
  */
@@ -416,7 +416,7 @@ void smp_ptlb_all(void)
         on_each_cpu(smp_ptlb_callback, NULL, 0, 1);
 }
 EXPORT_SYMBOL(smp_ptlb_all);
-#endif /* ! CONFIG_ARCH_S390X */
+#endif /* ! CONFIG_64BIT */
 
 /*
  * this function sends a 'reschedule' IPI to another CPU.
@@ -783,7 +783,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		if (stack == 0ULL)
 			panic("smp_boot_cpus failed to allocate memory\n");
 		lowcore_ptr[i]->panic_stack = stack + (PAGE_SIZE);
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 		if (MACHINE_HAS_IEEE) {
 			lowcore_ptr[i]->extended_save_area_addr =
 				(__u32) __get_free_pages(GFP_KERNEL,0);
@@ -793,7 +793,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		}
 #endif
 	}
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 	if (MACHINE_HAS_IEEE)
 		ctl_set_bit(14, 29); /* enable extended save area */
 #endif
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c
index efe6b83b53f7..6a63553493c5 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/sys_s390.c
@@ -26,9 +26,7 @@
 #include <linux/mman.h>
 #include <linux/file.h>
 #include <linux/utsname.h>
-#ifdef CONFIG_ARCH_S390X
 #include <linux/personality.h>
-#endif /* CONFIG_ARCH_S390X */
 
 #include <asm/uaccess.h>
 #include <asm/ipc.h>
@@ -121,7 +119,7 @@ out:
 	return error;
 }
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 struct sel_arg_struct {
 	unsigned long n;
 	fd_set *inp, *outp, *exp;
@@ -138,7 +136,7 @@ asmlinkage long old_select(struct sel_arg_struct __user *arg)
 	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
 
 }
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * sys_ipc() is the de-multiplexer for the SysV IPC calls..
@@ -211,7 +209,7 @@ asmlinkage long sys_ipc(uint call, int first, unsigned long second,
 	return -EINVAL;
 }
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 asmlinkage long s390x_newuname(struct new_utsname __user *name)
 {
 	int ret = sys_newuname(name);
@@ -235,12 +233,12 @@ asmlinkage long s390x_personality(unsigned long personality)
 
 	return ret;
 }
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * Wrapper function for sys_fadvise64/fadvise64_64
  */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 
 asmlinkage long
 s390_fadvise64(int fd, u32 offset_high, u32 offset_low, size_t len, int advice)
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index c5bd36fae56b..95d109968619 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -67,13 +67,13 @@ extern pgm_check_handler_t do_monitor_call;
 
 #define stack_pointer ({ void **sp; asm("la %0,0(15)" : "=&d" (sp)); sp; })
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 #define FOURLONG "%08lx %08lx %08lx %08lx\n"
 static int kstack_depth_to_print = 12;
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define FOURLONG "%016lx %016lx %016lx %016lx\n"
 static int kstack_depth_to_print = 20;
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * For show_trace we have tree different stack to consider:
@@ -702,12 +702,12 @@ void __init trap_init(void)
         pgm_check_table[0x11] = &do_dat_exception;
         pgm_check_table[0x12] = &translation_exception;
         pgm_check_table[0x13] = &special_op_exception;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
         pgm_check_table[0x38] = &do_dat_exception;
 	pgm_check_table[0x39] = &do_dat_exception;
 	pgm_check_table[0x3A] = &do_dat_exception;
         pgm_check_table[0x3B] = &do_dat_exception;
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
         pgm_check_table[0x15] = &operand_exception;
         pgm_check_table[0x1C] = &space_switch_exception;
         pgm_check_table[0x1D] = &hfp_sqrt_exception;
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 89fdb3808bc0..9289face3027 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
 #include <asm-generic/vmlinux.lds.h>
 #include <linux/config.h>
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
 OUTPUT_ARCH(s390)
 ENTRY(_start)
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index b701efa1f00e..d9b97b3c597f 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -4,6 +4,5 @@
 
 EXTRA_AFLAGS := -traditional
 
-lib-y += delay.o string.o
-lib-$(CONFIG_ARCH_S390_31) += uaccess.o spinlock.o
-lib-$(CONFIG_ARCH_S390X) += uaccess64.o spinlock.o
+lib-y += delay.o string.o spinlock.o
+lib-y += $(if $(CONFIG_64BIT),uaccess64.o,uaccess.o)
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index 2dc14e9c8327..68d79c502081 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -29,7 +29,7 @@ __setup("spin_retry=", spin_retry_setup);
 static inline void
 _diag44(void)
 {
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	if (MACHINE_HAS_DIAG44)
 #endif
 		asm volatile("diag 0,0,0x44");
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 506a33b51e4f..a9566bcab682 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -143,7 +143,7 @@ dcss_diag (__u8 func, void *parameter,
 	rx = (unsigned long) parameter;
 	ry = (unsigned long) func;
 	__asm__ __volatile__(
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		"   sam31\n" // switch to 31 bit
 		"   diag    %0,%1,0x64\n"
 		"   sam64\n" // switch back to 64 bit
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index fb2607c369ed..81ade401b073 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -31,17 +31,17 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 #define __FAIL_ADDR_MASK 0x7ffff000
 #define __FIXUP_MASK 0x7fffffff
 #define __SUBCODE_MASK 0x0200
 #define __PF_RES_FIELD 0ULL
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define __FAIL_ADDR_MASK -4096L
 #define __FIXUP_MASK ~0L
 #define __SUBCODE_MASK 0x0600
 #define __PF_RES_FIELD 0x8000000000000000ULL
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 #ifdef CONFIG_SYSCTL
 extern int sysctl_userprocess_debug;
@@ -393,11 +393,11 @@ int pfault_init(void)
 		"2:\n"
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		"   .long  0b,1b\n"
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 		"   .quad  0b,1b\n"
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 		".previous"
                 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc" );
         __ctl_set_bit(0, 9);
@@ -417,11 +417,11 @@ void pfault_fini(void)
 		"0:\n"
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		"   .long  0b,0b\n"
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 		"   .quad  0b,0b\n"
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 		".previous"
 		: : "a" (&refbk), "m" (refbk) : "cc" );
 }
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 6ec5cd981e74..df953383724d 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -44,7 +44,7 @@ void diag10(unsigned long addr)
 {
         if (addr >= 0x7ff00000)
                 return;
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
         asm volatile (
 		"   sam31\n"
 		"   diag %0,%0,0x10\n"
@@ -106,7 +106,7 @@ extern unsigned long __initdata zholes_size[];
  * paging_init() sets up the page tables
  */
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 void __init paging_init(void)
 {
         pgd_t * pg_dir;
@@ -175,7 +175,7 @@ void __init paging_init(void)
         return;
 }
 
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 void __init paging_init(void)
 {
         pgd_t * pg_dir;
@@ -256,7 +256,7 @@ void __init paging_init(void)
 
         return;
 }
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 void __init mem_init(void)
 {
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index fb187e5a54b4..356257c171de 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -50,7 +50,7 @@ static inline unsigned long mmap_base(void)
 
 static inline int mmap_is_legacy(void)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	/*
 	 * Force standard allocation for 64 bit programs.
 	 */
diff --git a/block/Kconfig b/block/Kconfig
index eb48edb80c1d..377f6dd20e17 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,7 +5,7 @@
 #for instance.
 config LBD
 	bool "Support for Large Block Devices"
-	depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
+	depends on X86 || (MIPS && 32BIT) || PPC32 || (S390 && !64BIT) || SUPERH || UML
 	help
 	  Say Y here if you want to attach large (bigger than 2TB) discs to
 	  your machine, or if you want to have a raid or loopback device
diff --git a/crypto/Kconfig b/crypto/Kconfig
index c696f7ab729e..52e1d4108a99 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -42,7 +42,7 @@ config CRYPTO_SHA1
 
 config CRYPTO_SHA1_S390
 	tristate "SHA1 digest algorithm (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  This is the s390 hardware accelerated implementation of the
 	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).
@@ -58,7 +58,7 @@ config CRYPTO_SHA256
 
 config CRYPTO_SHA256_S390
 	tristate "SHA256 digest algorithm (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  This is the s390 hardware accelerated implementation of the
 	  SHA256 secure hash standard (DFIPS 180-2).
@@ -111,7 +111,7 @@ config CRYPTO_DES
 
 config CRYPTO_DES_S390
 	tristate "DES and Triple DES cipher algorithms (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3).
 
@@ -217,7 +217,7 @@ config CRYPTO_AES_X86_64
 
 config CRYPTO_AES_S390
 	tristate "AES cipher algorithms (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  This is the s390 hardware accelerated implementation of the
 	  AES cipher algorithms (FIPS-197). AES uses the Rijndael
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 84e68cdd451b..5ebd06b1b4ca 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -985,7 +985,7 @@ config HPET_MMAP
 
 config HANGCHECK_TIMER
 	tristate "Hangcheck timer"
-	depends on X86 || IA64 || PPC64 || ARCH_S390
+	depends on X86 || IA64 || PPC64 || S390
 	help
 	  The hangcheck-timer module detects when the system has gone
 	  out to lunch past a certain margin.  It can reboot the system
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 66e53dd450ff..40a67c86420c 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -120,7 +120,7 @@ __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
 #if defined(CONFIG_X86)
 # define HAVE_MONOTONIC
 # define TIMER_FREQ 1000000000ULL
-#elif defined(CONFIG_ARCH_S390)
+#elif defined(CONFIG_S390)
 /* FA240000 is 1 Second in the IBM time universe (Page 4-38 Principles of Op for zSeries */
 # define TIMER_FREQ 0xFA240000ULL
 #elif defined(CONFIG_IA64)
diff --git a/drivers/char/watchdog/Kconfig b/drivers/char/watchdog/Kconfig
index 344001b45af9..a6544790af60 100644
--- a/drivers/char/watchdog/Kconfig
+++ b/drivers/char/watchdog/Kconfig
@@ -438,7 +438,7 @@ config INDYDOG
 
 config ZVM_WATCHDOG
 	tristate "z/VM Watchdog Timer"
-	depends on WATCHDOG && ARCH_S390
+	depends on WATCHDOG && S390
 	help
 	  IBM s/390 and zSeries machines running under z/VM 5.1 or later
 	  provide a virtual watchdog timer to their guest that cause a
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 9f2352bd8348..a1e660e3531d 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -157,7 +157,7 @@ struct input_event_compat {
 #  define COMPAT_TEST test_thread_flag(TIF_IA32)
 #elif defined(CONFIG_IA64)
 #  define COMPAT_TEST IS_IA32_PROCESS(ia64_task_regs(current))
-#elif defined(CONFIG_ARCH_S390)
+#elif defined(CONFIG_S390)
 #  define COMPAT_TEST test_thread_flag(TIF_31BIT)
 #elif defined(CONFIG_MIPS)
 #  define COMPAT_TEST (current->thread.mflags & MF_32BIT_ADDR)
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index c782a6329805..fa39b944bc46 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -6,7 +6,7 @@ menu "PHY device support"
 
 config PHYLIB
 	tristate "PHY Device support and infrastructure"
-	depends on NET_ETHERNET && (BROKEN || !ARCH_S390)
+	depends on NET_ETHERNET && (BROKEN || !S390)
 	help
 	  Ethernet controllers are usually attached to PHY
 	  devices.  This option provides infrastructure for
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 6e7d7b06421d..6f50cc9323d9 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -1,11 +1,11 @@
-if ARCH_S390
+if S390
 
 comment "S/390 block device drivers"
-	depends on ARCH_S390
+	depends on S390
 
 config BLK_DEV_XPRAM
 	tristate "XPRAM disk support"
-	depends on ARCH_S390
+	depends on S390
 	help
 	  Select this option if you want to use your expanded storage on S/390
 	  or zSeries as a disk.  This is useful as a _fast_ swap device if you
@@ -49,7 +49,7 @@ config DASD_FBA
 
 config DASD_DIAG
 	tristate "Support for DIAG access to Disks"
-	depends on DASD && ( ARCH_S390X = 'n' || EXPERIMENTAL)
+	depends on DASD && ( 64BIT = 'n' || EXPERIMENTAL)
 	help
 	  Select this option if you want to use Diagnose250 command to access
 	  Disks under VM.  If you are not running under VM or unsure what it is,
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 1141a5963b67..041e1a621885 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -604,7 +604,7 @@ dasd_smalloc_request(char *magic, int cplength, int datasize,
 void
 dasd_kfree_request(struct dasd_ccw_req * cqr, struct dasd_device * device)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	struct ccw1 *ccw;
 
 	/* Clear any idals used for the request. */
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index a33d4064b537..ba80fdea7ebf 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -75,7 +75,7 @@ dia250(void *iob, int cmd)
 	int rc;
 
 	__asm__ __volatile__(
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		"	lghi	%0,3\n"
 		"	lgr	0,%3\n"
 		"	diag	0,%2,0x250\n"
diff --git a/drivers/s390/block/dasd_diag.h b/drivers/s390/block/dasd_diag.h
index 37edf6e91715..a4f80bd735f1 100644
--- a/drivers/s390/block/dasd_diag.h
+++ b/drivers/s390/block/dasd_diag.h
@@ -45,7 +45,7 @@ struct dasd_diag_characteristics {
 } __attribute__ ((packed, aligned(4)));
 
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #define DASD_DIAG_FLAGA_DEFAULT		DASD_DIAG_FLAGA_FORMAT_64BIT
 
 typedef u64 blocknum_t;
@@ -86,7 +86,7 @@ struct dasd_diag_rw_io {
 	struct dasd_diag_bio *bio_list;
 	u8  spare4[8];
 } __attribute__ ((packed, aligned(8)));
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define DASD_DIAG_FLAGA_DEFAULT		0x0
 
 typedef u32 blocknum_t;
@@ -125,4 +125,4 @@ struct dasd_diag_rw_io {
 	u32 interrupt_params;
 	u8 spare3[20];
 } __attribute__ ((packed, aligned(8)));
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index efc4cf62496e..96eb48258580 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -1041,7 +1041,7 @@ dasd_eckd_build_cp(struct dasd_device * device, struct request *req)
 				/* Eckd can only do full blocks. */
 				return ERR_PTR(-EINVAL);
 			count += bv->bv_len >> (device->s2b_shift + 9);
-#if defined(CONFIG_ARCH_S390X)
+#if defined(CONFIG_64BIT)
 			if (idal_is_needed (page_address(bv->bv_page),
 					    bv->bv_len))
 				cidaw += bv->bv_len >> (device->s2b_shift + 9);
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index 9bac8d87a9cc..8ec75dc08e2c 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -271,7 +271,7 @@ dasd_fba_build_cp(struct dasd_device * device, struct request *req)
 				/* Fba can only do full blocks. */
 				return ERR_PTR(-EINVAL);
 			count += bv->bv_len >> (device->s2b_shift + 9);
-#if defined(CONFIG_ARCH_S390X)
+#if defined(CONFIG_64BIT)
 			if (idal_is_needed (page_address(bv->bv_page),
 					    bv->bv_len))
 				cidaw += bv->bv_len / blksize;
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index d428c909b8a0..bf3a67c3cc5e 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -160,7 +160,7 @@ static int xpram_page_in (unsigned long page_addr, unsigned int xpage_index)
                 "0: ipm   %0\n"
 		"   srl   %0,28\n"
 		"1:\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
 		"   .long  0b,1b\n"
@@ -208,7 +208,7 @@ static long xpram_page_out (unsigned long page_addr, unsigned int xpage_index)
                 "0: ipm   %0\n"
 		"   srl   %0,28\n"
 		"1:\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
 		"   .long  0b,1b\n"
diff --git a/drivers/s390/char/vmwatchdog.c b/drivers/s390/char/vmwatchdog.c
index 5473c23fcb52..5acc0ace3d7d 100644
--- a/drivers/s390/char/vmwatchdog.c
+++ b/drivers/s390/char/vmwatchdog.c
@@ -66,7 +66,7 @@ static int __diag288(enum vmwdt_func func, unsigned int timeout,
 	__cmdl = len;
 	err = 0;
 	asm volatile (
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 		       "diag %2,%4,0x288\n"
 		"1:	\n"
 		".section .fixup,\"ax\"\n"
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 6f274f4f92eb..7376bc87206d 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -195,7 +195,7 @@ cio_start_key (struct subchannel *sch,	/* subchannel structure */
 	sch->orb.spnd = sch->options.suspend;
 	sch->orb.ssic = sch->options.suspend && sch->options.inter;
 	sch->orb.lpm = (lpm != 0) ? (lpm & sch->opm) : sch->lpm;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	/*
 	 * for 64 bit we always support 64 bit IDAWs with 4k page size only
 	 */
diff --git a/drivers/s390/cio/device_id.c b/drivers/s390/cio/device_id.c
index 3c77c3fd461d..04ceba343db8 100644
--- a/drivers/s390/cio/device_id.c
+++ b/drivers/s390/cio/device_id.c
@@ -27,7 +27,7 @@
 /*
  * diag210 is used under VM to get information about a virtual device
  */
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 int
 diag210(struct diag210 * addr)
 {
diff --git a/drivers/s390/cio/ioasm.h b/drivers/s390/cio/ioasm.h
index 62b0e2ad507f..95a9462f9a91 100644
--- a/drivers/s390/cio/ioasm.h
+++ b/drivers/s390/cio/ioasm.h
@@ -50,7 +50,7 @@ static inline int stsch_err(struct subchannel_id schid,
 		"0:  ipm  %0\n"
 		"    srl  %0,28\n"
 		"1:\n"
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 8\n"
 		"   .quad 0b,1b\n"
@@ -95,7 +95,7 @@ static inline int msch_err(struct subchannel_id schid,
 		"0:  ipm  %0\n"
 		"    srl  %0,28\n"
 		"1:\n"
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 8\n"
 		"   .quad 0b,1b\n"
diff --git a/drivers/s390/cio/qdio.c b/drivers/s390/cio/qdio.c
index 035c77af9cd3..30a836ffc31f 100644
--- a/drivers/s390/cio/qdio.c
+++ b/drivers/s390/cio/qdio.c
@@ -2394,7 +2394,7 @@ tiqdio_check_chsc_availability(void)
 	sprintf(dbf_text,"hydrati%1x", hydra_thinints);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	/* Check for QEBSM support in general (bit 58). */
 	is_passthrough = css_general_characteristics.qebsm;
 #endif
diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h
index 43b840af5300..fa385e761fe1 100644
--- a/drivers/s390/cio/qdio.h
+++ b/drivers/s390/cio/qdio.h
@@ -271,7 +271,7 @@ static inline int
 do_sqbs(unsigned long sch, unsigned char state, int queue,
        unsigned int *start, unsigned int *count)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
        register unsigned long _ccq asm ("0") = *count;
        register unsigned long _sch asm ("1") = sch;
        unsigned long _queuestart = ((unsigned long)queue << 32) | *start;
@@ -295,7 +295,7 @@ static inline int
 do_eqbs(unsigned long sch, unsigned char *state, int queue,
 	unsigned int *start, unsigned int *count)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	register unsigned long _ccq asm ("0") = *count;
 	register unsigned long _sch asm ("1") = sch;
 	unsigned long _queuestart = ((unsigned long)queue << 32) | *start;
@@ -323,7 +323,7 @@ do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
 {
 	int cc;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	0,2	\n\t"
 		"lr	1,%1	\n\t"
@@ -336,7 +336,7 @@ do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
 		: "d" (schid), "d" (mask1), "d" (mask2)
 		: "cc", "0", "1", "2", "3"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
 		"lghi	0,2	\n\t"
 		"llgfr	1,%1	\n\t"
@@ -349,7 +349,7 @@ do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
 		: "d" (schid), "d" (mask1), "d" (mask2)
 		: "cc", "0", "1", "2", "3"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	return cc;
 }
 
@@ -358,7 +358,7 @@ do_siga_input(struct subchannel_id schid, unsigned int mask)
 {
 	int cc;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	0,1	\n\t"
 		"lr	1,%1	\n\t"
@@ -370,7 +370,7 @@ do_siga_input(struct subchannel_id schid, unsigned int mask)
 		: "d" (schid), "d" (mask)
 		: "cc", "0", "1", "2", "memory"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
 		"lghi	0,1	\n\t"
 		"llgfr	1,%1	\n\t"
@@ -382,7 +382,7 @@ do_siga_input(struct subchannel_id schid, unsigned int mask)
 		: "d" (schid), "d" (mask)
 		: "cc", "0", "1", "2", "memory"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	
 	return cc;
 }
@@ -394,7 +394,7 @@ do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
 	int cc;
 	__u32 busy_bit;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	0,0	\n\t"
 		"lr	1,%2	\n\t"
@@ -424,7 +424,7 @@ do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
 		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION)
 		: "cc", "0", "1", "2", "memory"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
         	"llgfr  0,%5    \n\t"
                 "lgr    1,%2    \n\t"
@@ -449,7 +449,7 @@ do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
 		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION), "d" (fc)
 		: "cc", "0", "1", "2", "memory"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	
 	(*bb) = busy_bit;
 	return cc;
@@ -461,21 +461,21 @@ do_clear_global_summary(void)
 
 	unsigned long time;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	1,3	\n\t"
 		".insn	rre,0xb2650000,2,0	\n\t"
 		"lr	%0,3	\n\t"
 		: "=d" (time) : : "cc", "1", "2", "3"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
 		"lghi	1,3	\n\t"
 		".insn	rre,0xb2650000,2,0	\n\t"
 		"lgr	%0,3	\n\t"
 		: "=d" (time) : : "cc", "1", "2", "3"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	
 	return time;
 }
@@ -542,11 +542,11 @@ struct qdio_perf_stats {
 
 #define MY_MODULE_STRING(x) #x
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #define QDIO_GET_ADDR(x) ((__u32)(unsigned long)x)
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define QDIO_GET_ADDR(x) ((__u32)(long)x)
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 struct qdio_q {
 	volatile struct slsb slsb;
diff --git a/drivers/s390/crypto/z90hardware.c b/drivers/s390/crypto/z90hardware.c
index 7c3ed52e03e1..d7f7494a0cbe 100644
--- a/drivers/s390/crypto/z90hardware.c
+++ b/drivers/s390/crypto/z90hardware.c
@@ -785,7 +785,7 @@ testq(int q_nr, int *q_depth, int *dev_type, struct ap_status_word *stat)
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	llgfr	0,%4		\n"
 	 "	slgr	1,1		\n"
 	 "	lgr	2,1		\n"
@@ -855,7 +855,7 @@ resetq(int q_nr, struct ap_status_word *stat_p)
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	llgfr	0,%2		\n"
 	 "	lghi	1,1		\n"
 	 "	sll	1,24		\n"
@@ -921,7 +921,7 @@ sen(int msg_len, unsigned char *msg_ext, struct ap_status_word *stat)
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	lgr	6,%3		\n"
 	 "	llgfr	7,%2		\n"
 	 "	llgt	0,0(6)		\n"
@@ -1000,7 +1000,7 @@ rec(int q_nr, int buff_l, unsigned char *rsp, unsigned char *id,
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	llgfr	0,%2		\n"
 	 "	lgr	3,%4		\n"
 	 "	lgr	6,%3		\n"
diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig
index a7efc394515e..548854754921 100644
--- a/drivers/s390/net/Kconfig
+++ b/drivers/s390/net/Kconfig
@@ -1,5 +1,5 @@
 menu "S/390 network device drivers"
-	depends on NETDEVICES && ARCH_S390
+	depends on NETDEVICES && S390
 
 config LCS
 	tristate "Lan Channel Station Interface"
diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c
index 6b63d21612ec..e70af7f39946 100644
--- a/drivers/s390/net/claw.c
+++ b/drivers/s390/net/claw.c
@@ -1603,7 +1603,7 @@ dumpit(char* buf, int len)
         __u32      ct, sw, rm, dup;
         char       *ptr, *rptr;
         char       tbuf[82], tdup[82];
-#if (CONFIG_ARCH_S390X)
+#if (CONFIG_64BIT)
         char       addr[22];
 #else
         char       addr[12];
@@ -1619,7 +1619,7 @@ dumpit(char* buf, int len)
         dup = 0;
         for ( ct=0; ct < len; ct++, ptr++, rptr++ )  {
                 if (sw == 0) {
-#if (CONFIG_ARCH_S390X)
+#if (CONFIG_64BIT)
                         sprintf(addr, "%16.16lX",(unsigned long)rptr);
 #else
                         sprintf(addr, "%8.8X",(__u32)rptr);
@@ -1634,7 +1634,7 @@ dumpit(char* buf, int len)
                 if (sw == 8) {
                         strcat(bhex, "  ");
                 }
-#if (CONFIG_ARCH_S390X)
+#if (CONFIG_64BIT)
                 sprintf(tbuf,"%2.2lX", (unsigned long)*ptr);
 #else
                 sprintf(tbuf,"%2.2X", (__u32)*ptr);
diff --git a/drivers/s390/s390mach.c b/drivers/s390/s390mach.c
index 7dad597ff86e..3bf466603512 100644
--- a/drivers/s390/s390mach.c
+++ b/drivers/s390/s390mach.c
@@ -246,7 +246,7 @@ s390_revalidate_registers(struct mci *mci)
 		 */
 		kill_task = 1;
 
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 	asm volatile("ld 0,0(%0)\n"
 		     "ld 2,8(%0)\n"
 		     "ld 4,16(%0)\n"
@@ -255,7 +255,7 @@ s390_revalidate_registers(struct mci *mci)
 #endif
 
 	if (MACHINE_HAS_IEEE) {
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 		fpt_save_area = &S390_lowcore.floating_pt_save_area;
 		fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area;
 #else
@@ -314,7 +314,7 @@ s390_revalidate_registers(struct mci *mci)
 		 */
 		s390_handle_damage("invalid control registers.");
 	else
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 		asm volatile("lctlg 0,15,0(%0)"
 			     : : "a" (&S390_lowcore.cregs_save_area));
 #else
@@ -327,7 +327,7 @@ s390_revalidate_registers(struct mci *mci)
 	 * can't write something sensible into that register.
 	 */
 
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	/*
 	 * See if we can revalidate the TOD programmable register with its
 	 * old contents (should be zero) otherwise set it to zero.
@@ -384,7 +384,7 @@ s390_do_machine_check(struct pt_regs *regs)
 		if (mci->b) {
 			/* Processing backup -> verify if we can survive this */
 			u64 z_mcic, o_mcic, t_mcic;
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 			z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29);
 			o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 |
 				  1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 |
diff --git a/drivers/s390/sysinfo.c b/drivers/s390/sysinfo.c
index 87c2db1bd4f5..66da840c9316 100644
--- a/drivers/s390/sysinfo.c
+++ b/drivers/s390/sysinfo.c
@@ -106,7 +106,7 @@ static inline int stsi (void *sysinfo,
 {
 	int cc, retv;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	__asm__ __volatile__ (	"lr\t0,%2\n"
 				"\tlr\t1,%3\n"
 				"\tstsi\t0(%4)\n"
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 4c42065dea88..9e8254f0256c 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1815,7 +1815,7 @@ config SCSI_SUNESP
 
 config ZFCP
 	tristate "FCP host bus adapter driver for IBM eServer zSeries"
-	depends on ARCH_S390 && QDIO && SCSI
+	depends on S390 && QDIO && SCSI
 	select SCSI_FC_ATTRS
 	help
           If you want to access SCSI devices attached to your IBM eServer
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index 656bc43431b9..e227a04261ab 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -85,7 +85,7 @@ config ATARI_PARTITION
 
 config IBM_PARTITION
 	bool "IBM disk label and partition support"
-	depends on PARTITION_ADVANCED && ARCH_S390
+	depends on PARTITION_ADVANCED && S390
 	help
 	  Say Y here if you would like to be able to read the hard disk
 	  partition table format used by IBM DASD disks operating under CMS.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3e1239e4b303..5e9251f65317 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,7 @@ int proc_pid_status(struct task_struct *task, char * buffer)
 	buffer = task_sig(task, buffer);
 	buffer = task_cap(task, buffer);
 	buffer = cpuset_task_status_allowed(task, buffer);
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
 	buffer = task_show_regs(task, buffer);
 #endif
 	return buffer - orig;
diff --git a/include/asm-s390/unistd.h b/include/asm-s390/unistd.h
index f97d92691f17..2861cdc243ad 100644
--- a/include/asm-s390/unistd.h
+++ b/include/asm-s390/unistd.h
@@ -539,7 +539,7 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4,    \
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_SYS_RT_SIGACTION
-# ifdef CONFIG_ARCH_S390_31
+# ifndef CONFIG_64BIT
 #   define __ARCH_WANT_STAT64
 #   define __ARCH_WANT_SYS_TIME
 # endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 60f8bc78a35a..6c5d4c898ccb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -12,7 +12,7 @@
 #include <linux/config.h>
 #include <linux/smp.h>
 
-#if !defined(CONFIG_ARCH_S390)
+#if !defined(CONFIG_S390)
 
 #include <linux/linkage.h>
 #include <linux/cache.h>
diff --git a/init/Kconfig b/init/Kconfig
index 24e0f7c756c0..ba42f3793a84 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -189,7 +189,7 @@ config AUDIT
 
 config AUDITSYSCALL
 	bool "Enable system-call auditing support"
-	depends on AUDIT && (X86 || PPC || PPC64 || ARCH_S390 || IA64 || UML || SPARC64)
+	depends on AUDIT && (X86 || PPC || PPC64 || S390 || IA64 || UML || SPARC64)
 	default y if SECURITY_SELINUX
 	help
 	  Enable low-overhead system-call auditing infrastructure that
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index c10b08a80982..c2683fcd792d 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -145,7 +145,7 @@ int __init rd_load_image(char *from)
 	int nblocks, i, disk;
 	char *buf = NULL;
 	unsigned short rotate = 0;
-#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_PPC_ISERIES)
+#if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES)
 	char rotator[4] = { '|' , '/' , '-' , '\\' };
 #endif
 
@@ -237,7 +237,7 @@ int __init rd_load_image(char *from)
 		}
 		sys_read(in_fd, buf, BLOCK_SIZE);
 		sys_write(out_fd, buf, BLOCK_SIZE);
-#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_PPC_ISERIES)
+#if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES)
 		if (!(i % 16)) {
 			printk("%c\b", rotator[rotate & 0x3]);
 			rotate++;
diff --git a/kernel/panic.c b/kernel/panic.c
index aabc5f86fa3f..c5c4ab255834 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	long i;
 	static char buf[1024];
 	va_list args;
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
         unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
 
@@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 		printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
 	}
 #endif
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
         disabled_wait(caller);
 #endif
 	local_irq_enable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 345f4a1d533f..a85047bb5739 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -108,7 +108,7 @@ extern int pwrsw_enabled;
 extern int unaligned_enabled;
 #endif
 
-#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 extern int sysctl_ieee_emulation_warnings;
 #endif
@@ -542,7 +542,7 @@ static ctl_table kern_table[] = {
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
-#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 	{
 		.ctl_name	= KERN_IEEE_EMULATION_WARNINGS,
@@ -644,7 +644,7 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
 	{
 		.ctl_name	= KERN_SPIN_RETRY,
 		.procname	= "spin_retry",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1cedc2356b78..80598cfd728c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -32,7 +32,7 @@ config MAGIC_SYSRQ
 config LOG_BUF_SHIFT
 	int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL
 	range 12 21
-	default 17 if ARCH_S390
+	default 17 if S390
 	default 16 if X86_NUMAQ || IA64
 	default 15 if SMP
 	default 14
-- 
cgit v1.2.3-71-gd317


From a1a5ea70a6e9db6332b27fe2d96666e17aa1436b Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Fri, 6 Jan 2006 00:19:29 -0800
Subject: [PATCH] I2O: changed I2O API to create I2O messages in kernel memory

Changed the I2O API to create I2O messages first in kernel memory and then
transfer it at once over the PCI bus instead of sending each quad-word over
the PCI bus.

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/bus-osm.c    |   21 +-
 drivers/message/i2o/device.c     |   51 +-
 drivers/message/i2o/exec-osm.c   |   93 +-
 drivers/message/i2o/i2o_block.c  |  157 +--
 drivers/message/i2o/i2o_config.c |  169 ++--
 drivers/message/i2o/i2o_scsi.c   |   50 +-
 drivers/message/i2o/iop.c        |  296 +++---
 drivers/message/i2o/pci.c        |    1 +
 include/linux/i2o.h              | 1947 ++++++++++++++++++++------------------
 9 files changed, 1450 insertions(+), 1335 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/message/i2o/bus-osm.c b/drivers/message/i2o/bus-osm.c
index 151b228e1cb3..ce039d322fd0 100644
--- a/drivers/message/i2o/bus-osm.c
+++ b/drivers/message/i2o/bus-osm.c
@@ -39,18 +39,18 @@ static struct i2o_class_id i2o_bus_class_id[] = {
  */
 static int i2o_bus_scan(struct i2o_device *dev)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
 		return -ETIMEDOUT;
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.tid,
-	       &msg->u.head[1]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.
+			tid);
 
-	return i2o_msg_post_wait(dev->iop, m, 60);
+	return i2o_msg_post_wait(dev->iop, msg, 60);
 };
 
 /**
@@ -59,8 +59,9 @@ static int i2o_bus_scan(struct i2o_device *dev)
  *
  *	Returns count.
  */
-static ssize_t i2o_bus_store_scan(struct device *d, struct device_attribute *attr, const char *buf,
-				  size_t count)
+static ssize_t i2o_bus_store_scan(struct device *d,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
 {
 	struct i2o_device *i2o_dev = to_i2o_device(d);
 	int rc;
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 8eb50cdb8ae1..002ae0ed8966 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -35,18 +35,18 @@
 static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd,
 					 u32 type)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid, &msg->u.head[1]);
-	writel(type, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+		cpu_to_le32(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid);
+	msg->body[0] = cpu_to_le32(type);
 
-	return i2o_msg_post_wait(dev->iop, m, 60);
+	return i2o_msg_post_wait(dev->iop, msg, 60);
 }
 
 /**
@@ -419,10 +419,9 @@ int i2o_device_parse_lct(struct i2o_controller *c)
  *	ResultCount, ErrorInfoSize, BlockStatus and BlockSize.
  */
 int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
-			  int oplen, void *reslist, int reslen)
+		   int oplen, void *reslist, int reslen)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	u32 *res32 = (u32 *) reslist;
 	u32 *restmp = (u32 *) reslist;
 	int len = 0;
@@ -437,26 +436,28 @@ int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
 	if (i2o_dma_alloc(dev, &res, reslen, GFP_KERNEL))
 		return -ENOMEM;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY) {
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg)) {
 		i2o_dma_free(dev, &res);
-		return -ETIMEDOUT;
+		return PTR_ERR(msg);
 	}
 
 	i = 0;
-	writel(cmd << 24 | HOST_TID << 12 | i2o_dev->lct_data.tid,
-	       &msg->u.head[1]);
-	writel(0, &msg->body[i++]);
-	writel(0x4C000000 | oplen, &msg->body[i++]);	/* OperationList */
-	memcpy_toio(&msg->body[i], oplist, oplen);
+	msg->u.head[1] =
+	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | i2o_dev->lct_data.tid);
+	msg->body[i++] = cpu_to_le32(0x00000000);
+	msg->body[i++] = cpu_to_le32(0x4C000000 | oplen);	/* OperationList */
+	memcpy(&msg->body[i], oplist, oplen);
+
 	i += (oplen / 4 + (oplen % 4 ? 1 : 0));
-	writel(0xD0000000 | res.len, &msg->body[i++]);	/* ResultList */
-	writel(res.phys, &msg->body[i++]);
+	msg->body[i++] = cpu_to_le32(0xD0000000 | res.len);	/* ResultList */
+	msg->body[i++] = cpu_to_le32(res.phys);
 
-	writel(I2O_MESSAGE_SIZE(i + sizeof(struct i2o_message) / 4) |
-	       SGL_OFFSET_5, &msg->u.head[0]);
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(i + sizeof(struct i2o_message) / 4) |
+			SGL_OFFSET_5);
 
-	rc = i2o_msg_post_wait_mem(c, m, 10, &res);
+	rc = i2o_msg_post_wait_mem(c, msg, 10, &res);
 
 	/* This only looks like a memory leak - don't "fix" it. */
 	if (rc == -ETIMEDOUT)
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 9c339a2505b0..71a09332e7c0 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -114,13 +114,12 @@ static void i2o_exec_wait_free(struct i2o_exec_wait *wait)
  *	Returns 0 on success, negative error code on timeout or positive error
  *	code from reply.
  */
-int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
-			  timeout, struct i2o_dma *dma)
+int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg,
+			  unsigned long timeout, struct i2o_dma *dma)
 {
 	DECLARE_WAIT_QUEUE_HEAD(wq);
 	struct i2o_exec_wait *wait;
 	static u32 tcntxt = 0x80000000;
-	struct i2o_message __iomem *msg = i2o_msg_in_to_virt(c, m);
 	int rc = 0;
 
 	wait = i2o_exec_wait_alloc();
@@ -138,15 +137,15 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
 	 * We will only use transaction contexts >= 0x80000000 for POST WAIT,
 	 * so we could find a POST WAIT reply easier in the reply handler.
 	 */
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
 	wait->tcntxt = tcntxt++;
-	writel(wait->tcntxt, &msg->u.s.tcntxt);
+	msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt);
 
 	/*
 	 * Post the message to the controller. At some point later it will
 	 * return. If we time out before it returns then complete will be zero.
 	 */
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	if (!wait->complete) {
 		wait->wq = &wq;
@@ -266,7 +265,8 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
  *
  *	Returns number of bytes printed into buffer.
  */
-static ssize_t i2o_exec_show_vendor_id(struct device *d, struct device_attribute *attr, char *buf)
+static ssize_t i2o_exec_show_vendor_id(struct device *d,
+				       struct device_attribute *attr, char *buf)
 {
 	struct i2o_device *dev = to_i2o_device(d);
 	u16 id;
@@ -286,7 +286,9 @@ static ssize_t i2o_exec_show_vendor_id(struct device *d, struct device_attribute
  *
  *	Returns number of bytes printed into buffer.
  */
-static ssize_t i2o_exec_show_product_id(struct device *d, struct device_attribute *attr, char *buf)
+static ssize_t i2o_exec_show_product_id(struct device *d,
+					struct device_attribute *attr,
+					char *buf)
 {
 	struct i2o_device *dev = to_i2o_device(d);
 	u16 id;
@@ -385,23 +387,22 @@ static int i2o_exec_reply(struct i2o_controller *c, u32 m,
 	u32 context;
 
 	if (le32_to_cpu(msg->u.head[0]) & MSG_FAIL) {
+		struct i2o_message __iomem *pmsg;
+		u32 pm;
+
 		/*
 		 * If Fail bit is set we must take the transaction context of
 		 * the preserved message to find the right request again.
 		 */
-		struct i2o_message __iomem *pmsg;
-		u32 pm;
 
 		pm = le32_to_cpu(msg->body[3]);
-
 		pmsg = i2o_msg_in_to_virt(c, pm);
+		context = readl(&pmsg->u.s.tcntxt);
 
 		i2o_report_status(KERN_INFO, "i2o_core", msg);
 
-		context = readl(&pmsg->u.s.tcntxt);
-
 		/* Release the preserved msg */
-		i2o_msg_nop(c, pm);
+		i2o_msg_nop_mfa(c, pm);
 	} else
 		context = le32_to_cpu(msg->u.s.tcntxt);
 
@@ -462,25 +463,26 @@ static void i2o_exec_event(struct i2o_event *evt)
  */
 int i2o_exec_lct_get(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	int i = 0;
 	int rc = -EAGAIN;
 
 	for (i = 1; i <= I2O_LCT_GET_TRIES; i++) {
-		m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-		if (m == I2O_QUEUE_EMPTY)
-			return -ETIMEDOUT;
-
-		writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]);
-		writel(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 | ADAPTER_TID,
-		       &msg->u.head[1]);
-		writel(0xffffffff, &msg->body[0]);
-		writel(0x00000000, &msg->body[1]);
-		writel(0xd0000000 | c->dlct.len, &msg->body[2]);
-		writel(c->dlct.phys, &msg->body[3]);
-
-		rc = i2o_msg_post_wait(c, m, I2O_TIMEOUT_LCT_GET);
+		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
+
+		msg->u.head[0] =
+		    cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
+				ADAPTER_TID);
+		msg->body[0] = cpu_to_le32(0xffffffff);
+		msg->body[1] = cpu_to_le32(0x00000000);
+		msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
+		msg->body[3] = cpu_to_le32(c->dlct.phys);
+
+		rc = i2o_msg_post_wait(c, msg, I2O_TIMEOUT_LCT_GET);
 		if (rc < 0)
 			break;
 
@@ -506,29 +508,28 @@ static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
 {
 	i2o_status_block *sb = c->status_block.virt;
 	struct device *dev;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
 	dev = &c->pdev->dev;
 
 	if (i2o_dma_realloc(dev, &c->dlct, sb->expected_lct_size, GFP_KERNEL))
 		return -ENOMEM;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
-
-	writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]);
-	writel(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0, &msg->u.s.tcntxt);	/* FIXME */
-	writel(0xffffffff, &msg->body[0]);
-	writel(change_ind, &msg->body[1]);
-	writel(0xd0000000 | c->dlct.len, &msg->body[2]);
-	writel(c->dlct.phys, &msg->body[3]);
-
-	i2o_msg_post(c, m);
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+	msg->u.head[1] = cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
+				     ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0xffffffff);
+	msg->body[1] = cpu_to_le32(change_ind);
+	msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
+	msg->body[3] = cpu_to_le32(c->dlct.phys);
+
+	i2o_msg_post(c, msg);
 
 	return 0;
 };
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index f283b5bafdd3..2bd15c70773b 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -130,20 +130,20 @@ static int i2o_block_remove(struct device *dev)
  */
 static int i2o_block_device_flush(struct i2o_device *dev)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BLOCK_CFLUSH << 24 | HOST_TID << 12 | dev->lct_data.tid,
-	       &msg->u.head[1]);
-	writel(60 << 16, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_CFLUSH << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(60 << 16);
 	osm_debug("Flushing...\n");
 
-	return i2o_msg_post_wait(dev->iop, m, 60);
+	return i2o_msg_post_wait(dev->iop, msg, 60);
 };
 
 /**
@@ -181,21 +181,21 @@ static int i2o_block_issue_flush(request_queue_t * queue, struct gendisk *disk,
  */
 static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
-
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
-
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BLOCK_MMOUNT << 24 | HOST_TID << 12 | dev->lct_data.tid,
-	       &msg->u.head[1]);
-	writel(-1, &msg->body[0]);
-	writel(0, &msg->body[1]);
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MMOUNT << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(-1);
+	msg->body[1] = cpu_to_le32(0x00000000);
 	osm_debug("Mounting...\n");
 
-	return i2o_msg_post_wait(dev->iop, m, 2);
+	return i2o_msg_post_wait(dev->iop, msg, 2);
 };
 
 /**
@@ -210,20 +210,20 @@ static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id)
  */
 static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg) == I2O_QUEUE_EMPTY)
+		return PTR_ERR(msg);
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BLOCK_MLOCK << 24 | HOST_TID << 12 | dev->lct_data.tid,
-	       &msg->u.head[1]);
-	writel(-1, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MLOCK << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(-1);
 	osm_debug("Locking...\n");
 
-	return i2o_msg_post_wait(dev->iop, m, 2);
+	return i2o_msg_post_wait(dev->iop, msg, 2);
 };
 
 /**
@@ -238,20 +238,20 @@ static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id)
  */
 static int i2o_block_device_unlock(struct i2o_device *dev, u32 media_id)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BLOCK_MUNLOCK << 24 | HOST_TID << 12 | dev->lct_data.tid,
-	       &msg->u.head[1]);
-	writel(media_id, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MUNLOCK << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(media_id);
 	osm_debug("Unlocking...\n");
 
-	return i2o_msg_post_wait(dev->iop, m, 2);
+	return i2o_msg_post_wait(dev->iop, msg, 2);
 };
 
 /**
@@ -267,21 +267,21 @@ static int i2o_block_device_power(struct i2o_block_device *dev, u8 op)
 {
 	struct i2o_device *i2o_dev = dev->i2o_dev;
 	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	int rc;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BLOCK_POWER << 24 | HOST_TID << 12 | i2o_dev->lct_data.
-	       tid, &msg->u.head[1]);
-	writel(op << 24, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_POWER << 24 | HOST_TID << 12 | i2o_dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(op << 24);
 	osm_debug("Power...\n");
 
-	rc = i2o_msg_post_wait(c, m, 60);
+	rc = i2o_msg_post_wait(c, msg, 60);
 	if (!rc)
 		dev->power = op;
 
@@ -331,7 +331,7 @@ static inline void i2o_block_request_free(struct i2o_block_request *ireq)
  */
 static inline int i2o_block_sglist_alloc(struct i2o_controller *c,
 					 struct i2o_block_request *ireq,
-					 u32 __iomem ** mptr)
+					 u32 ** mptr)
 {
 	int nents;
 	enum dma_data_direction direction;
@@ -745,10 +745,9 @@ static int i2o_block_transfer(struct request *req)
 	struct i2o_block_device *dev = req->rq_disk->private_data;
 	struct i2o_controller *c;
 	int tid = dev->i2o_dev->lct_data.tid;
-	struct i2o_message __iomem *msg;
-	u32 __iomem *mptr;
+	struct i2o_message *msg;
+	u32 *mptr;
 	struct i2o_block_request *ireq = req->special;
-	u32 m;
 	u32 tcntxt;
 	u32 sgl_offset = SGL_OFFSET_8;
 	u32 ctl_flags = 0x00000000;
@@ -763,9 +762,9 @@ static int i2o_block_transfer(struct request *req)
 
 	c = dev->i2o_dev->iop;
 
-	m = i2o_msg_get(c, &msg);
-	if (m == I2O_QUEUE_EMPTY) {
-		rc = -EBUSY;
+	msg = i2o_msg_get(c);
+	if (IS_ERR(msg)) {
+		rc = PTR_ERR(msg);
 		goto exit;
 	}
 
@@ -775,8 +774,8 @@ static int i2o_block_transfer(struct request *req)
 		goto nop_msg;
 	}
 
-	writel(i2o_block_driver.context, &msg->u.s.icntxt);
-	writel(tcntxt, &msg->u.s.tcntxt);
+	msg->u.s.icntxt = cpu_to_le32(i2o_block_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
 
 	mptr = &msg->body[0];
 
@@ -834,11 +833,11 @@ static int i2o_block_transfer(struct request *req)
 
 		sgl_offset = SGL_OFFSET_12;
 
-		writel(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid,
-		       &msg->u.head[1]);
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid);
 
-		writel(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC, mptr++);
-		writel(tid, mptr++);
+		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
+		*mptr++ = cpu_to_le32(tid);
 
 		/*
 		 * ENABLE_DISCONNECT
@@ -853,22 +852,24 @@ static int i2o_block_transfer(struct request *req)
 			scsi_flags = 0xa0a0000a;
 		}
 
-		writel(scsi_flags, mptr++);
+		*mptr++ = cpu_to_le32(scsi_flags);
 
 		*((u32 *) & cmd[2]) = cpu_to_be32(req->sector * hwsec);
 		*((u16 *) & cmd[7]) = cpu_to_be16(req->nr_sectors * hwsec);
 
-		memcpy_toio(mptr, cmd, 10);
+		memcpy(mptr, cmd, 10);
 		mptr += 4;
-		writel(req->nr_sectors << KERNEL_SECTOR_SHIFT, mptr++);
+		*mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT);
 	} else
 #endif
 	{
-		writel(cmd | HOST_TID << 12 | tid, &msg->u.head[1]);
-		writel(ctl_flags, mptr++);
-		writel(req->nr_sectors << KERNEL_SECTOR_SHIFT, mptr++);
-		writel((u32) (req->sector << KERNEL_SECTOR_SHIFT), mptr++);
-		writel(req->sector >> (32 - KERNEL_SECTOR_SHIFT), mptr++);
+		msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
+		*mptr++ = cpu_to_le32(ctl_flags);
+		*mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT);
+		*mptr++ =
+		    cpu_to_le32((u32) (req->sector << KERNEL_SECTOR_SHIFT));
+		*mptr++ =
+		    cpu_to_le32(req->sector >> (32 - KERNEL_SECTOR_SHIFT));
 	}
 
 	if (!i2o_block_sglist_alloc(c, ireq, &mptr)) {
@@ -876,13 +877,13 @@ static int i2o_block_transfer(struct request *req)
 		goto context_remove;
 	}
 
-	writel(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) |
-	       sgl_offset, &msg->u.head[0]);
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
 
 	list_add_tail(&ireq->queue, &dev->open_queue);
 	dev->open_queue_depth++;
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	return 0;
 
@@ -890,7 +891,7 @@ static int i2o_block_transfer(struct request *req)
 	i2o_cntxt_list_remove(c, req);
 
       nop_msg:
-	i2o_msg_nop(c, m);
+	i2o_msg_nop(c, msg);
 
       exit:
 	return rc;
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 3c3a7abebb1b..4fe73d628c5b 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -230,8 +230,7 @@ static int i2o_cfg_swdl(unsigned long arg)
 	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
 	unsigned char maxfrag = 0, curfrag = 1;
 	struct i2o_dma buffer;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	unsigned int status = 0, swlen = 0, fragsize = 8192;
 	struct i2o_controller *c;
 
@@ -257,31 +256,34 @@ static int i2o_cfg_swdl(unsigned long arg)
 	if (!c)
 		return -ENXIO;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -EBUSY;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
 	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize, GFP_KERNEL)) {
-		i2o_msg_nop(c, m);
+		i2o_msg_nop(c, msg);
 		return -ENOMEM;
 	}
 
 	__copy_from_user(buffer.virt, kxfer.buf, fragsize);
 
-	writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_7, &msg->u.head[0]);
-	writel(I2O_CMD_SW_DOWNLOAD << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_config_driver.context, &msg->u.head[2]);
-	writel(0, &msg->u.head[3]);
-	writel((((u32) kxfer.flags) << 24) | (((u32) kxfer.sw_type) << 16) |
-	       (((u32) maxfrag) << 8) | (((u32) curfrag)), &msg->body[0]);
-	writel(swlen, &msg->body[1]);
-	writel(kxfer.sw_id, &msg->body[2]);
-	writel(0xD0000000 | fragsize, &msg->body[3]);
-	writel(buffer.phys, &msg->body[4]);
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_DOWNLOAD << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((((u32) kxfer.flags) << 24) | (((u32) kxfer.
+							sw_type) << 16) |
+			(((u32) maxfrag) << 8) | (((u32) curfrag)));
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
+	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
+	msg->body[4] = cpu_to_le32(buffer.phys);
 
 	osm_debug("swdl frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
-	status = i2o_msg_post_wait_mem(c, m, 60, &buffer);
+	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
 
 	if (status != -ETIMEDOUT)
 		i2o_dma_free(&c->pdev->dev, &buffer);
@@ -302,8 +304,7 @@ static int i2o_cfg_swul(unsigned long arg)
 	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
 	unsigned char maxfrag = 0, curfrag = 1;
 	struct i2o_dma buffer;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	unsigned int status = 0, swlen = 0, fragsize = 8192;
 	struct i2o_controller *c;
 	int ret = 0;
@@ -330,30 +331,30 @@ static int i2o_cfg_swul(unsigned long arg)
 	if (!c)
 		return -ENXIO;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -EBUSY;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
 	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize, GFP_KERNEL)) {
-		i2o_msg_nop(c, m);
+		i2o_msg_nop(c, msg);
 		return -ENOMEM;
 	}
 
-	writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_7, &msg->u.head[0]);
-	writel(I2O_CMD_SW_UPLOAD << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_config_driver.context, &msg->u.head[2]);
-	writel(0, &msg->u.head[3]);
-	writel((u32) kxfer.flags << 24 | (u32) kxfer.
-	       sw_type << 16 | (u32) maxfrag << 8 | (u32) curfrag,
-	       &msg->body[0]);
-	writel(swlen, &msg->body[1]);
-	writel(kxfer.sw_id, &msg->body[2]);
-	writel(0xD0000000 | fragsize, &msg->body[3]);
-	writel(buffer.phys, &msg->body[4]);
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_UPLOAD << 24 | HOST_TID << 12 | ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.
+			sw_type << 16 | (u32) maxfrag << 8 | (u32) curfrag);
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
+	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
+	msg->body[4] = cpu_to_le32(buffer.phys);
 
 	osm_debug("swul frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
-	status = i2o_msg_post_wait_mem(c, m, 60, &buffer);
+	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
 
 	if (status != I2O_POST_WAIT_OK) {
 		if (status != -ETIMEDOUT)
@@ -380,8 +381,7 @@ static int i2o_cfg_swdel(unsigned long arg)
 	struct i2o_controller *c;
 	struct i2o_sw_xfer kxfer;
 	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	unsigned int swlen;
 	int token;
 
@@ -395,21 +395,21 @@ static int i2o_cfg_swdel(unsigned long arg)
 	if (!c)
 		return -ENXIO;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -EBUSY;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(SEVEN_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_SW_REMOVE << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_config_driver.context, &msg->u.head[2]);
-	writel(0, &msg->u.head[3]);
-	writel((u32) kxfer.flags << 24 | (u32) kxfer.sw_type << 16,
-	       &msg->body[0]);
-	writel(swlen, &msg->body[1]);
-	writel(kxfer.sw_id, &msg->body[2]);
+	msg->u.head[0] = cpu_to_le32(SEVEN_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_REMOVE << 24 | HOST_TID << 12 | ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.sw_type << 16);
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
 
-	token = i2o_msg_post_wait(c, m, 10);
+	token = i2o_msg_post_wait(c, msg, 10);
 
 	if (token != I2O_POST_WAIT_OK) {
 		osm_info("swdel failed, DetailedStatus = %d\n", token);
@@ -423,25 +423,24 @@ static int i2o_cfg_validate(unsigned long arg)
 {
 	int token;
 	int iop = (int)arg;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	struct i2o_controller *c;
 
 	c = i2o_find_iop(iop);
 	if (!c)
 		return -ENXIO;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -EBUSY;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_CONFIG_VALIDATE << 24 | HOST_TID << 12 | iop,
-	       &msg->u.head[1]);
-	writel(i2o_config_driver.context, &msg->u.head[2]);
-	writel(0, &msg->u.head[3]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_CONFIG_VALIDATE << 24 | HOST_TID << 12 | iop);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
 
-	token = i2o_msg_post_wait(c, m, 10);
+	token = i2o_msg_post_wait(c, msg, 10);
 
 	if (token != I2O_POST_WAIT_OK) {
 		osm_info("Can't validate configuration, ErrorStatus = %d\n",
@@ -454,8 +453,7 @@ static int i2o_cfg_validate(unsigned long arg)
 
 static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	struct i2o_evt_id __user *pdesc = (struct i2o_evt_id __user *)arg;
 	struct i2o_evt_id kdesc;
 	struct i2o_controller *c;
@@ -474,18 +472,19 @@ static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp)
 	if (!d)
 		return -ENODEV;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -EBUSY;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | kdesc.tid,
-	       &msg->u.head[1]);
-	writel(i2o_config_driver.context, &msg->u.head[2]);
-	writel(i2o_cntxt_list_add(c, fp->private_data), &msg->u.head[3]);
-	writel(kdesc.evt_mask, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 |
+			kdesc.tid);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(i2o_cntxt_list_add(c, fp->private_data));
+	msg->body[0] = cpu_to_le32(kdesc.evt_mask);
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	return 0;
 }
@@ -537,7 +536,6 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 	u32 sg_index = 0;
 	i2o_status_block *sb;
 	struct i2o_message *msg;
-	u32 m;
 	unsigned int iop;
 
 	cmd = (struct i2o_cmd_passthru32 __user *)arg;
@@ -553,7 +551,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 		return -ENXIO;
 	}
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
 
 	sb = c->status_block.virt;
 
@@ -595,8 +593,8 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 
 	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
 
-	writel(i2o_config_driver.context, &msg->u.s.icntxt);
-	writel(i2o_cntxt_list_add(c, reply), &msg->u.s.tcntxt);
+	msg->u.s.icntxt = cpu_to_le32(i2o_config_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, reply));
 
 	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
 	if (sg_offset) {
@@ -662,7 +660,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 		}
 	}
 
-	rcode = i2o_msg_post_wait(c, m, 60);
+	rcode = i2o_msg_post_wait(c, msg, 60);
 	if (rcode)
 		goto sg_list_cleanup;
 
@@ -780,8 +778,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 	u32 i = 0;
 	void *p = NULL;
 	i2o_status_block *sb;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	unsigned int iop;
 
 	if (get_user(iop, &cmd->iop) || get_user(user_msg, &cmd->msg))
@@ -793,7 +790,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 		return -ENXIO;
 	}
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
 
 	sb = c->status_block.virt;
 
@@ -830,8 +827,8 @@ static int i2o_cfg_passthru(unsigned long arg)
 
 	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
 
-	writel(i2o_config_driver.context, &msg->u.s.icntxt);
-	writel(i2o_cntxt_list_add(c, reply), &msg->u.s.tcntxt);
+	msg->u.s.icntxt = cpu_to_le32(i2o_config_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, reply));
 
 	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
 	if (sg_offset) {
@@ -894,7 +891,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 		}
 	}
 
-	rcode = i2o_msg_post_wait(c, m, 60);
+	rcode = i2o_msg_post_wait(c, msg, 60);
 	if (rcode)
 		goto sg_list_cleanup;
 
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index 9f1744c3933b..7a784fd60804 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -510,8 +510,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	struct i2o_controller *c;
 	struct i2o_device *i2o_dev;
 	int tid;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	/*
 	 * ENABLE_DISCONNECT
 	 * SIMPLE_TAG
@@ -519,7 +518,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	 */
 	u32 scsi_flags = 0x20a00000;
 	u32 sgl_offset;
-	u32 __iomem *mptr;
+	u32 *mptr;
 	u32 cmd = I2O_CMD_SCSI_EXEC << 24;
 	int rc = 0;
 
@@ -576,8 +575,8 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	 *      throw it back to the scsi layer
 	 */
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY) {
+	msg = i2o_msg_get(c);
+	if (IS_ERR(msg)) {
 		rc = SCSI_MLQUEUE_HOST_BUSY;
 		goto exit;
 	}
@@ -617,16 +616,16 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 		if (sgl_offset == SGL_OFFSET_10)
 			sgl_offset = SGL_OFFSET_12;
 		cmd = I2O_CMD_PRIVATE << 24;
-		writel(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC, mptr++);
-		writel(adpt_flags | tid, mptr++);
+		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
+		*mptr++ = cpu_to_le32(adpt_flags | tid);
 	}
 #endif
 
-	writel(cmd | HOST_TID << 12 | tid, &msg->u.head[1]);
-	writel(i2o_scsi_driver.context, &msg->u.s.icntxt);
+	msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
+	msg->u.s.icntxt = cpu_to_le32(i2o_scsi_driver.context);
 
 	/* We want the SCSI control block back */
-	writel(i2o_cntxt_list_add(c, SCpnt), &msg->u.s.tcntxt);
+	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, SCpnt));
 
 	/* LSI_920_PCI_QUIRK
 	 *
@@ -649,15 +648,15 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	   }
 	 */
 
-	writel(scsi_flags | SCpnt->cmd_len, mptr++);
+	*mptr++ = cpu_to_le32(scsi_flags | SCpnt->cmd_len);
 
 	/* Write SCSI command into the message - always 16 byte block */
-	memcpy_toio(mptr, SCpnt->cmnd, 16);
+	memcpy(mptr, SCpnt->cmnd, 16);
 	mptr += 4;
 
 	if (sgl_offset != SGL_OFFSET_0) {
 		/* write size of data addressed by SGL */
-		writel(SCpnt->request_bufflen, mptr++);
+		*mptr++ = cpu_to_le32(SCpnt->request_bufflen);
 
 		/* Now fill in the SGList and command */
 		if (SCpnt->use_sg) {
@@ -676,11 +675,11 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	}
 
 	/* Stick the headers on */
-	writel(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset,
-	       &msg->u.head[0]);
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
 
 	/* Queue the message */
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	osm_debug("Issued %ld\n", SCpnt->serial_number);
 
@@ -688,7 +687,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 
       nomem:
 	rc = -ENOMEM;
-	i2o_msg_nop(c, m);
+	i2o_msg_nop(c, msg);
 
       exit:
 	return rc;
@@ -709,8 +708,7 @@ static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
 {
 	struct i2o_device *i2o_dev;
 	struct i2o_controller *c;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	int tid;
 	int status = FAILED;
 
@@ -720,16 +718,16 @@ static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
 	c = i2o_dev->iop;
 	tid = i2o_dev->lct_data.tid;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
 		return SCSI_MLQUEUE_HOST_BUSY;
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_SCSI_ABORT << 24 | HOST_TID << 12 | tid,
-	       &msg->u.head[1]);
-	writel(i2o_cntxt_list_get_ptr(c, SCpnt), &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SCSI_ABORT << 24 | HOST_TID << 12 | tid);
+	msg->body[0] = cpu_to_le32(i2o_cntxt_list_get_ptr(c, SCpnt));
 
-	if (i2o_msg_post_wait(c, m, I2O_TIMEOUT_SCSI_SCB_ABORT))
+	if (i2o_msg_post_wait(c, msg, I2O_TIMEOUT_SCSI_SCB_ABORT))
 		status = SUCCESS;
 
 	return status;
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 4eb53258842e..f86abb42bf89 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -46,27 +46,6 @@ static struct i2o_dma i2o_systab;
 
 static int i2o_hrt_get(struct i2o_controller *c);
 
-/**
- *	i2o_msg_nop - Returns a message which is not used
- *	@c: I2O controller from which the message was created
- *	@m: message which should be returned
- *
- *	If you fetch a message via i2o_msg_get, and can't use it, you must
- *	return the message with this function. Otherwise the message frame
- *	is lost.
- */
-void i2o_msg_nop(struct i2o_controller *c, u32 m)
-{
-	struct i2o_message __iomem *msg = i2o_msg_in_to_virt(c, m);
-
-	writel(THREE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(0, &msg->u.head[2]);
-	writel(0, &msg->u.head[3]);
-	i2o_msg_post(c, m);
-};
-
 /**
  *	i2o_msg_get_wait - obtain an I2O message from the IOP
  *	@c: I2O controller
@@ -81,22 +60,21 @@ void i2o_msg_nop(struct i2o_controller *c, u32 m)
  *	address from the read port (see the i2o spec). If no message is
  *	available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
  */
-u32 i2o_msg_get_wait(struct i2o_controller *c,
-		     struct i2o_message __iomem ** msg, int wait)
+struct i2o_message *i2o_msg_get_wait(struct i2o_controller *c, int wait)
 {
 	unsigned long timeout = jiffies + wait * HZ;
-	u32 m;
+	struct i2o_message *msg;
 
-	while ((m = i2o_msg_get(c, msg)) == I2O_QUEUE_EMPTY) {
+	while (IS_ERR(msg = i2o_msg_get(c))) {
 		if (time_after(jiffies, timeout)) {
 			osm_debug("%s: Timeout waiting for message frame.\n",
 				  c->name);
-			return I2O_QUEUE_EMPTY;
+			return ERR_PTR(-ETIMEDOUT);
 		}
 		schedule_timeout_uninterruptible(1);
 	}
 
-	return m;
+	return msg;
 };
 
 #if BITS_PER_LONG == 64
@@ -301,8 +279,7 @@ struct i2o_device *i2o_iop_find_device(struct i2o_controller *c, u16 tid)
  */
 static int i2o_iop_quiesce(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	i2o_status_block *sb = c->status_block.virt;
 	int rc;
 
@@ -313,16 +290,17 @@ static int i2o_iop_quiesce(struct i2o_controller *c)
 	    (sb->iop_state != ADAPTER_STATE_OPERATIONAL))
 		return 0;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_SYS_QUIESCE << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_QUIESCE << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
 
 	/* Long timeout needed for quiesce if lots of devices */
-	if ((rc = i2o_msg_post_wait(c, m, 240)))
+	if ((rc = i2o_msg_post_wait(c, msg, 240)))
 		osm_info("%s: Unable to quiesce (status=%#x).\n", c->name, -rc);
 	else
 		osm_debug("%s: Quiesced.\n", c->name);
@@ -342,8 +320,7 @@ static int i2o_iop_quiesce(struct i2o_controller *c)
  */
 static int i2o_iop_enable(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	i2o_status_block *sb = c->status_block.virt;
 	int rc;
 
@@ -353,16 +330,17 @@ static int i2o_iop_enable(struct i2o_controller *c)
 	if (sb->iop_state != ADAPTER_STATE_READY)
 		return -EINVAL;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_SYS_ENABLE << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_ENABLE << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
 
 	/* How long of a timeout do we need? */
-	if ((rc = i2o_msg_post_wait(c, m, 240)))
+	if ((rc = i2o_msg_post_wait(c, msg, 240)))
 		osm_err("%s: Could not enable (status=%#x).\n", c->name, -rc);
 	else
 		osm_debug("%s: Enabled.\n", c->name);
@@ -413,22 +391,22 @@ static inline void i2o_iop_enable_all(void)
  */
 static int i2o_iop_clear(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	int rc;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
 	/* Quiesce all IOPs first */
 	i2o_iop_quiesce_all();
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_ADAPTER_CLEAR << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_ADAPTER_CLEAR << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
 
-	if ((rc = i2o_msg_post_wait(c, m, 30)))
+	if ((rc = i2o_msg_post_wait(c, msg, 30)))
 		osm_info("%s: Unable to clear (status=%#x).\n", c->name, -rc);
 	else
 		osm_debug("%s: Cleared.\n", c->name);
@@ -446,13 +424,13 @@ static int i2o_iop_clear(struct i2o_controller *c)
  *	Clear and (re)initialize IOP's outbound queue and post the message
  *	frames to the IOP.
  *
- *	Returns 0 on success or a negative errno code on failure.
+ *	Returns 0 on success or negative error code on failure.
  */
 static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
 {
-	volatile u8 *status = c->status.virt;
 	u32 m;
-	struct i2o_message __iomem *msg;
+	volatile u8 *status = c->status.virt;
+	struct i2o_message *msg;
 	ulong timeout;
 	int i;
 
@@ -460,23 +438,24 @@ static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
 
 	memset(c->status.virt, 0, 4);
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
-
-	writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]);
-	writel(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0x00000000, &msg->u.s.tcntxt);
-	writel(PAGE_SIZE, &msg->body[0]);
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(PAGE_SIZE);
 	/* Outbound msg frame size in words and Initcode */
-	writel(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80, &msg->body[1]);
-	writel(0xd0000004, &msg->body[2]);
-	writel(i2o_dma_low(c->status.phys), &msg->body[3]);
-	writel(i2o_dma_high(c->status.phys), &msg->body[4]);
+	msg->body[1] = cpu_to_le32(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80);
+	msg->body[2] = cpu_to_le32(0xd0000004);
+	msg->body[3] = cpu_to_le32(i2o_dma_low(c->status.phys));
+	msg->body[4] = cpu_to_le32(i2o_dma_high(c->status.phys));
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	timeout = jiffies + I2O_TIMEOUT_INIT_OUTBOUND_QUEUE * HZ;
 	while (*status <= I2O_CMD_IN_PROGRESS) {
@@ -511,34 +490,34 @@ static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
 static int i2o_iop_reset(struct i2o_controller *c)
 {
 	volatile u8 *status = c->status.virt;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	unsigned long timeout;
 	i2o_status_block *sb = c->status_block.virt;
 	int rc = 0;
 
 	osm_debug("%s: Resetting controller\n", c->name);
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
 	memset(c->status_block.virt, 0, 8);
 
 	/* Quiesce all IOPs first */
 	i2o_iop_quiesce_all();
 
-	writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_ADAPTER_RESET << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0, &msg->u.s.tcntxt);	//FIXME: use reasonable transaction context
-	writel(0, &msg->body[0]);
-	writel(0, &msg->body[1]);
-	writel(i2o_dma_low(c->status.phys), &msg->body[2]);
-	writel(i2o_dma_high(c->status.phys), &msg->body[3]);
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_ADAPTER_RESET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0x00000000);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status.phys));
+	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status.phys));
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	/* Wait for a reply */
 	timeout = jiffies + I2O_TIMEOUT_RESET * HZ;
@@ -567,18 +546,15 @@ static int i2o_iop_reset(struct i2o_controller *c)
 		osm_debug("%s: Reset in progress, waiting for reboot...\n",
 			  c->name);
 
-		m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_RESET);
-		while (m == I2O_QUEUE_EMPTY) {
+		while (IS_ERR(msg = i2o_msg_get_wait(c, I2O_TIMEOUT_RESET))) {
 			if (time_after(jiffies, timeout)) {
 				osm_err("%s: IOP reset timeout.\n", c->name);
-				rc = -ETIMEDOUT;
+				rc = PTR_ERR(msg);
 				goto exit;
 			}
 			schedule_timeout_uninterruptible(1);
-
-			m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_RESET);
 		}
-		i2o_msg_nop(c, m);
+		i2o_msg_nop(c, msg);
 
 		/* from here all quiesce commands are safe */
 		c->no_quiesce = 0;
@@ -686,8 +662,7 @@ static int i2o_iop_activate(struct i2o_controller *c)
  */
 static int i2o_iop_systab_set(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	i2o_status_block *sb = c->status_block.virt;
 	struct device *dev = &c->pdev->dev;
 	struct resource *root;
@@ -735,20 +710,21 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
 		}
 	}
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
 	i2o_systab.phys = dma_map_single(dev, i2o_systab.virt, i2o_systab.len,
 					 PCI_DMA_TODEVICE);
 	if (!i2o_systab.phys) {
-		i2o_msg_nop(c, m);
+		i2o_msg_nop(c, msg);
 		return -ENOMEM;
 	}
 
-	writel(I2O_MESSAGE_SIZE(12) | SGL_OFFSET_6, &msg->u.head[0]);
-	writel(I2O_CMD_SYS_TAB_SET << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
+	msg->u.head[0] = cpu_to_le32(I2O_MESSAGE_SIZE(12) | SGL_OFFSET_6);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_TAB_SET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
 
 	/*
 	 * Provide three SGL-elements:
@@ -760,16 +736,16 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
 	 * same table to everyone. We have to go remap it for them all
 	 */
 
-	writel(c->unit + 2, &msg->body[0]);
-	writel(0, &msg->body[1]);
-	writel(0x54000000 | i2o_systab.len, &msg->body[2]);
-	writel(i2o_systab.phys, &msg->body[3]);
-	writel(0x54000000 | sb->current_mem_size, &msg->body[4]);
-	writel(sb->current_mem_base, &msg->body[5]);
-	writel(0xd4000000 | sb->current_io_size, &msg->body[6]);
-	writel(sb->current_io_base, &msg->body[6]);
+	msg->body[0] = cpu_to_le32(c->unit + 2);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(0x54000000 | i2o_systab.len);
+	msg->body[3] = cpu_to_le32(i2o_systab.phys);
+	msg->body[4] = cpu_to_le32(0x54000000 | sb->current_mem_size);
+	msg->body[5] = cpu_to_le32(sb->current_mem_base);
+	msg->body[6] = cpu_to_le32(0xd4000000 | sb->current_io_size);
+	msg->body[6] = cpu_to_le32(sb->current_io_base);
 
-	rc = i2o_msg_post_wait(c, m, 120);
+	rc = i2o_msg_post_wait(c, msg, 120);
 
 	dma_unmap_single(dev, i2o_systab.phys, i2o_systab.len,
 			 PCI_DMA_TODEVICE);
@@ -952,30 +928,30 @@ static int i2o_parse_hrt(struct i2o_controller *c)
  */
 int i2o_status_get(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	volatile u8 *status_block;
 	unsigned long timeout;
 
 	status_block = (u8 *) c->status_block.virt;
 	memset(c->status_block.virt, 0, sizeof(i2o_status_block));
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_STATUS_GET << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0, &msg->u.s.tcntxt);	// FIXME: use resonable transaction context
-	writel(0, &msg->body[0]);
-	writel(0, &msg->body[1]);
-	writel(i2o_dma_low(c->status_block.phys), &msg->body[2]);
-	writel(i2o_dma_high(c->status_block.phys), &msg->body[3]);
-	writel(sizeof(i2o_status_block), &msg->body[4]);	/* always 88 bytes */
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_STATUS_GET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0x00000000);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status_block.phys));
+	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status_block.phys));
+	msg->body[4] = cpu_to_le32(sizeof(i2o_status_block));	/* always 88 bytes */
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	/* Wait for a reply */
 	timeout = jiffies + I2O_TIMEOUT_STATUS_GET * HZ;
@@ -1013,20 +989,20 @@ static int i2o_hrt_get(struct i2o_controller *c)
 	struct device *dev = &c->pdev->dev;
 
 	for (i = 0; i < I2O_HRT_GET_TRIES; i++) {
-		struct i2o_message __iomem *msg;
-		u32 m;
+		struct i2o_message *msg;
 
-		m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-		if (m == I2O_QUEUE_EMPTY)
-			return -ETIMEDOUT;
+		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
 
-		writel(SIX_WORD_MSG_SIZE | SGL_OFFSET_4, &msg->u.head[0]);
-		writel(I2O_CMD_HRT_GET << 24 | HOST_TID << 12 | ADAPTER_TID,
-		       &msg->u.head[1]);
-		writel(0xd0000000 | c->hrt.len, &msg->body[0]);
-		writel(c->hrt.phys, &msg->body[1]);
+		msg->u.head[0] = cpu_to_le32(SIX_WORD_MSG_SIZE | SGL_OFFSET_4);
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_HRT_GET << 24 | HOST_TID << 12 |
+				ADAPTER_TID);
+		msg->body[0] = cpu_to_le32(0xd0000000 | c->hrt.len);
+		msg->body[1] = cpu_to_le32(c->hrt.phys);
 
-		rc = i2o_msg_post_wait_mem(c, m, 20, &c->hrt);
+		rc = i2o_msg_post_wait_mem(c, msg, 20, &c->hrt);
 
 		if (rc < 0) {
 			osm_err("%s: Unable to get HRT (status=%#x)\n", c->name,
@@ -1056,6 +1032,7 @@ static int i2o_hrt_get(struct i2o_controller *c)
  */
 void i2o_iop_free(struct i2o_controller *c)
 {
+	i2o_pool_free(&c->in_msg);
 	kfree(c);
 };
 
@@ -1080,7 +1057,7 @@ static struct class *i2o_controller_class;
  *	i2o_iop_alloc - Allocate and initialize a i2o_controller struct
  *
  *	Allocate the necessary memory for a i2o_controller struct and
- *	initialize the lists.
+ *	initialize the lists and message mempool.
  *
  *	Returns a pointer to the I2O controller or a negative error code on
  *	failure.
@@ -1089,6 +1066,7 @@ struct i2o_controller *i2o_iop_alloc(void)
 {
 	static int unit = 0;	/* 0 and 1 are NULL IOP and Local Host */
 	struct i2o_controller *c;
+	char poolname[32];
 
 	c = kmalloc(sizeof(*c), GFP_KERNEL);
 	if (!c) {
@@ -1098,11 +1076,20 @@ struct i2o_controller *i2o_iop_alloc(void)
 	}
 	memset(c, 0, sizeof(*c));
 
+	c->unit = unit++;
+	sprintf(c->name, "iop%d", c->unit);
+
+	snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name);
+	if (i2o_pool_alloc
+	    (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4,
+	     I2O_MSG_INPOOL_MIN)) {
+		kfree(c);
+		return ERR_PTR(-ENOMEM);
+	};
+
 	INIT_LIST_HEAD(&c->devices);
 	spin_lock_init(&c->lock);
 	init_MUTEX(&c->lct_lock);
-	c->unit = unit++;
-	sprintf(c->name, "iop%d", c->unit);
 
 	device_initialize(&c->device);
 
@@ -1199,28 +1186,27 @@ int i2o_iop_add(struct i2o_controller *c)
  *	is waited for, or expected. If you do not want further notifications,
  *	call the i2o_event_register again with a evt_mask of 0.
  *
- *	Returns 0 on success or -ETIMEDOUT if no message could be fetched for
- *	sending the request.
+ *	Returns 0 on success or negative error code on failure.
  */
 int i2o_event_register(struct i2o_device *dev, struct i2o_driver *drv,
 		       int tcntxt, u32 evt_mask)
 {
 	struct i2o_controller *c = dev->iop;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | dev->lct_data.
-	       tid, &msg->u.head[1]);
-	writel(drv->context, &msg->u.s.icntxt);
-	writel(tcntxt, &msg->u.s.tcntxt);
-	writel(evt_mask, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->u.s.icntxt = cpu_to_le32(drv->context);
+	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
+	msg->body[0] = cpu_to_le32(evt_mask);
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	return 0;
 };
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index ee7075fa1ec3..329d482eee81 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -483,4 +483,5 @@ void __exit i2o_pci_exit(void)
 {
 	pci_unregister_driver(&i2o_pci_driver);
 };
+
 MODULE_DEVICE_TABLE(pci, i2o_pci_ids);
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index d79c8a4bc4f8..9e359a981221 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -30,6 +30,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>	/* work_struct */
+#include <linux/mempool.h>
 
 #include <asm/io.h>
 #include <asm/semaphore.h>	/* Needed for MUTEX init macros */
@@ -38,1091 +39,1219 @@
 #define I2O_QUEUE_EMPTY		0xffffffff
 
 /*
- *	Message structures
+ *	Cache strategies
  */
-struct i2o_message {
-	union {
-		struct {
-			u8 version_offset;
-			u8 flags;
-			u16 size;
-			u32 target_tid:12;
-			u32 init_tid:12;
-			u32 function:8;
-			u32 icntxt;	/* initiator context */
-			u32 tcntxt;	/* transaction context */
-		} s;
-		u32 head[4];
-	} u;
-	/* List follows */
-	u32 body[0];
-};
 
-/*
- *	Each I2O device entity has one of these. There is one per device.
+/*	The NULL strategy leaves everything up to the controller. This tends to be a
+ *	pessimal but functional choice.
  */
-struct i2o_device {
-	i2o_lct_entry lct_data;	/* Device LCT information */
-
-	struct i2o_controller *iop;	/* Controlling IOP */
-	struct list_head list;	/* node in IOP devices list */
-
-	struct device device;
-
-	struct semaphore lock;	/* device lock */
-};
+#define CACHE_NULL		0
+/*	Prefetch data when reading. We continually attempt to load the next 32 sectors
+ *	into the controller cache.
+ */
+#define CACHE_PREFETCH		1
+/*	Prefetch data when reading. We sometimes attempt to load the next 32 sectors
+ *	into the controller cache. When an I/O is less <= 8K we assume its probably
+ *	not sequential and don't prefetch (default)
+ */
+#define CACHE_SMARTFETCH	2
+/*	Data is written to the cache and then out on to the disk. The I/O must be
+ *	physically on the medium before the write is acknowledged (default without
+ *	NVRAM)
+ */
+#define CACHE_WRITETHROUGH	17
+/*	Data is written to the cache and then out on to the disk. The controller
+ *	is permitted to write back the cache any way it wants. (default if battery
+ *	backed NVRAM is present). It can be useful to set this for swap regardless of
+ *	battery state.
+ */
+#define CACHE_WRITEBACK		18
+/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
+ *	write large I/O's directly to disk bypassing the cache to avoid the extra
+ *	memory copy hits. Small writes are writeback cached
+ */
+#define CACHE_SMARTBACK		19
+/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
+ *	write large I/O's directly to disk bypassing the cache to avoid the extra
+ *	memory copy hits. Small writes are writethrough cached. Suitable for devices
+ *	lacking battery backup
+ */
+#define CACHE_SMARTTHROUGH	20
 
 /*
- *	Event structure provided to the event handling function
+ *	Ioctl structures
  */
-struct i2o_event {
-	struct work_struct work;
-	struct i2o_device *i2o_dev;	/* I2O device pointer from which the
-					   event reply was initiated */
-	u16 size;		/* Size of data in 32-bit words */
-	u32 tcntxt;		/* Transaction context used at
-				   registration */
-	u32 event_indicator;	/* Event indicator from reply */
-	u32 data[0];		/* Event data from reply */
-};
+
+#define 	BLKI2OGRSTRAT	_IOR('2', 1, int)
+#define 	BLKI2OGWSTRAT	_IOR('2', 2, int)
+#define 	BLKI2OSRSTRAT	_IOW('2', 3, int)
+#define 	BLKI2OSWSTRAT	_IOW('2', 4, int)
 
 /*
- *	I2O classes which could be handled by the OSM
+ *	I2O Function codes
  */
-struct i2o_class_id {
-	u16 class_id:12;
-};
 
 /*
- *	I2O driver structure for OSMs
+ *	Executive Class
  */
-struct i2o_driver {
-	char *name;		/* OSM name */
-	int context;		/* Low 8 bits of the transaction info */
-	struct i2o_class_id *classes;	/* I2O classes that this OSM handles */
-
-	/* Message reply handler */
-	int (*reply) (struct i2o_controller *, u32, struct i2o_message *);
-
-	/* Event handler */
-	void (*event) (struct i2o_event *);
-
-	struct workqueue_struct *event_queue;	/* Event queue */
-
-	struct device_driver driver;
-
-	/* notification of changes */
-	void (*notify_controller_add) (struct i2o_controller *);
-	void (*notify_controller_remove) (struct i2o_controller *);
-	void (*notify_device_add) (struct i2o_device *);
-	void (*notify_device_remove) (struct i2o_device *);
-
-	struct semaphore lock;
-};
+#define	I2O_CMD_ADAPTER_ASSIGN		0xB3
+#define	I2O_CMD_ADAPTER_READ		0xB2
+#define	I2O_CMD_ADAPTER_RELEASE		0xB5
+#define	I2O_CMD_BIOS_INFO_SET		0xA5
+#define	I2O_CMD_BOOT_DEVICE_SET		0xA7
+#define	I2O_CMD_CONFIG_VALIDATE		0xBB
+#define	I2O_CMD_CONN_SETUP		0xCA
+#define	I2O_CMD_DDM_DESTROY		0xB1
+#define	I2O_CMD_DDM_ENABLE		0xD5
+#define	I2O_CMD_DDM_QUIESCE		0xC7
+#define	I2O_CMD_DDM_RESET		0xD9
+#define	I2O_CMD_DDM_SUSPEND		0xAF
+#define	I2O_CMD_DEVICE_ASSIGN		0xB7
+#define	I2O_CMD_DEVICE_RELEASE		0xB9
+#define	I2O_CMD_HRT_GET			0xA8
+#define	I2O_CMD_ADAPTER_CLEAR		0xBE
+#define	I2O_CMD_ADAPTER_CONNECT		0xC9
+#define	I2O_CMD_ADAPTER_RESET		0xBD
+#define	I2O_CMD_LCT_NOTIFY		0xA2
+#define	I2O_CMD_OUTBOUND_INIT		0xA1
+#define	I2O_CMD_PATH_ENABLE		0xD3
+#define	I2O_CMD_PATH_QUIESCE		0xC5
+#define	I2O_CMD_PATH_RESET		0xD7
+#define	I2O_CMD_STATIC_MF_CREATE	0xDD
+#define	I2O_CMD_STATIC_MF_RELEASE	0xDF
+#define	I2O_CMD_STATUS_GET		0xA0
+#define	I2O_CMD_SW_DOWNLOAD		0xA9
+#define	I2O_CMD_SW_UPLOAD		0xAB
+#define	I2O_CMD_SW_REMOVE		0xAD
+#define	I2O_CMD_SYS_ENABLE		0xD1
+#define	I2O_CMD_SYS_MODIFY		0xC1
+#define	I2O_CMD_SYS_QUIESCE		0xC3
+#define	I2O_CMD_SYS_TAB_SET		0xA3
 
 /*
- *	Contains DMA mapped address information
+ * Utility Class
  */
-struct i2o_dma {
-	void *virt;
-	dma_addr_t phys;
-	size_t len;
-};
+#define I2O_CMD_UTIL_NOP		0x00
+#define I2O_CMD_UTIL_ABORT		0x01
+#define I2O_CMD_UTIL_CLAIM		0x09
+#define I2O_CMD_UTIL_RELEASE		0x0B
+#define I2O_CMD_UTIL_PARAMS_GET		0x06
+#define I2O_CMD_UTIL_PARAMS_SET		0x05
+#define I2O_CMD_UTIL_EVT_REGISTER	0x13
+#define I2O_CMD_UTIL_EVT_ACK		0x14
+#define I2O_CMD_UTIL_CONFIG_DIALOG	0x10
+#define I2O_CMD_UTIL_DEVICE_RESERVE	0x0D
+#define I2O_CMD_UTIL_DEVICE_RELEASE	0x0F
+#define I2O_CMD_UTIL_LOCK		0x17
+#define I2O_CMD_UTIL_LOCK_RELEASE	0x19
+#define I2O_CMD_UTIL_REPLY_FAULT_NOTIFY	0x15
 
 /*
- *	Contains IO mapped address information
+ * SCSI Host Bus Adapter Class
  */
-struct i2o_io {
-	void __iomem *virt;
-	unsigned long phys;
-	unsigned long len;
-};
+#define I2O_CMD_SCSI_EXEC		0x81
+#define I2O_CMD_SCSI_ABORT		0x83
+#define I2O_CMD_SCSI_BUSRESET		0x27
 
 /*
- *	Context queue entry, used for 32-bit context on 64-bit systems
+ * Bus Adapter Class
  */
-struct i2o_context_list_element {
-	struct list_head list;
-	u32 context;
-	void *ptr;
-	unsigned long timestamp;
-};
+#define I2O_CMD_BUS_ADAPTER_RESET	0x85
+#define I2O_CMD_BUS_RESET		0x87
+#define I2O_CMD_BUS_SCAN		0x89
+#define I2O_CMD_BUS_QUIESCE		0x8b
 
 /*
- * Each I2O controller has one of these objects
+ * Random Block Storage Class
  */
-struct i2o_controller {
-	char name[16];
-	int unit;
-	int type;
+#define I2O_CMD_BLOCK_READ		0x30
+#define I2O_CMD_BLOCK_WRITE		0x31
+#define I2O_CMD_BLOCK_CFLUSH		0x37
+#define I2O_CMD_BLOCK_MLOCK		0x49
+#define I2O_CMD_BLOCK_MUNLOCK		0x4B
+#define I2O_CMD_BLOCK_MMOUNT		0x41
+#define I2O_CMD_BLOCK_MEJECT		0x43
+#define I2O_CMD_BLOCK_POWER		0x70
 
-	struct pci_dev *pdev;	/* PCI device */
+#define I2O_CMD_PRIVATE			0xFF
 
-	unsigned int promise:1;	/* Promise controller */
-	unsigned int adaptec:1;	/* DPT / Adaptec controller */
-	unsigned int raptor:1;	/* split bar */
-	unsigned int no_quiesce:1;	/* dont quiesce before reset */
-	unsigned int short_req:1;	/* use small block sizes */
-	unsigned int limit_sectors:1;	/* limit number of sectors / request */
-	unsigned int pae_support:1;	/* controller has 64-bit SGL support */
+/* Command status values  */
 
-	struct list_head devices;	/* list of I2O devices */
-	struct list_head list;	/* Controller list */
+#define I2O_CMD_IN_PROGRESS	0x01
+#define I2O_CMD_REJECTED	0x02
+#define I2O_CMD_FAILED		0x03
+#define I2O_CMD_COMPLETED	0x04
 
-	void __iomem *in_port;	/* Inbout port address */
-	void __iomem *out_port;	/* Outbound port address */
-	void __iomem *irq_status;	/* Interrupt status register address */
-	void __iomem *irq_mask;	/* Interrupt mask register address */
+/* I2O API function return values */
 
-	/* Dynamic LCT related data */
+#define I2O_RTN_NO_ERROR			0
+#define I2O_RTN_NOT_INIT			1
+#define I2O_RTN_FREE_Q_EMPTY			2
+#define I2O_RTN_TCB_ERROR			3
+#define I2O_RTN_TRANSACTION_ERROR		4
+#define I2O_RTN_ADAPTER_ALREADY_INIT		5
+#define I2O_RTN_MALLOC_ERROR			6
+#define I2O_RTN_ADPTR_NOT_REGISTERED		7
+#define I2O_RTN_MSG_REPLY_TIMEOUT		8
+#define I2O_RTN_NO_STATUS			9
+#define I2O_RTN_NO_FIRM_VER			10
+#define	I2O_RTN_NO_LINK_SPEED			11
 
-	struct i2o_dma status;	/* IOP status block */
+/* Reply message status defines for all messages */
 
-	struct i2o_dma hrt;	/* HW Resource Table */
-	i2o_lct *lct;		/* Logical Config Table */
-	struct i2o_dma dlct;	/* Temp LCT */
-	struct semaphore lct_lock;	/* Lock for LCT updates */
-	struct i2o_dma status_block;	/* IOP status block */
+#define I2O_REPLY_STATUS_SUCCESS                    	0x00
+#define I2O_REPLY_STATUS_ABORT_DIRTY                	0x01
+#define I2O_REPLY_STATUS_ABORT_NO_DATA_TRANSFER     	0x02
+#define	I2O_REPLY_STATUS_ABORT_PARTIAL_TRANSFER		0x03
+#define	I2O_REPLY_STATUS_ERROR_DIRTY			0x04
+#define	I2O_REPLY_STATUS_ERROR_NO_DATA_TRANSFER		0x05
+#define	I2O_REPLY_STATUS_ERROR_PARTIAL_TRANSFER		0x06
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_DIRTY		0x08
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_NO_DATA_TRANSFER	0x09
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_PARTIAL_TRANSFER	0x0A
+#define	I2O_REPLY_STATUS_TRANSACTION_ERROR		0x0B
+#define	I2O_REPLY_STATUS_PROGRESS_REPORT		0x80
 
-	struct i2o_io base;	/* controller messaging unit */
-	struct i2o_io in_queue;	/* inbound message queue Host->IOP */
-	struct i2o_dma out_queue;	/* outbound message queue IOP->Host */
+/* Status codes and Error Information for Parameter functions */
 
-	unsigned int battery:1;	/* Has a battery backup */
-	unsigned int io_alloc:1;	/* An I/O resource was allocated */
-	unsigned int mem_alloc:1;	/* A memory resource was allocated */
+#define I2O_PARAMS_STATUS_SUCCESS		0x00
+#define I2O_PARAMS_STATUS_BAD_KEY_ABORT		0x01
+#define I2O_PARAMS_STATUS_BAD_KEY_CONTINUE   	0x02
+#define I2O_PARAMS_STATUS_BUFFER_FULL		0x03
+#define I2O_PARAMS_STATUS_BUFFER_TOO_SMALL	0x04
+#define I2O_PARAMS_STATUS_FIELD_UNREADABLE	0x05
+#define I2O_PARAMS_STATUS_FIELD_UNWRITEABLE	0x06
+#define I2O_PARAMS_STATUS_INSUFFICIENT_FIELDS	0x07
+#define I2O_PARAMS_STATUS_INVALID_GROUP_ID	0x08
+#define I2O_PARAMS_STATUS_INVALID_OPERATION	0x09
+#define I2O_PARAMS_STATUS_NO_KEY_FIELD		0x0A
+#define I2O_PARAMS_STATUS_NO_SUCH_FIELD		0x0B
+#define I2O_PARAMS_STATUS_NON_DYNAMIC_GROUP	0x0C
+#define I2O_PARAMS_STATUS_OPERATION_ERROR	0x0D
+#define I2O_PARAMS_STATUS_SCALAR_ERROR		0x0E
+#define I2O_PARAMS_STATUS_TABLE_ERROR		0x0F
+#define I2O_PARAMS_STATUS_WRONG_GROUP_TYPE	0x10
 
-	struct resource io_resource;	/* I/O resource allocated to the IOP */
-	struct resource mem_resource;	/* Mem resource allocated to the IOP */
+/* DetailedStatusCode defines for Executive, DDM, Util and Transaction error
+ * messages: Table 3-2 Detailed Status Codes.*/
 
-	struct device device;
-	struct class_device *classdev;	/* I2O controller class device */
-	struct i2o_device *exec;	/* Executive */
-#if BITS_PER_LONG == 64
-	spinlock_t context_list_lock;	/* lock for context_list */
-	atomic_t context_list_counter;	/* needed for unique contexts */
-	struct list_head context_list;	/* list of context id's
-					   and pointers */
-#endif
-	spinlock_t lock;	/* lock for controller
-				   configuration */
+#define I2O_DSC_SUCCESS                        0x0000
+#define I2O_DSC_BAD_KEY                        0x0002
+#define I2O_DSC_TCL_ERROR                      0x0003
+#define I2O_DSC_REPLY_BUFFER_FULL              0x0004
+#define I2O_DSC_NO_SUCH_PAGE                   0x0005
+#define I2O_DSC_INSUFFICIENT_RESOURCE_SOFT     0x0006
+#define I2O_DSC_INSUFFICIENT_RESOURCE_HARD     0x0007
+#define I2O_DSC_CHAIN_BUFFER_TOO_LARGE         0x0009
+#define I2O_DSC_UNSUPPORTED_FUNCTION           0x000A
+#define I2O_DSC_DEVICE_LOCKED                  0x000B
+#define I2O_DSC_DEVICE_RESET                   0x000C
+#define I2O_DSC_INAPPROPRIATE_FUNCTION         0x000D
+#define I2O_DSC_INVALID_INITIATOR_ADDRESS      0x000E
+#define I2O_DSC_INVALID_MESSAGE_FLAGS          0x000F
+#define I2O_DSC_INVALID_OFFSET                 0x0010
+#define I2O_DSC_INVALID_PARAMETER              0x0011
+#define I2O_DSC_INVALID_REQUEST                0x0012
+#define I2O_DSC_INVALID_TARGET_ADDRESS         0x0013
+#define I2O_DSC_MESSAGE_TOO_LARGE              0x0014
+#define I2O_DSC_MESSAGE_TOO_SMALL              0x0015
+#define I2O_DSC_MISSING_PARAMETER              0x0016
+#define I2O_DSC_TIMEOUT                        0x0017
+#define I2O_DSC_UNKNOWN_ERROR                  0x0018
+#define I2O_DSC_UNKNOWN_FUNCTION               0x0019
+#define I2O_DSC_UNSUPPORTED_VERSION            0x001A
+#define I2O_DSC_DEVICE_BUSY                    0x001B
+#define I2O_DSC_DEVICE_NOT_AVAILABLE           0x001C
 
-	void *driver_data[I2O_MAX_DRIVERS];	/* storage for drivers */
-};
+/* DetailedStatusCode defines for Block Storage Operation: Table 6-7 Detailed
+   Status Codes.*/
 
-/*
- * I2O System table entry
- *
- * The system table contains information about all the IOPs in the
- * system.  It is sent to all IOPs so that they can create peer2peer
- * connections between them.
- */
-struct i2o_sys_tbl_entry {
-	u16 org_id;
-	u16 reserved1;
-	u32 iop_id:12;
-	u32 reserved2:20;
-	u16 seg_num:12;
-	u16 i2o_version:4;
-	u8 iop_state;
-	u8 msg_type;
-	u16 frame_size;
-	u16 reserved3;
-	u32 last_changed;
-	u32 iop_capabilities;
-	u32 inbound_low;
-	u32 inbound_high;
-};
+#define I2O_BSA_DSC_SUCCESS               0x0000
+#define I2O_BSA_DSC_MEDIA_ERROR           0x0001
+#define I2O_BSA_DSC_ACCESS_ERROR          0x0002
+#define I2O_BSA_DSC_DEVICE_FAILURE        0x0003
+#define I2O_BSA_DSC_DEVICE_NOT_READY      0x0004
+#define I2O_BSA_DSC_MEDIA_NOT_PRESENT     0x0005
+#define I2O_BSA_DSC_MEDIA_LOCKED          0x0006
+#define I2O_BSA_DSC_MEDIA_FAILURE         0x0007
+#define I2O_BSA_DSC_PROTOCOL_FAILURE      0x0008
+#define I2O_BSA_DSC_BUS_FAILURE           0x0009
+#define I2O_BSA_DSC_ACCESS_VIOLATION      0x000A
+#define I2O_BSA_DSC_WRITE_PROTECTED       0x000B
+#define I2O_BSA_DSC_DEVICE_RESET          0x000C
+#define I2O_BSA_DSC_VOLUME_CHANGED        0x000D
+#define I2O_BSA_DSC_TIMEOUT               0x000E
 
-struct i2o_sys_tbl {
-	u8 num_entries;
-	u8 version;
-	u16 reserved1;
-	u32 change_ind;
-	u32 reserved2;
-	u32 reserved3;
-	struct i2o_sys_tbl_entry iops[0];
-};
+/* FailureStatusCodes, Table 3-3 Message Failure Codes */
 
-extern struct list_head i2o_controllers;
+#define I2O_FSC_TRANSPORT_SERVICE_SUSPENDED             0x81
+#define I2O_FSC_TRANSPORT_SERVICE_TERMINATED            0x82
+#define I2O_FSC_TRANSPORT_CONGESTION                    0x83
+#define I2O_FSC_TRANSPORT_FAILURE                       0x84
+#define I2O_FSC_TRANSPORT_STATE_ERROR                   0x85
+#define I2O_FSC_TRANSPORT_TIME_OUT                      0x86
+#define I2O_FSC_TRANSPORT_ROUTING_FAILURE               0x87
+#define I2O_FSC_TRANSPORT_INVALID_VERSION               0x88
+#define I2O_FSC_TRANSPORT_INVALID_OFFSET                0x89
+#define I2O_FSC_TRANSPORT_INVALID_MSG_FLAGS             0x8A
+#define I2O_FSC_TRANSPORT_FRAME_TOO_SMALL               0x8B
+#define I2O_FSC_TRANSPORT_FRAME_TOO_LARGE               0x8C
+#define I2O_FSC_TRANSPORT_INVALID_TARGET_ID             0x8D
+#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_ID          0x8E
+#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_CONTEXT     0x8F
+#define I2O_FSC_TRANSPORT_UNKNOWN_FAILURE               0xFF
 
-/* Message functions */
-static inline u32 i2o_msg_get(struct i2o_controller *,
-			      struct i2o_message __iomem **);
-extern u32 i2o_msg_get_wait(struct i2o_controller *,
-			    struct i2o_message __iomem **, int);
-static inline void i2o_msg_post(struct i2o_controller *, u32);
-static inline int i2o_msg_post_wait(struct i2o_controller *, u32,
-				    unsigned long);
-extern int i2o_msg_post_wait_mem(struct i2o_controller *, u32, unsigned long,
-				 struct i2o_dma *);
-extern void i2o_msg_nop(struct i2o_controller *, u32);
-static inline void i2o_flush_reply(struct i2o_controller *, u32);
+/* Device Claim Types */
+#define	I2O_CLAIM_PRIMARY					0x01000000
+#define	I2O_CLAIM_MANAGEMENT					0x02000000
+#define	I2O_CLAIM_AUTHORIZED					0x03000000
+#define	I2O_CLAIM_SECONDARY					0x04000000
 
-/* IOP functions */
-extern int i2o_status_get(struct i2o_controller *);
+/* Message header defines for VersionOffset */
+#define I2OVER15	0x0001
+#define I2OVER20	0x0002
 
-extern int i2o_event_register(struct i2o_device *, struct i2o_driver *, int,
-			      u32);
-extern struct i2o_device *i2o_iop_find_device(struct i2o_controller *, u16);
-extern struct i2o_controller *i2o_find_iop(int);
+/* Default is 1.5 */
+#define I2OVERSION	I2OVER15
 
-/* Functions needed for handling 64-bit pointers in 32-bit context */
-#if BITS_PER_LONG == 64
-extern u32 i2o_cntxt_list_add(struct i2o_controller *, void *);
-extern void *i2o_cntxt_list_get(struct i2o_controller *, u32);
-extern u32 i2o_cntxt_list_remove(struct i2o_controller *, void *);
-extern u32 i2o_cntxt_list_get_ptr(struct i2o_controller *, void *);
+#define SGL_OFFSET_0    I2OVERSION
+#define SGL_OFFSET_4    (0x0040 | I2OVERSION)
+#define SGL_OFFSET_5    (0x0050 | I2OVERSION)
+#define SGL_OFFSET_6    (0x0060 | I2OVERSION)
+#define SGL_OFFSET_7    (0x0070 | I2OVERSION)
+#define SGL_OFFSET_8    (0x0080 | I2OVERSION)
+#define SGL_OFFSET_9    (0x0090 | I2OVERSION)
+#define SGL_OFFSET_10   (0x00A0 | I2OVERSION)
+#define SGL_OFFSET_11   (0x00B0 | I2OVERSION)
+#define SGL_OFFSET_12   (0x00C0 | I2OVERSION)
+#define SGL_OFFSET(x)   (((x)<<4) | I2OVERSION)
 
-static inline u32 i2o_ptr_low(void *ptr)
-{
-	return (u32) (u64) ptr;
-};
+/* Transaction Reply Lists (TRL) Control Word structure */
+#define TRL_SINGLE_FIXED_LENGTH		0x00
+#define TRL_SINGLE_VARIABLE_LENGTH	0x40
+#define TRL_MULTIPLE_FIXED_LENGTH	0x80
 
-static inline u32 i2o_ptr_high(void *ptr)
-{
-	return (u32) ((u64) ptr >> 32);
-};
+ /* msg header defines for MsgFlags */
+#define MSG_STATIC	0x0100
+#define MSG_64BIT_CNTXT	0x0200
+#define MSG_MULTI_TRANS	0x1000
+#define MSG_FAIL	0x2000
+#define MSG_FINAL	0x4000
+#define MSG_REPLY	0x8000
 
-static inline u32 i2o_dma_low(dma_addr_t dma_addr)
-{
-	return (u32) (u64) dma_addr;
-};
+ /* minimum size msg */
+#define THREE_WORD_MSG_SIZE	0x00030000
+#define FOUR_WORD_MSG_SIZE	0x00040000
+#define FIVE_WORD_MSG_SIZE	0x00050000
+#define SIX_WORD_MSG_SIZE	0x00060000
+#define SEVEN_WORD_MSG_SIZE	0x00070000
+#define EIGHT_WORD_MSG_SIZE	0x00080000
+#define NINE_WORD_MSG_SIZE	0x00090000
+#define TEN_WORD_MSG_SIZE	0x000A0000
+#define ELEVEN_WORD_MSG_SIZE	0x000B0000
+#define I2O_MESSAGE_SIZE(x)	((x)<<16)
 
-static inline u32 i2o_dma_high(dma_addr_t dma_addr)
-{
-	return (u32) ((u64) dma_addr >> 32);
-};
-#else
-static inline u32 i2o_cntxt_list_add(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
+/* special TID assignments */
+#define ADAPTER_TID		0
+#define HOST_TID		1
 
-static inline void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
-{
-	return (void *)context;
-};
+/* outbound queue defines */
+#define I2O_MAX_OUTBOUND_MSG_FRAMES	128
+#define I2O_OUTBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
 
-static inline u32 i2o_cntxt_list_remove(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
+/* inbound queue definitions */
+#define I2O_MSG_INPOOL_MIN		32
+#define I2O_INBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
 
-static inline u32 i2o_cntxt_list_get_ptr(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
+#define I2O_POST_WAIT_OK	0
+#define I2O_POST_WAIT_TIMEOUT	-ETIMEDOUT
 
-static inline u32 i2o_ptr_low(void *ptr)
-{
-	return (u32) ptr;
-};
+#define I2O_CONTEXT_LIST_MIN_LENGTH	15
+#define I2O_CONTEXT_LIST_USED		0x01
+#define I2O_CONTEXT_LIST_DELETED	0x02
 
-static inline u32 i2o_ptr_high(void *ptr)
-{
-	return 0;
-};
+/* timeouts */
+#define I2O_TIMEOUT_INIT_OUTBOUND_QUEUE	15
+#define I2O_TIMEOUT_MESSAGE_GET		5
+#define I2O_TIMEOUT_RESET		30
+#define I2O_TIMEOUT_STATUS_GET		5
+#define I2O_TIMEOUT_LCT_GET		360
+#define I2O_TIMEOUT_SCSI_SCB_ABORT	240
 
-static inline u32 i2o_dma_low(dma_addr_t dma_addr)
-{
-	return (u32) dma_addr;
-};
+/* retries */
+#define I2O_HRT_GET_TRIES		3
+#define I2O_LCT_GET_TRIES		3
 
-static inline u32 i2o_dma_high(dma_addr_t dma_addr)
-{
-	return 0;
-};
-#endif
+/* defines for max_sectors and max_phys_segments */
+#define I2O_MAX_SECTORS			1024
+#define I2O_MAX_SECTORS_LIMITED		256
+#define I2O_MAX_PHYS_SEGMENTS		MAX_PHYS_SEGMENTS
 
-/**
- *	i2o_sg_tablesize - Calculate the maximum number of elements in a SGL
- *	@c: I2O controller for which the calculation should be done
- *	@body_size: maximum body size used for message in 32-bit words.
- *
- *	Return the maximum number of SG elements in a SG list.
+/*
+ *	Message structures
  */
-static inline u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	u16 sg_count =
-	    (sb->inbound_frame_size - sizeof(struct i2o_message) / 4) -
-	    body_size;
-
-	if (c->pae_support) {
-		/*
-		 * for 64-bit a SG attribute element must be added and each
-		 * SG element needs 12 bytes instead of 8.
-		 */
-		sg_count -= 2;
-		sg_count /= 3;
-	} else
-		sg_count /= 2;
-
-	if (c->short_req && (sg_count > 8))
-		sg_count = 8;
+struct i2o_message {
+	union {
+		struct {
+			u8 version_offset;
+			u8 flags;
+			u16 size;
+			u32 target_tid:12;
+			u32 init_tid:12;
+			u32 function:8;
+			u32 icntxt;	/* initiator context */
+			u32 tcntxt;	/* transaction context */
+		} s;
+		u32 head[4];
+	} u;
+	/* List follows */
+	u32 body[0];
+};
 
-	return sg_count;
+/* MFA and I2O message used by mempool */
+struct i2o_msg_mfa {
+	u32 mfa;		/* MFA returned by the controller */
+	struct i2o_message msg;	/* I2O message */
 };
 
-/**
- *	i2o_dma_map_single - Map pointer to controller and fill in I2O message.
- *	@c: I2O controller
- *	@ptr: pointer to the data which should be mapped
- *	@size: size of data in bytes
- *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
- *	@sg_ptr: pointer to the SG list inside the I2O message
- *
- *	This function does all necessary DMA handling and also writes the I2O
- *	SGL elements into the I2O message. For details on DMA handling see also
- *	dma_map_single(). The pointer sg_ptr will only be set to the end of the
- *	SG list if the allocation was successful.
- *
- *	Returns DMA address which must be checked for failures using
- *	dma_mapping_error().
+/*
+ *	Each I2O device entity has one of these. There is one per device.
  */
-static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
-					    size_t size,
-					    enum dma_data_direction direction,
-					    u32 __iomem ** sg_ptr)
-{
-	u32 sg_flags;
-	u32 __iomem *mptr = *sg_ptr;
-	dma_addr_t dma_addr;
+struct i2o_device {
+	i2o_lct_entry lct_data;	/* Device LCT information */
 
-	switch (direction) {
-	case DMA_TO_DEVICE:
-		sg_flags = 0xd4000000;
-		break;
-	case DMA_FROM_DEVICE:
-		sg_flags = 0xd0000000;
-		break;
-	default:
-		return 0;
-	}
+	struct i2o_controller *iop;	/* Controlling IOP */
+	struct list_head list;	/* node in IOP devices list */
 
-	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
-	if (!dma_mapping_error(dma_addr)) {
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
-			writel(0x7C020002, mptr++);
-			writel(PAGE_SIZE, mptr++);
-		}
-#endif
+	struct device device;
 
-		writel(sg_flags | size, mptr++);
-		writel(i2o_dma_low(dma_addr), mptr++);
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
-			writel(i2o_dma_high(dma_addr), mptr++);
-#endif
-		*sg_ptr = mptr;
-	}
-	return dma_addr;
+	struct semaphore lock;	/* device lock */
 };
 
-/**
- *	i2o_dma_map_sg - Map a SG List to controller and fill in I2O message.
- *	@c: I2O controller
- *	@sg: SG list to be mapped
- *	@sg_count: number of elements in the SG list
- *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
- *	@sg_ptr: pointer to the SG list inside the I2O message
- *
- *	This function does all necessary DMA handling and also writes the I2O
- *	SGL elements into the I2O message. For details on DMA handling see also
- *	dma_map_sg(). The pointer sg_ptr will only be set to the end of the SG
- *	list if the allocation was successful.
- *
- *	Returns 0 on failure or 1 on success.
+/*
+ *	Event structure provided to the event handling function
  */
-static inline int i2o_dma_map_sg(struct i2o_controller *c,
-				 struct scatterlist *sg, int sg_count,
-				 enum dma_data_direction direction,
-				 u32 __iomem ** sg_ptr)
-{
-	u32 sg_flags;
-	u32 __iomem *mptr = *sg_ptr;
-
-	switch (direction) {
-	case DMA_TO_DEVICE:
-		sg_flags = 0x14000000;
-		break;
-	case DMA_FROM_DEVICE:
-		sg_flags = 0x10000000;
-		break;
-	default:
-		return 0;
-	}
-
-	sg_count = dma_map_sg(&c->pdev->dev, sg, sg_count, direction);
-	if (!sg_count)
-		return 0;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-	if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
-		writel(0x7C020002, mptr++);
-		writel(PAGE_SIZE, mptr++);
-	}
-#endif
-
-	while (sg_count-- > 0) {
-		if (!sg_count)
-			sg_flags |= 0xC0000000;
-		writel(sg_flags | sg_dma_len(sg), mptr++);
-		writel(i2o_dma_low(sg_dma_address(sg)), mptr++);
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
-			writel(i2o_dma_high(sg_dma_address(sg)), mptr++);
-#endif
-		sg++;
-	}
-	*sg_ptr = mptr;
+struct i2o_event {
+	struct work_struct work;
+	struct i2o_device *i2o_dev;	/* I2O device pointer from which the
+					   event reply was initiated */
+	u16 size;		/* Size of data in 32-bit words */
+	u32 tcntxt;		/* Transaction context used at
+				   registration */
+	u32 event_indicator;	/* Event indicator from reply */
+	u32 data[0];		/* Event data from reply */
+};
 
-	return 1;
+/*
+ *	I2O classes which could be handled by the OSM
+ */
+struct i2o_class_id {
+	u16 class_id:12;
 };
 
-/**
- *	i2o_dma_alloc - Allocate DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which should get the DMA buffer
- *	@len: length of the new DMA memory
- *	@gfp_mask: GFP mask
- *
- *	Allocate a coherent DMA memory and write the pointers into addr.
- *
- *	Returns 0 on success or -ENOMEM on failure.
+/*
+ *	I2O driver structure for OSMs
  */
-static inline int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr,
-				size_t len, gfp_t gfp_mask)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	int dma_64 = 0;
+struct i2o_driver {
+	char *name;		/* OSM name */
+	int context;		/* Low 8 bits of the transaction info */
+	struct i2o_class_id *classes;	/* I2O classes that this OSM handles */
 
-	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_64BIT_MASK)) {
-		dma_64 = 1;
-		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK))
-			return -ENOMEM;
-	}
+	/* Message reply handler */
+	int (*reply) (struct i2o_controller *, u32, struct i2o_message *);
 
-	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, gfp_mask);
+	/* Event handler */
+	void (*event) (struct i2o_event *);
 
-	if ((sizeof(dma_addr_t) > 4) && dma_64)
-		if (pci_set_dma_mask(pdev, DMA_64BIT_MASK))
-			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
+	struct workqueue_struct *event_queue;	/* Event queue */
 
-	if (!addr->virt)
-		return -ENOMEM;
+	struct device_driver driver;
 
-	memset(addr->virt, 0, len);
-	addr->len = len;
+	/* notification of changes */
+	void (*notify_controller_add) (struct i2o_controller *);
+	void (*notify_controller_remove) (struct i2o_controller *);
+	void (*notify_device_add) (struct i2o_device *);
+	void (*notify_device_remove) (struct i2o_device *);
 
-	return 0;
+	struct semaphore lock;
 };
 
-/**
- *	i2o_dma_free - Free DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which contains the DMA buffer
- *
- *	Free a coherent DMA memory and set virtual address of addr to NULL.
+/*
+ *	Contains DMA mapped address information
  */
-static inline void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
-{
-	if (addr->virt) {
-		if (addr->phys)
-			dma_free_coherent(dev, addr->len, addr->virt,
-					  addr->phys);
-		else
-			kfree(addr->virt);
-		addr->virt = NULL;
-	}
+struct i2o_dma {
+	void *virt;
+	dma_addr_t phys;
+	size_t len;
 };
 
-/**
- *	i2o_dma_realloc - Realloc DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: pointer to a i2o_dma struct DMA buffer
- *	@len: new length of memory
- *	@gfp_mask: GFP mask
- *
- *	If there was something allocated in the addr, free it first. If len > 0
- *	than try to allocate it and write the addresses back to the addr
- *	structure. If len == 0 set the virtual address to NULL.
- *
- *	Returns the 0 on success or negative error code on failure.
+/*
+ *	Contains slab cache and mempool information
  */
-static inline int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
-				  size_t len, gfp_t gfp_mask)
-{
-	i2o_dma_free(dev, addr);
-
-	if (len)
-		return i2o_dma_alloc(dev, addr, len, gfp_mask);
-
-	return 0;
+struct i2o_pool {
+	char *name;
+	kmem_cache_t *slab;
+	mempool_t *mempool;
 };
 
-/* I2O driver (OSM) functions */
-extern int i2o_driver_register(struct i2o_driver *);
-extern void i2o_driver_unregister(struct i2o_driver *);
-
-/**
- *	i2o_driver_notify_controller_add - Send notification of added controller
- *					   to a single I2O driver
- *
- *	Send notification of added controller to a single registered driver.
+/*
+ *	Contains IO mapped address information
  */
-static inline void i2o_driver_notify_controller_add(struct i2o_driver *drv,
-						    struct i2o_controller *c)
-{
-	if (drv->notify_controller_add)
-		drv->notify_controller_add(c);
+struct i2o_io {
+	void __iomem *virt;
+	unsigned long phys;
+	unsigned long len;
 };
 
-/**
- *	i2o_driver_notify_controller_remove - Send notification of removed
- *					      controller to a single I2O driver
- *
- *	Send notification of removed controller to a single registered driver.
+/*
+ *	Context queue entry, used for 32-bit context on 64-bit systems
  */
-static inline void i2o_driver_notify_controller_remove(struct i2o_driver *drv,
-						       struct i2o_controller *c)
-{
-	if (drv->notify_controller_remove)
-		drv->notify_controller_remove(c);
+struct i2o_context_list_element {
+	struct list_head list;
+	u32 context;
+	void *ptr;
+	unsigned long timestamp;
 };
 
-/**
- *	i2o_driver_notify_device_add - Send notification of added device to a
- *				       single I2O driver
- *
- *	Send notification of added device to a single registered driver.
+/*
+ * Each I2O controller has one of these objects
  */
-static inline void i2o_driver_notify_device_add(struct i2o_driver *drv,
-						struct i2o_device *i2o_dev)
-{
-	if (drv->notify_device_add)
-		drv->notify_device_add(i2o_dev);
+struct i2o_controller {
+	char name[16];
+	int unit;
+	int type;
+
+	struct pci_dev *pdev;	/* PCI device */
+
+	unsigned int promise:1;	/* Promise controller */
+	unsigned int adaptec:1;	/* DPT / Adaptec controller */
+	unsigned int raptor:1;	/* split bar */
+	unsigned int no_quiesce:1;	/* dont quiesce before reset */
+	unsigned int short_req:1;	/* use small block sizes */
+	unsigned int limit_sectors:1;	/* limit number of sectors / request */
+	unsigned int pae_support:1;	/* controller has 64-bit SGL support */
+
+	struct list_head devices;	/* list of I2O devices */
+	struct list_head list;	/* Controller list */
+
+	void __iomem *in_port;	/* Inbout port address */
+	void __iomem *out_port;	/* Outbound port address */
+	void __iomem *irq_status;	/* Interrupt status register address */
+	void __iomem *irq_mask;	/* Interrupt mask register address */
+
+	struct i2o_dma status;	/* IOP status block */
+
+	struct i2o_dma hrt;	/* HW Resource Table */
+	i2o_lct *lct;		/* Logical Config Table */
+	struct i2o_dma dlct;	/* Temp LCT */
+	struct semaphore lct_lock;	/* Lock for LCT updates */
+	struct i2o_dma status_block;	/* IOP status block */
+
+	struct i2o_io base;	/* controller messaging unit */
+	struct i2o_io in_queue;	/* inbound message queue Host->IOP */
+	struct i2o_dma out_queue;	/* outbound message queue IOP->Host */
+
+	struct i2o_pool in_msg;	/* mempool for inbound messages */
+
+	unsigned int battery:1;	/* Has a battery backup */
+	unsigned int io_alloc:1;	/* An I/O resource was allocated */
+	unsigned int mem_alloc:1;	/* A memory resource was allocated */
+
+	struct resource io_resource;	/* I/O resource allocated to the IOP */
+	struct resource mem_resource;	/* Mem resource allocated to the IOP */
+
+	struct device device;
+	struct class_device *classdev;	/* I2O controller class device */
+	struct i2o_device *exec;	/* Executive */
+#if BITS_PER_LONG == 64
+	spinlock_t context_list_lock;	/* lock for context_list */
+	atomic_t context_list_counter;	/* needed for unique contexts */
+	struct list_head context_list;	/* list of context id's
+					   and pointers */
+#endif
+	spinlock_t lock;	/* lock for controller
+				   configuration */
+
+	void *driver_data[I2O_MAX_DRIVERS];	/* storage for drivers */
 };
 
-/**
- *	i2o_driver_notify_device_remove - Send notification of removed device
- *					  to a single I2O driver
+/*
+ * I2O System table entry
  *
- *	Send notification of removed device to a single registered driver.
+ * The system table contains information about all the IOPs in the
+ * system.  It is sent to all IOPs so that they can create peer2peer
+ * connections between them.
  */
-static inline void i2o_driver_notify_device_remove(struct i2o_driver *drv,
-						   struct i2o_device *i2o_dev)
-{
-	if (drv->notify_device_remove)
-		drv->notify_device_remove(i2o_dev);
+struct i2o_sys_tbl_entry {
+	u16 org_id;
+	u16 reserved1;
+	u32 iop_id:12;
+	u32 reserved2:20;
+	u16 seg_num:12;
+	u16 i2o_version:4;
+	u8 iop_state;
+	u8 msg_type;
+	u16 frame_size;
+	u16 reserved3;
+	u32 last_changed;
+	u32 iop_capabilities;
+	u32 inbound_low;
+	u32 inbound_high;
 };
 
-extern void i2o_driver_notify_controller_add_all(struct i2o_controller *);
-extern void i2o_driver_notify_controller_remove_all(struct i2o_controller *);
-extern void i2o_driver_notify_device_add_all(struct i2o_device *);
-extern void i2o_driver_notify_device_remove_all(struct i2o_device *);
+struct i2o_sys_tbl {
+	u8 num_entries;
+	u8 version;
+	u16 reserved1;
+	u32 change_ind;
+	u32 reserved2;
+	u32 reserved3;
+	struct i2o_sys_tbl_entry iops[0];
+};
 
-/* I2O device functions */
-extern int i2o_device_claim(struct i2o_device *);
-extern int i2o_device_claim_release(struct i2o_device *);
+extern struct list_head i2o_controllers;
 
-/* Exec OSM functions */
-extern int i2o_exec_lct_get(struct i2o_controller *);
+/* Message functions */
+static inline struct i2o_message *i2o_msg_get(struct i2o_controller *);
+extern struct i2o_message *i2o_msg_get_wait(struct i2o_controller *, int);
+static inline void i2o_msg_post(struct i2o_controller *, struct i2o_message *);
+static inline int i2o_msg_post_wait(struct i2o_controller *,
+				    struct i2o_message *, unsigned long);
+extern int i2o_msg_post_wait_mem(struct i2o_controller *, struct i2o_message *,
+				 unsigned long, struct i2o_dma *);
+static inline void i2o_flush_reply(struct i2o_controller *, u32);
 
-/* device / driver / kobject conversion functions */
-#define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver)
-#define to_i2o_device(dev) container_of(dev, struct i2o_device, device)
-#define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device)
-#define kobj_to_i2o_device(kobj) to_i2o_device(container_of(kobj, struct device, kobj))
+/* IOP functions */
+extern int i2o_status_get(struct i2o_controller *);
 
-/**
- *	i2o_msg_get - obtain an I2O message from the IOP
- *	@c: I2O controller
- *	@msg: pointer to a I2O message pointer
- *
- *	This function tries to get a message slot. If no message slot is
- *	available do not wait until one is availabe (see also i2o_msg_get_wait).
- *
- *	On a success the message is returned and the pointer to the message is
- *	set in msg. The returned message is the physical page frame offset
- *	address from the read port (see the i2o spec). If no message is
- *	available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
- */
-static inline u32 i2o_msg_get(struct i2o_controller *c,
-			      struct i2o_message __iomem ** msg)
-{
-	u32 m = readl(c->in_port);
+extern int i2o_event_register(struct i2o_device *, struct i2o_driver *, int,
+			      u32);
+extern struct i2o_device *i2o_iop_find_device(struct i2o_controller *, u16);
+extern struct i2o_controller *i2o_find_iop(int);
 
-	if (m != I2O_QUEUE_EMPTY)
-		*msg = c->in_queue.virt + m;
+/* Functions needed for handling 64-bit pointers in 32-bit context */
+#if BITS_PER_LONG == 64
+extern u32 i2o_cntxt_list_add(struct i2o_controller *, void *);
+extern void *i2o_cntxt_list_get(struct i2o_controller *, u32);
+extern u32 i2o_cntxt_list_remove(struct i2o_controller *, void *);
+extern u32 i2o_cntxt_list_get_ptr(struct i2o_controller *, void *);
 
-	return m;
+static inline u32 i2o_ptr_low(void *ptr)
+{
+	return (u32) (u64) ptr;
 };
 
-/**
- *	i2o_msg_post - Post I2O message to I2O controller
- *	@c: I2O controller to which the message should be send
- *	@m: the message identifier
- *
- *	Post the message to the I2O controller.
- */
-static inline void i2o_msg_post(struct i2o_controller *c, u32 m)
+static inline u32 i2o_ptr_high(void *ptr)
 {
-	writel(m, c->in_port);
+	return (u32) ((u64) ptr >> 32);
 };
 
-/**
- * 	i2o_msg_post_wait - Post and wait a message and wait until return
- *	@c: controller
- *	@m: message to post
- *	@timeout: time in seconds to wait
- *
- * 	This API allows an OSM to post a message and then be told whether or
- *	not the system received a successful reply. If the message times out
- *	then the value '-ETIMEDOUT' is returned.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static inline int i2o_msg_post_wait(struct i2o_controller *c, u32 m,
-				    unsigned long timeout)
+static inline u32 i2o_dma_low(dma_addr_t dma_addr)
 {
-	return i2o_msg_post_wait_mem(c, m, timeout, NULL);
+	return (u32) (u64) dma_addr;
 };
 
-/**
- *	i2o_flush_reply - Flush reply from I2O controller
- *	@c: I2O controller
- *	@m: the message identifier
- *
- *	The I2O controller must be informed that the reply message is not needed
- *	anymore. If you forget to flush the reply, the message frame can't be
- *	used by the controller anymore and is therefore lost.
- */
-static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
+static inline u32 i2o_dma_high(dma_addr_t dma_addr)
 {
-	writel(m, c->out_port);
+	return (u32) ((u64) dma_addr >> 32);
+};
+#else
+static inline u32 i2o_cntxt_list_add(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
 };
 
-/**
- *	i2o_out_to_virt - Turn an I2O message to a virtual address
- *	@c: controller
- *	@m: message engine value
- *
- *	Turn a receive message from an I2O controller bus address into
- *	a Linux virtual address. The shared page frame is a linear block
- *	so we simply have to shift the offset. This function does not
- *	work for sender side messages as they are ioremap objects
- *	provided by the I2O controller.
- */
-static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
-						      u32 m)
+static inline void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
 {
-	BUG_ON(m < c->out_queue.phys
-	       || m >= c->out_queue.phys + c->out_queue.len);
+	return (void *)context;
+};
 
-	return c->out_queue.virt + (m - c->out_queue.phys);
+static inline u32 i2o_cntxt_list_remove(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
 };
 
-/**
- *	i2o_msg_in_to_virt - Turn an I2O message to a virtual address
- *	@c: controller
- *	@m: message engine value
+static inline u32 i2o_cntxt_list_get_ptr(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline u32 i2o_ptr_low(void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline u32 i2o_ptr_high(void *ptr)
+{
+	return 0;
+};
+
+static inline u32 i2o_dma_low(dma_addr_t dma_addr)
+{
+	return (u32) dma_addr;
+};
+
+static inline u32 i2o_dma_high(dma_addr_t dma_addr)
+{
+	return 0;
+};
+#endif
+
+/**
+ *	i2o_sg_tablesize - Calculate the maximum number of elements in a SGL
+ *	@c: I2O controller for which the calculation should be done
+ *	@body_size: maximum body size used for message in 32-bit words.
  *
- *	Turn a send message from an I2O controller bus address into
- *	a Linux virtual address. The shared page frame is a linear block
- *	so we simply have to shift the offset. This function does not
- *	work for receive side messages as they are kmalloc objects
- *	in a different pool.
+ *	Return the maximum number of SG elements in a SG list.
  */
-static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct
-							     i2o_controller *c,
-							     u32 m)
+static inline u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
 {
-	return c->in_queue.virt + m;
+	i2o_status_block *sb = c->status_block.virt;
+	u16 sg_count =
+	    (sb->inbound_frame_size - sizeof(struct i2o_message) / 4) -
+	    body_size;
+
+	if (c->pae_support) {
+		/*
+		 * for 64-bit a SG attribute element must be added and each
+		 * SG element needs 12 bytes instead of 8.
+		 */
+		sg_count -= 2;
+		sg_count /= 3;
+	} else
+		sg_count /= 2;
+
+	if (c->short_req && (sg_count > 8))
+		sg_count = 8;
+
+	return sg_count;
 };
 
-/*
- *	Endian handling wrapped into the macro - keeps the core code
- *	cleaner.
+/**
+ *	i2o_dma_map_single - Map pointer to controller and fill in I2O message.
+ *	@c: I2O controller
+ *	@ptr: pointer to the data which should be mapped
+ *	@size: size of data in bytes
+ *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
+ *	@sg_ptr: pointer to the SG list inside the I2O message
+ *
+ *	This function does all necessary DMA handling and also writes the I2O
+ *	SGL elements into the I2O message. For details on DMA handling see also
+ *	dma_map_single(). The pointer sg_ptr will only be set to the end of the
+ *	SG list if the allocation was successful.
+ *
+ *	Returns DMA address which must be checked for failures using
+ *	dma_mapping_error().
  */
+static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
+					    size_t size,
+					    enum dma_data_direction direction,
+					    u32 ** sg_ptr)
+{
+	u32 sg_flags;
+	u32 *mptr = *sg_ptr;
+	dma_addr_t dma_addr;
 
-#define i2o_raw_writel(val, mem)	__raw_writel(cpu_to_le32(val), mem)
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		sg_flags = 0xd4000000;
+		break;
+	case DMA_FROM_DEVICE:
+		sg_flags = 0xd0000000;
+		break;
+	default:
+		return 0;
+	}
 
-extern int i2o_parm_field_get(struct i2o_device *, int, int, void *, int);
-extern int i2o_parm_table_get(struct i2o_device *, int, int, int, void *, int,
-			      void *, int);
+	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
+	if (!dma_mapping_error(dma_addr)) {
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
+			*mptr++ = cpu_to_le32(0x7C020002);
+			*mptr++ = cpu_to_le32(PAGE_SIZE);
+		}
+#endif
 
-/* debugging and troubleshooting/diagnostic helpers. */
-#define osm_printk(level, format, arg...)  \
-	printk(level "%s: " format, OSM_NAME , ## arg)
+		*mptr++ = cpu_to_le32(sg_flags | size);
+		*mptr++ = cpu_to_le32(i2o_dma_low(dma_addr));
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
+			*mptr++ = cpu_to_le32(i2o_dma_high(dma_addr));
+#endif
+		*sg_ptr = mptr;
+	}
+	return dma_addr;
+};
 
-#ifdef DEBUG
-#define osm_debug(format, arg...) \
-	osm_printk(KERN_DEBUG, format , ## arg)
-#else
-#define osm_debug(format, arg...) \
-        do { } while (0)
+/**
+ *	i2o_dma_map_sg - Map a SG List to controller and fill in I2O message.
+ *	@c: I2O controller
+ *	@sg: SG list to be mapped
+ *	@sg_count: number of elements in the SG list
+ *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
+ *	@sg_ptr: pointer to the SG list inside the I2O message
+ *
+ *	This function does all necessary DMA handling and also writes the I2O
+ *	SGL elements into the I2O message. For details on DMA handling see also
+ *	dma_map_sg(). The pointer sg_ptr will only be set to the end of the SG
+ *	list if the allocation was successful.
+ *
+ *	Returns 0 on failure or 1 on success.
+ */
+static inline int i2o_dma_map_sg(struct i2o_controller *c,
+				 struct scatterlist *sg, int sg_count,
+				 enum dma_data_direction direction,
+				 u32 ** sg_ptr)
+{
+	u32 sg_flags;
+	u32 *mptr = *sg_ptr;
+
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		sg_flags = 0x14000000;
+		break;
+	case DMA_FROM_DEVICE:
+		sg_flags = 0x10000000;
+		break;
+	default:
+		return 0;
+	}
+
+	sg_count = dma_map_sg(&c->pdev->dev, sg, sg_count, direction);
+	if (!sg_count)
+		return 0;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+	if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
+		*mptr++ = cpu_to_le32(0x7C020002);
+		*mptr++ = cpu_to_le32(PAGE_SIZE);
+	}
 #endif
 
-#define osm_err(format, arg...)		\
-	osm_printk(KERN_ERR, format , ## arg)
-#define osm_info(format, arg...)		\
-	osm_printk(KERN_INFO, format , ## arg)
-#define osm_warn(format, arg...)		\
-	osm_printk(KERN_WARNING, format , ## arg)
+	while (sg_count-- > 0) {
+		if (!sg_count)
+			sg_flags |= 0xC0000000;
+		*mptr++ = cpu_to_le32(sg_flags | sg_dma_len(sg));
+		*mptr++ = cpu_to_le32(i2o_dma_low(sg_dma_address(sg)));
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
+			*mptr++ = cpu_to_le32(i2o_dma_high(sg_dma_address(sg)));
+#endif
+		sg++;
+	}
+	*sg_ptr = mptr;
 
-/* debugging functions */
-extern void i2o_report_status(const char *, const char *, struct i2o_message *);
-extern void i2o_dump_message(struct i2o_message *);
-extern void i2o_dump_hrt(struct i2o_controller *c);
-extern void i2o_debug_state(struct i2o_controller *c);
+	return 1;
+};
 
-/*
- *	Cache strategies
+/**
+ *	i2o_dma_alloc - Allocate DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: i2o_dma struct which should get the DMA buffer
+ *	@len: length of the new DMA memory
+ *	@gfp_mask: GFP mask
+ *
+ *	Allocate a coherent DMA memory and write the pointers into addr.
+ *
+ *	Returns 0 on success or -ENOMEM on failure.
  */
+static inline int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr,
+				size_t len, gfp_t gfp_mask)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int dma_64 = 0;
 
-/*	The NULL strategy leaves everything up to the controller. This tends to be a
- *	pessimal but functional choice.
- */
-#define CACHE_NULL		0
-/*	Prefetch data when reading. We continually attempt to load the next 32 sectors
- *	into the controller cache.
- */
-#define CACHE_PREFETCH		1
-/*	Prefetch data when reading. We sometimes attempt to load the next 32 sectors
- *	into the controller cache. When an I/O is less <= 8K we assume its probably
- *	not sequential and don't prefetch (default)
- */
-#define CACHE_SMARTFETCH	2
-/*	Data is written to the cache and then out on to the disk. The I/O must be
- *	physically on the medium before the write is acknowledged (default without
- *	NVRAM)
- */
-#define CACHE_WRITETHROUGH	17
-/*	Data is written to the cache and then out on to the disk. The controller
- *	is permitted to write back the cache any way it wants. (default if battery
- *	backed NVRAM is present). It can be useful to set this for swap regardless of
- *	battery state.
- */
-#define CACHE_WRITEBACK		18
-/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
- *	write large I/O's directly to disk bypassing the cache to avoid the extra
- *	memory copy hits. Small writes are writeback cached
- */
-#define CACHE_SMARTBACK		19
-/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
- *	write large I/O's directly to disk bypassing the cache to avoid the extra
- *	memory copy hits. Small writes are writethrough cached. Suitable for devices
- *	lacking battery backup
- */
-#define CACHE_SMARTTHROUGH	20
+	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_64BIT_MASK)) {
+		dma_64 = 1;
+		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK))
+			return -ENOMEM;
+	}
 
-/*
- *	Ioctl structures
- */
+	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, gfp_mask);
 
-#define 	BLKI2OGRSTRAT	_IOR('2', 1, int)
-#define 	BLKI2OGWSTRAT	_IOR('2', 2, int)
-#define 	BLKI2OSRSTRAT	_IOW('2', 3, int)
-#define 	BLKI2OSWSTRAT	_IOW('2', 4, int)
+	if ((sizeof(dma_addr_t) > 4) && dma_64)
+		if (pci_set_dma_mask(pdev, DMA_64BIT_MASK))
+			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
 
-/*
- *	I2O Function codes
- */
+	if (!addr->virt)
+		return -ENOMEM;
 
-/*
- *	Executive Class
- */
-#define	I2O_CMD_ADAPTER_ASSIGN		0xB3
-#define	I2O_CMD_ADAPTER_READ		0xB2
-#define	I2O_CMD_ADAPTER_RELEASE		0xB5
-#define	I2O_CMD_BIOS_INFO_SET		0xA5
-#define	I2O_CMD_BOOT_DEVICE_SET		0xA7
-#define	I2O_CMD_CONFIG_VALIDATE		0xBB
-#define	I2O_CMD_CONN_SETUP		0xCA
-#define	I2O_CMD_DDM_DESTROY		0xB1
-#define	I2O_CMD_DDM_ENABLE		0xD5
-#define	I2O_CMD_DDM_QUIESCE		0xC7
-#define	I2O_CMD_DDM_RESET		0xD9
-#define	I2O_CMD_DDM_SUSPEND		0xAF
-#define	I2O_CMD_DEVICE_ASSIGN		0xB7
-#define	I2O_CMD_DEVICE_RELEASE		0xB9
-#define	I2O_CMD_HRT_GET			0xA8
-#define	I2O_CMD_ADAPTER_CLEAR		0xBE
-#define	I2O_CMD_ADAPTER_CONNECT		0xC9
-#define	I2O_CMD_ADAPTER_RESET		0xBD
-#define	I2O_CMD_LCT_NOTIFY		0xA2
-#define	I2O_CMD_OUTBOUND_INIT		0xA1
-#define	I2O_CMD_PATH_ENABLE		0xD3
-#define	I2O_CMD_PATH_QUIESCE		0xC5
-#define	I2O_CMD_PATH_RESET		0xD7
-#define	I2O_CMD_STATIC_MF_CREATE	0xDD
-#define	I2O_CMD_STATIC_MF_RELEASE	0xDF
-#define	I2O_CMD_STATUS_GET		0xA0
-#define	I2O_CMD_SW_DOWNLOAD		0xA9
-#define	I2O_CMD_SW_UPLOAD		0xAB
-#define	I2O_CMD_SW_REMOVE		0xAD
-#define	I2O_CMD_SYS_ENABLE		0xD1
-#define	I2O_CMD_SYS_MODIFY		0xC1
-#define	I2O_CMD_SYS_QUIESCE		0xC3
-#define	I2O_CMD_SYS_TAB_SET		0xA3
+	memset(addr->virt, 0, len);
+	addr->len = len;
 
-/*
- * Utility Class
+	return 0;
+};
+
+/**
+ *	i2o_dma_free - Free DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: i2o_dma struct which contains the DMA buffer
+ *
+ *	Free a coherent DMA memory and set virtual address of addr to NULL.
  */
-#define I2O_CMD_UTIL_NOP		0x00
-#define I2O_CMD_UTIL_ABORT		0x01
-#define I2O_CMD_UTIL_CLAIM		0x09
-#define I2O_CMD_UTIL_RELEASE		0x0B
-#define I2O_CMD_UTIL_PARAMS_GET		0x06
-#define I2O_CMD_UTIL_PARAMS_SET		0x05
-#define I2O_CMD_UTIL_EVT_REGISTER	0x13
-#define I2O_CMD_UTIL_EVT_ACK		0x14
-#define I2O_CMD_UTIL_CONFIG_DIALOG	0x10
-#define I2O_CMD_UTIL_DEVICE_RESERVE	0x0D
-#define I2O_CMD_UTIL_DEVICE_RELEASE	0x0F
-#define I2O_CMD_UTIL_LOCK		0x17
-#define I2O_CMD_UTIL_LOCK_RELEASE	0x19
-#define I2O_CMD_UTIL_REPLY_FAULT_NOTIFY	0x15
+static inline void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
+{
+	if (addr->virt) {
+		if (addr->phys)
+			dma_free_coherent(dev, addr->len, addr->virt,
+					  addr->phys);
+		else
+			kfree(addr->virt);
+		addr->virt = NULL;
+	}
+};
 
-/*
- * SCSI Host Bus Adapter Class
+/**
+ *	i2o_dma_realloc - Realloc DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: pointer to a i2o_dma struct DMA buffer
+ *	@len: new length of memory
+ *	@gfp_mask: GFP mask
+ *
+ *	If there was something allocated in the addr, free it first. If len > 0
+ *	than try to allocate it and write the addresses back to the addr
+ *	structure. If len == 0 set the virtual address to NULL.
+ *
+ *	Returns the 0 on success or negative error code on failure.
  */
-#define I2O_CMD_SCSI_EXEC		0x81
-#define I2O_CMD_SCSI_ABORT		0x83
-#define I2O_CMD_SCSI_BUSRESET		0x27
+static inline int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
+				  size_t len, gfp_t gfp_mask)
+{
+	i2o_dma_free(dev, addr);
+
+	if (len)
+		return i2o_dma_alloc(dev, addr, len, gfp_mask);
+
+	return 0;
+};
 
 /*
- * Bus Adapter Class
+ *	i2o_pool_alloc - Allocate an slab cache and mempool
+ *	@mempool: pointer to struct i2o_pool to write data into.
+ *	@name: name which is used to identify cache
+ *	@size: size of each object
+ *	@min_nr: minimum number of objects
+ *
+ *	First allocates a slab cache with name and size. Then allocates a
+ *	mempool which uses the slab cache for allocation and freeing.
+ *
+ *	Returns 0 on success or negative error code on failure.
  */
-#define I2O_CMD_BUS_ADAPTER_RESET	0x85
-#define I2O_CMD_BUS_RESET		0x87
-#define I2O_CMD_BUS_SCAN		0x89
-#define I2O_CMD_BUS_QUIESCE		0x8b
+static inline int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
+				 size_t size, int min_nr)
+{
+	pool->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+	if (!pool->name)
+		goto exit;
+	strcpy(pool->name, name);
+
+	pool->slab =
+	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL,
+			      NULL);
+	if (!pool->slab)
+		goto free_name;
+
+	pool->mempool =
+	    mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
+			   pool->slab);
+	if (!pool->mempool)
+		goto free_slab;
+
+	return 0;
+
+      free_slab:
+	kmem_cache_destroy(pool->slab);
+
+      free_name:
+	kfree(pool->name);
+
+      exit:
+	return -ENOMEM;
+};
 
 /*
- * Random Block Storage Class
+ *	i2o_pool_free - Free slab cache and mempool again
+ *	@mempool: pointer to struct i2o_pool which should be freed
+ *
+ *	Note that you have to return all objects to the mempool again before
+ *	calling i2o_pool_free().
  */
-#define I2O_CMD_BLOCK_READ		0x30
-#define I2O_CMD_BLOCK_WRITE		0x31
-#define I2O_CMD_BLOCK_CFLUSH		0x37
-#define I2O_CMD_BLOCK_MLOCK		0x49
-#define I2O_CMD_BLOCK_MUNLOCK		0x4B
-#define I2O_CMD_BLOCK_MMOUNT		0x41
-#define I2O_CMD_BLOCK_MEJECT		0x43
-#define I2O_CMD_BLOCK_POWER		0x70
-
-#define I2O_CMD_PRIVATE			0xFF
+static inline void i2o_pool_free(struct i2o_pool *pool)
+{
+	mempool_destroy(pool->mempool);
+	kmem_cache_destroy(pool->slab);
+	kfree(pool->name);
+};
 
-/* Command status values  */
+/* I2O driver (OSM) functions */
+extern int i2o_driver_register(struct i2o_driver *);
+extern void i2o_driver_unregister(struct i2o_driver *);
 
-#define I2O_CMD_IN_PROGRESS	0x01
-#define I2O_CMD_REJECTED	0x02
-#define I2O_CMD_FAILED		0x03
-#define I2O_CMD_COMPLETED	0x04
+/**
+ *	i2o_driver_notify_controller_add - Send notification of added controller
+ *					   to a single I2O driver
+ *
+ *	Send notification of added controller to a single registered driver.
+ */
+static inline void i2o_driver_notify_controller_add(struct i2o_driver *drv,
+						    struct i2o_controller *c)
+{
+	if (drv->notify_controller_add)
+		drv->notify_controller_add(c);
+};
 
-/* I2O API function return values */
+/**
+ *	i2o_driver_notify_controller_remove - Send notification of removed
+ *					      controller to a single I2O driver
+ *
+ *	Send notification of removed controller to a single registered driver.
+ */
+static inline void i2o_driver_notify_controller_remove(struct i2o_driver *drv,
+						       struct i2o_controller *c)
+{
+	if (drv->notify_controller_remove)
+		drv->notify_controller_remove(c);
+};
 
-#define I2O_RTN_NO_ERROR			0
-#define I2O_RTN_NOT_INIT			1
-#define I2O_RTN_FREE_Q_EMPTY			2
-#define I2O_RTN_TCB_ERROR			3
-#define I2O_RTN_TRANSACTION_ERROR		4
-#define I2O_RTN_ADAPTER_ALREADY_INIT		5
-#define I2O_RTN_MALLOC_ERROR			6
-#define I2O_RTN_ADPTR_NOT_REGISTERED		7
-#define I2O_RTN_MSG_REPLY_TIMEOUT		8
-#define I2O_RTN_NO_STATUS			9
-#define I2O_RTN_NO_FIRM_VER			10
-#define	I2O_RTN_NO_LINK_SPEED			11
+/**
+ *	i2o_driver_notify_device_add - Send notification of added device to a
+ *				       single I2O driver
+ *
+ *	Send notification of added device to a single registered driver.
+ */
+static inline void i2o_driver_notify_device_add(struct i2o_driver *drv,
+						struct i2o_device *i2o_dev)
+{
+	if (drv->notify_device_add)
+		drv->notify_device_add(i2o_dev);
+};
 
-/* Reply message status defines for all messages */
+/**
+ *	i2o_driver_notify_device_remove - Send notification of removed device
+ *					  to a single I2O driver
+ *
+ *	Send notification of removed device to a single registered driver.
+ */
+static inline void i2o_driver_notify_device_remove(struct i2o_driver *drv,
+						   struct i2o_device *i2o_dev)
+{
+	if (drv->notify_device_remove)
+		drv->notify_device_remove(i2o_dev);
+};
 
-#define I2O_REPLY_STATUS_SUCCESS                    	0x00
-#define I2O_REPLY_STATUS_ABORT_DIRTY                	0x01
-#define I2O_REPLY_STATUS_ABORT_NO_DATA_TRANSFER     	0x02
-#define	I2O_REPLY_STATUS_ABORT_PARTIAL_TRANSFER		0x03
-#define	I2O_REPLY_STATUS_ERROR_DIRTY			0x04
-#define	I2O_REPLY_STATUS_ERROR_NO_DATA_TRANSFER		0x05
-#define	I2O_REPLY_STATUS_ERROR_PARTIAL_TRANSFER		0x06
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_DIRTY		0x08
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_NO_DATA_TRANSFER	0x09
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_PARTIAL_TRANSFER	0x0A
-#define	I2O_REPLY_STATUS_TRANSACTION_ERROR		0x0B
-#define	I2O_REPLY_STATUS_PROGRESS_REPORT		0x80
+extern void i2o_driver_notify_controller_add_all(struct i2o_controller *);
+extern void i2o_driver_notify_controller_remove_all(struct i2o_controller *);
+extern void i2o_driver_notify_device_add_all(struct i2o_device *);
+extern void i2o_driver_notify_device_remove_all(struct i2o_device *);
 
-/* Status codes and Error Information for Parameter functions */
+/* I2O device functions */
+extern int i2o_device_claim(struct i2o_device *);
+extern int i2o_device_claim_release(struct i2o_device *);
 
-#define I2O_PARAMS_STATUS_SUCCESS		0x00
-#define I2O_PARAMS_STATUS_BAD_KEY_ABORT		0x01
-#define I2O_PARAMS_STATUS_BAD_KEY_CONTINUE   	0x02
-#define I2O_PARAMS_STATUS_BUFFER_FULL		0x03
-#define I2O_PARAMS_STATUS_BUFFER_TOO_SMALL	0x04
-#define I2O_PARAMS_STATUS_FIELD_UNREADABLE	0x05
-#define I2O_PARAMS_STATUS_FIELD_UNWRITEABLE	0x06
-#define I2O_PARAMS_STATUS_INSUFFICIENT_FIELDS	0x07
-#define I2O_PARAMS_STATUS_INVALID_GROUP_ID	0x08
-#define I2O_PARAMS_STATUS_INVALID_OPERATION	0x09
-#define I2O_PARAMS_STATUS_NO_KEY_FIELD		0x0A
-#define I2O_PARAMS_STATUS_NO_SUCH_FIELD		0x0B
-#define I2O_PARAMS_STATUS_NON_DYNAMIC_GROUP	0x0C
-#define I2O_PARAMS_STATUS_OPERATION_ERROR	0x0D
-#define I2O_PARAMS_STATUS_SCALAR_ERROR		0x0E
-#define I2O_PARAMS_STATUS_TABLE_ERROR		0x0F
-#define I2O_PARAMS_STATUS_WRONG_GROUP_TYPE	0x10
+/* Exec OSM functions */
+extern int i2o_exec_lct_get(struct i2o_controller *);
 
-/* DetailedStatusCode defines for Executive, DDM, Util and Transaction error
- * messages: Table 3-2 Detailed Status Codes.*/
+/* device / driver / kobject conversion functions */
+#define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver)
+#define to_i2o_device(dev) container_of(dev, struct i2o_device, device)
+#define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device)
+#define kobj_to_i2o_device(kobj) to_i2o_device(container_of(kobj, struct device, kobj))
 
-#define I2O_DSC_SUCCESS                        0x0000
-#define I2O_DSC_BAD_KEY                        0x0002
-#define I2O_DSC_TCL_ERROR                      0x0003
-#define I2O_DSC_REPLY_BUFFER_FULL              0x0004
-#define I2O_DSC_NO_SUCH_PAGE                   0x0005
-#define I2O_DSC_INSUFFICIENT_RESOURCE_SOFT     0x0006
-#define I2O_DSC_INSUFFICIENT_RESOURCE_HARD     0x0007
-#define I2O_DSC_CHAIN_BUFFER_TOO_LARGE         0x0009
-#define I2O_DSC_UNSUPPORTED_FUNCTION           0x000A
-#define I2O_DSC_DEVICE_LOCKED                  0x000B
-#define I2O_DSC_DEVICE_RESET                   0x000C
-#define I2O_DSC_INAPPROPRIATE_FUNCTION         0x000D
-#define I2O_DSC_INVALID_INITIATOR_ADDRESS      0x000E
-#define I2O_DSC_INVALID_MESSAGE_FLAGS          0x000F
-#define I2O_DSC_INVALID_OFFSET                 0x0010
-#define I2O_DSC_INVALID_PARAMETER              0x0011
-#define I2O_DSC_INVALID_REQUEST                0x0012
-#define I2O_DSC_INVALID_TARGET_ADDRESS         0x0013
-#define I2O_DSC_MESSAGE_TOO_LARGE              0x0014
-#define I2O_DSC_MESSAGE_TOO_SMALL              0x0015
-#define I2O_DSC_MISSING_PARAMETER              0x0016
-#define I2O_DSC_TIMEOUT                        0x0017
-#define I2O_DSC_UNKNOWN_ERROR                  0x0018
-#define I2O_DSC_UNKNOWN_FUNCTION               0x0019
-#define I2O_DSC_UNSUPPORTED_VERSION            0x001A
-#define I2O_DSC_DEVICE_BUSY                    0x001B
-#define I2O_DSC_DEVICE_NOT_AVAILABLE           0x001C
+/**
+ *	i2o_out_to_virt - Turn an I2O message to a virtual address
+ *	@c: controller
+ *	@m: message engine value
+ *
+ *	Turn a receive message from an I2O controller bus address into
+ *	a Linux virtual address. The shared page frame is a linear block
+ *	so we simply have to shift the offset. This function does not
+ *	work for sender side messages as they are ioremap objects
+ *	provided by the I2O controller.
+ */
+static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
+						      u32 m)
+{
+	BUG_ON(m < c->out_queue.phys
+	       || m >= c->out_queue.phys + c->out_queue.len);
 
-/* DetailedStatusCode defines for Block Storage Operation: Table 6-7 Detailed
-   Status Codes.*/
+	return c->out_queue.virt + (m - c->out_queue.phys);
+};
 
-#define I2O_BSA_DSC_SUCCESS               0x0000
-#define I2O_BSA_DSC_MEDIA_ERROR           0x0001
-#define I2O_BSA_DSC_ACCESS_ERROR          0x0002
-#define I2O_BSA_DSC_DEVICE_FAILURE        0x0003
-#define I2O_BSA_DSC_DEVICE_NOT_READY      0x0004
-#define I2O_BSA_DSC_MEDIA_NOT_PRESENT     0x0005
-#define I2O_BSA_DSC_MEDIA_LOCKED          0x0006
-#define I2O_BSA_DSC_MEDIA_FAILURE         0x0007
-#define I2O_BSA_DSC_PROTOCOL_FAILURE      0x0008
-#define I2O_BSA_DSC_BUS_FAILURE           0x0009
-#define I2O_BSA_DSC_ACCESS_VIOLATION      0x000A
-#define I2O_BSA_DSC_WRITE_PROTECTED       0x000B
-#define I2O_BSA_DSC_DEVICE_RESET          0x000C
-#define I2O_BSA_DSC_VOLUME_CHANGED        0x000D
-#define I2O_BSA_DSC_TIMEOUT               0x000E
+/**
+ *	i2o_msg_in_to_virt - Turn an I2O message to a virtual address
+ *	@c: controller
+ *	@m: message engine value
+ *
+ *	Turn a send message from an I2O controller bus address into
+ *	a Linux virtual address. The shared page frame is a linear block
+ *	so we simply have to shift the offset. This function does not
+ *	work for receive side messages as they are kmalloc objects
+ *	in a different pool.
+ */
+static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct
+							     i2o_controller *c,
+							     u32 m)
+{
+	return c->in_queue.virt + m;
+};
 
-/* FailureStatusCodes, Table 3-3 Message Failure Codes */
+/**
+ *	i2o_msg_get - obtain an I2O message from the IOP
+ *	@c: I2O controller
+ *
+ *	This function tries to get a message frame. If no message frame is
+ *	available do not wait until one is availabe (see also i2o_msg_get_wait).
+ *	The returned pointer to the message frame is not in I/O memory, it is
+ *	allocated from a mempool. But because a MFA is allocated from the
+ *	controller too it is guaranteed that i2o_msg_post() will never fail.
+ *
+ *	On a success a pointer to the message frame is returned. If the message
+ *	queue is empty -EBUSY is returned and if no memory is available -ENOMEM
+ *	is returned.
+ */
+static inline struct i2o_message *i2o_msg_get(struct i2o_controller *c)
+{
+	struct i2o_msg_mfa *mmsg = mempool_alloc(c->in_msg.mempool, GFP_ATOMIC);
+	if (!mmsg)
+		return ERR_PTR(-ENOMEM);
+
+	mmsg->mfa = readl(c->in_port);
+	if (mmsg->mfa == I2O_QUEUE_EMPTY) {
+		mempool_free(mmsg, c->in_msg.mempool);
+		return ERR_PTR(-EBUSY);
+	}
 
-#define I2O_FSC_TRANSPORT_SERVICE_SUSPENDED             0x81
-#define I2O_FSC_TRANSPORT_SERVICE_TERMINATED            0x82
-#define I2O_FSC_TRANSPORT_CONGESTION                    0x83
-#define I2O_FSC_TRANSPORT_FAILURE                       0x84
-#define I2O_FSC_TRANSPORT_STATE_ERROR                   0x85
-#define I2O_FSC_TRANSPORT_TIME_OUT                      0x86
-#define I2O_FSC_TRANSPORT_ROUTING_FAILURE               0x87
-#define I2O_FSC_TRANSPORT_INVALID_VERSION               0x88
-#define I2O_FSC_TRANSPORT_INVALID_OFFSET                0x89
-#define I2O_FSC_TRANSPORT_INVALID_MSG_FLAGS             0x8A
-#define I2O_FSC_TRANSPORT_FRAME_TOO_SMALL               0x8B
-#define I2O_FSC_TRANSPORT_FRAME_TOO_LARGE               0x8C
-#define I2O_FSC_TRANSPORT_INVALID_TARGET_ID             0x8D
-#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_ID          0x8E
-#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_CONTEXT     0x8F
-#define I2O_FSC_TRANSPORT_UNKNOWN_FAILURE               0xFF
+	return &mmsg->msg;
+};
 
-/* Device Claim Types */
-#define	I2O_CLAIM_PRIMARY					0x01000000
-#define	I2O_CLAIM_MANAGEMENT					0x02000000
-#define	I2O_CLAIM_AUTHORIZED					0x03000000
-#define	I2O_CLAIM_SECONDARY					0x04000000
+/**
+ *	i2o_msg_post - Post I2O message to I2O controller
+ *	@c: I2O controller to which the message should be send
+ *	@msg: message returned by i2o_msg_get()
+ *
+ *	Post the message to the I2O controller and return immediately.
+ */
+static inline void i2o_msg_post(struct i2o_controller *c,
+				struct i2o_message *msg)
+{
+	struct i2o_msg_mfa *mmsg;
 
-/* Message header defines for VersionOffset */
-#define I2OVER15	0x0001
-#define I2OVER20	0x0002
+	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
+	memcpy_toio(i2o_msg_in_to_virt(c, mmsg->mfa), msg,
+		    (le32_to_cpu(msg->u.head[0]) >> 16) << 2);
+	writel(mmsg->mfa, c->in_port);
+	mempool_free(mmsg, c->in_msg.mempool);
+};
 
-/* Default is 1.5 */
-#define I2OVERSION	I2OVER15
+/**
+ * 	i2o_msg_post_wait - Post and wait a message and wait until return
+ *	@c: controller
+ *	@m: message to post
+ *	@timeout: time in seconds to wait
+ *
+ * 	This API allows an OSM to post a message and then be told whether or
+ *	not the system received a successful reply. If the message times out
+ *	then the value '-ETIMEDOUT' is returned.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static inline int i2o_msg_post_wait(struct i2o_controller *c,
+				    struct i2o_message *msg,
+				    unsigned long timeout)
+{
+	return i2o_msg_post_wait_mem(c, msg, timeout, NULL);
+};
 
-#define SGL_OFFSET_0    I2OVERSION
-#define SGL_OFFSET_4    (0x0040 | I2OVERSION)
-#define SGL_OFFSET_5    (0x0050 | I2OVERSION)
-#define SGL_OFFSET_6    (0x0060 | I2OVERSION)
-#define SGL_OFFSET_7    (0x0070 | I2OVERSION)
-#define SGL_OFFSET_8    (0x0080 | I2OVERSION)
-#define SGL_OFFSET_9    (0x0090 | I2OVERSION)
-#define SGL_OFFSET_10   (0x00A0 | I2OVERSION)
-#define SGL_OFFSET_11   (0x00B0 | I2OVERSION)
-#define SGL_OFFSET_12   (0x00C0 | I2OVERSION)
-#define SGL_OFFSET(x)   (((x)<<4) | I2OVERSION)
+/**
+ *	i2o_msg_nop_mfa - Returns a fetched MFA back to the controller
+ *	@c: I2O controller from which the MFA was fetched
+ *	@mfa: MFA which should be returned
+ *
+ *	This function must be used for preserved messages, because i2o_msg_nop()
+ *	also returns the allocated memory back to the msg_pool mempool.
+ */
+static inline void i2o_msg_nop_mfa(struct i2o_controller *c, u32 mfa)
+{
+	struct i2o_message __iomem *msg;
+	u32 nop[3] = {
+		THREE_WORD_MSG_SIZE | SGL_OFFSET_0,
+		I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
+		0x00000000
+	};
+
+	msg = i2o_msg_in_to_virt(c, mfa);
+	memcpy_toio(msg, nop, sizeof(nop));
+	writel(mfa, c->in_port);
+};
 
-/* Transaction Reply Lists (TRL) Control Word structure */
-#define TRL_SINGLE_FIXED_LENGTH		0x00
-#define TRL_SINGLE_VARIABLE_LENGTH	0x40
-#define TRL_MULTIPLE_FIXED_LENGTH	0x80
+/**
+ *	i2o_msg_nop - Returns a message which is not used
+ *	@c: I2O controller from which the message was created
+ *	@msg: message which should be returned
+ *
+ *	If you fetch a message via i2o_msg_get, and can't use it, you must
+ *	return the message with this function. Otherwise the MFA is lost as well
+ *	as the allocated memory from the mempool.
+ */
+static inline void i2o_msg_nop(struct i2o_controller *c,
+			       struct i2o_message *msg)
+{
+	struct i2o_msg_mfa *mmsg;
+	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
 
- /* msg header defines for MsgFlags */
-#define MSG_STATIC	0x0100
-#define MSG_64BIT_CNTXT	0x0200
-#define MSG_MULTI_TRANS	0x1000
-#define MSG_FAIL	0x2000
-#define MSG_FINAL	0x4000
-#define MSG_REPLY	0x8000
+	i2o_msg_nop_mfa(c, mmsg->mfa);
+	mempool_free(mmsg, c->in_msg.mempool);
+};
 
- /* minimum size msg */
-#define THREE_WORD_MSG_SIZE	0x00030000
-#define FOUR_WORD_MSG_SIZE	0x00040000
-#define FIVE_WORD_MSG_SIZE	0x00050000
-#define SIX_WORD_MSG_SIZE	0x00060000
-#define SEVEN_WORD_MSG_SIZE	0x00070000
-#define EIGHT_WORD_MSG_SIZE	0x00080000
-#define NINE_WORD_MSG_SIZE	0x00090000
-#define TEN_WORD_MSG_SIZE	0x000A0000
-#define ELEVEN_WORD_MSG_SIZE	0x000B0000
-#define I2O_MESSAGE_SIZE(x)	((x)<<16)
+/**
+ *	i2o_flush_reply - Flush reply from I2O controller
+ *	@c: I2O controller
+ *	@m: the message identifier
+ *
+ *	The I2O controller must be informed that the reply message is not needed
+ *	anymore. If you forget to flush the reply, the message frame can't be
+ *	used by the controller anymore and is therefore lost.
+ */
+static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
+{
+	writel(m, c->out_port);
+};
 
-/* special TID assignments */
-#define ADAPTER_TID		0
-#define HOST_TID		1
+/*
+ *	Endian handling wrapped into the macro - keeps the core code
+ *	cleaner.
+ */
 
-/* outbound queue defines */
-#define I2O_MAX_OUTBOUND_MSG_FRAMES	128
-#define I2O_OUTBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
+#define i2o_raw_writel(val, mem)	__raw_writel(cpu_to_le32(val), mem)
 
-#define I2O_POST_WAIT_OK	0
-#define I2O_POST_WAIT_TIMEOUT	-ETIMEDOUT
+extern int i2o_parm_field_get(struct i2o_device *, int, int, void *, int);
+extern int i2o_parm_table_get(struct i2o_device *, int, int, int, void *, int,
+			      void *, int);
 
-#define I2O_CONTEXT_LIST_MIN_LENGTH	15
-#define I2O_CONTEXT_LIST_USED		0x01
-#define I2O_CONTEXT_LIST_DELETED	0x02
+/* debugging and troubleshooting/diagnostic helpers. */
+#define osm_printk(level, format, arg...)  \
+	printk(level "%s: " format, OSM_NAME , ## arg)
 
-/* timeouts */
-#define I2O_TIMEOUT_INIT_OUTBOUND_QUEUE	15
-#define I2O_TIMEOUT_MESSAGE_GET		5
-#define I2O_TIMEOUT_RESET		30
-#define I2O_TIMEOUT_STATUS_GET		5
-#define I2O_TIMEOUT_LCT_GET		360
-#define I2O_TIMEOUT_SCSI_SCB_ABORT	240
+#ifdef DEBUG
+#define osm_debug(format, arg...) \
+	osm_printk(KERN_DEBUG, format , ## arg)
+#else
+#define osm_debug(format, arg...) \
+        do { } while (0)
+#endif
 
-/* retries */
-#define I2O_HRT_GET_TRIES		3
-#define I2O_LCT_GET_TRIES		3
+#define osm_err(format, arg...)		\
+	osm_printk(KERN_ERR, format , ## arg)
+#define osm_info(format, arg...)		\
+	osm_printk(KERN_INFO, format , ## arg)
+#define osm_warn(format, arg...)		\
+	osm_printk(KERN_WARNING, format , ## arg)
 
-/* defines for max_sectors and max_phys_segments */
-#define I2O_MAX_SECTORS			1024
-#define I2O_MAX_SECTORS_LIMITED		256
-#define I2O_MAX_PHYS_SEGMENTS		MAX_PHYS_SEGMENTS
+/* debugging functions */
+extern void i2o_report_status(const char *, const char *, struct i2o_message *);
+extern void i2o_dump_message(struct i2o_message *);
+extern void i2o_dump_hrt(struct i2o_controller *c);
+extern void i2o_debug_state(struct i2o_controller *c);
 
 #endif				/* __KERNEL__ */
 #endif				/* _I2O_H */
-- 
cgit v1.2.3-71-gd317


From 24791bd48f643194d806654b587251b0f92233e8 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Fri, 6 Jan 2006 00:19:31 -0800
Subject: [PATCH] I2O: Remove wrong I2O device class

Removed wrong I2O device class, which was only needed to add sysfs attributes.

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/core.h   |   2 +
 drivers/message/i2o/device.c | 144 ++++++++++++++++++-------------------------
 drivers/message/i2o/driver.c |   2 -
 drivers/message/i2o/iop.c    |  34 ++--------
 include/linux/i2o.h          |   1 -
 5 files changed, 68 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/message/i2o/core.h b/drivers/message/i2o/core.h
index 9eefedb16211..9aa9b91170b2 100644
--- a/drivers/message/i2o/core.h
+++ b/drivers/message/i2o/core.h
@@ -33,6 +33,8 @@ extern int __init i2o_pci_init(void);
 extern void __exit i2o_pci_exit(void);
 
 /* device */
+extern struct device_attribute i2o_device_attrs[];
+
 extern void i2o_device_remove(struct i2o_device *);
 extern int i2o_device_parse_lct(struct i2o_controller *);
 
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 1db26215937f..a5e260b7a3b6 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -142,8 +142,9 @@ static void i2o_device_release(struct device *dev)
 
 
 /**
- *	i2o_device_class_show_class_id - Displays class id of I2O device
- *	@cd: class device of which the class id should be displayed
+ *	i2o_device_show_class_id - Displays class id of I2O device
+ *	@dev: device of which the class id should be displayed
+ *	@attr: pointer to device attribute
  *	@buf: buffer into which the class id should be printed
  *
  *	Returns the number of bytes which are printed into the buffer.
@@ -159,15 +160,15 @@ static ssize_t i2o_device_show_class_id(struct device *dev,
 }
 
 /**
- *	i2o_device_class_show_tid - Displays TID of I2O device
- *	@cd: class device of which the TID should be displayed
- *	@buf: buffer into which the class id should be printed
+ *	i2o_device_show_tid - Displays TID of I2O device
+ *	@dev: device of which the TID should be displayed
+ *	@attr: pointer to device attribute
+ *	@buf: buffer into which the TID should be printed
  *
  *	Returns the number of bytes which are printed into the buffer.
  */
 static ssize_t i2o_device_show_tid(struct device *dev,
-				   struct device_attribute *attr,
-				   char *buf)
+				   struct device_attribute *attr, char *buf)
 {
 	struct i2o_device *i2o_dev = to_i2o_device(dev);
 
@@ -208,66 +209,6 @@ static struct i2o_device *i2o_device_alloc(void)
 	return dev;
 }
 
-/**
- *	i2o_setup_sysfs_links - Adds attributes to the I2O device
- *	@cd: I2O class device which is added to the I2O device class
- *
- *	This function get called when a I2O device is added to the class. It
- *	creates the attributes for each device and creates user/parent symlink
- *	if necessary.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static void i2o_setup_sysfs_links(struct i2o_device *i2o_dev)
-{
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_device *tmp;
-
-	/* create user entries for this device */
-	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
-	if (tmp && tmp != i2o_dev)
-		sysfs_create_link(&i2o_dev->device.kobj,
-				  &tmp->device.kobj, "user");
-
-	/* create user entries refering to this device */
-	list_for_each_entry(tmp, &c->devices, list)
-		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid &&
-		    tmp != i2o_dev)
-			sysfs_create_link(&tmp->device.kobj,
-					  &i2o_dev->device.kobj, "user");
-
-	/* create parent entries for this device */
-	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
-	if (tmp && tmp != i2o_dev)
-		sysfs_create_link(&i2o_dev->device.kobj,
-				  &tmp->device.kobj, "parent");
-
-	/* create parent entries refering to this device */
-	list_for_each_entry(tmp, &c->devices, list)
-		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid &&
-		    tmp != i2o_dev)
-		sysfs_create_link(&tmp->device.kobj,
-				  &i2o_dev->device.kobj, "parent");
-}
-
-static void i2o_remove_sysfs_links(struct i2o_device *i2o_dev)
-{
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_device *tmp;
-
-	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
-	sysfs_remove_link(&i2o_dev->device.kobj, "user");
-
-	list_for_each_entry(tmp, &c->devices, list) {
-		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "parent");
-		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "user");
-	}
-}
-
-
-
 /**
  *	i2o_device_add - allocate a new I2O device and add it to the IOP
  *	@iop: I2O controller where the device is on
@@ -282,33 +223,57 @@ static void i2o_remove_sysfs_links(struct i2o_device *i2o_dev)
 static struct i2o_device *i2o_device_add(struct i2o_controller *c,
 					 i2o_lct_entry * entry)
 {
-	struct i2o_device *dev;
+	struct i2o_device *i2o_dev, *tmp;
 
-	dev = i2o_device_alloc();
-	if (IS_ERR(dev)) {
+	i2o_dev = i2o_device_alloc();
+	if (IS_ERR(i2o_dev)) {
 		printk(KERN_ERR "i2o: unable to allocate i2o device\n");
-		return dev;
+		return i2o_dev;
 	}
 
-	dev->lct_data = *entry;
-	dev->iop = c;
+	i2o_dev->lct_data = *entry;
 
-	snprintf(dev->device.bus_id, BUS_ID_SIZE, "%d:%03x", c->unit,
-		 dev->lct_data.tid);
+	snprintf(i2o_dev->device.bus_id, BUS_ID_SIZE, "%d:%03x", c->unit,
+		 i2o_dev->lct_data.tid);
 
-	dev->device.parent = &c->device;
+	i2o_dev->iop = c;
+	i2o_dev->device.parent = &c->device;
 
-	device_register(&dev->device);
+	device_register(&i2o_dev->device);
 
-	list_add_tail(&dev->list, &c->devices);
+	list_add_tail(&i2o_dev->list, &c->devices);
 
-	i2o_setup_sysfs_links(dev);
+	/* create user entries for this device */
+	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
+	if (tmp && (tmp != i2o_dev))
+		sysfs_create_link(&i2o_dev->device.kobj, &tmp->device.kobj,
+				  "user");
 
-	i2o_driver_notify_device_add_all(dev);
+	/* create user entries refering to this device */
+	list_for_each_entry(tmp, &c->devices, list)
+		if ((tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+		    && (tmp != i2o_dev))
+		    sysfs_create_link(&tmp->device.kobj,
+				      &i2o_dev->device.kobj, "user");
 
-	pr_debug("i2o: device %s added\n", dev->device.bus_id);
+	/* create parent entries for this device */
+	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
+	if (tmp && (tmp != i2o_dev))
+		sysfs_create_link(&i2o_dev->device.kobj, &tmp->device.kobj,
+				  "parent");
 
-	return dev;
+	/* create parent entries refering to this device */
+	list_for_each_entry(tmp, &c->devices, list)
+		if ((tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+		    && (tmp != i2o_dev))
+			sysfs_create_link(&tmp->device.kobj,
+					  &i2o_dev->device.kobj, "parent");
+
+	i2o_driver_notify_device_add_all(i2o_dev);
+
+	pr_debug("i2o: device %s added\n", i2o_dev->device.bus_id);
+
+	return i2o_dev;
 }
 
 /**
@@ -321,9 +286,22 @@ static struct i2o_device *i2o_device_add(struct i2o_controller *c,
  */
 void i2o_device_remove(struct i2o_device *i2o_dev)
 {
+	struct i2o_device *tmp;
+	struct i2o_controller *c = i2o_dev->iop;
+
 	i2o_driver_notify_device_remove_all(i2o_dev);
-	i2o_remove_sysfs_links(i2o_dev);
+
+	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
+	sysfs_remove_link(&i2o_dev->device.kobj, "user");
+
+	list_for_each_entry(tmp, &c->devices, list) {
+		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "parent");
+		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "user");
+	}
 	list_del(&i2o_dev->list);
+
 	device_unregister(&i2o_dev->device);
 }
 
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 0fb9c4e2ad4c..25292b36e2d9 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -61,8 +61,6 @@ static int i2o_bus_match(struct device *dev, struct device_driver *drv)
 };
 
 /* I2O bus type */
-extern struct device_attribute i2o_device_attrs[];
-
 struct bus_type i2o_bus_type = {
 	.name = "i2o",
 	.match = i2o_bus_match,
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index f86abb42bf89..7411a0504bf2 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -806,7 +806,6 @@ void i2o_iop_remove(struct i2o_controller *c)
 	list_for_each_entry_safe(dev, tmp, &c->devices, list)
 	    i2o_device_remove(dev);
 
-	class_device_unregister(c->classdev);
 	device_del(&c->device);
 
 	/* Ask the IOP to switch to RESET state */
@@ -1050,9 +1049,6 @@ static void i2o_iop_release(struct device *dev)
 	i2o_iop_free(c);
 };
 
-/* I2O controller class */
-static struct class *i2o_controller_class;
-
 /**
  *	i2o_iop_alloc - Allocate and initialize a i2o_controller struct
  *
@@ -1124,36 +1120,29 @@ int i2o_iop_add(struct i2o_controller *c)
 		goto iop_reset;
 	}
 
-	c->classdev = class_device_create(i2o_controller_class, NULL, MKDEV(0,0),
-			&c->device, "iop%d", c->unit);
-	if (IS_ERR(c->classdev)) {
-		osm_err("%s: could not add controller class\n", c->name);
-		goto device_del;
-	}
-
 	osm_info("%s: Activating I2O controller...\n", c->name);
 	osm_info("%s: This may take a few minutes if there are many devices\n",
 		 c->name);
 
 	if ((rc = i2o_iop_activate(c))) {
 		osm_err("%s: could not activate controller\n", c->name);
-		goto class_del;
+		goto device_del;
 	}
 
 	osm_debug("%s: building sys table...\n", c->name);
 
 	if ((rc = i2o_systab_build()))
-		goto class_del;
+		goto device_del;
 
 	osm_debug("%s: online controller...\n", c->name);
 
 	if ((rc = i2o_iop_online(c)))
-		goto class_del;
+		goto device_del;
 
 	osm_debug("%s: getting LCT...\n", c->name);
 
 	if ((rc = i2o_exec_lct_get(c)))
-		goto class_del;
+		goto device_del;
 
 	list_add(&c->list, &i2o_controllers);
 
@@ -1163,9 +1152,6 @@ int i2o_iop_add(struct i2o_controller *c)
 
 	return 0;
 
-      class_del:
-	class_device_unregister(c->classdev);
-
       device_del:
 	device_del(&c->device);
 
@@ -1225,14 +1211,8 @@ static int __init i2o_iop_init(void)
 
 	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
 
-	i2o_controller_class = class_create(THIS_MODULE, "i2o_controller");
-	if (IS_ERR(i2o_controller_class)) {
-		osm_err("can't register class i2o_controller\n");
-		goto exit;
-	}
-
 	if ((rc = i2o_driver_init()))
-		goto class_exit;
+		goto exit;
 
 	if ((rc = i2o_exec_init()))
 		goto driver_exit;
@@ -1248,9 +1228,6 @@ static int __init i2o_iop_init(void)
       driver_exit:
 	i2o_driver_exit();
 
-      class_exit:
-	class_destroy(i2o_controller_class);
-
       exit:
 	return rc;
 }
@@ -1265,7 +1242,6 @@ static void __exit i2o_iop_exit(void)
 	i2o_pci_exit();
 	i2o_exec_exit();
 	i2o_driver_exit();
-	class_destroy(i2o_controller_class);
 };
 
 module_init(i2o_iop_init);
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 9e359a981221..4c18b7711bd9 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -561,7 +561,6 @@ struct i2o_controller {
 	struct resource mem_resource;	/* Mem resource allocated to the IOP */
 
 	struct device device;
-	struct class_device *classdev;	/* I2O controller class device */
 	struct i2o_device *exec;	/* Executive */
 #if BITS_PER_LONG == 64
 	spinlock_t context_list_lock;	/* lock for context_list */
-- 
cgit v1.2.3-71-gd317


From dcceafe25a5f47cf69e5b46b4da6f15186ec8386 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Fri, 6 Jan 2006 00:19:32 -0800
Subject: [PATCH] I2O: Bugfixes

- Removed some kmalloc's with __GFP_ZERO and replace it with memset()
  because it didn't work properly.

- Fixed returned message frame in i2o_cfg_passthru() which caused raidutils
  to display wrong error message in case a disk was missing.

- Fixed size of printk() in i2o_scsi.c.

- Fixed get_device() and put_device() in probing of the I2O controller.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/driver.c     |  5 +++--
 drivers/message/i2o/i2o_config.c | 29 ++++++++++++++---------------
 drivers/message/i2o/i2o_scsi.c   |  4 ++--
 drivers/message/i2o/pci.c        |  6 +-----
 include/linux/i2o.h              |  2 +-
 5 files changed, 21 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 25292b36e2d9..9c631c873dc6 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -217,14 +217,15 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
 		/* cut of header from message size (in 32-bit words) */
 		size = (le32_to_cpu(msg->u.head[0]) >> 16) - 5;
 
-		evt = kmalloc(size * 4 + sizeof(*evt), GFP_ATOMIC | __GFP_ZERO);
+		evt = kmalloc(size * 4 + sizeof(*evt), GFP_ATOMIC);
 		if (!evt)
 			return -ENOMEM;
+		memset(evt, 0, size * 4 + sizeof(*evt));
 
 		evt->size = size;
 		evt->tcntxt = le32_to_cpu(msg->u.s.tcntxt);
 		evt->event_indicator = le32_to_cpu(msg->body[0]);
-		memcpy(&evt->tcntxt, &msg->u.s.tcntxt, size * 4);
+		memcpy(&evt->data, &msg->body[1], size * 4);
 
 		list_for_each_entry_safe(dev, tmp, &c->devices, list)
 		    if (dev->lct_data.tid == tid) {
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 4fe73d628c5b..286fef3240c4 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -36,12 +36,12 @@
 
 #include <asm/uaccess.h>
 
-#include "core.h"
-
 #define SG_TABLESIZE		30
 
-static int i2o_cfg_ioctl(struct inode *inode, struct file *fp, unsigned int cmd,
-			 unsigned long arg);
+extern int i2o_parm_issue(struct i2o_device *, int, void *, int, void *, int);
+
+static int i2o_cfg_ioctl(struct inode *, struct file *, unsigned int,
+			 unsigned long);
 
 static spinlock_t i2o_config_lock;
 
@@ -593,9 +593,6 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 
 	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
 
-	msg->u.s.icntxt = cpu_to_le32(i2o_config_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, reply));
-
 	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
 	if (sg_offset) {
 		struct sg_simple_element *sg;
@@ -629,7 +626,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 				goto cleanup;
 			}
 			sg_size = sg[i].flag_count & 0xffffff;
-			p = &(sg_list[sg_index++]);
+			p = &(sg_list[sg_index]);
 			/* Allocate memory for the transfer */
 			if (i2o_dma_alloc
 			    (&c->pdev->dev, p, sg_size,
@@ -640,6 +637,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 				rcode = -ENOMEM;
 				goto sg_list_cleanup;
 			}
+			sg_index++;
 			/* Copy in the user's SG buffer if necessary */
 			if (sg[i].
 			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
@@ -661,8 +659,10 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 	}
 
 	rcode = i2o_msg_post_wait(c, msg, 60);
-	if (rcode)
+	if (rcode) {
+		reply[4] = ((u32) rcode) << 24;
 		goto sg_list_cleanup;
+	}
 
 	if (sg_offset) {
 		u32 msg[I2O_OUTBOUND_MSG_FRAME_SIZE];
@@ -712,6 +712,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 		}
 	}
 
+      sg_list_cleanup:
 	/* Copy back the reply to user space */
 	if (reply_size) {
 		// we wrote our own values for context - now restore the user supplied ones
@@ -729,7 +730,6 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 		}
 	}
 
-      sg_list_cleanup:
 	for (i = 0; i < sg_index; i++)
 		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
 
@@ -827,9 +827,6 @@ static int i2o_cfg_passthru(unsigned long arg)
 
 	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
 
-	msg->u.s.icntxt = cpu_to_le32(i2o_config_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, reply));
-
 	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
 	if (sg_offset) {
 		struct sg_simple_element *sg;
@@ -892,8 +889,10 @@ static int i2o_cfg_passthru(unsigned long arg)
 	}
 
 	rcode = i2o_msg_post_wait(c, msg, 60);
-	if (rcode)
+	if (rcode) {
+		reply[4] = ((u32) rcode) << 24;
 		goto sg_list_cleanup;
+	}
 
 	if (sg_offset) {
 		u32 msg[128];
@@ -943,6 +942,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 		}
 	}
 
+      sg_list_cleanup:
 	/* Copy back the reply to user space */
 	if (reply_size) {
 		// we wrote our own values for context - now restore the user supplied ones
@@ -959,7 +959,6 @@ static int i2o_cfg_passthru(unsigned long arg)
 		}
 	}
 
-      sg_list_cleanup:
 	for (i = 0; i < sg_index; i++)
 		kfree(sg_list[i]);
 
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index 24061dfd46e4..76b9516b1934 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -309,9 +309,9 @@ static int i2o_scsi_probe(struct device *dev)
 	sysfs_create_link(&i2o_dev->device.kobj, &scsi_dev->sdev_gendev.kobj,
 			  "scsi");
 
-	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %d\n",
+	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %ld\n",
 		 i2o_dev->lct_data.tid, channel, le32_to_cpu(id),
-		 (unsigned int)le64_to_cpu(lun));
+		 (long unsigned int)le64_to_cpu(lun));
 
 	return 0;
 };
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index 329d482eee81..c5b656cdea7c 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -339,7 +339,7 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 		       pci_name(pdev));
 
 	c->pdev = pdev;
-	c->device.parent = get_device(&pdev->dev);
+	c->device.parent = &pdev->dev;
 
 	/* Cards that fall apart if you hit them with large I/O loads... */
 	if (pdev->vendor == PCI_VENDOR_ID_NCR && pdev->device == 0x0630) {
@@ -410,8 +410,6 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 	if ((rc = i2o_iop_add(c)))
 		goto uninstall;
 
-	get_device(&c->device);
-
 	if (i960)
 		pci_write_config_word(i960, 0x42, 0x03ff);
 
@@ -424,7 +422,6 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 	i2o_pci_free(c);
 
       free_controller:
-	put_device(c->device.parent);
 	i2o_iop_free(c);
 
       disable:
@@ -454,7 +451,6 @@ static void __devexit i2o_pci_remove(struct pci_dev *pdev)
 
 	printk(KERN_INFO "%s: Controller removed.\n", c->name);
 
-	put_device(c->device.parent);
 	put_device(&c->device);
 };
 
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 4c18b7711bd9..9ba806796667 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -384,7 +384,7 @@
 
 /* defines for max_sectors and max_phys_segments */
 #define I2O_MAX_SECTORS			1024
-#define I2O_MAX_SECTORS_LIMITED		256
+#define I2O_MAX_SECTORS_LIMITED		128
 #define I2O_MAX_PHYS_SEGMENTS		MAX_PHYS_SEGMENTS
 
 /*
-- 
cgit v1.2.3-71-gd317


From 45714d65618407bce1fd0271bc58303ce14b0785 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:36 -0800
Subject: [PATCH] fuse: bump interface version

Change interface version to 7.4.

Following changes will need backward compatibility support, so store the minor
version returned by userspace.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c        | 2 ++
 fs/fuse/fuse_i.h     | 3 +++
 include/linux/fuse.h | 2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8f873e621f41..e5bc3f8eebd0 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -178,6 +178,8 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
 			fc->conn_error = 1;
 
+		fc->minor = req->misc.init_in_out.minor;
+
 		/* After INIT reply is received other requests can go
 		   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
 		   up()s on outstanding_sem.  The last up() is done in
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0ea5301f86be..2d4835e54c90 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,9 @@ struct fuse_conn {
 	/** Is create not implemented by fs? */
 	unsigned no_create : 1;
 
+	/** Negotiated minor version */
+	unsigned minor;
+
 	/** Backing dev info */
 	struct backing_dev_info bdi;
 };
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index b76b558b03d4..3c85f1a422cc 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -14,7 +14,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 3
+#define FUSE_KERNEL_MINOR_VERSION 4
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
-- 
cgit v1.2.3-71-gd317


From de5f12025572ef8fcffa4be5453061725acfb754 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:37 -0800
Subject: [PATCH] fuse: add frsize to statfs reply

Add 'frsize' member to the statfs reply.

I'm not sure if sending f_fsid will ever be needed, but just in case leave
some space at the end of the structure, so less compatibility mess would be
required.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/inode.c      | 5 ++++-
 include/linux/fuse.h | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e69a546844d0..3b928a02af04 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -218,6 +218,7 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
 {
 	stbuf->f_type    = FUSE_SUPER_MAGIC;
 	stbuf->f_bsize   = attr->bsize;
+	stbuf->f_frsize  = attr->frsize;
 	stbuf->f_blocks  = attr->blocks;
 	stbuf->f_bfree   = attr->bfree;
 	stbuf->f_bavail  = attr->bavail;
@@ -238,10 +239,12 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
 	if (!req)
 		return -EINTR;
 
+	memset(&outarg, 0, sizeof(outarg));
 	req->in.numargs = 0;
 	req->in.h.opcode = FUSE_STATFS;
 	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].size =
+		fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
 	req->out.args[0].value = &outarg;
 	request_send(fc, req);
 	err = req->out.h.error;
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 3c85f1a422cc..9d5177c356cc 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -53,6 +53,9 @@ struct fuse_kstatfs {
 	__u64	ffree;
 	__u32	bsize;
 	__u32	namelen;
+	__u32	frsize;
+	__u32	padding;
+	__u32	spare[6];
 };
 
 #define FATTR_MODE	(1 << 0)
@@ -213,6 +216,8 @@ struct fuse_write_out {
 	__u32	padding;
 };
 
+#define FUSE_COMPAT_STATFS_SIZE 48
+
 struct fuse_statfs_out {
 	struct fuse_kstatfs st;
 };
-- 
cgit v1.2.3-71-gd317


From 1d3d752b471d2a3a1d5e4fe177e5e7d52abb4e4c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:40 -0800
Subject: [PATCH] fuse: clean up request size limit checking

Change the way a too large request is handled.  Until now in this case the
device read returned -EINVAL and the operation returned -EIO.

Make it more flexibible by not returning -EINVAL from the read, but restarting
it instead.

Also remove the fixed limit on setxattr data and let the filesystem provide as
large a read buffer as it needs to handle the extended attribute data.

The symbolic link length is already checked by VFS to be less than PATH_MAX,
so the extra check against FUSE_SYMLINK_MAX is not needed.

The check in fuse_create_open() against FUSE_NAME_MAX is not needed, since the
dentry has already been looked up, and hence the name already checked.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c        | 26 ++++++++++++++++----------
 fs/fuse/dir.c        | 14 +-------------
 fs/fuse/fuse_i.h     |  9 ++++++---
 fs/fuse/inode.c      |  2 +-
 include/linux/fuse.h |  8 ++------
 5 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e5bc3f8eebd0..1afdffdf80db 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -617,6 +617,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	struct fuse_copy_state cs;
 	unsigned reqsize;
 
+ restart:
 	spin_lock(&fuse_lock);
 	fc = file->private_data;
 	err = -EPERM;
@@ -632,20 +633,25 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 
 	req = list_entry(fc->pending.next, struct fuse_req, list);
 	list_del_init(&req->list);
-	spin_unlock(&fuse_lock);
 
 	in = &req->in;
-	reqsize = req->in.h.len;
-	fuse_copy_init(&cs, 1, req, iov, nr_segs);
-	err = -EINVAL;
-	if (iov_length(iov, nr_segs) >= reqsize) {
-		err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
-		if (!err)
-			err = fuse_copy_args(&cs, in->numargs, in->argpages,
-					     (struct fuse_arg *) in->args, 0);
+	reqsize = in->h.len;
+	/* If request is too large, reply with an error and restart the read */
+	if (iov_length(iov, nr_segs) < reqsize) {
+		req->out.h.error = -EIO;
+		/* SETXATTR is special, since it may contain too large data */
+		if (in->h.opcode == FUSE_SETXATTR)
+			req->out.h.error = -E2BIG;
+		request_end(fc, req);
+		goto restart;
 	}
+	spin_unlock(&fuse_lock);
+	fuse_copy_init(&cs, 1, req, iov, nr_segs);
+	err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+	if (!err)
+		err = fuse_copy_args(&cs, in->numargs, in->argpages,
+				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(&cs);
-
 	spin_lock(&fuse_lock);
 	req->locked = 0;
 	if (!err && req->interrupted)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9a6075de961f..f156392d019e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -236,10 +236,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (fc->no_create)
 		goto out;
 
-	err = -ENAMETOOLONG;
-	if (entry->d_name.len > FUSE_NAME_MAX)
-		goto out;
-
 	err = -EINTR;
 	req = fuse_get_request(fc);
 	if (!req)
@@ -413,12 +409,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	unsigned len = strlen(link) + 1;
-	struct fuse_req *req;
-
-	if (len > FUSE_SYMLINK_MAX)
-		return -ENAMETOOLONG;
-
-	req = fuse_get_request(fc);
+	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -988,9 +979,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	struct fuse_setxattr_in inarg;
 	int err;
 
-	if (size > FUSE_XATTR_SIZE_MAX)
-		return -E2BIG;
-
 	if (fc->no_setxattr)
 		return -EOPNOTSUPP;
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2d4835e54c90..17fd368559cd 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,12 @@
 /** If more requests are outstanding, then the operation will block */
 #define FUSE_MAX_OUTSTANDING 10
 
+/** Maximum size of data in a write request */
+#define FUSE_MAX_WRITE 4096
+
+/** It could be as large as PATH_MAX, but would that have any uses? */
+#define FUSE_NAME_MAX 1024
+
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
     permission checking is done in the kernel */
@@ -108,9 +114,6 @@ struct fuse_out {
 	struct fuse_arg args[3];
 };
 
-struct fuse_req;
-struct fuse_conn;
-
 /**
  * A request to the client
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3b928a02af04..3580b9e12345 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -485,7 +485,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->max_read = d.max_read;
 	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
 		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
-	fc->max_write = FUSE_MAX_IN / 2;
+	fc->max_write = FUSE_MAX_WRITE;
 
 	err = -ENOMEM;
 	root = get_root_inode(sb, d.rootmode);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 9d5177c356cc..8f64cc2205b0 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -108,12 +108,8 @@ enum fuse_opcode {
 	FUSE_CREATE        = 35
 };
 
-/* Conservative buffer size for the client */
-#define FUSE_MAX_IN 8192
-
-#define FUSE_NAME_MAX 1024
-#define FUSE_SYMLINK_MAX 4096
-#define FUSE_XATTR_SIZE_MAX 4096
+/* The read buffer is required to be at least 8k, but may be much larger */
+#define FUSE_MIN_READ_BUFFER 8192
 
 struct fuse_entry_out {
 	__u64	nodeid;		/* Inode ID */
-- 
cgit v1.2.3-71-gd317


From 3ec870d524c9150add120475c8ddcfa50574f98e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:41 -0800
Subject: [PATCH] fuse: make maximum write data configurable

Make the maximum size of write data configurable by the filesystem.  The
previous fixed 4096 limit only worked on architectures where the page size is
less or equal to this.  This change make writing work on other architectures
too, and also lets the filesystem receive bigger write requests in direct_io
mode.

Normal writes which go through the page cache are still limited to a page
sized chunk per request.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c        | 48 ++++++++++++++++++++++++++++++------------------
 fs/fuse/fuse_i.h     |  6 ++----
 fs/fuse/inode.c      |  1 -
 include/linux/fuse.h | 11 +++++++++--
 4 files changed, 41 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1afdffdf80db..e08ab4702d97 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -148,6 +148,26 @@ void fuse_release_background(struct fuse_req *req)
 	spin_unlock(&fuse_lock);
 }
 
+static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int i;
+	struct fuse_init_out *arg = &req->misc.init_out;
+
+	if (arg->major != FUSE_KERNEL_VERSION)
+		fc->conn_error = 1;
+	else {
+		fc->minor = arg->minor;
+		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+	}
+
+	/* After INIT reply is received other requests can go
+	   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
+	   up()s on outstanding_sem.  The last up() is done in
+	   fuse_putback_request() */
+	for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
+		up(&fc->outstanding_sem);
+}
+
 /*
  * This function is called when a request is finished.  Either a reply
  * has arrived or it was interrupted (and not yet sent) or some error
@@ -172,21 +192,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		up_read(&fc->sbput_sem);
 	}
 	wake_up(&req->waitq);
-	if (req->in.h.opcode == FUSE_INIT) {
-		int i;
-
-		if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
-			fc->conn_error = 1;
-
-		fc->minor = req->misc.init_in_out.minor;
-
-		/* After INIT reply is received other requests can go
-		   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
-		   up()s on outstanding_sem.  The last up() is done in
-		   fuse_putback_request() */
-		for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
-			up(&fc->outstanding_sem);
-	} else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
+	if (req->in.h.opcode == FUSE_INIT)
+		process_init_reply(fc, req);
+	else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
 		/* Special case for failed iget in CREATE */
 		u64 nodeid = req->in.h.nodeid;
 		__fuse_get_request(req);
@@ -359,7 +367,7 @@ void fuse_send_init(struct fuse_conn *fc)
 	/* This is called from fuse_read_super() so there's guaranteed
 	   to be a request available */
 	struct fuse_req *req = do_get_request(fc);
-	struct fuse_init_in_out *arg = &req->misc.init_in_out;
+	struct fuse_init_in *arg = &req->misc.init_in;
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	req->in.h.opcode = FUSE_INIT;
@@ -367,8 +375,12 @@ void fuse_send_init(struct fuse_conn *fc)
 	req->in.args[0].size = sizeof(*arg);
 	req->in.args[0].value = arg;
 	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(*arg);
-	req->out.args[0].value = arg;
+	/* Variable length arguement used for backward compatibility
+	   with interface version < 7.5.  Rest of init_out is zeroed
+	   by do_get_request(), so a short reply is not a problem */
+	req->out.argvar = 1;
+	req->out.args[0].size = sizeof(struct fuse_init_out);
+	req->out.args[0].value = &req->misc.init_out;
 	request_send_background(fc, req);
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 17fd368559cd..74c8d098a14a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,9 +21,6 @@
 /** If more requests are outstanding, then the operation will block */
 #define FUSE_MAX_OUTSTANDING 10
 
-/** Maximum size of data in a write request */
-#define FUSE_MAX_WRITE 4096
-
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
 
@@ -162,7 +159,8 @@ struct fuse_req {
 	union {
 		struct fuse_forget_in forget_in;
 		struct fuse_release_in release_in;
-		struct fuse_init_in_out init_in_out;
+		struct fuse_init_in init_in;
+		struct fuse_init_out init_out;
 	} misc;
 
 	/** page vector */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3580b9e12345..e4541869831e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -485,7 +485,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->max_read = d.max_read;
 	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
 		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
-	fc->max_write = FUSE_MAX_WRITE;
 
 	err = -ENOMEM;
 	root = get_root_inode(sb, d.rootmode);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 8f64cc2205b0..528959c52f1b 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -14,7 +14,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 4
+#define FUSE_KERNEL_MINOR_VERSION 5
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -244,11 +244,18 @@ struct fuse_access_in {
 	__u32	padding;
 };
 
-struct fuse_init_in_out {
+struct fuse_init_in {
 	__u32	major;
 	__u32	minor;
 };
 
+struct fuse_init_out {
+	__u32	major;
+	__u32	minor;
+	__u32	unused[3];
+	__u32	max_write;
+};
+
 struct fuse_in_header {
 	__u32	len;
 	__u32	opcode;
-- 
cgit v1.2.3-71-gd317


From 742ec650e9b63ea61891455bb6f76bac37025c78 Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:44 -0800
Subject: [PATCH] parport: phase fixes

Did not move the parport interface properly into IEEE1284_PH_REV_IDLE phase at
end of data due to comparing bytes with nibbles.  Internal phase
IEEE1284_PH_HBUSY_DNA became unused, so remove it.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/media/video/cpia_pp.c  | 30 +++++++++-----------
 drivers/parport/ieee1284_ops.c | 62 ++++++++++++++++++++----------------------
 include/linux/parport.h        |  1 -
 3 files changed, 42 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/cpia_pp.c b/drivers/media/video/cpia_pp.c
index ddf184f95d80..6861d408f1b3 100644
--- a/drivers/media/video/cpia_pp.c
+++ b/drivers/media/video/cpia_pp.c
@@ -170,16 +170,9 @@ static size_t cpia_read_nibble (struct parport *port,
 		/* Does the error line indicate end of data? */
 		if (((i /*& 1*/) == 0) &&
 		    (parport_read_status(port) & PARPORT_STATUS_ERROR)) {
-			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
-				DBG("%s: No more nibble data (%d bytes)\n",
-				port->name, i/2);
-
-			/* Go to reverse idle phase. */
-			parport_frob_control (port,
-					      PARPORT_CONTROL_AUTOFD,
-					      PARPORT_CONTROL_AUTOFD);
-			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-			break;
+			DBG("%s: No more nibble data (%d bytes)\n",
+			    port->name, i/2);
+			goto end_of_data;
 		}
 
 		/* Event 7: Set nAutoFd low. */
@@ -227,18 +220,21 @@ static size_t cpia_read_nibble (struct parport *port,
 			byte = nibble;
 	}
 
-	i /= 2; /* i is now in bytes */
-
 	if (i == len) {
 		/* Read the last nibble without checking data avail. */
-		port = port->physport;
-		if (parport_read_status (port) & PARPORT_STATUS_ERROR)
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
+		if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
+		end_of_data:
+			/* Go to reverse idle phase. */
+			parport_frob_control (port,
+					      PARPORT_CONTROL_AUTOFD,
+					      PARPORT_CONTROL_AUTOFD);
+			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
+		}
 		else
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
+			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
 	}
 
-	return i;
+	return i/2;
 }
 
 /* CPiA nonstandard "Nibble Stream" mode (2 nibbles per cycle, instead of 1)
diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c
index ce1e2aad8b10..d6c77658231e 100644
--- a/drivers/parport/ieee1284_ops.c
+++ b/drivers/parport/ieee1284_ops.c
@@ -165,17 +165,7 @@ size_t parport_ieee1284_read_nibble (struct parport *port,
 		/* Does the error line indicate end of data? */
 		if (((i & 1) == 0) &&
 		    (parport_read_status(port) & PARPORT_STATUS_ERROR)) {
-			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
-			DPRINTK (KERN_DEBUG
-				"%s: No more nibble data (%d bytes)\n",
-				port->name, i/2);
-
-			/* Go to reverse idle phase. */
-			parport_frob_control (port,
-					      PARPORT_CONTROL_AUTOFD,
-					      PARPORT_CONTROL_AUTOFD);
-			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-			break;
+			goto end_of_data;
 		}
 
 		/* Event 7: Set nAutoFd low. */
@@ -225,18 +215,25 @@ size_t parport_ieee1284_read_nibble (struct parport *port,
 			byte = nibble;
 	}
 
-	i /= 2; /* i is now in bytes */
-
 	if (i == len) {
 		/* Read the last nibble without checking data avail. */
-		port = port->physport;
-		if (parport_read_status (port) & PARPORT_STATUS_ERROR)
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
+		if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
+		end_of_data:
+			DPRINTK (KERN_DEBUG
+				"%s: No more nibble data (%d bytes)\n",
+				port->name, i/2);
+
+			/* Go to reverse idle phase. */
+			parport_frob_control (port,
+					      PARPORT_CONTROL_AUTOFD,
+					      PARPORT_CONTROL_AUTOFD);
+			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
+		}
 		else
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
+			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
 	}
 
-	return i;
+	return i/2;
 #endif /* IEEE1284 support */
 }
 
@@ -256,17 +253,7 @@ size_t parport_ieee1284_read_byte (struct parport *port,
 
 		/* Data available? */
 		if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
-			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
-			DPRINTK (KERN_DEBUG
-				 "%s: No more byte data (%Zd bytes)\n",
-				 port->name, count);
-
-			/* Go to reverse idle phase. */
-			parport_frob_control (port,
-					      PARPORT_CONTROL_AUTOFD,
-					      PARPORT_CONTROL_AUTOFD);
-			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-			break;
+			goto end_of_data;
 		}
 
 		/* Event 14: Place data bus in high impedance state. */
@@ -318,11 +305,20 @@ size_t parport_ieee1284_read_byte (struct parport *port,
 
 	if (count == len) {
 		/* Read the last byte without checking data avail. */
-		port = port->physport;
-		if (parport_read_status (port) & PARPORT_STATUS_ERROR)
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
+		if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
+		end_of_data:
+			DPRINTK (KERN_DEBUG
+				 "%s: No more byte data (%Zd bytes)\n",
+				 port->name, count);
+
+			/* Go to reverse idle phase. */
+			parport_frob_control (port,
+					      PARPORT_CONTROL_AUTOFD,
+					      PARPORT_CONTROL_AUTOFD);
+			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
+		}
 		else
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
+			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
 	}
 
 	return count;
diff --git a/include/linux/parport.h b/include/linux/parport.h
index d2a4d9e1e6d1..f7ff0b0c4031 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -242,7 +242,6 @@ enum ieee1284_phase {
 	IEEE1284_PH_FWD_IDLE,
 	IEEE1284_PH_TERMINATE,
 	IEEE1284_PH_NEGOTIATION,
-	IEEE1284_PH_HBUSY_DNA,
 	IEEE1284_PH_REV_IDLE,
 	IEEE1284_PH_HBUSY_DAVAIL,
 	IEEE1284_PH_REV_DATA,
-- 
cgit v1.2.3-71-gd317


From 110bee75d2e03d3b4bcc74743dee5a21fe7b43bd Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:49 -0800
Subject: [PATCH] parport: DEBUG_PARPORT build fix

Add missing "struct" keyword preventing compilation with DEBUG_PARPORT
defined.  Also add some "const".

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/parport_pc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/parport_pc.h b/include/linux/parport_pc.h
index c6f762470879..7e62b3429cdd 100644
--- a/include/linux/parport_pc.h
+++ b/include/linux/parport_pc.h
@@ -85,7 +85,7 @@ extern __inline__ void dump_parport_state (char *str, struct parport *p)
 	unsigned char ecr = inb (ECONTROL (p));
 	unsigned char dcr = inb (CONTROL (p));
 	unsigned char dsr = inb (STATUS (p));
-	static char *ecr_modes[] = {"SPP", "PS2", "PPFIFO", "ECP", "xXx", "yYy", "TST", "CFG"};
+	static const char *const ecr_modes[] = {"SPP", "PS2", "PPFIFO", "ECP", "xXx", "yYy", "TST", "CFG"};
 	const struct parport_pc_private *priv = p->physport->private_data;
 	int i;
 
-- 
cgit v1.2.3-71-gd317


From 81684ee645e15601ec935461d9069a3086179c06 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 6 Jan 2006 00:19:53 -0800
Subject: [PATCH] include/linux/parport_pc.h: "extern inline" -> "static
 inline"

"extern inline" doesn't make much sense.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/parport_pc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/parport_pc.h b/include/linux/parport_pc.h
index 7e62b3429cdd..1cc0f6b1a49a 100644
--- a/include/linux/parport_pc.h
+++ b/include/linux/parport_pc.h
@@ -79,7 +79,7 @@ static __inline__ unsigned char parport_pc_read_data(struct parport *p)
 }
 
 #ifdef DEBUG_PARPORT
-extern __inline__ void dump_parport_state (char *str, struct parport *p)
+static inline void dump_parport_state (char *str, struct parport *p)
 {
 	/* here's hoping that reading these ports won't side-effect anything underneath */
 	unsigned char ecr = inb (ECONTROL (p));
-- 
cgit v1.2.3-71-gd317


From f93ea411b73594f7d144855fd34278bcf34a9afc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 6 Jan 2006 00:19:55 -0800
Subject: [PATCH] jbd: split checkpoint lists

Split the checkpoint list of the transaction into two lists.  In the first
list we keep the buffers that need to be submitted for IO.  In the second
list are kept buffers that were already submitted and we just have to wait
for the IO to complete.  This should simplify a handling of checkpoint
lists a bit and can eventually be also a performance gain.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/checkpoint.c | 418 ++++++++++++++++++++++++++++++----------------------
 include/linux/jbd.h |   8 +-
 2 files changed, 248 insertions(+), 178 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 014a51fd00d7..cb3cef525c3b 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,75 @@
 #include <linux/slab.h>
 
 /*
- * Unlink a buffer from a transaction. 
+ * Unlink a buffer from a transaction checkpoint list.
  *
  * Called with j_list_lock held.
  */
 
-static inline void __buffer_unlink(struct journal_head *jh)
+static void __buffer_unlink_first(struct journal_head *jh)
 {
 	transaction_t *transaction;
 
 	transaction = jh->b_cp_transaction;
-	jh->b_cp_transaction = NULL;
 
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
+	if (transaction->t_checkpoint_list == jh) {
 		transaction->t_checkpoint_list = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
-		transaction->t_checkpoint_list = NULL;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction;
+
+	transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction;
+
+	transaction = jh->b_cp_transaction;
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
 }
 
 /*
  * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
@@ -57,12 +103,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		__journal_remove_checkpoint(jh);
+		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
-		ret = 1;
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
@@ -117,83 +162,53 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 
 /*
- * Clean up a transaction's checkpoint list.  
- *
- * We wait for any pending IO to complete and make sure any clean
- * buffers are removed from the transaction. 
- *
- * Return 1 if we performed any actions which might have destroyed the
- * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
- * the last checkpoint buffer is cleansed)
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
  *
  * Called with j_list_lock held.
  */
-static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
-	struct journal_head *jh, *next_jh, *last_jh;
+	struct journal_head *jh;
 	struct buffer_head *bh;
-	int ret = 0;
-
-	assert_spin_locked(&journal->j_list_lock);
-	jh = transaction->t_checkpoint_list;
-	if (!jh)
-		return 0;
-
-	last_jh = jh->b_cpprev;
-	next_jh = jh;
-	do {
-		jh = next_jh;
+	tid_t this_tid;
+	int released = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Didn't somebody clean up the transaction in the meanwhile */
+	if (journal->j_checkpoint_transactions != transaction ||
+		transaction->t_tid != this_tid)
+		return;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
-			goto out_return_1;
-		}
-
-		/*
-		 * This is foul
-		 */
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			goto out_return_1;
+			spin_lock(&journal->j_list_lock);
+			goto restart;
 		}
-
-		if (jh->b_transaction != NULL) {
-			transaction_t *t = jh->b_transaction;
-			tid_t tid = t->t_tid;
-
-			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
-			log_start_commit(journal, tid);
-			log_wait_commit(journal, tid);
-			goto out_return_1;
-		}
-
 		/*
-		 * AKPM: I think the buffer_jbddirty test is redundant - it
-		 * shouldn't have NULL b_transaction?
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
 		 */
-		next_jh = jh->b_cpnext;
-		if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
-			BUFFER_TRACE(bh, "remove from checkpoint");
-			__journal_remove_checkpoint(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
-			ret = 1;
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-	} while (jh != last_jh);
-
-	return ret;
-out_return_1:
-	spin_lock(&journal->j_list_lock);
-	return 1;
+		released = __journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+	}
 }
 
 #define NR_BATCH	64
@@ -203,9 +218,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
-	spin_unlock(&journal->j_list_lock);
 	ll_rw_block(SWRITE, *batch_count, bhs);
-	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -221,19 +234,46 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.  
  *
- * Called with j_list_lock held.
+ * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
-static int __flush_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			int *drop_count)
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 
-	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
-		J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	if (buffer_locked(bh)) {
+		get_bh(bh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		put_bh(bh);
+		ret = 1;
+	}
+	else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
 
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		log_start_commit(journal, tid);
+		log_wait_commit(journal, tid);
+		ret = 1;
+	}
+	else if (!buffer_dirty(bh)) {
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		put_bh(bh);
+		ret = 1;
+	}
+	else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
@@ -246,45 +286,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
-	} else {
-		int last_buffer = 0;
-		if (jh->b_cpnext == jh) {
-			/* We may be about to drop the transaction.  Tell the
-			 * caller that the lists have changed.
-			 */
-			last_buffer = 1;
-		}
-		if (__try_to_free_cp_buf(jh)) {
-			(*drop_count)++;
-			ret = last_buffer;
-		}
 	}
 	return ret;
 }
 
 /*
- * Perform an actual checkpoint.  We don't write out only enough to
- * satisfy the current blocked requests: rather we submit a reasonably
- * sized chunk of the outstanding data to disk at once for
- * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
  * 
- * However, we _do_ take into account the amount requested so that once
- * the IO has been queued, we can return as soon as enough of it has
- * completed to disk.  
- *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
+	transaction_t *transaction;
+	tid_t this_tid;
 	int result;
-	int batch_count = 0;
-	struct buffer_head *bhs[NR_BATCH];
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -299,79 +324,70 @@ int log_do_checkpoint(journal_t *journal)
 		return result;
 
 	/*
-	 * OK, we need to start writing disk blocks.  Try to free up a
-	 * quarter of the log in a single checkpoint if we can.
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
 	 */
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
 	/*
-	 * AKPM: check this code.  I had a feeling a while back that it
-	 * degenerates into a busy loop at unmount time.
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
 	 */
-	spin_lock(&journal->j_list_lock);
-	while (journal->j_checkpoint_transactions) {
-		transaction_t *transaction;
-		struct journal_head *jh, *last_jh, *next_jh;
-		int drop_count = 0;
-		int cleanup_ret, retry = 0;
-		tid_t this_tid;
-
-		transaction = journal->j_checkpoint_transactions;
-		this_tid = transaction->t_tid;
-		jh = transaction->t_checkpoint_list;
-		last_jh = jh->b_cpprev;
-		next_jh = jh;
-		do {
+ 	if (journal->j_checkpoint_transactions == transaction ||
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0;
+
+		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
 
-			jh = next_jh;
-			next_jh = jh->b_cpnext;
+			jh = transaction->t_checkpoint_list;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
-				spin_lock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-			retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
-			if (cond_resched_lock(&journal->j_list_lock)) {
+			retry = __process_buffer(journal, jh, bhs,
+						&batch_count);
+			if (!retry &&
+			    lock_need_resched(&journal->j_list_lock)) {
+				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-		} while (jh != last_jh && !retry);
+		}
 
 		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
 			__flush_batch(journal, bhs, &batch_count);
-			retry = 1;
 		}
 
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		/*
-		 * If someone cleaned up this transaction while we slept, we're
-		 * done
-		 */
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
-		if (retry)
-			continue;
-		/*
-		 * Maybe it's a new transaction, but it fell at the same
-		 * address
-		 */
-		if (transaction->t_tid != this_tid)
-			continue;
-		/*
-		 * We have walked the whole transaction list without
-		 * finding anything to write to disk.  We had better be
-		 * able to make some progress or we are in trouble. 
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list.  Let's clean up the second one.
 		 */
-		cleanup_ret = __cleanup_transaction(journal, transaction);
-		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
+		__wait_cp_io(journal, transaction);
 	}
+out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
-
 	return 0;
 }
 
@@ -455,6 +471,53 @@ int cleanup_journal_tail(journal_t *journal)
 
 /* Checkpoint list management */
 
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+ 	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
 /*
  * journal_clean_checkpoint_list
  *
@@ -462,46 +525,38 @@ int cleanup_journal_tail(journal_t *journal)
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
-	int ret = 0;
+	int ret = 0, released;
 
 	transaction = journal->j_checkpoint_transactions;
-	if (transaction == 0)
+	if (!transaction)
 		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
-		struct journal_head *jh;
-
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		jh = transaction->t_checkpoint_list;
-		if (jh) {
-			struct journal_head *last_jh = jh->b_cpprev;
-			struct journal_head *next_jh = jh;
-
-			do {
-				jh = next_jh;
-				next_jh = jh->b_cpnext;
-				/* Use trylock because of the ranknig */
-				if (jbd_trylock_bh_state(jh2bh(jh)))
-					ret += __try_to_free_cp_buf(jh);
-				/*
-				 * This function only frees up some memory
-				 * if possible so we dont have an obligation
-				 * to finish processing. Bail out if preemption
-				 * requested:
-				 */
-				if (need_resched())
-					goto out;
-			} while (jh != last_jh);
-		}
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
 	} while (transaction != last_transaction);
 out:
 	return ret;
@@ -516,18 +571,22 @@ out:
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
- * list until they have been rewritten, at which point this function is
+ * lists until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
- * checkpoint list.  
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 
-void __journal_remove_checkpoint(struct journal_head *jh)
+int __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
+	int ret = 0;
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -538,8 +597,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
 
-	if (transaction->t_checkpoint_list != NULL)
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
@@ -565,8 +626,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
+	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
+	return ret;
 }
 
 /*
@@ -628,6 +691,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index dcde7adfdce5..558cb4c26ec9 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -497,6 +497,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_checkpoint_list;
 
+	/*
+	 * Doubly-linked circular list of all buffers submitted for IO while
+	 * checkpointing. [j_list_lock]
+	 */
+	struct journal_head	*t_checkpoint_io_list;
+
 	/*
 	 * Doubly-linked circular list of temporary buffers currently undergoing
 	 * IO in the log [j_list_lock]
@@ -843,7 +849,7 @@ extern void journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
 int __journal_clean_checkpoint_list(journal_t *journal);
-void __journal_remove_checkpoint(struct journal_head *);
+int __journal_remove_checkpoint(struct journal_head *);
 void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
 
 /* Buffer IO */
-- 
cgit v1.2.3-71-gd317


From a334de28665b14f0a33df82699fa9a78cfeedf31 Mon Sep 17 00:00:00 2001
From: David Shaw <dshaw@jabberwocky.com>
Date: Fri, 6 Jan 2006 00:19:58 -0800
Subject: [PATCH] knfsd: check error status from vfs_getattr and i_op->fsync

Both vfs_getattr and i_op->fsync return error statuses which nfsd was
largely ignoring.  This as noticed when exporting directories using fuse.

This patch cleans up most of the offences, which involves moving the call
to vfs_getattr out of the xdr encoding routines (where it is too late to
report an error) into the main NFS procedure handling routines.

There is still a called to vfs_gettattr (related to the ACL code) where the
status is ignored, and called to nfsd_sync_dir don't check return status
either.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs3proc.c        | 11 +++++++++--
 fs/nfsd/nfs3xdr.c         | 47 ++++++++++++++++++++++++----------------------
 fs/nfsd/nfsxdr.c          | 48 +++++++++++++++++++++++------------------------
 fs/nfsd/vfs.c             | 20 +++++++++++++-------
 include/linux/nfsd/xdr.h  |  3 +++
 include/linux/nfsd/xdr3.h |  1 +
 6 files changed, 75 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 041380fe667b..6d2dfed1de08 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -56,13 +56,20 @@ static int
 nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 					   struct nfsd3_attrstat *resp)
 {
-	int	nfserr;
+	int	err, nfserr;
 
 	dprintk("nfsd: GETATTR(3)  %s\n",
-				SVCFH_fmt(&argp->fh));
+		SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	err = vfs_getattr(resp->fh.fh_export->ex_mnt,
+			  resp->fh.fh_dentry, &resp->stat);
+	nfserr = nfserrno(err);
+
 	RETURN_STATUS(nfserr);
 }
 
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9147b8524d05..243d94b9653a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -154,37 +154,34 @@ decode_sattr3(u32 *p, struct iattr *iap)
 }
 
 static inline u32 *
-encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+	      struct kstat *stat)
 {
-	struct vfsmount *mnt = fhp->fh_export->ex_mnt;
 	struct dentry	*dentry = fhp->fh_dentry;
-	struct kstat stat;
 	struct timespec time;
 
-	vfs_getattr(mnt, dentry, &stat);
-
-	*p++ = htonl(nfs3_ftypes[(stat.mode & S_IFMT) >> 12]);
-	*p++ = htonl((u32) stat.mode);
-	*p++ = htonl((u32) stat.nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
-	if (S_ISLNK(stat.mode) && stat.size > NFS3_MAXPATHLEN) {
+	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+	if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
 		p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
 	} else {
-		p = xdr_encode_hyper(p, (u64) stat.size);
+		p = xdr_encode_hyper(p, (u64) stat->size);
 	}
-	p = xdr_encode_hyper(p, ((u64)stat.blocks) << 9);
-	*p++ = htonl((u32) MAJOR(stat.rdev));
-	*p++ = htonl((u32) MINOR(stat.rdev));
+	p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
+	*p++ = htonl((u32) MAJOR(stat->rdev));
+	*p++ = htonl((u32) MINOR(stat->rdev));
 	if (is_fsid(fhp, rqstp->rq_reffh))
 		p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
 	else
-		p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat.dev));
-	p = xdr_encode_hyper(p, (u64) stat.ino);
-	p = encode_time3(p, &stat.atime);
+		p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat->dev));
+	p = xdr_encode_hyper(p, (u64) stat->ino);
+	p = encode_time3(p, &stat->atime);
 	lease_get_mtime(dentry->d_inode, &time); 
 	p = encode_time3(p, &time);
-	p = encode_time3(p, &stat.ctime);
+	p = encode_time3(p, &stat->ctime);
 
 	return p;
 }
@@ -232,8 +229,14 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 {
 	struct dentry *dentry = fhp->fh_dentry;
 	if (dentry && dentry->d_inode != NULL) {
-		*p++ = xdr_one;		/* attributes follow */
-		return encode_fattr3(rqstp, p, fhp);
+	        int err;
+		struct kstat stat;
+
+		err = vfs_getattr(fhp->fh_export->ex_mnt, dentry, &stat);
+		if (!err) {
+			*p++ = xdr_one;		/* attributes follow */
+			return encode_fattr3(rqstp, p, fhp, &stat);
+		}
 	}
 	*p++ = xdr_zero;
 	return p;
@@ -616,7 +619,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd3_attrstat *resp)
 {
 	if (resp->status == 0)
-		p = encode_fattr3(rqstp, p, &resp->fh);
+		p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b45999ff33e6..aa7bb41b293d 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,46 +152,44 @@ decode_sattr(u32 *p, struct iattr *iap)
 }
 
 static inline u32 *
-encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+	     struct kstat *stat)
 {
-	struct vfsmount *mnt = fhp->fh_export->ex_mnt;
 	struct dentry	*dentry = fhp->fh_dentry;
-	struct kstat stat;
 	int type;
 	struct timespec time;
 
-	vfs_getattr(mnt, dentry, &stat);
-	type = (stat.mode & S_IFMT);
+	type = (stat->mode & S_IFMT);
 
 	*p++ = htonl(nfs_ftypes[type >> 12]);
-	*p++ = htonl((u32) stat.mode);
-	*p++ = htonl((u32) stat.nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
 
-	if (S_ISLNK(type) && stat.size > NFS_MAXPATHLEN) {
+	if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
 		*p++ = htonl(NFS_MAXPATHLEN);
 	} else {
-		*p++ = htonl((u32) stat.size);
+		*p++ = htonl((u32) stat->size);
 	}
-	*p++ = htonl((u32) stat.blksize);
+	*p++ = htonl((u32) stat->blksize);
 	if (S_ISCHR(type) || S_ISBLK(type))
-		*p++ = htonl(new_encode_dev(stat.rdev));
+		*p++ = htonl(new_encode_dev(stat->rdev));
 	else
 		*p++ = htonl(0xffffffff);
-	*p++ = htonl((u32) stat.blocks);
+	*p++ = htonl((u32) stat->blocks);
 	if (is_fsid(fhp, rqstp->rq_reffh))
 		*p++ = htonl((u32) fhp->fh_export->ex_fsid);
 	else
-		*p++ = htonl(new_encode_dev(stat.dev));
-	*p++ = htonl((u32) stat.ino);
-	*p++ = htonl((u32) stat.atime.tv_sec);
-	*p++ = htonl(stat.atime.tv_nsec ? stat.atime.tv_nsec / 1000 : 0);
+		*p++ = htonl(new_encode_dev(stat->dev));
+	*p++ = htonl((u32) stat->ino);
+	*p++ = htonl((u32) stat->atime.tv_sec);
+	*p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
 	lease_get_mtime(dentry->d_inode, &time); 
 	*p++ = htonl((u32) time.tv_sec);
 	*p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); 
-	*p++ = htonl((u32) stat.ctime.tv_sec);
-	*p++ = htonl(stat.ctime.tv_nsec ? stat.ctime.tv_nsec / 1000 : 0);
+	*p++ = htonl((u32) stat->ctime.tv_sec);
+	*p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
 
 	return p;
 }
@@ -199,7 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 /* Helper function for NFSv2 ACL code */
 u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 {
-	return encode_fattr(rqstp, p, fhp);
+	struct kstat stat;
+	vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
+	return encode_fattr(rqstp, p, fhp, &stat);
 }
 
 /*
@@ -394,7 +394,7 @@ int
 nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_attrstat *resp)
 {
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -403,7 +403,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_diropres *resp)
 {
 	p = encode_fh(p, &resp->fh);
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -428,7 +428,7 @@ int
 nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_readres *resp)
 {
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	*p++ = htonl(resp->count);
 	xdr_ressize_check(rqstp, p);
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index af7c3c3074b0..f83ab4cf4265 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -717,27 +717,33 @@ nfsd_close(struct file *filp)
  * As this calls fsync (not fdatasync) there is no need for a write_inode
  * after it.
  */
-static inline void nfsd_dosync(struct file *filp, struct dentry *dp,
-			       struct file_operations *fop)
+static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
+			      struct file_operations *fop)
 {
 	struct inode *inode = dp->d_inode;
 	int (*fsync) (struct file *, struct dentry *, int);
+	int err = nfs_ok;
 
 	filemap_fdatawrite(inode->i_mapping);
 	if (fop && (fsync = fop->fsync))
-		fsync(filp, dp, 0);
+		err=fsync(filp, dp, 0);
 	filemap_fdatawait(inode->i_mapping);
+
+	return nfserrno(err);
 }
 	
 
-static void
+static int
 nfsd_sync(struct file *filp)
 {
+        int err;
 	struct inode *inode = filp->f_dentry->d_inode;
 	dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name);
 	down(&inode->i_sem);
-	nfsd_dosync(filp, filp->f_dentry, filp->f_op);
+	err=nfsd_dosync(filp, filp->f_dentry, filp->f_op);
 	up(&inode->i_sem);
+
+	return err;
 }
 
 void
@@ -962,7 +968,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 			if (inode->i_state & I_DIRTY) {
 				dprintk("nfsd: write sync %d\n", current->pid);
-				nfsd_sync(file);
+				err=nfsd_sync(file);
 			}
 #if 0
 			wake_up(&inode->i_wait);
@@ -1066,7 +1072,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		return err;
 	if (EX_ISSYNC(fhp->fh_export)) {
 		if (file->f_op && file->f_op->fsync) {
-			nfsd_sync(file);
+			err = nfsd_sync(file);
 		} else {
 			err = nfserr_notsupp;
 		}
diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h
index 130d4f588a37..3f4f7142bbe3 100644
--- a/include/linux/nfsd/xdr.h
+++ b/include/linux/nfsd/xdr.h
@@ -88,10 +88,12 @@ struct nfsd_readdirargs {
 
 struct nfsd_attrstat {
 	struct svc_fh		fh;
+	struct kstat		stat;
 };
 
 struct nfsd_diropres  {
 	struct svc_fh		fh;
+	struct kstat		stat;
 };
 
 struct nfsd_readlinkres {
@@ -101,6 +103,7 @@ struct nfsd_readlinkres {
 struct nfsd_readres {
 	struct svc_fh		fh;
 	unsigned long		count;
+	struct kstat		stat;
 };
 
 struct nfsd_readdirres {
diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h
index 3c2a71b43bac..a4322741f8b9 100644
--- a/include/linux/nfsd/xdr3.h
+++ b/include/linux/nfsd/xdr3.h
@@ -126,6 +126,7 @@ struct nfsd3_setaclargs {
 struct nfsd3_attrstat {
 	__u32			status;
 	struct svc_fh		fh;
+	struct kstat            stat;
 };
 
 /* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */
-- 
cgit v1.2.3-71-gd317


From 6da487dcc0c6f4c827779687a20016efeffc4d60 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 6 Jan 2006 00:20:07 -0800
Subject: [PATCH] device-mapper ioctl: add skip lock_fs flag

Add ioctl DM_SKIP_LOCKFS_FLAG for userspace to request that lock_fs is
bypassed when suspending a device.

There's no change to the behaviour of existing code that doesn't know about
the new flag.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c    | 11 +++++++++--
 include/linux/dm-ioctl.h | 11 ++++++++---
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index dbc07afd4462..561bda5011e0 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -693,14 +693,18 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 static int do_suspend(struct dm_ioctl *param)
 {
 	int r = 0;
+	int do_lockfs = 1;
 	struct mapped_device *md;
 
 	md = find_device(param);
 	if (!md)
 		return -ENXIO;
 
+	if (param->flags & DM_SKIP_LOCKFS_FLAG)
+		do_lockfs = 0;
+
 	if (!dm_suspended(md))
-		r = dm_suspend(md, 1);
+		r = dm_suspend(md, do_lockfs);
 
 	if (!r)
 		r = __dev_status(md, param);
@@ -712,6 +716,7 @@ static int do_suspend(struct dm_ioctl *param)
 static int do_resume(struct dm_ioctl *param)
 {
 	int r = 0;
+	int do_lockfs = 1;
 	struct hash_cell *hc;
 	struct mapped_device *md;
 	struct dm_table *new_map;
@@ -737,8 +742,10 @@ static int do_resume(struct dm_ioctl *param)
 	/* Do we need to load a new map ? */
 	if (new_map) {
 		/* Suspend if it isn't already suspended */
+		if (param->flags & DM_SKIP_LOCKFS_FLAG)
+			do_lockfs = 0;
 		if (!dm_suspended(md))
-			dm_suspend(md, 1);
+			dm_suspend(md, do_lockfs);
 
 		r = dm_swap_table(md, new_map);
 		if (r) {
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index f5eb6b6cd109..fa75ba0d635e 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -272,9 +272,9 @@ typedef char ioctl_struct[308];
 #define DM_TARGET_MSG	 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	4
+#define DM_VERSION_MINOR	5
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2005-01-12)"
+#define DM_VERSION_EXTRA	"-ioctl (2005-10-04)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
@@ -301,8 +301,13 @@ typedef char ioctl_struct[308];
 #define DM_BUFFER_FULL_FLAG	(1 << 8) /* Out */
 
 /*
- * Set this to improve performance when you aren't going to use open_count
+ * Set this to improve performance when you aren't going to use open_count.
  */
 #define DM_SKIP_BDGET_FLAG	(1 << 9) /* In */
 
+/*
+ * Set this to avoid attempting to freeze any filesystem when suspending.
+ */
+#define DM_SKIP_LOCKFS_FLAG	(1 << 10) /* In */
+
 #endif				/* _LINUX_DM_IOCTL_H */
-- 
cgit v1.2.3-71-gd317


From 17999be4aa408e7ff3b9d32c735649676567a3cd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:12 -0800
Subject: [PATCH] md: improve raid1 "IO Barrier" concept

raid1 needs to put up a barrier to new requests while it does resync or other
background recovery.  The code for this is currently open-coded, slighty
obscure by its use of two waitqueues, and not documented.

This patch gathers all the related code into 4 functions, and includes a
comment which (hopefully) explains what is happening.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c         | 167 ++++++++++++++++++++++++---------------------
 include/linux/raid/raid1.h |   4 +-
 2 files changed, 91 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 229d7b204297..f5204149ab65 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -51,6 +51,8 @@ static mdk_personality_t raid1_personality;
 
 static void unplug_slaves(mddev_t *mddev);
 
+static void allow_barrier(conf_t *conf);
+static void lower_barrier(conf_t *conf);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -160,20 +162,13 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 
 static inline void free_r1bio(r1bio_t *r1_bio)
 {
-	unsigned long flags;
-
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 
 	/*
 	 * Wake up any possible resync thread that waits for the device
 	 * to go idle.
 	 */
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	if (!--conf->nr_pending) {
-		wake_up(&conf->wait_idle);
-		wake_up(&conf->wait_resume);
-	}
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	allow_barrier(conf);
 
 	put_all_bios(conf, r1_bio);
 	mempool_free(r1_bio, conf->r1bio_pool);
@@ -182,22 +177,10 @@ static inline void free_r1bio(r1bio_t *r1_bio)
 static inline void put_buf(r1bio_t *r1_bio)
 {
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
-	unsigned long flags;
 
 	mempool_free(r1_bio, conf->r1buf_pool);
 
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	if (!conf->barrier)
-		BUG();
-	--conf->barrier;
-	wake_up(&conf->wait_resume);
-	wake_up(&conf->wait_idle);
-
-	if (!--conf->nr_pending) {
-		wake_up(&conf->wait_idle);
-		wake_up(&conf->wait_resume);
-	}
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	lower_barrier(conf);
 }
 
 static void reschedule_retry(r1bio_t *r1_bio)
@@ -210,6 +193,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
 	list_add(&r1_bio->retry_list, &conf->retry_list);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
+	wake_up(&conf->wait_barrier);
 	md_wakeup_thread(mddev->thread);
 }
 
@@ -593,30 +577,83 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
-/*
- * Throttle resync depth, so that we can both get proper overlapping of
- * requests, but are still able to handle normal requests quickly.
+/* Barriers....
+ * Sometimes we need to suspend IO while we do something else,
+ * either some resync/recovery, or reconfigure the array.
+ * To do this we raise a 'barrier'.
+ * The 'barrier' is a counter that can be raised multiple times
+ * to count how many activities are happening which preclude
+ * normal IO.
+ * We can only raise the barrier if there is no pending IO.
+ * i.e. if nr_pending == 0.
+ * We choose only to raise the barrier if no-one is waiting for the
+ * barrier to go down.  This means that as soon as an IO request
+ * is ready, no other operations which require a barrier will start
+ * until the IO request has had a chance.
+ *
+ * So: regular IO calls 'wait_barrier'.  When that returns there
+ *    is no backgroup IO happening,  It must arrange to call
+ *    allow_barrier when it has finished its IO.
+ * backgroup IO calls must call raise_barrier.  Once that returns
+ *    there is no normal IO happeing.  It must arrange to call
+ *    lower_barrier when the particular background IO completes.
  */
 #define RESYNC_DEPTH 32
 
-static void device_barrier(conf_t *conf, sector_t sect)
+static void raise_barrier(conf_t *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
-			    conf->resync_lock, raid1_unplug(conf->mddev->queue));
-	
-	if (!conf->barrier++) {
-		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-				    conf->resync_lock, raid1_unplug(conf->mddev->queue));
-		if (conf->nr_pending)
-			BUG();
+
+	/* Wait until no block IO is waiting */
+	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+			    conf->resync_lock,
+			    raid1_unplug(conf->mddev->queue));
+
+	/* block any new IO from starting */
+	conf->barrier++;
+
+	/* No wait for all pending IO to complete */
+	wait_event_lock_irq(conf->wait_barrier,
+			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+			    conf->resync_lock,
+			    raid1_unplug(conf->mddev->queue));
+
+	spin_unlock_irq(&conf->resync_lock);
+}
+
+static void lower_barrier(conf_t *conf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->resync_lock, flags);
+	conf->barrier--;
+	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	wake_up(&conf->wait_barrier);
+}
+
+static void wait_barrier(conf_t *conf)
+{
+	spin_lock_irq(&conf->resync_lock);
+	if (conf->barrier) {
+		conf->nr_waiting++;
+		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+				    conf->resync_lock,
+				    raid1_unplug(conf->mddev->queue));
+		conf->nr_waiting--;
 	}
-	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock, raid1_unplug(conf->mddev->queue));
-	conf->next_resync = sect;
+	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+static void allow_barrier(conf_t *conf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->resync_lock, flags);
+	conf->nr_pending--;
+	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	wake_up(&conf->wait_barrier);
+}
+
+
 /* duplicate the data pages for behind I/O */
 static struct page **alloc_behind_pages(struct bio *bio)
 {
@@ -678,10 +715,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	 */
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
-	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
-	conf->nr_pending++;
-	spin_unlock_irq(&conf->resync_lock);
+	wait_barrier(conf);
 
 	disk_stat_inc(mddev->gendisk, ios[rw]);
 	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -909,13 +943,8 @@ static void print_conf(conf_t *conf)
 
 static void close_sync(conf_t *conf)
 {
-	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier,
-			    conf->resync_lock, 	raid1_unplug(conf->mddev->queue));
-	spin_unlock_irq(&conf->resync_lock);
-
-	if (conf->barrier) BUG();
-	if (waitqueue_active(&conf->wait_idle)) BUG();
+	wait_barrier(conf);
+	allow_barrier(conf);
 
 	mempool_destroy(conf->r1buf_pool);
 	conf->r1buf_pool = NULL;
@@ -1317,12 +1346,16 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		return sync_blocks;
 	}
 	/*
-	 * If there is non-resync activity waiting for us then
-	 * put in a delay to throttle resync.
+	 * If there is non-resync activity waiting for a turn,
+	 * and resync is going fast enough,
+	 * then let it though before starting on this new sync request.
 	 */
-	if (!go_faster && waitqueue_active(&conf->wait_resume))
+	if (!go_faster && conf->nr_waiting)
 		msleep_interruptible(1000);
-	device_barrier(conf, sector_nr + RESYNC_SECTORS);
+
+	raise_barrier(conf);
+
+	conf->next_resync = sector_nr;
 
 	/*
 	 * If reconstructing, and >1 working disc,
@@ -1355,10 +1388,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
 
-	spin_lock_irq(&conf->resync_lock);
-	conf->nr_pending++;
-	spin_unlock_irq(&conf->resync_lock);
-
 	r1_bio->mddev = mddev;
 	r1_bio->sector = sector_nr;
 	r1_bio->state = 0;
@@ -1542,8 +1571,7 @@ static int run(mddev_t *mddev)
 		mddev->recovery_cp = MaxSector;
 
 	spin_lock_init(&conf->resync_lock);
-	init_waitqueue_head(&conf->wait_idle);
-	init_waitqueue_head(&conf->wait_resume);
+	init_waitqueue_head(&conf->wait_barrier);
 
 	bio_list_init(&conf->pending_bio_list);
 	bio_list_init(&conf->flushing_bio_list);
@@ -1714,11 +1742,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 	}
 	memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
 
-	spin_lock_irq(&conf->resync_lock);
-	conf->barrier++;
-	wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-			    conf->resync_lock, raid1_unplug(mddev->queue));
-	spin_unlock_irq(&conf->resync_lock);
+	raise_barrier(conf);
 
 	/* ok, everything is stopped */
 	oldpool = conf->r1bio_pool;
@@ -1738,12 +1762,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 	conf->raid_disks = mddev->raid_disks = raid_disks;
 
 	conf->last_used = 0; /* just make sure it is in-range */
-	spin_lock_irq(&conf->resync_lock);
-	conf->barrier--;
-	spin_unlock_irq(&conf->resync_lock);
-	wake_up(&conf->wait_resume);
-	wake_up(&conf->wait_idle);
-
+	lower_barrier(conf);
 
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -1758,18 +1777,10 @@ static void raid1_quiesce(mddev_t *mddev, int state)
 
 	switch(state) {
 	case 1:
-		spin_lock_irq(&conf->resync_lock);
-		conf->barrier++;
-		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-				    conf->resync_lock, raid1_unplug(mddev->queue));
-		spin_unlock_irq(&conf->resync_lock);
+		raise_barrier(conf);
 		break;
 	case 0:
-		spin_lock_irq(&conf->resync_lock);
-		conf->barrier--;
-		spin_unlock_irq(&conf->resync_lock);
-		wake_up(&conf->wait_resume);
-		wake_up(&conf->wait_idle);
+		lower_barrier(conf);
 		break;
 	}
 	if (mddev->thread) {
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index 292b98f2b408..c55674252533 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -45,6 +45,7 @@ struct r1_private_data_s {
 
 	spinlock_t		resync_lock;
 	int			nr_pending;
+	int			nr_waiting;
 	int			barrier;
 	sector_t		next_resync;
 	int			fullsync;  /* set to 1 if a full sync is needed,
@@ -52,8 +53,7 @@ struct r1_private_data_s {
 					    * Cleared when a sync completes.
 					    */
 
-	wait_queue_head_t	wait_idle;
-	wait_queue_head_t	wait_resume;
+	wait_queue_head_t	wait_barrier;
 
 	struct pool_info	*poolinfo;
 
-- 
cgit v1.2.3-71-gd317


From 0a27ec96b6fb1abf867e36d7b0b681d67588767a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:13 -0800
Subject: [PATCH] md: improve raid10 "IO Barrier" concept

raid10 needs to put up a barrier to new requests while it does resync or other
background recovery.  The code for this is currently open-coded, slighty
obscure by its use of two waitqueues, and not documented.

This patch gathers all the related code into 4 functions, and includes a
comment which (hopefully) explains what is happening.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c         | 135 ++++++++++++++++++++++++++------------------
 include/linux/raid/raid10.h |   4 +-
 2 files changed, 81 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 713dc9c2c730..50bd7b152f28 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -47,6 +47,9 @@
 
 static void unplug_slaves(mddev_t *mddev);
 
+static void allow_barrier(conf_t *conf);
+static void lower_barrier(conf_t *conf);
+
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	conf_t *conf = data;
@@ -175,20 +178,13 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 
 static inline void free_r10bio(r10bio_t *r10_bio)
 {
-	unsigned long flags;
-
 	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 
 	/*
 	 * Wake up any possible resync thread that waits for the device
 	 * to go idle.
 	 */
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	if (!--conf->nr_pending) {
-		wake_up(&conf->wait_idle);
-		wake_up(&conf->wait_resume);
-	}
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	allow_barrier(conf);
 
 	put_all_bios(conf, r10_bio);
 	mempool_free(r10_bio, conf->r10bio_pool);
@@ -197,22 +193,10 @@ static inline void free_r10bio(r10bio_t *r10_bio)
 static inline void put_buf(r10bio_t *r10_bio)
 {
 	conf_t *conf = mddev_to_conf(r10_bio->mddev);
-	unsigned long flags;
 
 	mempool_free(r10_bio, conf->r10buf_pool);
 
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	if (!conf->barrier)
-		BUG();
-	--conf->barrier;
-	wake_up(&conf->wait_resume);
-	wake_up(&conf->wait_idle);
-
-	if (!--conf->nr_pending) {
-		wake_up(&conf->wait_idle);
-		wake_up(&conf->wait_resume);
-	}
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	lower_barrier(conf);
 }
 
 static void reschedule_retry(r10bio_t *r10_bio)
@@ -640,30 +624,82 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
-/*
- * Throttle resync depth, so that we can both get proper overlapping of
- * requests, but are still able to handle normal requests quickly.
+/* Barriers....
+ * Sometimes we need to suspend IO while we do something else,
+ * either some resync/recovery, or reconfigure the array.
+ * To do this we raise a 'barrier'.
+ * The 'barrier' is a counter that can be raised multiple times
+ * to count how many activities are happening which preclude
+ * normal IO.
+ * We can only raise the barrier if there is no pending IO.
+ * i.e. if nr_pending == 0.
+ * We choose only to raise the barrier if no-one is waiting for the
+ * barrier to go down.  This means that as soon as an IO request
+ * is ready, no other operations which require a barrier will start
+ * until the IO request has had a chance.
+ *
+ * So: regular IO calls 'wait_barrier'.  When that returns there
+ *    is no backgroup IO happening,  It must arrange to call
+ *    allow_barrier when it has finished its IO.
+ * backgroup IO calls must call raise_barrier.  Once that returns
+ *    there is no normal IO happeing.  It must arrange to call
+ *    lower_barrier when the particular background IO completes.
  */
 #define RESYNC_DEPTH 32
 
-static void device_barrier(conf_t *conf, sector_t sect)
+static void raise_barrier(conf_t *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
-			    conf->resync_lock, unplug_slaves(conf->mddev));
-
-	if (!conf->barrier++) {
-		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-				    conf->resync_lock, unplug_slaves(conf->mddev));
-		if (conf->nr_pending)
-			BUG();
+
+	/* Wait until no block IO is waiting */
+	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+			    conf->resync_lock,
+			    raid10_unplug(conf->mddev->queue));
+
+	/* block any new IO from starting */
+	conf->barrier++;
+
+	/* No wait for all pending IO to complete */
+	wait_event_lock_irq(conf->wait_barrier,
+			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+			    conf->resync_lock,
+			    raid10_unplug(conf->mddev->queue));
+
+	spin_unlock_irq(&conf->resync_lock);
+}
+
+static void lower_barrier(conf_t *conf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->resync_lock, flags);
+	conf->barrier--;
+	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	wake_up(&conf->wait_barrier);
+}
+
+static void wait_barrier(conf_t *conf)
+{
+	spin_lock_irq(&conf->resync_lock);
+	if (conf->barrier) {
+		conf->nr_waiting++;
+		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+				    conf->resync_lock,
+				    raid10_unplug(conf->mddev->queue));
+		conf->nr_waiting--;
 	}
-	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock, unplug_slaves(conf->mddev));
-	conf->next_resync = sect;
+	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+static void allow_barrier(conf_t *conf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->resync_lock, flags);
+	conf->nr_pending--;
+	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	wake_up(&conf->wait_barrier);
+}
+
 static int make_request(request_queue_t *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
@@ -719,10 +755,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
-	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
-	conf->nr_pending++;
-	spin_unlock_irq(&conf->resync_lock);
+	wait_barrier(conf);
 
 	disk_stat_inc(mddev->gendisk, ios[rw]);
 	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -897,13 +930,8 @@ static void print_conf(conf_t *conf)
 
 static void close_sync(conf_t *conf)
 {
-	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier,
-			    conf->resync_lock, 	unplug_slaves(conf->mddev));
-	spin_unlock_irq(&conf->resync_lock);
-
-	if (conf->barrier) BUG();
-	if (waitqueue_active(&conf->wait_idle)) BUG();
+	wait_barrier(conf);
+	allow_barrier(conf);
 
 	mempool_destroy(conf->r10buf_pool);
 	conf->r10buf_pool = NULL;
@@ -1395,9 +1423,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	 * If there is non-resync activity waiting for us then
 	 * put in a delay to throttle resync.
 	 */
-	if (!go_faster && waitqueue_active(&conf->wait_resume))
+	if (!go_faster && conf->nr_waiting)
 		msleep_interruptible(1000);
-	device_barrier(conf, sector_nr + RESYNC_SECTORS);
+	raise_barrier(conf);
+	conf->next_resync = sector_nr;
 
 	/* Again, very different code for resync and recovery.
 	 * Both must result in an r10bio with a list of bios that
@@ -1427,7 +1456,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 				spin_lock_irq(&conf->resync_lock);
-				conf->nr_pending++;
 				if (rb2) conf->barrier++;
 				spin_unlock_irq(&conf->resync_lock);
 				atomic_set(&r10_bio->remaining, 0);
@@ -1500,10 +1528,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		int count = 0;
 		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 
-		spin_lock_irq(&conf->resync_lock);
-		conf->nr_pending++;
-		spin_unlock_irq(&conf->resync_lock);
-
 		r10_bio->mddev = mddev;
 		atomic_set(&r10_bio->remaining, 0);
 
@@ -1713,8 +1737,7 @@ static int run(mddev_t *mddev)
 	INIT_LIST_HEAD(&conf->retry_list);
 
 	spin_lock_init(&conf->resync_lock);
-	init_waitqueue_head(&conf->wait_idle);
-	init_waitqueue_head(&conf->wait_resume);
+	init_waitqueue_head(&conf->wait_barrier);
 
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf)) {
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index 60708789c8f9..08317b77802b 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -39,11 +39,11 @@ struct r10_private_data_s {
 
 	spinlock_t		resync_lock;
 	int nr_pending;
+	int nr_waiting;
 	int barrier;
 	sector_t		next_resync;
 
-	wait_queue_head_t	wait_idle;
-	wait_queue_head_t	wait_resume;
+	wait_queue_head_t	wait_barrier;
 
 	mempool_t *r10bio_pool;
 	mempool_t *r10buf_pool;
-- 
cgit v1.2.3-71-gd317


From 6ff8d8ec06690f4011a6c3ad9e0759b9094f0601 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:15 -0800
Subject: [PATCH] md: allow dirty raid[456] arrays to be started at boot

See patch to md.txt for more details

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      | 24 ++++++++++++++++++++++++
 drivers/md/md.c           |  4 ++++
 drivers/md/raid5.c        | 15 +++++++++++----
 drivers/md/raid6main.c    | 13 +++++++++----
 include/linux/raid/md_k.h |  1 +
 5 files changed, 49 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 23e6cce40f9c..1dd0fb6021cf 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -51,6 +51,30 @@ superblock can be autodetected and run at boot time.
 The kernel parameter "raid=partitionable" (or "raid=part") means
 that all auto-detected arrays are assembled as partitionable.
 
+Boot time assembly of degraded/dirty arrays
+-------------------------------------------
+
+If a raid5 or raid6 array is both dirty and degraded, it could have
+undetectable data corruption.  This is because the fact that it is
+'dirty' means that the parity cannot be trusted, and the fact that it
+is degraded means that some datablocks are missing and cannot reliably
+be reconstructed (due to no parity).
+
+For this reason, md will normally refuse to start such an array.  This
+requires the sysadmin to take action to explicitly start the array
+desipite possible corruption.  This is normally done with
+   mdadm --assemble --force ....
+
+This option is not really available if the array has the root
+filesystem on it.  In order to support this booting from such an
+array, md supports a module parameter "start_dirty_degraded" which,
+when set to 1, bypassed the checks and will allows dirty degraded
+arrays to be started.
+
+So, to boot with a root filesystem of a dirty degraded raid[56], use
+
+   md-mod.start_dirty_degraded=1
+
 
 Superblock formats
 ------------------
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8175a2a222da..b4fb7247b3ed 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1937,6 +1937,7 @@ static void md_safemode_timeout(unsigned long data)
 	md_wakeup_thread(mddev->thread);
 }
 
+static int start_dirty_degraded;
 
 static int do_md_run(mddev_t * mddev)
 {
@@ -2048,6 +2049,7 @@ static int do_md_run(mddev_t * mddev)
 	mddev->recovery = 0;
 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
 	mddev->barriers_work = 1;
+	mddev->ok_start_degraded = start_dirty_degraded;
 
 	if (start_readonly)
 		mddev->ro = 2; /* read-only, but switch on first write */
@@ -4509,6 +4511,8 @@ static int set_ro(const char *val, struct kernel_param *kp)
 }
 
 module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
+module_param(start_dirty_degraded, int, 0644);
+
 
 EXPORT_SYMBOL(register_md_personality);
 EXPORT_SYMBOL(unregister_md_personality);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 334ff7a07283..53a0f2ce76c8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1904,10 +1904,17 @@ static int run(mddev_t *mddev)
 
 	if (mddev->degraded == 1 &&
 	    mddev->recovery_cp != MaxSector) {
-		printk(KERN_ERR 
-			"raid5: cannot start dirty degraded array for %s\n",
-			mdname(mddev));
-		goto abort;
+		if (mddev->ok_start_degraded)
+			printk(KERN_WARNING
+			       "raid5: starting dirty degraded array: %s"
+			       "- data corruption possible.\n",
+			       mdname(mddev));
+		else {
+			printk(KERN_ERR
+			       "raid5: cannot start dirty degraded array for %s\n",
+			       mdname(mddev));
+			goto abort;
+		}
 	}
 
 	{
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 0000d162d198..9ac6dcd55127 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1929,13 +1929,18 @@ static int run(mddev_t *mddev)
 		goto abort;
 	}
 
-#if 0				/* FIX: For now */
 	if (mddev->degraded > 0 &&
 	    mddev->recovery_cp != MaxSector) {
-		printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev));
-		goto abort;
+		if (mddev->ok_start_degraded)
+			printk(KERN_WARNING "raid6: starting dirty degraded array:%s"
+			       "- data corruption possible.\n",
+			       mdname(mddev));
+		else {
+			printk(KERN_ERR "raid6: cannot start dirty degraded array"
+			       " for %s\n", mdname(mddev));
+			goto abort;
+		}
 	}
-#endif
 
 	{
 		mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 46629a275ba9..1dd587b5975a 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -183,6 +183,7 @@ struct mddev_s
 	sector_t			resync_mismatches; /* count of sectors where
 							    * parity/replica mismatch found
 							    */
+	int				ok_start_degraded;
 	/* recovery/resync flags 
 	 * NEEDED:   we might need to start a resync/recover
 	 * RUNNING:  a thread is running, or about to be started
-- 
cgit v1.2.3-71-gd317


From 6cce3b23f6f8e974c00af7a9b88f1d413ba368a8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:16 -0800
Subject: [PATCH] md: write intent bitmap support for raid10

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c             |  10 ++-
 drivers/md/raid10.c         | 178 ++++++++++++++++++++++++++++++++++++++------
 include/linux/raid/raid10.h |   9 ++-
 3 files changed, 171 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ee199d462520..64e7da3701a5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -714,9 +714,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
 		    mddev->bitmap_file == NULL) {
-			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) {
+			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+			    && mddev->level != 10) {
 				/* FIXME use a better test */
-				printk(KERN_WARNING "md: bitmaps only support for raid1\n");
+				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
 				return -EINVAL;
 			}
 			mddev->bitmap_offset = mddev->default_bitmap_offset;
@@ -1037,8 +1038,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
 		    mddev->bitmap_file == NULL ) {
-			if (mddev->level != 1) {
-				printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
+			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+			    && mddev->level != 10) {
+				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
 				return -EINVAL;
 			}
 			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 50bd7b152f28..8f58a447d9f0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,7 +18,9 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include "dm-bio-list.h"
 #include <linux/raid/raid10.h>
+#include <linux/raid/bitmap.h>
 
 /*
  * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -306,9 +308,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (!uptodate)
+	if (!uptodate) {
 		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-	else
+		/* an I/O failed, we can't clear the bitmap */
+		set_bit(R10BIO_Degraded, &r10_bio->state);
+	} else
 		/*
 		 * Set R10BIO_Uptodate in our master bio, so that
 		 * we will return a good error code for to the higher
@@ -328,6 +332,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
 	 * already.
 	 */
 	if (atomic_dec_and_test(&r10_bio->remaining)) {
+		/* clear the bitmap if all writes complete successfully */
+		bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
+				r10_bio->sectors,
+				!test_bit(R10BIO_Degraded, &r10_bio->state),
+				0);
 		md_write_end(r10_bio->mddev);
 		raid_end_bio_io(r10_bio);
 	}
@@ -486,8 +495,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 	rcu_read_lock();
 	/*
 	 * Check if we can balance. We can balance on the whole
-	 * device if no resync is going on, or below the resync window.
-	 * We take the first readable disk when above the resync window.
+	 * device if no resync is going on (recovery is ok), or below
+	 * the resync window. We take the first readable disk when
+	 * above the resync window.
 	 */
 	if (conf->mddev->recovery_cp < MaxSector
 	    && (this_sector + sectors >= conf->next_resync)) {
@@ -591,7 +601,10 @@ static void unplug_slaves(mddev_t *mddev)
 
 static void raid10_unplug(request_queue_t *q)
 {
+	mddev_t *mddev = q->queuedata;
+
 	unplug_slaves(q->queuedata);
+	md_wakeup_thread(mddev->thread);
 }
 
 static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -647,12 +660,13 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
  */
 #define RESYNC_DEPTH 32
 
-static void raise_barrier(conf_t *conf)
+static void raise_barrier(conf_t *conf, int force)
 {
+	BUG_ON(force && !conf->barrier);
 	spin_lock_irq(&conf->resync_lock);
 
-	/* Wait until no block IO is waiting */
-	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+	/* Wait until no block IO is waiting (unless 'force') */
+	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
 			    conf->resync_lock,
 			    raid10_unplug(conf->mddev->queue));
 
@@ -710,6 +724,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
+	struct bio_list bl;
+	unsigned long flags;
 
 	if (unlikely(bio_barrier(bio))) {
 		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -767,6 +783,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_sector;
+	r10_bio->state = 0;
 
 	if (rw == READ) {
 		/*
@@ -811,13 +828,16 @@ static int make_request(request_queue_t *q, struct bio * bio)
 		    !test_bit(Faulty, &rdev->flags)) {
 			atomic_inc(&rdev->nr_pending);
 			r10_bio->devs[i].bio = bio;
-		} else
+		} else {
 			r10_bio->devs[i].bio = NULL;
+			set_bit(R10BIO_Degraded, &r10_bio->state);
+		}
 	}
 	rcu_read_unlock();
 
-	atomic_set(&r10_bio->remaining, 1);
+	atomic_set(&r10_bio->remaining, 0);
 
+	bio_list_init(&bl);
 	for (i = 0; i < conf->copies; i++) {
 		struct bio *mbio;
 		int d = r10_bio->devs[i].devnum;
@@ -835,13 +855,14 @@ static int make_request(request_queue_t *q, struct bio * bio)
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
-		generic_make_request(mbio);
+		bio_list_add(&bl, mbio);
 	}
 
-	if (atomic_dec_and_test(&r10_bio->remaining)) {
-		md_write_end(mddev);
-		raid_end_bio_io(r10_bio);
-	}
+	bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
+	spin_lock_irqsave(&conf->device_lock, flags);
+	bio_list_merge(&conf->pending_bio_list, &bl);
+	blk_plug_device(mddev->queue);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	return 0;
 }
@@ -999,7 +1020,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!enough(conf))
 		return 0;
 
-	for (mirror=0; mirror < mddev->raid_disks; mirror++)
+	if (rdev->saved_raid_disk >= 0 &&
+	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+		mirror = rdev->saved_raid_disk;
+	else
+		mirror = 0;
+	for ( ; mirror < mddev->raid_disks; mirror++)
 		if ( !(p=conf->mirrors+mirror)->rdev) {
 
 			blk_queue_stack_limits(mddev->queue,
@@ -1015,6 +1041,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
 			found = 1;
+			if (rdev->saved_raid_disk != mirror)
+				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
 			break;
 		}
@@ -1282,6 +1310,26 @@ static void raid10d(mddev_t *mddev)
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 		spin_lock_irqsave(&conf->device_lock, flags);
+
+		if (conf->pending_bio_list.head) {
+			bio = bio_list_get(&conf->pending_bio_list);
+			blk_remove_plug(mddev->queue);
+			spin_unlock_irqrestore(&conf->device_lock, flags);
+			/* flush any pending bitmap writes to disk before proceeding w/ I/O */
+			if (bitmap_unplug(mddev->bitmap) != 0)
+				printk("%s: bitmap file write failed!\n", mdname(mddev));
+
+			while (bio) { /* submit pending writes */
+				struct bio *next = bio->bi_next;
+				bio->bi_next = NULL;
+				generic_make_request(bio);
+				bio = next;
+			}
+			unplug = 1;
+
+			continue;
+		}
+
 		if (list_empty(head))
 			break;
 		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
@@ -1388,6 +1436,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	sector_t max_sector, nr_sectors;
 	int disk;
 	int i;
+	int max_sync;
+	int sync_blocks;
 
 	sector_t sectors_skipped = 0;
 	int chunks_skipped = 0;
@@ -1401,6 +1451,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 		max_sector = mddev->resync_max_sectors;
 	if (sector_nr >= max_sector) {
+		/* If we aborted, we need to abort the
+		 * sync on the 'current' bitmap chucks (there can
+		 * be several when recovering multiple devices).
+		 * as we may have started syncing it but not finished.
+		 * We can find the current address in
+		 * mddev->curr_resync, but for recovery,
+		 * we need to convert that to several
+		 * virtual addresses.
+		 */
+		if (mddev->curr_resync < max_sector) { /* aborted */
+			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+						&sync_blocks, 1);
+			else for (i=0; i<conf->raid_disks; i++) {
+				sector_t sect =
+					raid10_find_virt(conf, mddev->curr_resync, i);
+				bitmap_end_sync(mddev->bitmap, sect,
+						&sync_blocks, 1);
+			}
+		} else /* completed sync */
+			conf->fullsync = 0;
+
+		bitmap_close_sync(mddev->bitmap);
 		close_sync(conf);
 		*skipped = 1;
 		return sectors_skipped;
@@ -1425,8 +1498,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	 */
 	if (!go_faster && conf->nr_waiting)
 		msleep_interruptible(1000);
-	raise_barrier(conf);
-	conf->next_resync = sector_nr;
 
 	/* Again, very different code for resync and recovery.
 	 * Both must result in an r10bio with a list of bios that
@@ -1443,6 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	 * end_sync_write if we will want to write.
 	 */
 
+	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
 	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		/* recovery... the complicated one */
 		int i, j, k;
@@ -1451,13 +1523,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		for (i=0 ; i<conf->raid_disks; i++)
 			if (conf->mirrors[i].rdev &&
 			    !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
+				int still_degraded = 0;
 				/* want to reconstruct this device */
 				r10bio_t *rb2 = r10_bio;
+				sector_t sect = raid10_find_virt(conf, sector_nr, i);
+				int must_sync;
+				/* Unless we are doing a full sync, we only need
+				 * to recover the block if it is set in the bitmap
+				 */
+				must_sync = bitmap_start_sync(mddev->bitmap, sect,
+							      &sync_blocks, 1);
+				if (sync_blocks < max_sync)
+					max_sync = sync_blocks;
+				if (!must_sync &&
+				    !conf->fullsync) {
+					/* yep, skip the sync_blocks here, but don't assume
+					 * that there will never be anything to do here
+					 */
+					chunks_skipped = -1;
+					continue;
+				}
 
 				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
-				spin_lock_irq(&conf->resync_lock);
-				if (rb2) conf->barrier++;
-				spin_unlock_irq(&conf->resync_lock);
+				raise_barrier(conf, rb2 != NULL);
 				atomic_set(&r10_bio->remaining, 0);
 
 				r10_bio->master_bio = (struct bio*)rb2;
@@ -1465,8 +1553,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 					atomic_inc(&rb2->remaining);
 				r10_bio->mddev = mddev;
 				set_bit(R10BIO_IsRecover, &r10_bio->state);
-				r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
+				r10_bio->sector = sect;
+
 				raid10_find_phys(conf, r10_bio);
+				/* Need to check if this section will still be
+				 * degraded
+				 */
+				for (j=0; j<conf->copies;j++) {
+					int d = r10_bio->devs[j].devnum;
+					if (conf->mirrors[d].rdev == NULL ||
+					    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
+						still_degraded = 1;
+				}
+				must_sync = bitmap_start_sync(mddev->bitmap, sect,
+							      &sync_blocks, still_degraded);
+
 				for (j=0; j<conf->copies;j++) {
 					int d = r10_bio->devs[j].devnum;
 					if (conf->mirrors[d].rdev &&
@@ -1526,10 +1627,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	} else {
 		/* resync. Schedule a read for every block at this virt offset */
 		int count = 0;
+
+		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
+				       &sync_blocks, mddev->degraded) &&
+		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+			/* We can skip this block */
+			*skipped = 1;
+			return sync_blocks + sectors_skipped;
+		}
+		if (sync_blocks < max_sync)
+			max_sync = sync_blocks;
 		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 
 		r10_bio->mddev = mddev;
 		atomic_set(&r10_bio->remaining, 0);
+		raise_barrier(conf, 0);
+		conf->next_resync = sector_nr;
 
 		r10_bio->master_bio = NULL;
 		r10_bio->sector = sector_nr;
@@ -1582,6 +1695,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	}
 
 	nr_sectors = 0;
+	if (sector_nr + max_sync < max_sector)
+		max_sector = sector_nr + max_sync;
 	do {
 		struct page *page;
 		int len = PAGE_SIZE;
@@ -1821,6 +1936,26 @@ static int stop(mddev_t *mddev)
 	return 0;
 }
 
+static void raid10_quiesce(mddev_t *mddev, int state)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+
+	switch(state) {
+	case 1:
+		raise_barrier(conf, 0);
+		break;
+	case 0:
+		lower_barrier(conf);
+		break;
+	}
+	if (mddev->thread) {
+		if (mddev->bitmap)
+			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+		else
+			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+		md_wakeup_thread(mddev->thread);
+	}
+}
 
 static mdk_personality_t raid10_personality =
 {
@@ -1835,6 +1970,7 @@ static mdk_personality_t raid10_personality =
 	.hot_remove_disk= raid10_remove_disk,
 	.spare_active	= raid10_spare_active,
 	.sync_request	= sync_request,
+	.quiesce	= raid10_quiesce,
 };
 
 static int __init raid_init(void)
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index 08317b77802b..b660cbf628d8 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -35,13 +35,19 @@ struct r10_private_data_s {
 	sector_t chunk_mask;
 
 	struct list_head	retry_list;
-	/* for use when syncing mirrors: */
+	/* queue pending writes and submit them on unplug */
+	struct bio_list		pending_bio_list;
+
 
 	spinlock_t		resync_lock;
 	int nr_pending;
 	int nr_waiting;
 	int barrier;
 	sector_t		next_resync;
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
 
 	wait_queue_head_t	wait_barrier;
 
@@ -100,4 +106,5 @@ struct r10bio_s {
 #define	R10BIO_Uptodate	0
 #define	R10BIO_IsSync	1
 #define	R10BIO_IsRecover 2
+#define	R10BIO_Degraded 3
 #endif
-- 
cgit v1.2.3-71-gd317


From ca65b73bd9c301d243df93780f7b26579e6c9204 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:17 -0800
Subject: [PATCH] md: fix raid6 resync check/repair code

raid6 currently does not check the P/Q syndromes when doing a resync, it just
calculates the correct value and writes it.  Doing the check can reduce writes
(often to 0) for a resync, and it is needed to properly implement the

  echo check > sync_action

operation.

This patch implements the appropriate checks and tidies up some related code.

It also allows raid6 user-requested resync to bypass the intent bitmap.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid6main.c     | 182 ++++++++++++++++++++++++++-------------------
 include/linux/raid/raid5.h |   2 +
 2 files changed, 108 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 304455d236f9..52e8796bb8ac 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -805,7 +805,7 @@ static void compute_parity(struct stripe_head *sh, int method)
 }
 
 /* Compute one missing block */
-static void compute_block_1(struct stripe_head *sh, int dd_idx)
+static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 {
 	raid6_conf_t *conf = sh->raid_conf;
 	int i, count, disks = conf->raid_disks;
@@ -821,7 +821,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
 		compute_parity(sh, UPDATE_PARITY);
 	} else {
 		ptr[0] = page_address(sh->dev[dd_idx].page);
-		memset(ptr[0], 0, STRIPE_SIZE);
+		if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
 		count = 1;
 		for (i = disks ; i--; ) {
 			if (i == dd_idx || i == qd_idx)
@@ -838,7 +838,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
 		}
 		if (count != 1)
 			xor_block(count, STRIPE_SIZE, ptr);
-		set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 	}
 }
 
@@ -871,7 +872,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 			return;
 		} else {
 			/* We're missing D+Q; recompute D from P */
-			compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1);
+			compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
 			compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
 			return;
 		}
@@ -982,6 +983,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 }
 
 
+static int page_is_zero(struct page *p)
+{
+	char *a = page_address(p);
+	return ((*(u32*)a) == 0 &&
+		memcmp(a, a+4, STRIPE_SIZE-4)==0);
+}
 /*
  * handle_stripe - do things to a stripe.
  *
@@ -1000,7 +1007,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
  *
  */
 
-static void handle_stripe(struct stripe_head *sh)
+static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 {
 	raid6_conf_t *conf = sh->raid_conf;
 	int disks = conf->raid_disks;
@@ -1228,7 +1235,7 @@ static void handle_stripe(struct stripe_head *sh)
 				if (uptodate == disks-1) {
 					PRINTK("Computing stripe %llu block %d\n",
 					       (unsigned long long)sh->sector, i);
-					compute_block_1(sh, i);
+					compute_block_1(sh, i, 0);
 					uptodate++;
 				} else if ( uptodate == disks-2 && failed >= 2 ) {
 					/* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
@@ -1323,7 +1330,7 @@ static void handle_stripe(struct stripe_head *sh)
 				/* We have failed blocks and need to compute them */
 				switch ( failed ) {
 				case 0:	BUG();
-				case 1: compute_block_1(sh, failed_num[0]); break;
+				case 1: compute_block_1(sh, failed_num[0], 0); break;
 				case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
 				default: BUG();	/* This request should have been failed? */
 				}
@@ -1338,12 +1345,10 @@ static void handle_stripe(struct stripe_head *sh)
 					       (unsigned long long)sh->sector, i);
 					locked++;
 					set_bit(R5_Wantwrite, &sh->dev[i].flags);
-#if 0 /**** FIX: I don't understand the logic here... ****/
-					if (!test_bit(R5_Insync, &sh->dev[i].flags)
-					    || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */
-						set_bit(STRIPE_INSYNC, &sh->state);
-#endif
 				}
+			/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
+			set_bit(STRIPE_INSYNC, &sh->state);
+
 			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 				atomic_dec(&conf->preread_active_stripes);
 				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -1356,79 +1361,97 @@ static void handle_stripe(struct stripe_head *sh)
 	 * Any reads will already have been scheduled, so we just see if enough data
 	 * is available
 	 */
-	if (syncing && locked == 0 &&
-	    !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) {
-		set_bit(STRIPE_HANDLE, &sh->state);
-#if 0 /* RAID-6: Don't support CHECK PARITY yet */
-		if (failed == 0) {
-			char *pagea;
-			if (uptodate != disks)
-				BUG();
-			compute_parity(sh, CHECK_PARITY);
-			uptodate--;
-			pagea = page_address(sh->dev[pd_idx].page);
-			if ((*(u32*)pagea) == 0 &&
-			    !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
-				/* parity is correct (on disc, not in buffer any more) */
-				set_bit(STRIPE_INSYNC, &sh->state);
-			}
-		}
-#endif
-		if (!test_bit(STRIPE_INSYNC, &sh->state)) {
-			int failed_needupdate[2];
-			struct r5dev *adev, *bdev;
-
-			if ( failed < 1 )
-				failed_num[0] = pd_idx;
-			if ( failed < 2 )
-				failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx;
+	if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
+		int update_p = 0, update_q = 0;
+		struct r5dev *dev;
 
-			failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags);
-			failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags);
+		set_bit(STRIPE_HANDLE, &sh->state);
 
-			PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n",
-			       failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]);
+		BUG_ON(failed>2);
+		BUG_ON(uptodate < disks);
+		/* Want to check and possibly repair P and Q.
+		 * However there could be one 'failed' device, in which
+		 * case we can only check one of them, possibly using the
+		 * other to generate missing data
+		 */
 
-#if 0  /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */
-			/* should be able to compute the missing block(s) and write to spare */
-			if ( failed_needupdate[0] ^ failed_needupdate[1] ) {
-				if (uptodate+1 != disks)
-					BUG();
-				compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]);
-				uptodate++;
-			} else if ( failed_needupdate[0] & failed_needupdate[1] ) {
-				if (uptodate+2 != disks)
-					BUG();
-				compute_block_2(sh, failed_num[0], failed_num[1]);
-				uptodate += 2;
+		/* If !tmp_page, we cannot do the calculations,
+		 * but as we have set STRIPE_HANDLE, we will soon be called
+		 * by stripe_handle with a tmp_page - just wait until then.
+		 */
+		if (tmp_page) {
+			if (failed == q_failed) {
+				/* The only possible failed device holds 'Q', so it makes
+				 * sense to check P (If anything else were failed, we would
+				 * have used P to recreate it).
+				 */
+				compute_block_1(sh, pd_idx, 1);
+				if (!page_is_zero(sh->dev[pd_idx].page)) {
+					compute_block_1(sh,pd_idx,0);
+					update_p = 1;
+				}
+			}
+			if (!q_failed && failed < 2) {
+				/* q is not failed, and we didn't use it to generate
+				 * anything, so it makes sense to check it
+				 */
+				memcpy(page_address(tmp_page),
+				       page_address(sh->dev[qd_idx].page),
+				       STRIPE_SIZE);
+				compute_parity(sh, UPDATE_PARITY);
+				if (memcmp(page_address(tmp_page),
+					   page_address(sh->dev[qd_idx].page),
+					   STRIPE_SIZE)!= 0) {
+					clear_bit(STRIPE_INSYNC, &sh->state);
+					update_q = 1;
+				}
+			}
+			if (update_p || update_q) {
+				conf->mddev->resync_mismatches += STRIPE_SECTORS;
+				if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+					/* don't try to repair!! */
+					update_p = update_q = 0;
 			}
-#else
-			compute_block_2(sh, failed_num[0], failed_num[1]);
-			uptodate += failed_needupdate[0] + failed_needupdate[1];
-#endif
 
-			if (uptodate != disks)
-				BUG();
+			/* now write out any block on a failed drive,
+			 * or P or Q if they need it
+			 */
 
-			PRINTK("Marking for sync stripe %llu blocks %d,%d\n",
-			       (unsigned long long)sh->sector, failed_num[0], failed_num[1]);
+			if (failed == 2) {
+				dev = &sh->dev[failed_num[1]];
+				locked++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+				set_bit(R5_Syncio, &dev->flags);
+			}
+			if (failed >= 1) {
+				dev = &sh->dev[failed_num[0]];
+				locked++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+				set_bit(R5_Syncio, &dev->flags);
+			}
 
-			/**** FIX: Should we really do both of these unconditionally? ****/
-			adev = &sh->dev[failed_num[0]];
-			locked += !test_bit(R5_LOCKED, &adev->flags);
-			set_bit(R5_LOCKED, &adev->flags);
-			set_bit(R5_Wantwrite, &adev->flags);
-			bdev = &sh->dev[failed_num[1]];
-			locked += !test_bit(R5_LOCKED, &bdev->flags);
-			set_bit(R5_LOCKED, &bdev->flags);
+			if (update_p) {
+				dev = &sh->dev[pd_idx];
+				locked ++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+				set_bit(R5_Syncio, &dev->flags);
+			}
+			if (update_q) {
+				dev = &sh->dev[qd_idx];
+				locked++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+				set_bit(R5_Syncio, &dev->flags);
+			}
 			clear_bit(STRIPE_DEGRADED, &sh->state);
-			set_bit(R5_Wantwrite, &bdev->flags);
 
 			set_bit(STRIPE_INSYNC, &sh->state);
-			set_bit(R5_Syncio, &adev->flags);
-			set_bit(R5_Syncio, &bdev->flags);
 		}
 	}
+
 	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
@@ -1664,7 +1687,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
 			}
 			finish_wait(&conf->wait_for_overlap, &w);
 			raid6_plug_device(conf);
-			handle_stripe(sh);
+			handle_stripe(sh, NULL);
 			release_stripe(sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
@@ -1728,6 +1751,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		return rv;
 	}
 	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
 		/* we can skip this block, and probably more */
 		sync_blocks /= STRIPE_SECTORS;
@@ -1765,7 +1789,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	clear_bit(STRIPE_INSYNC, &sh->state);
 	spin_unlock(&sh->lock);
 
-	handle_stripe(sh);
+	handle_stripe(sh, NULL);
 	release_stripe(sh);
 
 	return STRIPE_SECTORS;
@@ -1821,7 +1845,7 @@ static void raid6d (mddev_t *mddev)
 		spin_unlock_irq(&conf->device_lock);
 
 		handled++;
-		handle_stripe(sh);
+		handle_stripe(sh, conf->spare_page);
 		release_stripe(sh);
 
 		spin_lock_irq(&conf->device_lock);
@@ -1860,6 +1884,10 @@ static int run(mddev_t *mddev)
 		goto abort;
 	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
 
+	conf->spare_page = alloc_page(GFP_KERNEL);
+	if (!conf->spare_page)
+		goto abort;
+
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
@@ -1996,6 +2024,8 @@ static int run(mddev_t *mddev)
 abort:
 	if (conf) {
 		print_raid6_conf(conf);
+		if (conf->spare_page)
+			page_cache_release(conf->spare_page);
 		if (conf->stripe_hashtbl)
 			free_pages((unsigned long) conf->stripe_hashtbl,
 							HASH_PAGES_ORDER);
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index f025ba6fb14c..e9c1c0d4f90b 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -228,6 +228,8 @@ struct raid5_private_data {
 					    * Cleared when a sync completes.
 					    */
 
+	struct page 		*spare_page; /* Used when checking P/Q in raid6 */
+
 	/*
 	 * Free stripes pool
 	 */
-- 
cgit v1.2.3-71-gd317


From ddaf22abaa831763e75775e6d4c7693504237997 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:19 -0800
Subject: [PATCH] md: attempt to auto-correct read errors in raid1

On a read-error we suspend the array, then synchronously read the block from
other arrays until we find one where we can read it.  Then we try writing the
good data back everywhere and make sure it works.  If any write or subsequent
read fails, only then do we fail the device out of the array.

To be able to suspend the array, we need to also keep track of how many
requests are queued for handling by raid1d.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |   1 +
 drivers/md/raid1.c         | 115 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/raid/raid1.h |   3 ++
 3 files changed, 109 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 64e7da3701a5..1364a1c97e6f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -461,6 +461,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	bio_put(bio);
 	return ret;
 }
+EXPORT_SYMBOL(sync_page_io);
 
 static int read_disk_sb(mdk_rdev_t * rdev, int size)
 {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c618015f07f6..b3856db8d6c2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -191,6 +191,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r1_bio->retry_list, &conf->retry_list);
+	conf->nr_queued ++;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	wake_up(&conf->wait_barrier);
@@ -245,9 +246,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (!uptodate)
-		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-	else
+	update_head_pos(mirror, r1_bio);
+
+	if (uptodate || conf->working_disks <= 1) {
 		/*
 		 * Set R1BIO_Uptodate in our master bio, so that
 		 * we will return a good error code for to the higher
@@ -259,14 +260,8 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
 		 */
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
 
-	update_head_pos(mirror, r1_bio);
-
-	/*
-	 * we have only one bio on the read side
-	 */
-	if (uptodate)
 		raid_end_bio_io(r1_bio);
-	else {
+	} else {
 		/*
 		 * oops, read error:
 		 */
@@ -653,6 +648,32 @@ static void allow_barrier(conf_t *conf)
 	wake_up(&conf->wait_barrier);
 }
 
+static void freeze_array(conf_t *conf)
+{
+	/* stop syncio and normal IO and wait for everything to
+	 * go quite.
+	 * We increment barrier and nr_waiting, and then
+	 * wait until barrier+nr_pending match nr_queued+2
+	 */
+	spin_lock_irq(&conf->resync_lock);
+	conf->barrier++;
+	conf->nr_waiting++;
+	wait_event_lock_irq(conf->wait_barrier,
+			    conf->barrier+conf->nr_pending == conf->nr_queued+2,
+			    conf->resync_lock,
+			    raid1_unplug(conf->mddev->queue));
+	spin_unlock_irq(&conf->resync_lock);
+}
+static void unfreeze_array(conf_t *conf)
+{
+	/* reverse the effect of the freeze */
+	spin_lock_irq(&conf->resync_lock);
+	conf->barrier--;
+	conf->nr_waiting--;
+	wake_up(&conf->wait_barrier);
+	spin_unlock_irq(&conf->resync_lock);
+}
+
 
 /* duplicate the data pages for behind I/O */
 static struct page **alloc_behind_pages(struct bio *bio)
@@ -1196,6 +1217,7 @@ static void raid1d(mddev_t *mddev)
 			break;
 		r1_bio = list_entry(head->prev, r1bio_t, retry_list);
 		list_del(head->prev);
+		conf->nr_queued--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r1_bio->mddev;
@@ -1235,6 +1257,74 @@ static void raid1d(mddev_t *mddev)
 				}
 		} else {
 			int disk;
+
+			/* we got a read error. Maybe the drive is bad.  Maybe just
+			 * the block and we can fix it.
+			 * We freeze all other IO, and try reading the block from
+			 * other devices.  When we find one, we re-write
+			 * and check it that fixes the read error.
+			 * This is all done synchronously while the array is
+			 * frozen
+			 */
+			sector_t sect = r1_bio->sector;
+			int sectors = r1_bio->sectors;
+			freeze_array(conf);
+			while(sectors) {
+				int s = sectors;
+				int d = r1_bio->read_disk;
+				int success = 0;
+
+				if (s > (PAGE_SIZE>>9))
+					s = PAGE_SIZE >> 9;
+
+				do {
+					rdev = conf->mirrors[d].rdev;
+					if (rdev &&
+					    test_bit(In_sync, &rdev->flags) &&
+					    sync_page_io(rdev->bdev,
+							 sect + rdev->data_offset,
+							 s<<9,
+							 conf->tmppage, READ))
+						success = 1;
+					else {
+						d++;
+						if (d == conf->raid_disks)
+							d = 0;
+					}
+				} while (!success && d != r1_bio->read_disk);
+
+				if (success) {
+					/* write it back and re-read */
+					while (d != r1_bio->read_disk) {
+						if (d==0)
+							d = conf->raid_disks;
+						d--;
+						rdev = conf->mirrors[d].rdev;
+						if (rdev &&
+						    test_bit(In_sync, &rdev->flags)) {
+							if (sync_page_io(rdev->bdev,
+									 sect + rdev->data_offset,
+									 s<<9, conf->tmppage, WRITE) == 0 ||
+							    sync_page_io(rdev->bdev,
+									 sect + rdev->data_offset,
+									 s<<9, conf->tmppage, READ) == 0) {
+								/* Well, this device is dead */
+								md_error(mddev, rdev);
+							}
+						}
+					}
+				} else {
+					/* Cannot read from anywhere -- bye bye array */
+					md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+					break;
+				}
+				sectors -= s;
+				sect += s;
+			}
+
+
+			unfreeze_array(conf);
+
 			bio = r1_bio->bios[r1_bio->read_disk];
 			if ((disk=read_balance(conf, r1_bio)) == -1) {
 				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1529,6 +1619,10 @@ static int run(mddev_t *mddev)
 
 	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
 
+	conf->tmppage = alloc_page(GFP_KERNEL);
+	if (!conf->tmppage)
+		goto out_no_mem;
+
 	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
 	if (!conf->poolinfo)
 		goto out_no_mem;
@@ -1635,6 +1729,7 @@ out_free_conf:
 		if (conf->r1bio_pool)
 			mempool_destroy(conf->r1bio_pool);
 		kfree(conf->mirrors);
+		__free_page(conf->tmppage);
 		kfree(conf->poolinfo);
 		kfree(conf);
 		mddev->private = NULL;
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index c55674252533..cbe4238d3f9f 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -46,6 +46,7 @@ struct r1_private_data_s {
 	spinlock_t		resync_lock;
 	int			nr_pending;
 	int			nr_waiting;
+	int			nr_queued;
 	int			barrier;
 	sector_t		next_resync;
 	int			fullsync;  /* set to 1 if a full sync is needed,
@@ -57,6 +58,8 @@ struct r1_private_data_s {
 
 	struct pool_info	*poolinfo;
 
+	struct page		*tmppage;
+
 	mempool_t *r1bio_pool;
 	mempool_t *r1buf_pool;
 };
-- 
cgit v1.2.3-71-gd317


From cf30a473a02901fe4db37abc0b0fa26dd5ba3f72 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:23 -0800
Subject: [PATCH] md: handle errors when read-only

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c         | 18 +++++++++++-------
 include/linux/raid/raid1.h |  7 +++++++
 2 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 14a8fe0349c7..a8bc93d6ff63 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -154,7 +154,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 
 	for (i = 0; i < conf->raid_disks; i++) {
 		struct bio **bio = r1_bio->bios + i;
-		if (*bio)
+		if (*bio && *bio != IO_BLOCKED)
 			bio_put(*bio);
 		*bio = NULL;
 	}
@@ -419,11 +419,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 		new_disk = 0;
 
 		for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+		     r1_bio->bios[new_disk] == IO_BLOCKED ||
 		     !rdev || !test_bit(In_sync, &rdev->flags)
 			     || test_bit(WriteMostly, &rdev->flags);
 		     rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
 
-			if (rdev && test_bit(In_sync, &rdev->flags))
+			if (rdev && test_bit(In_sync, &rdev->flags) &&
+				r1_bio->bios[new_disk] != IO_BLOCKED)
 				wonly_disk = new_disk;
 
 			if (new_disk == conf->raid_disks - 1) {
@@ -437,11 +439,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 
 	/* make sure the disk is operational */
 	for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+	     r1_bio->bios[new_disk] == IO_BLOCKED ||
 	     !rdev || !test_bit(In_sync, &rdev->flags) ||
 		     test_bit(WriteMostly, &rdev->flags);
 	     rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
 
-		if (rdev && test_bit(In_sync, &rdev->flags))
+		if (rdev && test_bit(In_sync, &rdev->flags) &&
+		    r1_bio->bios[new_disk] != IO_BLOCKED)
 			wonly_disk = new_disk;
 
 		if (new_disk <= 0)
@@ -478,7 +482,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 
-		if (!rdev ||
+		if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
 		    !test_bit(In_sync, &rdev->flags) ||
 		    test_bit(WriteMostly, &rdev->flags))
 			continue;
@@ -1335,7 +1339,7 @@ static void raid1d(mddev_t *mddev)
 			sector_t sect = r1_bio->sector;
 			int sectors = r1_bio->sectors;
 			freeze_array(conf);
-			while(sectors) {
+			if (mddev->ro == 0) while(sectors) {
 				int s = sectors;
 				int d = r1_bio->read_disk;
 				int success = 0;
@@ -1388,7 +1392,6 @@ static void raid1d(mddev_t *mddev)
 				sect += s;
 			}
 
-
 			unfreeze_array(conf);
 
 			bio = r1_bio->bios[r1_bio->read_disk];
@@ -1399,7 +1402,8 @@ static void raid1d(mddev_t *mddev)
 				       (unsigned long long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
 			} else {
-				r1_bio->bios[r1_bio->read_disk] = NULL;
+				r1_bio->bios[r1_bio->read_disk] =
+					mddev->ro ? IO_BLOCKED : NULL;
 				r1_bio->read_disk = disk;
 				bio_put(bio);
 				bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index cbe4238d3f9f..9d5494aaac0f 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -109,6 +109,13 @@ struct r1bio_s {
 	/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
 };
 
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
 /* bits for r1bio.state */
 #define	R1BIO_Uptodate	0
 #define	R1BIO_IsSync	1
-- 
cgit v1.2.3-71-gd317


From 9910f16af35419a5382fa7850eecc220103036fa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:24 -0800
Subject: [PATCH] md: fix up some rdev rcu locking in raid5/6

There is this "FIXME" comment with a typo in it!!  that been annoying me for
days, so I just had to remove it.

conf->disks[i].rdev should only be accessed if
  - we know we hold a reference or
  - the mddev->reconfig_sem is down or
  - we have a rcu_readlock

handle_stripe was referencing rdev in three places without any of these.  For
the first two, get an rcu_readlock.  For the last, the same access
(md_sync_acct call) is made a little later after the rdev has been claimed
under and rcu_readlock, if R5_Syncio is set.  So just use that access...
However R5_Syncio isn't really needed as the 'syncing' variable contains the
same information.  So use that instead.

Issues, comment, and fix are identical in raid5 and raid6.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 16 ++++++++--------
 drivers/md/raid6main.c     | 19 ++++++++-----------
 include/linux/raid/raid5.h |  1 -
 3 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0d016a844ec6..0222ba1a6d35 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -961,11 +961,11 @@ static void handle_stripe(struct stripe_head *sh)
 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
 	/* Now to look around and see what can be done */
 
+	rcu_read_lock();
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
 		dev = &sh->dev[i];
 		clear_bit(R5_Insync, &dev->flags);
-		clear_bit(R5_Syncio, &dev->flags);
 
 		PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
 			i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -1004,7 +1004,7 @@ static void handle_stripe(struct stripe_head *sh)
 				non_overwrite++;
 		}
 		if (dev->written) written++;
-		rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
+		rdev = rcu_dereference(conf->disks[i].rdev);
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
 			/* The ReadError flag will just be confusing now */
 			clear_bit(R5_ReadError, &dev->flags);
@@ -1017,6 +1017,7 @@ static void handle_stripe(struct stripe_head *sh)
 		} else
 			set_bit(R5_Insync, &dev->flags);
 	}
+	rcu_read_unlock();
 	PRINTK("locked=%d uptodate=%d to_read=%d"
 		" to_write=%d failed=%d failed_num=%d\n",
 		locked, uptodate, to_read, to_write, failed, failed_num);
@@ -1028,10 +1029,13 @@ static void handle_stripe(struct stripe_head *sh)
 			int bitmap_end = 0;
 
 			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				mdk_rdev_t *rdev = conf->disks[i].rdev;
+				mdk_rdev_t *rdev;
+				rcu_read_lock();
+				rdev = rcu_dereference(conf->disks[i].rdev);
 				if (rdev && test_bit(In_sync, &rdev->flags))
 					/* multiple read failures in one stripe */
 					md_error(conf->mddev, rdev);
+				rcu_read_unlock();
 			}
 
 			spin_lock_irq(&conf->device_lock);
@@ -1180,9 +1184,6 @@ static void handle_stripe(struct stripe_head *sh)
 					locked++;
 					PRINTK("Reading block %d (sync=%d)\n", 
 						i, syncing);
-					if (syncing)
-						md_sync_acct(conf->disks[i].rdev->bdev,
-							     STRIPE_SECTORS);
 				}
 			}
 		}
@@ -1326,7 +1327,6 @@ static void handle_stripe(struct stripe_head *sh)
 			clear_bit(STRIPE_DEGRADED, &sh->state);
 			locked++;
 			set_bit(STRIPE_INSYNC, &sh->state);
-			set_bit(R5_Syncio, &dev->flags);
 		}
 	}
 	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -1392,7 +1392,7 @@ static void handle_stripe(struct stripe_head *sh)
 		rcu_read_unlock();
  
 		if (rdev) {
-			if (test_bit(R5_Syncio, &sh->dev[i].flags))
+			if (syncing)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 7a51553d8be5..b5b7a8d0b165 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1060,11 +1060,11 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
 	/* Now to look around and see what can be done */
 
+	rcu_read_lock();
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
 		dev = &sh->dev[i];
 		clear_bit(R5_Insync, &dev->flags);
-		clear_bit(R5_Syncio, &dev->flags);
 
 		PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
 			i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -1103,7 +1103,7 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 				non_overwrite++;
 		}
 		if (dev->written) written++;
-		rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
+		rdev = rcu_dereference(conf->disks[i].rdev);
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
 			/* The ReadError flag will just be confusing now */
 			clear_bit(R5_ReadError, &dev->flags);
@@ -1117,6 +1117,7 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 		} else
 			set_bit(R5_Insync, &dev->flags);
 	}
+	rcu_read_unlock();
 	PRINTK("locked=%d uptodate=%d to_read=%d"
 	       " to_write=%d failed=%d failed_num=%d,%d\n",
 	       locked, uptodate, to_read, to_write, failed,
@@ -1129,10 +1130,13 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 			int bitmap_end = 0;
 
 			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				mdk_rdev_t *rdev = conf->disks[i].rdev;
+				mdk_rdev_t *rdev;
+				rcu_read_lock();
+				rdev = rcu_dereference(conf->disks[i].rdev);
 				if (rdev && test_bit(In_sync, &rdev->flags))
 					/* multiple read failures in one stripe */
 					md_error(conf->mddev, rdev);
+				rcu_read_unlock();
 			}
 
 			spin_lock_irq(&conf->device_lock);
@@ -1307,9 +1311,6 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 					locked++;
 					PRINTK("Reading block %d (sync=%d)\n",
 						i, syncing);
-					if (syncing)
-						md_sync_acct(conf->disks[i].rdev->bdev,
-							     STRIPE_SECTORS);
 				}
 			}
 		}
@@ -1463,14 +1464,12 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 				locked++;
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantwrite, &dev->flags);
-				set_bit(R5_Syncio, &dev->flags);
 			}
 			if (failed >= 1) {
 				dev = &sh->dev[failed_num[0]];
 				locked++;
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantwrite, &dev->flags);
-				set_bit(R5_Syncio, &dev->flags);
 			}
 
 			if (update_p) {
@@ -1478,14 +1477,12 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 				locked ++;
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantwrite, &dev->flags);
-				set_bit(R5_Syncio, &dev->flags);
 			}
 			if (update_q) {
 				dev = &sh->dev[qd_idx];
 				locked++;
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantwrite, &dev->flags);
-				set_bit(R5_Syncio, &dev->flags);
 			}
 			clear_bit(STRIPE_DEGRADED, &sh->state);
 
@@ -1557,7 +1554,7 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 		rcu_read_unlock();
 
 		if (rdev) {
-			if (test_bit(R5_Syncio, &sh->dev[i].flags))
+			if (syncing)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index e9c1c0d4f90b..28fcd7533ac4 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -152,7 +152,6 @@ struct stripe_head {
 #define	R5_Insync	3	/* rdev && rdev->in_sync at start */
 #define	R5_Wantread	4	/* want to schedule a read */
 #define	R5_Wantwrite	5
-#define	R5_Syncio	6	/* this io need to be accounted as resync io */
 #define	R5_Overlap	7	/* There is a pending overlapping request on this block */
 #define	R5_ReadError	8	/* seen a read error here recently */
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
-- 
cgit v1.2.3-71-gd317


From 4443ae10ca15d07922ceda622f03db8865fa3d13 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:28 -0800
Subject: [PATCH] md: auto-correct correctable read errors in raid10

Largely just a cross-port from raid1.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c         | 127 ++++++++++++++++++++++++++++++++++++++------
 include/linux/raid/raid10.h |   2 +
 2 files changed, 114 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1fa70c34b7d2..64bb4ddc6798 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -209,6 +209,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r10_bio->retry_list, &conf->retry_list);
+	conf->nr_queued ++;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	md_wakeup_thread(mddev->thread);
@@ -254,9 +255,9 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (!uptodate)
-		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-	else
+	update_head_pos(slot, r10_bio);
+
+	if (uptodate) {
 		/*
 		 * Set R10BIO_Uptodate in our master bio, so that
 		 * we will return a good error code to the higher
@@ -267,15 +268,8 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
 		 * wait for the 'master' bio.
 		 */
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
-
-	update_head_pos(slot, r10_bio);
-
-	/*
-	 * we have only one bio on the read side
-	 */
-	if (uptodate)
 		raid_end_bio_io(r10_bio);
-	else {
+	} else {
 		/*
 		 * oops, read error:
 		 */
@@ -714,6 +708,33 @@ static void allow_barrier(conf_t *conf)
 	wake_up(&conf->wait_barrier);
 }
 
+static void freeze_array(conf_t *conf)
+{
+	/* stop syncio and normal IO and wait for everything to
+	 * go quite.
+	 * We increment barrier and nr_waiting, and then
+	 * wait until barrier+nr_pending match nr_queued+2
+	 */
+	spin_lock_irq(&conf->resync_lock);
+	conf->barrier++;
+	conf->nr_waiting++;
+	wait_event_lock_irq(conf->wait_barrier,
+			    conf->barrier+conf->nr_pending == conf->nr_queued+2,
+			    conf->resync_lock,
+			    raid10_unplug(conf->mddev->queue));
+	spin_unlock_irq(&conf->resync_lock);
+}
+
+static void unfreeze_array(conf_t *conf)
+{
+	/* reverse the effect of the freeze */
+	spin_lock_irq(&conf->resync_lock);
+	conf->barrier--;
+	conf->nr_waiting--;
+	wake_up(&conf->wait_barrier);
+	spin_unlock_irq(&conf->resync_lock);
+}
+
 static int make_request(request_queue_t *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
@@ -1338,6 +1359,7 @@ static void raid10d(mddev_t *mddev)
 			break;
 		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
 		list_del(head->prev);
+		conf->nr_queued--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r10_bio->mddev;
@@ -1350,6 +1372,78 @@ static void raid10d(mddev_t *mddev)
 			unplug = 1;
 		} else {
 			int mirror;
+			/* we got a read error. Maybe the drive is bad.  Maybe just
+			 * the block and we can fix it.
+			 * We freeze all other IO, and try reading the block from
+			 * other devices.  When we find one, we re-write
+			 * and check it that fixes the read error.
+			 * This is all done synchronously while the array is
+			 * frozen.
+			 */
+			int sect = 0; /* Offset from r10_bio->sector */
+			int sectors = r10_bio->sectors;
+			freeze_array(conf);
+			if (mddev->ro == 0) while(sectors) {
+				int s = sectors;
+				int sl = r10_bio->read_slot;
+				int success = 0;
+
+				if (s > (PAGE_SIZE>>9))
+					s = PAGE_SIZE >> 9;
+
+				do {
+					int d = r10_bio->devs[sl].devnum;
+					rdev = conf->mirrors[d].rdev;
+					if (rdev &&
+					    test_bit(In_sync, &rdev->flags) &&
+					    sync_page_io(rdev->bdev,
+							 r10_bio->devs[sl].addr +
+							 sect + rdev->data_offset,
+							 s<<9,
+							 conf->tmppage, READ))
+						success = 1;
+					else {
+						sl++;
+						if (sl == conf->copies)
+							sl = 0;
+					}
+				} while (!success && sl != r10_bio->read_slot);
+
+				if (success) {
+					/* write it back and re-read */
+					while (sl != r10_bio->read_slot) {
+						int d;
+						if (sl==0)
+							sl = conf->copies;
+						sl--;
+						d = r10_bio->devs[sl].devnum;
+						rdev = conf->mirrors[d].rdev;
+						if (rdev &&
+						    test_bit(In_sync, &rdev->flags)) {
+							if (sync_page_io(rdev->bdev,
+									 r10_bio->devs[sl].addr +
+									 sect + rdev->data_offset,
+									 s<<9, conf->tmppage, WRITE) == 0 ||
+							    sync_page_io(rdev->bdev,
+									 r10_bio->devs[sl].addr +
+									 sect + rdev->data_offset,
+									 s<<9, conf->tmppage, READ) == 0) {
+								/* Well, this device is dead */
+								md_error(mddev, rdev);
+							}
+						}
+					}
+				} else {
+					/* Cannot read from anywhere -- bye bye array */
+					md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
+					break;
+				}
+				sectors -= s;
+				sect += s;
+			}
+
+			unfreeze_array(conf);
+
 			bio = r10_bio->devs[r10_bio->read_slot].bio;
 			r10_bio->devs[r10_bio->read_slot].bio = NULL;
 			bio_put(bio);
@@ -1793,22 +1887,24 @@ static int run(mddev_t *mddev)
 	 * bookkeeping area. [whatever we allocate in run(),
 	 * should be freed in stop()]
 	 */
-	conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
+	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
 	mddev->private = conf;
 	if (!conf) {
 		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
 			mdname(mddev));
 		goto out;
 	}
-	memset(conf, 0, sizeof(*conf));
-	conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
 				 GFP_KERNEL);
 	if (!conf->mirrors) {
 		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
 		       mdname(mddev));
 		goto out_free_conf;
 	}
-	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+
+	conf->tmppage = alloc_page(GFP_KERNEL);
+	if (!conf->tmppage)
+		goto out_free_conf;
 
 	conf->near_copies = nc;
 	conf->far_copies = fc;
@@ -1918,6 +2014,7 @@ static int run(mddev_t *mddev)
 out_free_conf:
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
+	put_page(conf->tmppage);
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index b660cbf628d8..dfa528385e3f 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -42,6 +42,7 @@ struct r10_private_data_s {
 	spinlock_t		resync_lock;
 	int nr_pending;
 	int nr_waiting;
+	int nr_queued;
 	int barrier;
 	sector_t		next_resync;
 	int			fullsync;  /* set to 1 if a full sync is needed,
@@ -53,6 +54,7 @@ struct r10_private_data_s {
 
 	mempool_t *r10bio_pool;
 	mempool_t *r10buf_pool;
+	struct page		*tmppage;
 };
 
 typedef struct r10_private_data_s conf_t;
-- 
cgit v1.2.3-71-gd317


From 0eb3ff12aa8a12538ef681dc83f4361636a0699f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:29 -0800
Subject: [PATCH] md: raid10 read-error handling - resync and read-only

Add in correct read-error handling for resync and read-only situations.

When read-only, we don't over-write, so we need to mark the failed drive in
the r10_bio so we don't re-try it.  During resync, we always read all blocks,
so if there is a read error, we simply over-write it with the good block that
we found (assuming we found one).

Note that the recovery case still isn't handled in an interesting way.  There
is nothing useful to do for the 2-copies case.  If there are 3 or more copies,
then we could try reading from one of the non-missing copies, but this is a
bit complicated and very rarely would be used, so I'm leaving it for now.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c         | 56 ++++++++++++++++++++++++++++-----------------
 include/linux/raid/raid10.h |  7 ++++++
 2 files changed, 42 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64bb4ddc6798..3f8df2ecbae3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -172,7 +172,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 
 	for (i = 0; i < conf->copies; i++) {
 		struct bio **bio = & r10_bio->devs[i].bio;
-		if (*bio)
+		if (*bio && *bio != IO_BLOCKED)
 			bio_put(*bio);
 		*bio = NULL;
 	}
@@ -500,6 +500,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 		disk = r10_bio->devs[slot].devnum;
 
 		while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
+		       r10_bio->devs[slot].bio == IO_BLOCKED ||
 		       !test_bit(In_sync, &rdev->flags)) {
 			slot++;
 			if (slot == conf->copies) {
@@ -517,6 +518,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 	slot = 0;
 	disk = r10_bio->devs[slot].devnum;
 	while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
+	       r10_bio->devs[slot].bio == IO_BLOCKED ||
 	       !test_bit(In_sync, &rdev->flags)) {
 		slot ++;
 		if (slot == conf->copies) {
@@ -537,6 +539,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 
 
 		if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
+		    r10_bio->devs[nslot].bio == IO_BLOCKED ||
 		    !test_bit(In_sync, &rdev->flags))
 			continue;
 
@@ -1104,7 +1107,6 @@ abort:
 
 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 	int i,d;
@@ -1119,7 +1121,10 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 		BUG();
 	update_head_pos(i, r10_bio);
 	d = r10_bio->devs[i].devnum;
-	if (!uptodate)
+
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		set_bit(R10BIO_Uptodate, &r10_bio->state);
+	else if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
 		md_error(r10_bio->mddev,
 			 conf->mirrors[d].rdev);
 
@@ -1209,25 +1214,30 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 	fbio = r10_bio->devs[i].bio;
 
 	/* now find blocks with errors */
-	for (i=first+1 ; i < conf->copies ; i++) {
-		int vcnt, j, d;
+	for (i=0 ; i < conf->copies ; i++) {
+		int  j, d;
+		int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
 
-		if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
-			continue;
-		/* We know that the bi_io_vec layout is the same for
-		 * both 'first' and 'i', so we just compare them.
-		 * All vec entries are PAGE_SIZE;
-		 */
 		tbio = r10_bio->devs[i].bio;
-		vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
-		for (j = 0; j < vcnt; j++)
-			if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
-				   page_address(tbio->bi_io_vec[j].bv_page),
-				   PAGE_SIZE))
-				break;
-		if (j == vcnt)
+
+		if (tbio->bi_end_io != end_sync_read)
+			continue;
+		if (i == first)
 			continue;
-		mddev->resync_mismatches += r10_bio->sectors;
+		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
+			/* We know that the bi_io_vec layout is the same for
+			 * both 'first' and 'i', so we just compare them.
+			 * All vec entries are PAGE_SIZE;
+			 */
+			for (j = 0; j < vcnt; j++)
+				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
+					   page_address(tbio->bi_io_vec[j].bv_page),
+					   PAGE_SIZE))
+					break;
+			if (j == vcnt)
+				continue;
+			mddev->resync_mismatches += r10_bio->sectors;
+		}
 		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
 			/* Don't fix anything. */
 			continue;
@@ -1308,7 +1318,10 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 
 	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
-	generic_make_request(wbio);
+	if (test_bit(R10BIO_Uptodate, &r10_bio->state))
+		generic_make_request(wbio);
+	else
+		bio_endio(wbio, wbio->bi_size, -EIO);
 }
 
 
@@ -1445,7 +1458,8 @@ static void raid10d(mddev_t *mddev)
 			unfreeze_array(conf);
 
 			bio = r10_bio->devs[r10_bio->read_slot].bio;
-			r10_bio->devs[r10_bio->read_slot].bio = NULL;
+			r10_bio->devs[r10_bio->read_slot].bio =
+				mddev->ro ? IO_BLOCKED : NULL;
 			bio_put(bio);
 			mirror = read_balance(conf, r10_bio);
 			if (mirror == -1) {
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index dfa528385e3f..b1103298a8c2 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -104,6 +104,13 @@ struct r10bio_s {
 	} devs[0];
 };
 
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
 /* bits for r10bio.state */
 #define	R10BIO_Uptodate	0
 #define	R10BIO_IsSync	1
-- 
cgit v1.2.3-71-gd317


From fccddba060f2b4916a30aa27acc3d03b01bb981e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:33 -0800
Subject: [PATCH] md: tidy up raid5/6 hash table code

- replace open-coded hash chain with hlist macros

- Fix hash-table size at one page - it is already quite generous, so there
  will never be a need to use multiple pages, so no need for __get_free_pages

No functional change.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 40 ++++++++++++++--------------------------
 drivers/md/raid6main.c     | 46 +++++++++++++++++-----------------------------
 include/linux/raid/raid5.h |  4 ++--
 3 files changed, 33 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9fc50487e2ed..6e4db95cebb1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -35,12 +35,10 @@
 #define STRIPE_SHIFT		(PAGE_SHIFT - 9)
 #define STRIPE_SECTORS		(STRIPE_SIZE>>9)
 #define	IO_THRESHOLD		1
-#define HASH_PAGES		1
-#define HASH_PAGES_ORDER	0
-#define NR_HASH			(HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
+#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
 #define HASH_MASK		(NR_HASH - 1)
 
-#define stripe_hash(conf, sect)	((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
+#define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
 
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
  * order without overlap.  There may be several bio's per stripe+device, and
@@ -113,29 +111,21 @@ static void release_stripe(struct stripe_head *sh)
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 }
 
-static void remove_hash(struct stripe_head *sh)
+static inline void remove_hash(struct stripe_head *sh)
 {
 	PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
 
-	if (sh->hash_pprev) {
-		if (sh->hash_next)
-			sh->hash_next->hash_pprev = sh->hash_pprev;
-		*sh->hash_pprev = sh->hash_next;
-		sh->hash_pprev = NULL;
-	}
+	hlist_del_init(&sh->hash);
 }
 
-static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
+static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 {
-	struct stripe_head **shp = &stripe_hash(conf, sh->sector);
+	struct hlist_head *hp = stripe_hash(conf, sh->sector);
 
 	PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
 
 	CHECK_DEVLOCK();
-	if ((sh->hash_next = *shp) != NULL)
-		(*shp)->hash_pprev = &sh->hash_next;
-	*shp = sh;
-	sh->hash_pprev = shp;
+	hlist_add_head(&sh->hash, hp);
 }
 
 
@@ -228,10 +218,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i
 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 {
 	struct stripe_head *sh;
+	struct hlist_node *hn;
 
 	CHECK_DEVLOCK();
 	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
-	for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
+	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
 		if (sh->sector == sector)
 			return sh;
 	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
@@ -1835,9 +1826,8 @@ static int run(mddev_t *mddev)
 
 	conf->mddev = mddev;
 
-	if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
 		goto abort;
-	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
 
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
@@ -1972,9 +1962,7 @@ static int run(mddev_t *mddev)
 abort:
 	if (conf) {
 		print_raid5_conf(conf);
-		if (conf->stripe_hashtbl)
-			free_pages((unsigned long) conf->stripe_hashtbl,
-							HASH_PAGES_ORDER);
+		kfree(conf->stripe_hashtbl);
 		kfree(conf);
 	}
 	mddev->private = NULL;
@@ -1991,7 +1979,7 @@ static int stop(mddev_t *mddev)
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
 	shrink_stripes(conf);
-	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
+	kfree(conf->stripe_hashtbl);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
 	kfree(conf);
@@ -2019,12 +2007,12 @@ static void print_sh (struct stripe_head *sh)
 static void printall (raid5_conf_t *conf)
 {
 	struct stripe_head *sh;
+	struct hlist_node *hn;
 	int i;
 
 	spin_lock_irq(&conf->device_lock);
 	for (i = 0; i < NR_HASH; i++) {
-		sh = conf->stripe_hashtbl[i];
-		for (; sh; sh = sh->hash_next) {
+		hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
 			if (sh->raid_conf != conf)
 				continue;
 			print_sh(sh);
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 4062fc16ac2b..79b5244f44f4 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -40,12 +40,10 @@
 #define STRIPE_SHIFT		(PAGE_SHIFT - 9)
 #define STRIPE_SECTORS		(STRIPE_SIZE>>9)
 #define	IO_THRESHOLD		1
-#define HASH_PAGES		1
-#define HASH_PAGES_ORDER	0
-#define NR_HASH			(HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
+#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
 #define HASH_MASK		(NR_HASH - 1)
 
-#define stripe_hash(conf, sect)	((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
+#define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
 
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
  * order without overlap.  There may be several bio's per stripe+device, and
@@ -132,29 +130,21 @@ static void release_stripe(struct stripe_head *sh)
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 }
 
-static void remove_hash(struct stripe_head *sh)
+static inline void remove_hash(struct stripe_head *sh)
 {
 	PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
 
-	if (sh->hash_pprev) {
-		if (sh->hash_next)
-			sh->hash_next->hash_pprev = sh->hash_pprev;
-		*sh->hash_pprev = sh->hash_next;
-		sh->hash_pprev = NULL;
-	}
+	hlist_del_init(&sh->hash);
 }
 
-static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
+static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
 {
-	struct stripe_head **shp = &stripe_hash(conf, sh->sector);
+	struct hlist_head *hp = stripe_hash(conf, sh->sector);
 
 	PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
 
 	CHECK_DEVLOCK();
-	if ((sh->hash_next = *shp) != NULL)
-		(*shp)->hash_pprev = &sh->hash_next;
-	*shp = sh;
-	sh->hash_pprev = shp;
+	hlist_add_head(&sh->hash, hp);
 }
 
 
@@ -247,10 +237,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i
 static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
 {
 	struct stripe_head *sh;
+	struct hlist_node *hn;
 
 	CHECK_DEVLOCK();
 	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
-	for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
+	hlist_for_each_entry (sh, hn,  stripe_hash(conf, sector), hash)
 		if (sh->sector == sector)
 			return sh;
 	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
@@ -1931,17 +1922,15 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
-	mddev->private = kmalloc (sizeof (raid6_conf_t)
-				  + mddev->raid_disks * sizeof(struct disk_info),
-				  GFP_KERNEL);
+	mddev->private = kzalloc(sizeof (raid6_conf_t)
+				 + mddev->raid_disks * sizeof(struct disk_info),
+				 GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
-	memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) );
 	conf->mddev = mddev;
 
-	if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
 		goto abort;
-	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
 
 	conf->spare_page = alloc_page(GFP_KERNEL);
 	if (!conf->spare_page)
@@ -2085,9 +2074,7 @@ abort:
 		print_raid6_conf(conf);
 		if (conf->spare_page)
 			put_page(conf->spare_page);
-		if (conf->stripe_hashtbl)
-			free_pages((unsigned long) conf->stripe_hashtbl,
-							HASH_PAGES_ORDER);
+		kfree(conf->stripe_hashtbl);
 		kfree(conf);
 	}
 	mddev->private = NULL;
@@ -2104,7 +2091,7 @@ static int stop (mddev_t *mddev)
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
 	shrink_stripes(conf);
-	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
+	kfree(conf->stripe_hashtbl);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	kfree(conf);
 	mddev->private = NULL;
@@ -2131,12 +2118,13 @@ static void print_sh (struct seq_file *seq, struct stripe_head *sh)
 static void printall (struct seq_file *seq, raid6_conf_t *conf)
 {
 	struct stripe_head *sh;
+	struct hlist_node *hn;
 	int i;
 
 	spin_lock_irq(&conf->device_lock);
 	for (i = 0; i < NR_HASH; i++) {
 		sh = conf->stripe_hashtbl[i];
-		for (; sh; sh = sh->hash_next) {
+		hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
 			if (sh->raid_conf != conf)
 				continue;
 			print_sh(seq, sh);
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 28fcd7533ac4..394da8207b34 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -126,7 +126,7 @@
  */
 
 struct stripe_head {
-	struct stripe_head	*hash_next, **hash_pprev; /* hash pointers */
+	struct hlist_node	hash;
 	struct list_head	lru;			/* inactive_list or handle_list */
 	struct raid5_private_data	*raid_conf;
 	sector_t		sector;			/* sector of this row */
@@ -204,7 +204,7 @@ struct disk_info {
 };
 
 struct raid5_private_data {
-	struct stripe_head	**stripe_hashtbl;
+	struct hlist_head	*stripe_hashtbl;
 	mddev_t			*mddev;
 	struct disk_info	*spare;
 	int			chunk_size, level, algorithm;
-- 
cgit v1.2.3-71-gd317


From 2604b703b6b3db80e3c75ce472a54dfd0b7bf9f4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:36 -0800
Subject: [PATCH] md: remove personality numbering from md

md supports multiple different RAID level, each being implemented by a
'personality' (which is often in a separate module).

These personalities have fairly artificial 'numbers'.  The numbers
are use to:
 1- provide an index into an array where the various personalities
    are recorded
 2- identify the module (via an alias) which implements are particular
    personality.

Neither of these uses really justify the existence of personality numbers.
The array can be replaced by a linked list which is searched (array lookup
only happens very rarely).  Module identification can be done using an alias
based on level rather than 'personality' number.

The current 'raid5' modules support two level (4 and 5) but only one
personality.  This slight awkwardness (which was handled in the mapping from
level to personality) can be better handled by allowing raid5 to register 2
personalities.

With this change in place, the core md module does not need to have an
exhaustive list of all possible personalities, so other personalities can be
added independently.

This patch also moves the check for chunksize being non-zero into the ->run
routines for the personalities that need it, rather than having it in core-md.
 This has a side effect of allowing 'faulty' and 'linear' not to have a
chunk-size set.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/faulty.c       |  8 +++--
 drivers/md/linear.c       | 10 +++---
 drivers/md/md.c           | 79 +++++++++++++++++------------------------------
 drivers/md/multipath.c    | 11 +++----
 drivers/md/raid0.c        | 14 ++++++---
 drivers/md/raid1.c        |  9 +++---
 drivers/md/raid10.c       | 16 +++++-----
 drivers/md/raid5.c        | 34 +++++++++++++++++---
 drivers/md/raid6main.c    | 10 +++---
 include/linux/raid/md.h   |  4 +--
 include/linux/raid/md_k.h | 63 ++++++-------------------------------
 init/do_mounts_md.c       | 22 ++++++-------
 12 files changed, 125 insertions(+), 155 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 0248f8e7eac0..f12e83086897 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -316,9 +316,10 @@ static int stop(mddev_t *mddev)
 	return 0;
 }
 
-static mdk_personality_t faulty_personality =
+static struct mdk_personality faulty_personality =
 {
 	.name		= "faulty",
+	.level		= LEVEL_FAULTY,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
@@ -329,15 +330,16 @@ static mdk_personality_t faulty_personality =
 
 static int __init raid_init(void)
 {
-	return register_md_personality(FAULTY, &faulty_personality);
+	return register_md_personality(&faulty_personality);
 }
 
 static void raid_exit(void)
 {
-	unregister_md_personality(FAULTY);
+	unregister_md_personality(&faulty_personality);
 }
 
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-10"); /* faulty */
+MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index f46c98d05b44..79dee8159217 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -351,9 +351,10 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev)
 }
 
 
-static mdk_personality_t linear_personality=
+static struct mdk_personality linear_personality =
 {
 	.name		= "linear",
+	.level		= LEVEL_LINEAR,
 	.owner		= THIS_MODULE,
 	.make_request	= linear_make_request,
 	.run		= linear_run,
@@ -363,16 +364,17 @@ static mdk_personality_t linear_personality=
 
 static int __init linear_init (void)
 {
-	return register_md_personality (LINEAR, &linear_personality);
+	return register_md_personality (&linear_personality);
 }
 
 static void linear_exit (void)
 {
-	unregister_md_personality (LINEAR);
+	unregister_md_personality (&linear_personality);
 }
 
 
 module_init(linear_init);
 module_exit(linear_exit);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("md-personality-1"); /* LINEAR */
+MODULE_ALIAS("md-personality-1"); /* LINEAR - degrecated*/
+MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a6a066fc92e3..07f180f95b47 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -68,7 +68,7 @@
 static void autostart_arrays (int part);
 #endif
 
-static mdk_personality_t *pers[MAX_PERSONALITY];
+static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
 /*
@@ -303,6 +303,15 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 	return NULL;
 }
 
+static struct mdk_personality *find_pers(int level)
+{
+	struct mdk_personality *pers;
+	list_for_each_entry(pers, &pers_list, list)
+		if (pers->level == level)
+			return pers;
+	return NULL;
+}
+
 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 {
 	sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
@@ -1744,7 +1753,7 @@ static void analyze_sbs(mddev_t * mddev)
 static ssize_t
 level_show(mddev_t *mddev, char *page)
 {
-	mdk_personality_t *p = mddev->pers;
+	struct mdk_personality *p = mddev->pers;
 	if (p == NULL && mddev->raid_disks == 0)
 		return 0;
 	if (mddev->level >= 0)
@@ -1960,11 +1969,12 @@ static int start_dirty_degraded;
 
 static int do_md_run(mddev_t * mddev)
 {
-	int pnum, err;
+	int err;
 	int chunk_size;
 	struct list_head *tmp;
 	mdk_rdev_t *rdev;
 	struct gendisk *disk;
+	struct mdk_personality *pers;
 	char b[BDEVNAME_SIZE];
 
 	if (list_empty(&mddev->disks))
@@ -1981,20 +1991,8 @@ static int do_md_run(mddev_t * mddev)
 		analyze_sbs(mddev);
 
 	chunk_size = mddev->chunk_size;
-	pnum = level_to_pers(mddev->level);
 
-	if ((pnum != MULTIPATH) && (pnum != RAID1)) {
-		if (!chunk_size) {
-			/*
-			 * 'default chunksize' in the old md code used to
-			 * be PAGE_SIZE, baaad.
-			 * we abort here to be on the safe side. We don't
-			 * want to continue the bad practice.
-			 */
-			printk(KERN_ERR 
-				"no chunksize specified, see 'man raidtab'\n");
-			return -EINVAL;
-		}
+	if (chunk_size) {
 		if (chunk_size > MAX_CHUNK_SIZE) {
 			printk(KERN_ERR "too big chunk_size: %d > %d\n",
 				chunk_size, MAX_CHUNK_SIZE);
@@ -2030,10 +2028,7 @@ static int do_md_run(mddev_t * mddev)
 	}
 
 #ifdef CONFIG_KMOD
-	if (!pers[pnum])
-	{
-		request_module("md-personality-%d", pnum);
-	}
+	request_module("md-level-%d", mddev->level);
 #endif
 
 	/*
@@ -2055,14 +2050,14 @@ static int do_md_run(mddev_t * mddev)
 		return -ENOMEM;
 
 	spin_lock(&pers_lock);
-	if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
+	pers = find_pers(mddev->level);
+	if (!pers || !try_module_get(pers->owner)) {
 		spin_unlock(&pers_lock);
-		printk(KERN_WARNING "md: personality %d is not loaded!\n",
-		       pnum);
+		printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
+		       mddev->level);
 		return -EINVAL;
 	}
-
-	mddev->pers = pers[pnum];
+	mddev->pers = pers;
 	spin_unlock(&pers_lock);
 
 	mddev->recovery = 0;
@@ -3701,15 +3696,14 @@ static int md_seq_show(struct seq_file *seq, void *v)
 	struct list_head *tmp2;
 	mdk_rdev_t *rdev;
 	struct mdstat_info *mi = seq->private;
-	int i;
 	struct bitmap *bitmap;
 
 	if (v == (void*)1) {
+		struct mdk_personality *pers;
 		seq_printf(seq, "Personalities : ");
 		spin_lock(&pers_lock);
-		for (i = 0; i < MAX_PERSONALITY; i++)
-			if (pers[i])
-				seq_printf(seq, "[%s] ", pers[i]->name);
+		list_for_each_entry(pers, &pers_list, list)
+			seq_printf(seq, "[%s] ", pers->name);
 
 		spin_unlock(&pers_lock);
 		seq_printf(seq, "\n");
@@ -3870,35 +3864,20 @@ static struct file_operations md_seq_fops = {
 	.poll		= mdstat_poll,
 };
 
-int register_md_personality(int pnum, mdk_personality_t *p)
+int register_md_personality(struct mdk_personality *p)
 {
-	if (pnum >= MAX_PERSONALITY) {
-		printk(KERN_ERR
-		       "md: tried to install personality %s as nr %d, but max is %lu\n",
-		       p->name, pnum, MAX_PERSONALITY-1);
-		return -EINVAL;
-	}
-
 	spin_lock(&pers_lock);
-	if (pers[pnum]) {
-		spin_unlock(&pers_lock);
-		return -EBUSY;
-	}
-
-	pers[pnum] = p;
-	printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+	list_add_tail(&p->list, &pers_list);
+	printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
 	spin_unlock(&pers_lock);
 	return 0;
 }
 
-int unregister_md_personality(int pnum)
+int unregister_md_personality(struct mdk_personality *p)
 {
-	if (pnum >= MAX_PERSONALITY)
-		return -EINVAL;
-
-	printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+	printk(KERN_INFO "md: %s personality unregistered\n", p->name);
 	spin_lock(&pers_lock);
-	pers[pnum] = NULL;
+	list_del_init(&p->list);
 	spin_unlock(&pers_lock);
 	return 0;
 }
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 97a56aaaef6d..d4d838e3f9f8 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -35,9 +35,6 @@
 #define	NR_RESERVED_BUFS	32
 
 
-static mdk_personality_t multipath_personality;
-
-
 static void *mp_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct multipath_bh *mpb;
@@ -553,9 +550,10 @@ static int multipath_stop (mddev_t *mddev)
 	return 0;
 }
 
-static mdk_personality_t multipath_personality=
+static struct mdk_personality multipath_personality =
 {
 	.name		= "multipath",
+	.level		= LEVEL_MULTIPATH,
 	.owner		= THIS_MODULE,
 	.make_request	= multipath_make_request,
 	.run		= multipath_run,
@@ -568,15 +566,16 @@ static mdk_personality_t multipath_personality=
 
 static int __init multipath_init (void)
 {
-	return register_md_personality (MULTIPATH, &multipath_personality);
+	return register_md_personality (&multipath_personality);
 }
 
 static void __exit multipath_exit (void)
 {
-	unregister_md_personality (MULTIPATH);
+	unregister_md_personality (&multipath_personality);
 }
 
 module_init(multipath_init);
 module_exit(multipath_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
+MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index b4eaa67fabde..7fb69e29391b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -275,7 +275,11 @@ static int raid0_run (mddev_t *mddev)
 	mdk_rdev_t *rdev;
 	struct list_head *tmp;
 
-	printk("%s: setting max_sectors to %d, segment boundary to %d\n",
+	if (mddev->chunk_size == 0) {
+		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
+		return -EINVAL;
+	}
+	printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
 	       mdname(mddev),
 	       mddev->chunk_size >> 9,
 	       (mddev->chunk_size>>1)-1);
@@ -507,9 +511,10 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 	return;
 }
 
-static mdk_personality_t raid0_personality=
+static struct mdk_personality raid0_personality=
 {
 	.name		= "raid0",
+	.level		= 0,
 	.owner		= THIS_MODULE,
 	.make_request	= raid0_make_request,
 	.run		= raid0_run,
@@ -519,15 +524,16 @@ static mdk_personality_t raid0_personality=
 
 static int __init raid0_init (void)
 {
-	return register_md_personality (RAID0, &raid0_personality);
+	return register_md_personality (&raid0_personality);
 }
 
 static void raid0_exit (void)
 {
-	unregister_md_personality (RAID0);
+	unregister_md_personality (&raid0_personality);
 }
 
 module_init(raid0_init);
 module_exit(raid0_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-2"); /* RAID0 */
+MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c42ef1c99fa0..6e0f59ed3d80 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -47,7 +47,6 @@
  */
 #define	NR_RAID1_BIOS 256
 
-static mdk_personality_t raid1_personality;
 
 static void unplug_slaves(mddev_t *mddev);
 
@@ -2036,9 +2035,10 @@ static void raid1_quiesce(mddev_t *mddev, int state)
 }
 
 
-static mdk_personality_t raid1_personality =
+static struct mdk_personality raid1_personality =
 {
 	.name		= "raid1",
+	.level		= 1,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
@@ -2056,15 +2056,16 @@ static mdk_personality_t raid1_personality =
 
 static int __init raid_init(void)
 {
-	return register_md_personality(RAID1, &raid1_personality);
+	return register_md_personality(&raid1_personality);
 }
 
 static void raid_exit(void)
 {
-	unregister_md_personality(RAID1);
+	unregister_md_personality(&raid1_personality);
 }
 
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
+MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 253322ae9195..f23d52c5df94 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1883,11 +1883,11 @@ static int run(mddev_t *mddev)
 	int nc, fc;
 	sector_t stride, size;
 
-	if (mddev->level != 10) {
-		printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n",
-		       mdname(mddev), mddev->level);
-		goto out;
+	if (mddev->chunk_size == 0) {
+		printk(KERN_ERR "md/raid10: non-zero chunk size required.\n");
+		return -EINVAL;
 	}
+
 	nc = mddev->layout & 255;
 	fc = (mddev->layout >> 8) & 255;
 	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
@@ -2072,9 +2072,10 @@ static void raid10_quiesce(mddev_t *mddev, int state)
 	}
 }
 
-static mdk_personality_t raid10_personality =
+static struct mdk_personality raid10_personality =
 {
 	.name		= "raid10",
+	.level		= 10,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
@@ -2090,15 +2091,16 @@ static mdk_personality_t raid10_personality =
 
 static int __init raid_init(void)
 {
-	return register_md_personality(RAID10, &raid10_personality);
+	return register_md_personality(&raid10_personality);
 }
 
 static void raid_exit(void)
 {
-	unregister_md_personality(RAID10);
+	unregister_md_personality(&raid10_personality);
 }
 
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
+MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6e4db95cebb1..b0cfd3ca9ca0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2187,9 +2187,10 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 	}
 }
 
-static mdk_personality_t raid5_personality=
+static struct mdk_personality raid5_personality =
 {
 	.name		= "raid5",
+	.level		= 5,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
@@ -2204,17 +2205,40 @@ static mdk_personality_t raid5_personality=
 	.quiesce	= raid5_quiesce,
 };
 
-static int __init raid5_init (void)
+static struct mdk_personality raid4_personality =
 {
-	return register_md_personality (RAID5, &raid5_personality);
+	.name		= "raid4",
+	.level		= 4,
+	.owner		= THIS_MODULE,
+	.make_request	= make_request,
+	.run		= run,
+	.stop		= stop,
+	.status		= status,
+	.error_handler	= error,
+	.hot_add_disk	= raid5_add_disk,
+	.hot_remove_disk= raid5_remove_disk,
+	.spare_active	= raid5_spare_active,
+	.sync_request	= sync_request,
+	.resize		= raid5_resize,
+	.quiesce	= raid5_quiesce,
+};
+
+static int __init raid5_init(void)
+{
+	register_md_personality(&raid5_personality);
+	register_md_personality(&raid4_personality);
+	return 0;
 }
 
-static void raid5_exit (void)
+static void raid5_exit(void)
 {
-	unregister_md_personality (RAID5);
+	unregister_md_personality(&raid5_personality);
+	unregister_md_personality(&raid4_personality);
 }
 
 module_init(raid5_init);
 module_exit(raid5_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-4"); /* RAID5 */
+MODULE_ALIAS("md-level-5");
+MODULE_ALIAS("md-level-4");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 79b5244f44f4..950e5fa6e1f2 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2304,9 +2304,10 @@ static void raid6_quiesce(mddev_t *mddev, int state)
 	}
 }
 
-static mdk_personality_t raid6_personality=
+static struct mdk_personality raid6_personality =
 {
 	.name		= "raid6",
+	.level		= 6,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
@@ -2321,7 +2322,7 @@ static mdk_personality_t raid6_personality=
 	.quiesce	= raid6_quiesce,
 };
 
-static int __init raid6_init (void)
+static int __init raid6_init(void)
 {
 	int e;
 
@@ -2329,15 +2330,16 @@ static int __init raid6_init (void)
 	if ( e )
 		return e;
 
-	return register_md_personality (RAID6, &raid6_personality);
+	return register_md_personality(&raid6_personality);
 }
 
 static void raid6_exit (void)
 {
-	unregister_md_personality (RAID6);
+	unregister_md_personality(&raid6_personality);
 }
 
 module_init(raid6_init);
 module_exit(raid6_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-8"); /* RAID6 */
+MODULE_ALIAS("md-level-6");
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 13e7c4b62367..b6e0bcad84e1 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -71,8 +71,8 @@
  */
 #define MD_PATCHLEVEL_VERSION           3
 
-extern int register_md_personality (int p_num, mdk_personality_t *p);
-extern int unregister_md_personality (int p_num);
+extern int register_md_personality (struct mdk_personality *p);
+extern int unregister_md_personality (struct mdk_personality *p);
 extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev),
 				mddev_t *mddev, const char *name);
 extern void md_unregister_thread (mdk_thread_t *thread);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 1dd587b5975a..e559fb701aa1 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -18,62 +18,19 @@
 /* and dm-bio-list.h is not under include/linux because.... ??? */
 #include "../../../drivers/md/dm-bio-list.h"
 
-#define MD_RESERVED       0UL
-#define LINEAR            1UL
-#define RAID0             2UL
-#define RAID1             3UL
-#define RAID5             4UL
-#define TRANSLUCENT       5UL
-#define HSM               6UL
-#define MULTIPATH         7UL
-#define RAID6		  8UL
-#define	RAID10		  9UL
-#define FAULTY		  10UL
-#define MAX_PERSONALITY   11UL
-
 #define	LEVEL_MULTIPATH		(-4)
 #define	LEVEL_LINEAR		(-1)
 #define	LEVEL_FAULTY		(-5)
 
+/* we need a value for 'no level specified' and 0
+ * means 'raid0', so we need something else.  This is
+ * for internal use only
+ */
+#define	LEVEL_NONE		(-1000000)
+
 #define MaxSector (~(sector_t)0)
 #define MD_THREAD_NAME_MAX 14
 
-static inline int pers_to_level (int pers)
-{
-	switch (pers) {
-		case FAULTY:		return LEVEL_FAULTY;
-		case MULTIPATH:		return LEVEL_MULTIPATH;
-		case HSM:		return -3;
-		case TRANSLUCENT:	return -2;
-		case LINEAR:		return LEVEL_LINEAR;
-		case RAID0:		return 0;
-		case RAID1:		return 1;
-		case RAID5:		return 5;
-		case RAID6:		return 6;
-		case RAID10:		return 10;
-	}
-	BUG();
-	return MD_RESERVED;
-}
-
-static inline int level_to_pers (int level)
-{
-	switch (level) {
-		case LEVEL_FAULTY: return FAULTY;
-		case LEVEL_MULTIPATH: return MULTIPATH;
-		case -3: return HSM;
-		case -2: return TRANSLUCENT;
-		case LEVEL_LINEAR: return LINEAR;
-		case 0: return RAID0;
-		case 1: return RAID1;
-		case 4:
-		case 5: return RAID5;
-		case 6: return RAID6;
-		case 10: return RAID10;
-	}
-	return MD_RESERVED;
-}
-
 typedef struct mddev_s mddev_t;
 typedef struct mdk_rdev_s mdk_rdev_t;
 
@@ -140,12 +97,10 @@ struct mdk_rdev_s
 					 */
 };
 
-typedef struct mdk_personality_s mdk_personality_t;
-
 struct mddev_s
 {
 	void				*private;
-	mdk_personality_t		*pers;
+	struct mdk_personality		*pers;
 	dev_t				unit;
 	int				md_minor;
 	struct list_head 		disks;
@@ -266,9 +221,11 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect
         atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
 }
 
-struct mdk_personality_s
+struct mdk_personality
 {
 	char *name;
+	int level;
+	struct list_head list;
 	struct module *owner;
 	int (*make_request)(request_queue_t *q, struct bio *bio);
 	int (*run)(mddev_t *mddev);
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index 3fbc3555ce96..f6f36806f84a 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -17,7 +17,7 @@ static int __initdata raid_noautodetect, raid_autopart;
 static struct {
 	int minor;
 	int partitioned;
-	int pers;
+	int level;
 	int chunk;
 	char *device_names;
 } md_setup_args[MAX_MD_DEVS] __initdata;
@@ -47,7 +47,7 @@ extern int mdp_major;
  */
 static int __init md_setup(char *str)
 {
-	int minor, level, factor, fault, pers, partitioned = 0;
+	int minor, level, factor, fault, partitioned = 0;
 	char *pername = "";
 	char *str1;
 	int ent;
@@ -78,7 +78,7 @@ static int __init md_setup(char *str)
 	}
 	if (ent >= md_setup_ents)
 		md_setup_ents++;
-	switch (get_option(&str, &level)) {	/* RAID Personality */
+	switch (get_option(&str, &level)) {	/* RAID level */
 	case 2: /* could be 0 or -1.. */
 		if (level == 0 || level == LEVEL_LINEAR) {
 			if (get_option(&str, &factor) != 2 ||	/* Chunk Size */
@@ -86,16 +86,12 @@ static int __init md_setup(char *str)
 				printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
 				return 0;
 			}
-			md_setup_args[ent].pers = level;
+			md_setup_args[ent].level = level;
 			md_setup_args[ent].chunk = 1 << (factor+12);
-			if (level ==  LEVEL_LINEAR) {
-				pers = LINEAR;
+			if (level ==  LEVEL_LINEAR)
 				pername = "linear";
-			} else {
-				pers = RAID0;
+			else
 				pername = "raid0";
-			}
-			md_setup_args[ent].pers = pers;
 			break;
 		}
 		/* FALL THROUGH */
@@ -103,7 +99,7 @@ static int __init md_setup(char *str)
 		str = str1;
 		/* FALL THROUGH */
 	case 0:
-		md_setup_args[ent].pers = 0;
+		md_setup_args[ent].level = LEVEL_NONE;
 		pername="super-block";
 	}
 
@@ -190,10 +186,10 @@ static void __init md_setup_drive(void)
 			continue;
 		}
 
-		if (md_setup_args[ent].pers) {
+		if (md_setup_args[ent].level != LEVEL_NONE) {
 			/* non-persistent */
 			mdu_array_info_t ainfo;
-			ainfo.level = pers_to_level(md_setup_args[ent].pers);
+			ainfo.level = md_setup_args[ent].level;
 			ainfo.size = 0;
 			ainfo.nr_disks =0;
 			ainfo.raid_disks =0;
-- 
cgit v1.2.3-71-gd317


From 1345b1d8adbdeceb1c871d9a4af5e2a700b341c6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:40 -0800
Subject: [PATCH] md: define and use safe_put_page for md

md sometimes call put_page on NULL pointers (treating it like kfree).  This is
not safe, so define and use a 'safe_put_page' which checks for NULL.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c       | 3 +--
 drivers/md/raid1.c        | 8 ++++----
 drivers/md/raid10.c       | 8 ++++----
 drivers/md/raid6main.c    | 3 +--
 include/linux/raid/md_k.h | 5 +++++
 5 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index abe415f0c039..ee4a3424a8a3 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -626,8 +626,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 	kfree(map);
 	kfree(attr);
 
-	if (sb_page)
-		put_page(sb_page);
+	safe_put_page(sb_page);
 }
 
 static void bitmap_stop_daemon(struct bitmap *bitmap);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 39c10a65683d..feea4eeca1d9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -136,7 +136,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 out_free_pages:
 	for (i=0; i < RESYNC_PAGES ; i++)
 		for (j=0 ; j < pi->raid_disks; j++)
-			put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
+			safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
 	j = -1;
 out_free_bio:
 	while ( ++j < pi->raid_disks )
@@ -156,7 +156,7 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
 			if (j == 0 ||
 			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
 			    r1bio->bios[0]->bi_io_vec[i].bv_page)
-				put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
+				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 		}
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
@@ -381,7 +381,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
 			/* free extra copy of the data pages */
 			int i = bio->bi_vcnt;
 			while (i--)
-				put_page(bio->bi_io_vec[i].bv_page);
+				safe_put_page(bio->bi_io_vec[i].bv_page);
 		}
 		/* clear the bitmap if all writes complete successfully */
 		bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -1907,7 +1907,7 @@ out_free_conf:
 		if (conf->r1bio_pool)
 			mempool_destroy(conf->r1bio_pool);
 		kfree(conf->mirrors);
-		put_page(conf->tmppage);
+		safe_put_page(conf->tmppage);
 		kfree(conf->poolinfo);
 		kfree(conf);
 		mddev->private = NULL;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9647ebb0983a..fb952000fae2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -132,10 +132,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 
 out_free_pages:
 	for ( ; i > 0 ; i--)
-		put_page(bio->bi_io_vec[i-1].bv_page);
+		safe_put_page(bio->bi_io_vec[i-1].bv_page);
 	while (j--)
 		for (i = 0; i < RESYNC_PAGES ; i++)
-			put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
 	j = -1;
 out_free_bio:
 	while ( ++j < nalloc )
@@ -155,7 +155,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
 		struct bio *bio = r10bio->devs[j].bio;
 		if (bio) {
 			for (i = 0; i < RESYNC_PAGES; i++) {
-				put_page(bio->bi_io_vec[i].bv_page);
+				safe_put_page(bio->bi_io_vec[i].bv_page);
 				bio->bi_io_vec[i].bv_page = NULL;
 			}
 			bio_put(bio);
@@ -2042,7 +2042,7 @@ static int run(mddev_t *mddev)
 out_free_conf:
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
-	put_page(conf->tmppage);
+	safe_put_page(conf->tmppage);
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 950e5fa6e1f2..06b32bd671a3 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2072,8 +2072,7 @@ static int run(mddev_t *mddev)
 abort:
 	if (conf) {
 		print_raid6_conf(conf);
-		if (conf->spare_page)
-			put_page(conf->spare_page);
+		safe_put_page(conf->spare_page);
 		kfree(conf->stripe_hashtbl);
 		kfree(conf);
 	}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index e559fb701aa1..12b3203e3419 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -324,5 +324,10 @@ do {									\
 	__wait_event_lock_irq(wq, condition, lock, cmd);		\
 } while (0)
 
+static inline void safe_put_page(struct page *p)
+{
+	if (p) put_page(p);
+}
+
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 2989ddbd6e1d9638a188311b896362c4bf7b7c25 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:43 -0800
Subject: [PATCH] md: make a couple of names in md.c static

.. because they aren't used outside md.c

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 4 ++--
 include/linux/raid/md_k.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c1613854f38d..3cf089349e77 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -144,7 +144,7 @@ static int start_readonly;
  *  start array, stop array, error, add device, remove device,
  *  start build, activate spare
  */
-DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
+static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 static atomic_t md_event_count;
 void md_new_event(mddev_t *mddev)
 {
@@ -279,7 +279,7 @@ static inline void mddev_unlock(mddev_t * mddev)
 	md_wakeup_thread(mddev->thread);
 }
 
-mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 {
 	mdk_rdev_t * rdev;
 	struct list_head *tmp;
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 12b3203e3419..0fb5af6d622d 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -263,8 +263,6 @@ static inline char * mdname (mddev_t * mddev)
 	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
 }
 
-extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
-
 /*
  * iterates through some rdev ringlist. It's safe to remove the
  * current 'rdev'. Dont touch 'tmp' though.
-- 
cgit v1.2.3-71-gd317


From d9d166c2a9d5d01af34396793950aa695883eed4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:51 -0800
Subject: [PATCH] md: allow array level to be set textually via sysfs

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      |  8 +++++++
 drivers/md/faulty.c       |  1 +
 drivers/md/linear.c       |  3 ++-
 drivers/md/md.c           | 61 +++++++++++++++++++++++++++++++++++++----------
 drivers/md/multipath.c    |  1 +
 drivers/md/raid0.c        |  1 +
 drivers/md/raid1.c        |  1 +
 drivers/md/raid10.c       |  1 +
 drivers/md/raid5.c        |  2 ++
 drivers/md/raid6main.c    |  1 +
 include/linux/raid/md_k.h |  1 +
 11 files changed, 67 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index c5512afd5917..fd43fd2cad2f 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -189,6 +189,14 @@ All md devices contain:
      1.2 (newer format in varying locations) or "none" indicating that
      the kernel isn't managing metadata at all.
 
+  level
+     The raid 'level' for this array.  The name will often (but not
+     always) be the same as the name of the module that implements the
+     level.  To be auto-loaded the module must have an alias
+        md-$LEVEL  e.g. md-raid5
+     This can be written only while the array is being assembled, not
+     after it is started.
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index f12e83086897..a7a5ab554338 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -342,4 +342,5 @@ module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-10"); /* faulty */
+MODULE_ALIAS("md-faulty");
 MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 79dee8159217..777585458c85 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -376,5 +376,6 @@ static void linear_exit (void)
 module_init(linear_init);
 module_exit(linear_exit);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("md-personality-1"); /* LINEAR - degrecated*/
+MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
+MODULE_ALIAS("md-linear");
 MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ecc0166ba779..594d8c312e6a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -303,12 +303,15 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 	return NULL;
 }
 
-static struct mdk_personality *find_pers(int level)
+static struct mdk_personality *find_pers(int level, char *clevel)
 {
 	struct mdk_personality *pers;
-	list_for_each_entry(pers, &pers_list, list)
-		if (pers->level == level)
+	list_for_each_entry(pers, &pers_list, list) {
+		if (level != LEVEL_NONE && pers->level == level)
 			return pers;
+		if (strcmp(pers->name, clevel)==0)
+			return pers;
+	}
 	return NULL;
 }
 
@@ -715,6 +718,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->ctime = sb->ctime;
 		mddev->utime = sb->utime;
 		mddev->level = sb->level;
+		mddev->clevel[0] = 0;
 		mddev->layout = sb->layout;
 		mddev->raid_disks = sb->raid_disks;
 		mddev->size = sb->size;
@@ -1051,6 +1055,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
 		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
 		mddev->level = le32_to_cpu(sb->level);
+		mddev->clevel[0] = 0;
 		mddev->layout = le32_to_cpu(sb->layout);
 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
 		mddev->size = le64_to_cpu(sb->size)/2;
@@ -1774,15 +1779,36 @@ static ssize_t
 level_show(mddev_t *mddev, char *page)
 {
 	struct mdk_personality *p = mddev->pers;
-	if (p == NULL && mddev->raid_disks == 0)
-		return 0;
-	if (mddev->level >= 0)
-		return sprintf(page, "raid%d\n", mddev->level);
-	else
+	if (p)
 		return sprintf(page, "%s\n", p->name);
+	else if (mddev->clevel[0])
+		return sprintf(page, "%s\n", mddev->clevel);
+	else if (mddev->level != LEVEL_NONE)
+		return sprintf(page, "%d\n", mddev->level);
+	else
+		return 0;
+}
+
+static ssize_t
+level_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int rv = len;
+	if (mddev->pers)
+		return -EBUSY;
+	if (len == 0)
+		return 0;
+	if (len >= sizeof(mddev->clevel))
+		return -ENOSPC;
+	strncpy(mddev->clevel, buf, len);
+	if (mddev->clevel[len-1] == '\n')
+		len--;
+	mddev->clevel[len] = 0;
+	mddev->level = LEVEL_NONE;
+	return rv;
 }
 
-static struct md_sysfs_entry md_level = __ATTR_RO(level);
+static struct md_sysfs_entry md_level =
+__ATTR(level, 0644, level_show, level_store);
 
 static ssize_t
 raid_disks_show(mddev_t *mddev, char *page)
@@ -2158,7 +2184,10 @@ static int do_md_run(mddev_t * mddev)
 	}
 
 #ifdef CONFIG_KMOD
-	request_module("md-level-%d", mddev->level);
+	if (mddev->level != LEVEL_NONE)
+		request_module("md-level-%d", mddev->level);
+	else if (mddev->clevel[0])
+		request_module("md-%s", mddev->clevel);
 #endif
 
 	/*
@@ -2180,15 +2209,21 @@ static int do_md_run(mddev_t * mddev)
 		return -ENOMEM;
 
 	spin_lock(&pers_lock);
-	pers = find_pers(mddev->level);
+	pers = find_pers(mddev->level, mddev->clevel);
 	if (!pers || !try_module_get(pers->owner)) {
 		spin_unlock(&pers_lock);
-		printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
-		       mddev->level);
+		if (mddev->level != LEVEL_NONE)
+			printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
+			       mddev->level);
+		else
+			printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
+			       mddev->clevel);
 		return -EINVAL;
 	}
 	mddev->pers = pers;
 	spin_unlock(&pers_lock);
+	mddev->level = pers->level;
+	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
 	mddev->recovery = 0;
 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index d4d838e3f9f8..e6aa309a66d7 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -578,4 +578,5 @@ module_init(multipath_init);
 module_exit(multipath_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
+MODULE_ALIAS("md-multipath");
 MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7fb69e29391b..abbca150202b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -536,4 +536,5 @@ module_init(raid0_init);
 module_exit(raid0_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-2"); /* RAID0 */
+MODULE_ALIAS("md-raid0");
 MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7d4465f93064..181c9616d5f1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2092,4 +2092,5 @@ module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
+MODULE_ALIAS("md-raid1");
 MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8e6f6dfddb2b..201dc7168a5f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2117,4 +2117,5 @@ module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
+MODULE_ALIAS("md-raid10");
 MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b0cfd3ca9ca0..9cc844f455bf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2240,5 +2240,7 @@ module_init(raid5_init);
 module_exit(raid5_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-4"); /* RAID5 */
+MODULE_ALIAS("md-raid5");
+MODULE_ALIAS("md-raid4");
 MODULE_ALIAS("md-level-5");
 MODULE_ALIAS("md-level-4");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 06b32bd671a3..84dd875bb2f6 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2341,4 +2341,5 @@ module_init(raid6_init);
 module_exit(raid6_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-8"); /* RAID6 */
+MODULE_ALIAS("md-raid6");
 MODULE_ALIAS("md-level-6");
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 0fb5af6d622d..686463115438 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -119,6 +119,7 @@ struct mddev_s
 	int				chunk_size;
 	time_t				ctime, utime;
 	int				level, layout;
+	char				clevel[16];
 	int				raid_disks;
 	int				max_disks;
 	sector_t			size; /* used size of component devices */
-- 
cgit v1.2.3-71-gd317


From 4dbcdc751cb25ffca3a8374cbc5ab6de961cc545 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:52 -0800
Subject: [PATCH] md: count corrected read errors per drive

Store this total in superblock (As appropriate), and make it available to
userspace via sysfs.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      | 11 +++++++++++
 drivers/md/md.c           | 27 ++++++++++++++++++++++++++-
 drivers/md/raid1.c        |  2 ++
 drivers/md/raid10.c       | 11 ++++++++---
 drivers/md/raid5.c        |  3 +++
 drivers/md/raid6main.c    |  3 +++
 include/linux/raid/md_k.h |  4 ++++
 7 files changed, 57 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index fd43fd2cad2f..a3eadf8e1701 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -222,6 +222,17 @@ Each directory contains:
 			 of being recoverred to
 	This list make grow in future.
 
+      errors
+	An approximate count of read errors that have been detected on
+	this device but have not caused the device to be evicted from
+	the array (either because they were corrected or because they
+	happened while the array was read-only).  When using version-1
+	metadata, this value persists across restarts of the array.
+
+	This value can be written while assembling an array thus
+	providing an ongoing count for arrays with metadata managed by
+	userspace.
+
 
 An active md device will also contain and entry for each active device
 in the array.  These are named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 594d8c312e6a..32a4e2311e43 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1000,6 +1000,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	}
 	rdev->preferred_minor = 0xffff;
 	rdev->data_offset = le64_to_cpu(sb->data_offset);
+	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
 
 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
 	bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
@@ -1139,6 +1140,8 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	else
 		sb->resync_offset = cpu_to_le64(0);
 
+	sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
+
 	if (mddev->bitmap && mddev->bitmap_file == NULL) {
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
@@ -1592,9 +1595,30 @@ super_show(mdk_rdev_t *rdev, char *page)
 }
 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
 
+static ssize_t
+errors_show(mdk_rdev_t *rdev, char *page)
+{
+	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
+}
+
+static ssize_t
+errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+	if (*buf && (*e == 0 || *e == '\n')) {
+		atomic_set(&rdev->corrected_errors, n);
+		return len;
+	}
+	return -EINVAL;
+}
+static struct rdev_sysfs_entry rdev_errors =
+__ATTR(errors, 0644, errors_show, errors_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_super.attr,
+	&rdev_errors.attr,
 	NULL,
 };
 static ssize_t
@@ -1674,6 +1698,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
 	rdev->data_offset = 0;
 	atomic_set(&rdev->nr_pending, 0);
 	atomic_set(&rdev->read_errors, 0);
+	atomic_set(&rdev->corrected_errors, 0);
 
 	size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
 	if (!size) {
@@ -4729,7 +4754,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
 	int num = simple_strtoul(val, &e, 10);
 	if (*val && (*e == '\0' || *e == '\n')) {
 		start_readonly = num;
-		return 0;;
+		return 0;
 	}
 	return -EINVAL;
 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 181c9616d5f1..a06ff91f27e2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1265,6 +1265,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 					if (r1_bio->bios[d]->bi_end_io != end_sync_read)
 						continue;
 					rdev = conf->mirrors[d].rdev;
+					atomic_add(s, &rdev->corrected_errors);
 					if (sync_page_io(rdev->bdev,
 							 sect + rdev->data_offset,
 							 s<<9,
@@ -1463,6 +1464,7 @@ static void raid1d(mddev_t *mddev)
 							d = conf->raid_disks;
 						d--;
 						rdev = conf->mirrors[d].rdev;
+						atomic_add(s, &rdev->corrected_errors);
 						if (rdev &&
 						    test_bit(In_sync, &rdev->flags)) {
 							if (sync_page_io(rdev->bdev,
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 201dc7168a5f..9e658e519a27 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1122,9 +1122,13 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
-	else if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
-		md_error(r10_bio->mddev,
-			 conf->mirrors[d].rdev);
+	else {
+		atomic_add(r10_bio->sectors,
+			   &conf->mirrors[d].rdev->corrected_errors);
+		if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
+			md_error(r10_bio->mddev,
+				 conf->mirrors[d].rdev);
+	}
 
 	/* for reconstruct, we always reschedule after a read.
 	 * for resync, only after all reads
@@ -1430,6 +1434,7 @@ static void raid10d(mddev_t *mddev)
 						sl--;
 						d = r10_bio->devs[sl].devnum;
 						rdev = conf->mirrors[d].rdev;
+						atomic_add(s, &rdev->corrected_errors);
 						if (rdev &&
 						    test_bit(In_sync, &rdev->flags)) {
 							if (sync_page_io(rdev->bdev,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9cc844f455bf..54f4a9847e38 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1400,6 +1400,9 @@ static void handle_stripe(struct stripe_head *sh)
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_size = STRIPE_SIZE;
 			bi->bi_next = NULL;
+			if (rw == WRITE &&
+			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			generic_make_request(bi);
 		} else {
 			if (rw == 1)
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 84dd875bb2f6..8c823d686a60 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1562,6 +1562,9 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_size = STRIPE_SIZE;
 			bi->bi_next = NULL;
+			if (rw == WRITE &&
+			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			generic_make_request(bi);
 		} else {
 			if (rw == 1)
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 686463115438..68b929c079ab 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -95,6 +95,10 @@ struct mdk_rdev_s
 	atomic_t	read_errors;	/* number of consecutive read errors that
 					 * we have tried to ignore.
 					 */
+	atomic_t	corrected_errors; /* number of corrected read errors,
+					   * for reporting to userspace and storing
+					   * in superblock.
+					   */
 };
 
 struct mddev_s
-- 
cgit v1.2.3-71-gd317


From 88202a0c84e1951d6630d1d557d4801a8cc5b5ef Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:21:36 -0800
Subject: [PATCH] md: allow sync-speed to be controlled per-device

Also export current (average) speed and status in sysfs.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      |  22 ++++++++++
 drivers/md/md.c           | 110 +++++++++++++++++++++++++++++++++++++++++++---
 include/linux/raid/md_k.h |   4 ++
 3 files changed, 131 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index b8d172b254f7..03a13c462cf2 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -207,6 +207,28 @@ All md devices contain:
      available.  It will then appear at md/dev-XXX (depending on the
      name of the device) and further configuration is then possible.
 
+   sync_speed_min
+   sync_speed_max
+     This are similar to /proc/sys/dev/raid/speed_limit_{min,max}
+     however they only apply to the particular array.
+     If no value has been written to these, of if the word 'system'
+     is written, then the system-wide value is used.  If a value,
+     in kibibytes-per-second is written, then it is used.
+     When the files are read, they show the currently active value
+     followed by "(local)" or "(system)" depending on whether it is
+     a locally set or system-wide value.
+
+   sync_completed
+     This shows the number of sectors that have been completed of
+     whatever the current sync_action is, followed by the number of
+     sectors in total that could need to be processed.  The two
+     numbers are separated by a '/'  thus effectively showing one
+     value, a fraction of the process that is complete.
+
+   sync_speed
+     This shows the current actual speed, in K/sec, of the current
+     sync_action.  It is averaged over the last 30 seconds.
+
 
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 825e235b791b..1b76fb29fb70 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -81,10 +81,22 @@ static DEFINE_SPINLOCK(pers_lock);
  * idle IO detection.
  *
  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ * or /sys/block/mdX/md/sync_speed_{min,max}
  */
 
 static int sysctl_speed_limit_min = 1000;
 static int sysctl_speed_limit_max = 200000;
+static inline int speed_min(mddev_t *mddev)
+{
+	return mddev->sync_speed_min ?
+		mddev->sync_speed_min : sysctl_speed_limit_min;
+}
+
+static inline int speed_max(mddev_t *mddev)
+{
+	return mddev->sync_speed_max ?
+		mddev->sync_speed_max : sysctl_speed_limit_max;
+}
 
 static struct ctl_table_header *raid_table_header;
 
@@ -2197,6 +2209,90 @@ md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
 static struct md_sysfs_entry
 md_mismatches = __ATTR_RO(mismatch_cnt);
 
+static ssize_t
+sync_min_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d (%s)\n", speed_min(mddev),
+		       mddev->sync_speed_min ? "local": "system");
+}
+
+static ssize_t
+sync_min_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int min;
+	char *e;
+	if (strncmp(buf, "system", 6)==0) {
+		mddev->sync_speed_min = 0;
+		return len;
+	}
+	min = simple_strtoul(buf, &e, 10);
+	if (buf == e || (*e && *e != '\n') || min <= 0)
+		return -EINVAL;
+	mddev->sync_speed_min = min;
+	return len;
+}
+
+static struct md_sysfs_entry md_sync_min =
+__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
+
+static ssize_t
+sync_max_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d (%s)\n", speed_max(mddev),
+		       mddev->sync_speed_max ? "local": "system");
+}
+
+static ssize_t
+sync_max_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int max;
+	char *e;
+	if (strncmp(buf, "system", 6)==0) {
+		mddev->sync_speed_max = 0;
+		return len;
+	}
+	max = simple_strtoul(buf, &e, 10);
+	if (buf == e || (*e && *e != '\n') || max <= 0)
+		return -EINVAL;
+	mddev->sync_speed_max = max;
+	return len;
+}
+
+static struct md_sysfs_entry md_sync_max =
+__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
+
+
+static ssize_t
+sync_speed_show(mddev_t *mddev, char *page)
+{
+	unsigned long resync, dt, db;
+	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+	dt = ((jiffies - mddev->resync_mark) / HZ);
+	if (!dt) dt++;
+	db = resync - (mddev->resync_mark_cnt);
+	return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
+}
+
+static struct md_sysfs_entry
+md_sync_speed = __ATTR_RO(sync_speed);
+
+static ssize_t
+sync_completed_show(mddev_t *mddev, char *page)
+{
+	unsigned long max_blocks, resync;
+
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+		max_blocks = mddev->resync_max_sectors;
+	else
+		max_blocks = mddev->size << 1;
+
+	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+	return sprintf(page, "%lu / %lu\n", resync, max_blocks);
+}
+
+static struct md_sysfs_entry
+md_sync_completed = __ATTR_RO(sync_completed);
+
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_raid_disks.attr,
@@ -2210,6 +2306,10 @@ static struct attribute *md_default_attrs[] = {
 static struct attribute *md_redundancy_attrs[] = {
 	&md_scan_mode.attr,
 	&md_mismatches.attr,
+	&md_sync_min.attr,
+	&md_sync_max.attr,
+	&md_sync_speed.attr,
+	&md_sync_completed.attr,
 	NULL,
 };
 static struct attribute_group md_redundancy_group = {
@@ -4433,10 +4533,10 @@ static void md_do_sync(mddev_t *mddev)
 
 	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
 	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
-		" %d KB/sec/disc.\n", sysctl_speed_limit_min);
+		" %d KB/sec/disc.\n", speed_min(mddev));
 	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
 	       "(but not more than %d KB/sec) for reconstruction.\n",
-	       sysctl_speed_limit_max);
+	       speed_max(mddev));
 
 	is_mddev_idle(mddev); /* this also initializes IO event counters */
 	/* we don't use the checkpoint if there's a bitmap */
@@ -4477,7 +4577,7 @@ static void md_do_sync(mddev_t *mddev)
 
 		skipped = 0;
 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
-					    currspeed < sysctl_speed_limit_min);
+					    currspeed < speed_min(mddev));
 		if (sectors == 0) {
 			set_bit(MD_RECOVERY_ERR, &mddev->recovery);
 			goto out;
@@ -4542,8 +4642,8 @@ static void md_do_sync(mddev_t *mddev)
 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
 
-		if (currspeed > sysctl_speed_limit_min) {
-			if ((currspeed > sysctl_speed_limit_max) ||
+		if (currspeed > speed_min(mddev)) {
+			if ((currspeed > speed_max(mddev)) ||
 					!is_mddev_idle(mddev)) {
 				msleep(500);
 				goto repeat;
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 68b929c079ab..617b9506c760 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -143,6 +143,10 @@ struct mddev_s
 	sector_t			resync_mismatches; /* count of sectors where
 							    * parity/replica mismatch found
 							    */
+	/* if zero, use the system-wide default */
+	int				sync_speed_min;
+	int				sync_speed_max;
+
 	int				ok_start_degraded;
 	/* recovery/resync flags 
 	 * NEEDED:   we might need to start a resync/recover
-- 
cgit v1.2.3-71-gd317


From 9b847548663ef1039dd49f0eb4463d001e596bc3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Fri, 6 Jan 2006 09:28:07 +0100
Subject: [PATCH] Suspend support for libata

This patch adds suspend patch to libata, and ata_piix in particular. For
most low level drivers, they should just need to add the 4 hooks to
work. As I can only test ata_piix, I didn't enable it for more
though.

Suspend support is the single most important feature on a notebook, and
most new notebooks have sata drives. It's quite embarrassing that we
_still_ do not support this. Right now, it's perfectly possible to
suspend the drive in mid-transfer.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/scsi/ata_piix.c    |   4 ++
 drivers/scsi/libata-core.c | 114 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/libata-scsi.c |  16 +++++++
 drivers/scsi/scsi_sysfs.c  |  31 ++++++++++++
 include/linux/ata.h        |   2 +
 include/linux/libata.h     |   8 ++++
 include/scsi/scsi_host.h   |   6 +++
 7 files changed, 181 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c
index 0ea27873b9ff..f79630340028 100644
--- a/drivers/scsi/ata_piix.c
+++ b/drivers/scsi/ata_piix.c
@@ -166,6 +166,8 @@ static struct pci_driver piix_pci_driver = {
 	.id_table		= piix_pci_tbl,
 	.probe			= piix_init_one,
 	.remove			= ata_pci_remove_one,
+	.suspend		= ata_pci_device_suspend,
+	.resume			= ata_pci_device_resume,
 };
 
 static struct scsi_host_template piix_sht = {
@@ -186,6 +188,8 @@ static struct scsi_host_template piix_sht = {
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
 	.ordered_flush		= 1,
+	.resume			= ata_scsi_device_resume,
+	.suspend		= ata_scsi_device_suspend,
 };
 
 static const struct ata_port_operations piix_pata_ops = {
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 9ea102587914..9c66d4059399 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4154,6 +4154,96 @@ err_out:
  *	Inherited from caller.
  */
 
+/*
+ * Execute a 'simple' command, that only consists of the opcode 'cmd' itself,
+ * without filling any other registers
+ */
+static int ata_do_simple_cmd(struct ata_port *ap, struct ata_device *dev,
+			     u8 cmd)
+{
+	struct ata_taskfile tf;
+	int err;
+
+	ata_tf_init(ap, &tf, dev->devno);
+
+	tf.command = cmd;
+	tf.flags |= ATA_TFLAG_DEVICE;
+	tf.protocol = ATA_PROT_NODATA;
+
+	err = ata_exec_internal(ap, dev, &tf, DMA_NONE, NULL, 0);
+	if (err)
+		printk(KERN_ERR "%s: ata command failed: %d\n",
+				__FUNCTION__, err);
+
+	return err;
+}
+
+static int ata_flush_cache(struct ata_port *ap, struct ata_device *dev)
+{
+	u8 cmd;
+
+	if (!ata_try_flush_cache(dev))
+		return 0;
+
+	if (ata_id_has_flush_ext(dev->id))
+		cmd = ATA_CMD_FLUSH_EXT;
+	else
+		cmd = ATA_CMD_FLUSH;
+
+	return ata_do_simple_cmd(ap, dev, cmd);
+}
+
+static int ata_standby_drive(struct ata_port *ap, struct ata_device *dev)
+{
+	return ata_do_simple_cmd(ap, dev, ATA_CMD_STANDBYNOW1);
+}
+
+static int ata_start_drive(struct ata_port *ap, struct ata_device *dev)
+{
+	return ata_do_simple_cmd(ap, dev, ATA_CMD_IDLEIMMEDIATE);
+}
+
+/**
+ *	ata_device_resume - wakeup a previously suspended devices
+ *
+ *	Kick the drive back into action, by sending it an idle immediate
+ *	command and making sure its transfer mode matches between drive
+ *	and host.
+ *
+ */
+int ata_device_resume(struct ata_port *ap, struct ata_device *dev)
+{
+	if (ap->flags & ATA_FLAG_SUSPENDED) {
+		ap->flags &= ~ATA_FLAG_SUSPENDED;
+		ata_set_mode(ap);
+	}
+	if (!ata_dev_present(dev))
+		return 0;
+	if (dev->class == ATA_DEV_ATA)
+		ata_start_drive(ap, dev);
+
+	return 0;
+}
+
+/**
+ *	ata_device_suspend - prepare a device for suspend
+ *
+ *	Flush the cache on the drive, if appropriate, then issue a
+ *	standbynow command.
+ *
+ */
+int ata_device_suspend(struct ata_port *ap, struct ata_device *dev)
+{
+	if (!ata_dev_present(dev))
+		return 0;
+	if (dev->class == ATA_DEV_ATA)
+		ata_flush_cache(ap, dev);
+
+	ata_standby_drive(ap, dev);
+	ap->flags |= ATA_FLAG_SUSPENDED;
+	return 0;
+}
+
 int ata_port_start (struct ata_port *ap)
 {
 	struct device *dev = ap->host_set->dev;
@@ -4902,6 +4992,23 @@ int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits)
 
 	return (tmp == bits->val) ? 1 : 0;
 }
+
+int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, PCI_D3hot);
+	return 0;
+}
+
+int ata_pci_device_resume(struct pci_dev *pdev)
+{
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	pci_enable_device(pdev);
+	pci_set_master(pdev);
+	return 0;
+}
 #endif /* CONFIG_PCI */
 
 
@@ -5005,4 +5112,11 @@ EXPORT_SYMBOL_GPL(ata_pci_host_stop);
 EXPORT_SYMBOL_GPL(ata_pci_init_native_mode);
 EXPORT_SYMBOL_GPL(ata_pci_init_one);
 EXPORT_SYMBOL_GPL(ata_pci_remove_one);
+EXPORT_SYMBOL_GPL(ata_pci_device_suspend);
+EXPORT_SYMBOL_GPL(ata_pci_device_resume);
 #endif /* CONFIG_PCI */
+
+EXPORT_SYMBOL_GPL(ata_device_suspend);
+EXPORT_SYMBOL_GPL(ata_device_resume);
+EXPORT_SYMBOL_GPL(ata_scsi_device_suspend);
+EXPORT_SYMBOL_GPL(ata_scsi_device_resume);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index e0439be4b573..c1ebede14a48 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -396,6 +396,22 @@ void ata_dump_status(unsigned id, struct ata_taskfile *tf)
 	}
 }
 
+int ata_scsi_device_resume(struct scsi_device *sdev)
+{
+	struct ata_port *ap = (struct ata_port *) &sdev->host->hostdata[0];
+	struct ata_device *dev = &ap->device[sdev->id];
+
+	return ata_device_resume(ap, dev);
+}
+
+int ata_scsi_device_suspend(struct scsi_device *sdev)
+{
+	struct ata_port *ap = (struct ata_port *) &sdev->host->hostdata[0];
+	struct ata_device *dev = &ap->device[sdev->id];
+
+	return ata_device_suspend(ap, dev);
+}
+
 /**
  *	ata_to_sense_error - convert ATA error to SCSI error
  *	@id: ATA device number
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 15842b1f0f4a..ea7f3a433572 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -263,9 +263,40 @@ static int scsi_bus_match(struct device *dev, struct device_driver *gendrv)
 	return (sdp->inq_periph_qual == SCSI_INQ_PQ_CON)? 1: 0;
 }
 
+static int scsi_bus_suspend(struct device * dev, pm_message_t state)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct scsi_host_template *sht = sdev->host->hostt;
+	int err;
+
+	err = scsi_device_quiesce(sdev);
+	if (err)
+		return err;
+
+	if (sht->suspend)
+		err = sht->suspend(sdev);
+
+	return err;
+}
+
+static int scsi_bus_resume(struct device * dev)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct scsi_host_template *sht = sdev->host->hostt;
+	int err = 0;
+
+	if (sht->resume)
+		err = sht->resume(sdev);
+
+	scsi_device_resume(sdev);
+	return err;
+}
+
 struct bus_type scsi_bus_type = {
         .name		= "scsi",
         .match		= scsi_bus_match,
+	.suspend	= scsi_bus_suspend,
+	.resume		= scsi_bus_resume,
 };
 
 int scsi_sysfs_register(void)
diff --git a/include/linux/ata.h b/include/linux/ata.h
index d2873b732bb1..3eb80c391b39 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -141,6 +141,8 @@ enum {
 	ATA_CMD_PACKET		= 0xA0,
 	ATA_CMD_VERIFY		= 0x40,
 	ATA_CMD_VERIFY_EXT	= 0x42,
+ 	ATA_CMD_STANDBYNOW1	= 0xE0,
+ 	ATA_CMD_IDLEIMMEDIATE	= 0xE1,
 	ATA_CMD_INIT_DEV_PARAMS	= 0x91,
 
 	/* SETFEATURES stuff */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e828e172ccbf..cdab75c209a0 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -124,6 +124,8 @@ enum {
 	ATA_FLAG_DEBUGMSG	= (1 << 10),
 	ATA_FLAG_NO_ATAPI	= (1 << 11), /* No ATAPI support */
 
+	ATA_FLAG_SUSPENDED	= (1 << 12), /* port is suspended */
+
 	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
 	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
 	ATA_QCFLAG_SINGLE	= (1 << 4), /* no s/g, just a single buffer */
@@ -436,6 +438,8 @@ extern void ata_std_ports(struct ata_ioports *ioaddr);
 extern int ata_pci_init_one (struct pci_dev *pdev, struct ata_port_info **port_info,
 			     unsigned int n_ports);
 extern void ata_pci_remove_one (struct pci_dev *pdev);
+extern int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t state);
+extern int ata_pci_device_resume(struct pci_dev *pdev);
 #endif /* CONFIG_PCI */
 extern int ata_device_add(const struct ata_probe_ent *ent);
 extern void ata_host_set_remove(struct ata_host_set *host_set);
@@ -445,6 +449,10 @@ extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
 extern int ata_scsi_error(struct Scsi_Host *host);
 extern int ata_scsi_release(struct Scsi_Host *host);
 extern unsigned int ata_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
+extern int ata_scsi_device_resume(struct scsi_device *);
+extern int ata_scsi_device_suspend(struct scsi_device *);
+extern int ata_device_resume(struct ata_port *, struct ata_device *);
+extern int ata_device_suspend(struct ata_port *, struct ata_device *);
 extern int ata_ratelimit(void);
 
 /*
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 6cbb1982ed03..6297885a35e7 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -295,6 +295,12 @@ struct scsi_host_template {
 	 */
 	int (*proc_info)(struct Scsi_Host *, char *, char **, off_t, int, int);
 
+	/*
+	 * suspend support
+	 */
+	int (*resume)(struct scsi_device *);
+	int (*suspend)(struct scsi_device *);
+
 	/*
 	 * Name of proc directory
 	 */
-- 
cgit v1.2.3-71-gd317


From 22905f775dd6a8b73be99826dcad07ceec00244b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 16 Nov 2005 15:07:01 -0800
Subject: identify multipage ->writepages() calls

 NFS needs to be able to distinguish between single-page ->writepage() calls and
 multipage ->writepages() calls.

 For the single-page writepage calls NFS can kick off the I/O within the
 context of ->writepage().

 For multipage ->writepages calls, nfs_writepage() will leave the I/O pending
 and nfs_writepages() will kick off the I/O when it all has been queued up
 within NFS.

 Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
 Signed-off-by: Andrew Morton <akpm@osdl.org>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/writeback.h |  9 +++++----
 mm/page-writeback.c       | 10 ++++++++--
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 64a36ba43b2f..b096159086e8 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -53,10 +53,11 @@ struct writeback_control {
 	loff_t start;
 	loff_t end;
 
-	unsigned nonblocking:1;			/* Don't get stuck on request queues */
-	unsigned encountered_congestion:1;	/* An output: a queue is full */
-	unsigned for_kupdate:1;			/* A kupdate writeback */
-	unsigned for_reclaim:1;			/* Invoked from the page allocator */
+	unsigned nonblocking:1;		/* Don't get stuck on request queues */
+	unsigned encountered_congestion:1; /* An output: a queue is full */
+	unsigned for_kupdate:1;		/* A kupdate writeback */
+	unsigned for_reclaim:1;		/* Invoked from the page allocator */
+	unsigned for_writepages:1;	/* This is a writepages() call */
 };
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0166ea15c9ee..5240e426c1f7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -550,11 +550,17 @@ void __init page_writeback_init(void)
 
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
+	int ret;
+
 	if (wbc->nr_to_write <= 0)
 		return 0;
+	wbc->for_writepages = 1;
 	if (mapping->a_ops->writepages)
-		return mapping->a_ops->writepages(mapping, wbc);
-	return generic_writepages(mapping, wbc);
+		ret =  mapping->a_ops->writepages(mapping, wbc);
+	else
+		ret = generic_writepages(mapping, wbc);
+	wbc->for_writepages = 0;
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From abbcf28f23d53e8ec56a91f3528743913fa2694a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:03 +0100
Subject: SUNRPC: Yet more RPC cleanups

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h |  3 ++-
 net/sunrpc/clnt.c            | 32 +++++++++++++++++---------------
 net/sunrpc/pmap_clnt.c       |  8 +++-----
 net/sunrpc/sched.c           | 31 ++++++++++++-------------------
 4 files changed, 34 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 4d77e90d0b30..4c4b2dc8aca5 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -233,6 +233,7 @@ struct rpc_task *rpc_new_child(struct rpc_clnt *, struct rpc_task *parent);
 void		rpc_init_task(struct rpc_task *, struct rpc_clnt *,
 					rpc_action exitfunc, int flags);
 void		rpc_release_task(struct rpc_task *);
+void		rpc_exit_task(struct rpc_task *);
 void		rpc_killall_tasks(struct rpc_clnt *);
 int		rpc_execute(struct rpc_task *);
 void		rpc_run_child(struct rpc_task *parent, struct rpc_task *child,
@@ -259,7 +260,7 @@ void		rpc_destroy_mempool(void);
 static inline void rpc_exit(struct rpc_task *task, int status)
 {
 	task->tk_status = status;
-	task->tk_action = NULL;
+	task->tk_action = rpc_exit_task;
 }
 
 #ifdef RPC_DEBUG
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 61c3abeaccae..6ab4cbd8a901 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -511,7 +511,7 @@ rpc_call_setup(struct rpc_task *task, struct rpc_message *msg, int flags)
 	if (task->tk_status == 0)
 		task->tk_action = call_start;
 	else
-		task->tk_action = NULL;
+		task->tk_action = rpc_exit_task;
 }
 
 void
@@ -892,7 +892,7 @@ call_transmit(struct rpc_task *task)
 	if (task->tk_status < 0)
 		return;
 	if (!task->tk_msg.rpc_proc->p_decode) {
-		task->tk_action = NULL;
+		task->tk_action = rpc_exit_task;
 		rpc_wake_up_task(task);
 	}
 	return;
@@ -1039,13 +1039,14 @@ call_decode(struct rpc_task *task)
 				sizeof(req->rq_rcv_buf)) != 0);
 
 	/* Verify the RPC header */
-	if (!(p = call_verify(task))) {
-		if (task->tk_action == NULL)
-			return;
-		goto out_retry;
+	p = call_verify(task);
+	if (IS_ERR(p)) {
+		if (p == ERR_PTR(-EAGAIN))
+			goto out_retry;
+		return;
 	}
 
-	task->tk_action = NULL;
+	task->tk_action = rpc_exit_task;
 
 	if (decode)
 		task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
@@ -1138,7 +1139,7 @@ call_verify(struct rpc_task *task)
 
 	if ((n = ntohl(*p++)) != RPC_REPLY) {
 		printk(KERN_WARNING "call_verify: not an RPC reply: %x\n", n);
-		goto out_retry;
+		goto out_garbage;
 	}
 	if ((n = ntohl(*p++)) != RPC_MSG_ACCEPTED) {
 		if (--len < 0)
@@ -1168,7 +1169,7 @@ call_verify(struct rpc_task *task)
 							task->tk_pid);
 			rpcauth_invalcred(task);
 			task->tk_action = call_refresh;
-			return NULL;
+			goto out_retry;
 		case RPC_AUTH_BADCRED:
 		case RPC_AUTH_BADVERF:
 			/* possibly garbled cred/verf? */
@@ -1178,7 +1179,7 @@ call_verify(struct rpc_task *task)
 			dprintk("RPC: %4d call_verify: retry garbled creds\n",
 							task->tk_pid);
 			task->tk_action = call_bind;
-			return NULL;
+			goto out_retry;
 		case RPC_AUTH_TOOWEAK:
 			printk(KERN_NOTICE "call_verify: server requires stronger "
 			       "authentication.\n");
@@ -1193,7 +1194,7 @@ call_verify(struct rpc_task *task)
 	}
 	if (!(p = rpcauth_checkverf(task, p))) {
 		printk(KERN_WARNING "call_verify: auth check failed\n");
-		goto out_retry;		/* bad verifier, retry */
+		goto out_garbage;		/* bad verifier, retry */
 	}
 	len = p - (u32 *)iov->iov_base - 1;
 	if (len < 0)
@@ -1230,23 +1231,24 @@ call_verify(struct rpc_task *task)
 		/* Also retry */
 	}
 
-out_retry:
+out_garbage:
 	task->tk_client->cl_stats->rpcgarbage++;
 	if (task->tk_garb_retry) {
 		task->tk_garb_retry--;
 		dprintk("RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid);
 		task->tk_action = call_bind;
-		return NULL;
+out_retry:
+		return ERR_PTR(-EAGAIN);
 	}
 	printk(KERN_WARNING "RPC %s: retry failed, exit EIO\n", __FUNCTION__);
 out_eio:
 	error = -EIO;
 out_err:
 	rpc_exit(task, error);
-	return NULL;
+	return ERR_PTR(error);
 out_overflow:
 	printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__);
-	goto out_retry;
+	goto out_garbage;
 }
 
 static int rpcproc_encode_null(void *rqstp, u32 *data, void *obj)
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index a398575f94b8..cad4568fbbe2 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -90,8 +90,7 @@ bailout:
 	map->pm_binding = 0;
 	rpc_wake_up(&map->pm_bindwait);
 	spin_unlock(&pmap_lock);
-	task->tk_status = -EIO;
-	task->tk_action = NULL;
+	rpc_exit(task, -EIO);
 }
 
 #ifdef CONFIG_ROOT_NFS
@@ -138,11 +137,10 @@ pmap_getport_done(struct rpc_task *task)
 			task->tk_pid, task->tk_status, clnt->cl_port);
 	if (task->tk_status < 0) {
 		/* Make the calling task exit with an error */
-		task->tk_action = NULL;
+		task->tk_action = rpc_exit_task;
 	} else if (clnt->cl_port == 0) {
 		/* Program not registered */
-		task->tk_status = -EACCES;
-		task->tk_action = NULL;
+		rpc_exit(task, -EACCES);
 	} else {
 		/* byte-swap port number first */
 		clnt->cl_port = htons(clnt->cl_port);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 54e60a657500..3fcf7b0e1f6c 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -555,28 +555,22 @@ __rpc_atrun(struct rpc_task *task)
 }
 
 /*
- * Helper that calls task->tk_exit if it exists and then returns
- * true if we should exit __rpc_execute.
+ * Helper that calls task->tk_exit if it exists
  */
-static inline int __rpc_do_exit(struct rpc_task *task)
+void rpc_exit_task(struct rpc_task *task)
 {
+	task->tk_action = NULL;
 	if (task->tk_exit != NULL) {
-		lock_kernel();
 		task->tk_exit(task);
-		unlock_kernel();
-		/* If tk_action is non-null, we should restart the call */
 		if (task->tk_action != NULL) {
-			if (!RPC_ASSASSINATED(task)) {
-				/* Release RPC slot and buffer memory */
-				xprt_release(task);
-				rpc_free(task);
-				return 0;
-			}
-			printk(KERN_ERR "RPC: dead task tried to walk away.\n");
+			WARN_ON(RPC_ASSASSINATED(task));
+			/* Always release the RPC slot and buffer memory */
+			xprt_release(task);
+			rpc_free(task);
 		}
 	}
-	return 1;
 }
+EXPORT_SYMBOL(rpc_exit_task);
 
 static int rpc_wait_bit_interruptible(void *word)
 {
@@ -631,12 +625,11 @@ static int __rpc_execute(struct rpc_task *task)
 		 * by someone else.
 		 */
 		if (!RPC_IS_QUEUED(task)) {
-			if (task->tk_action != NULL) {
-				lock_kernel();
-				task->tk_action(task);
-				unlock_kernel();
-			} else if (__rpc_do_exit(task))
+			if (task->tk_action == NULL)
 				break;
+			lock_kernel();
+			task->tk_action(task);
+			unlock_kernel();
 		}
 
 		/*
-- 
cgit v1.2.3-71-gd317


From 963d8fe53339128ee46a7701f2e36305f0ccff8c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:04 +0100
Subject: RPC: Clean up RPC task structure

 Shrink the RPC task structure. Instead of storing separate pointers
 for task->tk_exit and task->tk_release, put them in a structure.

 Also pass the user data pointer as a parameter instead of passing it via
 task->tk_calldata. This enables us to nest callbacks.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c          |  38 ++++++++-------
 fs/lockd/svc4proc.c          |  15 +++---
 fs/lockd/svclock.c           |  14 ++++--
 fs/lockd/svcproc.c           |  14 ++++--
 fs/nfs/direct.c              |   1 -
 fs/nfs/nfs3proc.c            |  44 +++++++++++-------
 fs/nfs/nfs4proc.c            | 107 +++++++++++++++++++++++++------------------
 fs/nfs/proc.c                |  28 +++++++----
 fs/nfs/read.c                |  10 ++--
 fs/nfs/unlink.c              |  19 ++++----
 fs/nfs/write.c               |  21 +++------
 fs/nfsd/nfs4callback.c       |  10 ++--
 include/linux/lockd/lockd.h  |   2 +-
 include/linux/nfs_fs.h       |  12 +++--
 include/linux/sunrpc/clnt.h  |   3 +-
 include/linux/sunrpc/sched.h |  20 +++++---
 net/sunrpc/clnt.c            |  15 +++---
 net/sunrpc/sched.c           |  53 +++++++++++----------
 18 files changed, 241 insertions(+), 185 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c5a33648e9fd..816333cd377b 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -26,11 +26,12 @@
 static int	nlmclnt_test(struct nlm_rqst *, struct file_lock *);
 static int	nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
 static int	nlmclnt_unlock(struct nlm_rqst *, struct file_lock *);
-static void	nlmclnt_unlock_callback(struct rpc_task *);
-static void	nlmclnt_cancel_callback(struct rpc_task *);
 static int	nlm_stat_to_errno(u32 stat);
 static void	nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host);
 
+static const struct rpc_call_ops nlmclnt_unlock_ops;
+static const struct rpc_call_ops nlmclnt_cancel_ops;
+
 /*
  * Cookie counter for NLM requests
  */
@@ -399,8 +400,7 @@ in_grace_period:
 /*
  * Generic NLM call, async version.
  */
-int
-nlmsvc_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
+int nlmsvc_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
 {
 	struct nlm_host	*host = req->a_host;
 	struct rpc_clnt	*clnt;
@@ -419,13 +419,12 @@ nlmsvc_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
 
         /* bootstrap and kick off the async RPC call */
-        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, callback, req);
+        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req);
 
 	return status;
 }
 
-static int
-nlmclnt_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
+static int nlmclnt_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
 {
 	struct nlm_host	*host = req->a_host;
 	struct rpc_clnt	*clnt;
@@ -448,7 +447,7 @@ nlmclnt_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
 	/* Increment host refcount */
 	nlm_get_host(host);
         /* bootstrap and kick off the async RPC call */
-        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, callback, req);
+        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req);
 	if (status < 0)
 		nlm_release_host(host);
 	return status;
@@ -664,7 +663,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 
 	if (req->a_flags & RPC_TASK_ASYNC) {
 		status = nlmclnt_async_call(req, NLMPROC_UNLOCK,
-					nlmclnt_unlock_callback);
+					&nlmclnt_unlock_ops);
 		/* Hrmf... Do the unlock early since locks_remove_posix()
 		 * really expects us to free the lock synchronously */
 		do_vfs_lock(fl);
@@ -692,10 +691,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	return -ENOLCK;
 }
 
-static void
-nlmclnt_unlock_callback(struct rpc_task *task)
+static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst	*req = (struct nlm_rqst *) task->tk_calldata;
+	struct nlm_rqst	*req = data;
 	int		status = req->a_res.status;
 
 	if (RPC_ASSASSINATED(task))
@@ -722,6 +720,10 @@ die:
 	rpc_restart_call(task);
 }
 
+static const struct rpc_call_ops nlmclnt_unlock_ops = {
+	.rpc_call_done = nlmclnt_unlock_callback,
+};
+
 /*
  * Cancel a blocked lock request.
  * We always use an async RPC call for this in order not to hang a
@@ -750,8 +752,7 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl)
 
 	nlmclnt_setlockargs(req, fl);
 
-	status = nlmclnt_async_call(req, NLMPROC_CANCEL,
-					nlmclnt_cancel_callback);
+	status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
 	if (status < 0) {
 		nlmclnt_release_lockargs(req);
 		kfree(req);
@@ -765,10 +766,9 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl)
 	return status;
 }
 
-static void
-nlmclnt_cancel_callback(struct rpc_task *task)
+static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst	*req = (struct nlm_rqst *) task->tk_calldata;
+	struct nlm_rqst	*req = data;
 
 	if (RPC_ASSASSINATED(task))
 		goto die;
@@ -807,6 +807,10 @@ retry_cancel:
 	rpc_delay(task, 30 * HZ);
 }
 
+static const struct rpc_call_ops nlmclnt_cancel_ops = {
+	.rpc_call_done = nlmclnt_cancel_callback,
+};
+
 /*
  * Convert an NLM status code to a generic kernel errno
  */
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 489670e21769..4063095d849e 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -22,7 +22,8 @@
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
 static u32	nlm4svc_callback(struct svc_rqst *, u32, struct nlm_res *);
-static void	nlm4svc_callback_exit(struct rpc_task *);
+
+static const struct rpc_call_ops nlm4svc_callback_ops;
 
 /*
  * Obtain client and file from arguments
@@ -470,7 +471,6 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
 }
 
 
-
 /*
  * This is the generic lockd callback for async RPC calls
  */
@@ -494,7 +494,7 @@ nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
 	call->a_host  = host;
 	memcpy(&call->a_args, resp, sizeof(*resp));
 
-	if (nlmsvc_async_call(call, proc, nlm4svc_callback_exit) < 0)
+	if (nlmsvc_async_call(call, proc, &nlm4svc_callback_ops) < 0)
 		goto error;
 
 	return rpc_success;
@@ -504,10 +504,9 @@ nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
 	return rpc_system_err;
 }
 
-static void
-nlm4svc_callback_exit(struct rpc_task *task)
+static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst	*call = (struct nlm_rqst *) task->tk_calldata;
+	struct nlm_rqst	*call = data;
 
 	if (task->tk_status < 0) {
 		dprintk("lockd: %4d callback failed (errno = %d)\n",
@@ -517,6 +516,10 @@ nlm4svc_callback_exit(struct rpc_task *task)
 	kfree(call);
 }
 
+static const struct rpc_call_ops nlm4svc_callback_ops = {
+	.rpc_call_done = nlm4svc_callback_exit,
+};
+
 /*
  * NLM Server procedures.
  */
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 49f959796b66..87d09a0d8f64 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -41,7 +41,8 @@
 
 static void	nlmsvc_insert_block(struct nlm_block *block, unsigned long);
 static int	nlmsvc_remove_block(struct nlm_block *block);
-static void	nlmsvc_grant_callback(struct rpc_task *task);
+
+static const struct rpc_call_ops nlmsvc_grant_ops;
 
 /*
  * The list of blocked locks to retry
@@ -562,7 +563,7 @@ callback:
 	/* Call the client */
 	nlm_get_host(block->b_call.a_host);
 	if (nlmsvc_async_call(&block->b_call, NLMPROC_GRANTED_MSG,
-						nlmsvc_grant_callback) < 0)
+						&nlmsvc_grant_ops) < 0)
 		nlm_release_host(block->b_call.a_host);
 	up(&file->f_sema);
 }
@@ -575,10 +576,9 @@ callback:
  * chain once more in order to have it removed by lockd itself (which can
  * then sleep on the file semaphore without disrupting e.g. the nfs client).
  */
-static void
-nlmsvc_grant_callback(struct rpc_task *task)
+static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst		*call = (struct nlm_rqst *) task->tk_calldata;
+	struct nlm_rqst		*call = data;
 	struct nlm_block	*block;
 	unsigned long		timeout;
 	struct sockaddr_in	*peer_addr = RPC_PEERADDR(task->tk_client);
@@ -614,6 +614,10 @@ nlmsvc_grant_callback(struct rpc_task *task)
 	nlm_release_host(call->a_host);
 }
 
+static const struct rpc_call_ops nlmsvc_grant_ops = {
+	.rpc_call_done = nlmsvc_grant_callback,
+};
+
 /*
  * We received a GRANT_RES callback. Try to find the corresponding
  * block.
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 757e344cf200..3bc437e0cf5b 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -23,7 +23,8 @@
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
 static u32	nlmsvc_callback(struct svc_rqst *, u32, struct nlm_res *);
-static void	nlmsvc_callback_exit(struct rpc_task *);
+
+static const struct rpc_call_ops nlmsvc_callback_ops;
 
 #ifdef CONFIG_LOCKD_V4
 static u32
@@ -518,7 +519,7 @@ nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
 	call->a_host  = host;
 	memcpy(&call->a_args, resp, sizeof(*resp));
 
-	if (nlmsvc_async_call(call, proc, nlmsvc_callback_exit) < 0)
+	if (nlmsvc_async_call(call, proc, &nlmsvc_callback_ops) < 0)
 		goto error;
 
 	return rpc_success;
@@ -528,10 +529,9 @@ nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
 	return rpc_system_err;
 }
 
-static void
-nlmsvc_callback_exit(struct rpc_task *task)
+static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst	*call = (struct nlm_rqst *) task->tk_calldata;
+	struct nlm_rqst	*call = data;
 
 	if (task->tk_status < 0) {
 		dprintk("lockd: %4d callback failed (errno = %d)\n",
@@ -541,6 +541,10 @@ nlmsvc_callback_exit(struct rpc_task *task)
 	kfree(call);
 }
 
+static const struct rpc_call_ops nlmsvc_callback_ops = {
+	.rpc_call_done = nlmsvc_callback_exit,
+};
+
 /*
  * NLM Server procedures.
  */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 079228817603..a834423942c7 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -269,7 +269,6 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq,
 
 		data->task.tk_cookie = (unsigned long) inode;
 		data->task.tk_calldata = data;
-		data->task.tk_release = nfs_readdata_release;
 		data->complete = nfs_direct_read_result;
 
 		lock_kernel();
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 92c870d19ccd..c172a7584646 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -732,19 +732,23 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
 
-static void
-nfs3_read_done(struct rpc_task *task)
+static void nfs3_read_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata;
+	struct nfs_read_data *data = calldata;
 
 	if (nfs3_async_handle_jukebox(task))
 		return;
 	/* Call back common NFS readpage processing */
 	if (task->tk_status >= 0)
 		nfs_refresh_inode(data->inode, &data->fattr);
-	nfs_readpage_result(task);
+	nfs_readpage_result(task, calldata);
 }
 
+static const struct rpc_call_ops nfs3_read_ops = {
+	.rpc_call_done = nfs3_read_done,
+	.rpc_release = nfs_readdata_release,
+};
+
 static void
 nfs3_proc_read_setup(struct nfs_read_data *data)
 {
@@ -762,23 +766,26 @@ nfs3_proc_read_setup(struct nfs_read_data *data)
 	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs3_read_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_read_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void
-nfs3_write_done(struct rpc_task *task)
+static void nfs3_write_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data;
+	struct nfs_write_data *data = calldata;
 
 	if (nfs3_async_handle_jukebox(task))
 		return;
-	data = (struct nfs_write_data *)task->tk_calldata;
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(data->inode, data->res.fattr);
-	nfs_writeback_done(task);
+	nfs_writeback_done(task, calldata);
 }
 
+static const struct rpc_call_ops nfs3_write_ops = {
+	.rpc_call_done = nfs3_write_done,
+	.rpc_release = nfs_writedata_release,
+};
+
 static void
 nfs3_proc_write_setup(struct nfs_write_data *data, int how)
 {
@@ -806,23 +813,26 @@ nfs3_proc_write_setup(struct nfs_write_data *data, int how)
 	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs3_write_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_write_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void
-nfs3_commit_done(struct rpc_task *task)
+static void nfs3_commit_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data;
+	struct nfs_write_data *data = calldata;
 
 	if (nfs3_async_handle_jukebox(task))
 		return;
-	data = (struct nfs_write_data *)task->tk_calldata;
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(data->inode, data->res.fattr);
-	nfs_commit_done(task);
+	nfs_commit_done(task, calldata);
 }
 
+static const struct rpc_call_ops nfs3_commit_ops = {
+	.rpc_call_done = nfs3_commit_done,
+	.rpc_release = nfs_commit_release,
+};
+
 static void
 nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
 {
@@ -840,7 +850,7 @@ nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
 	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs3_commit_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_commit_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f988a9417b13..3d5d3c07d621 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -196,14 +196,12 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf
 
 /* Helper for asynchronous RPC calls */
 static int nfs4_call_async(struct rpc_clnt *clnt, rpc_action tk_begin,
-		rpc_action tk_exit, void *calldata)
+		const struct rpc_call_ops *tk_ops, void *calldata)
 {
 	struct rpc_task *task;
 
-	if (!(task = rpc_new_task(clnt, tk_exit, RPC_TASK_ASYNC)))
+	if (!(task = rpc_new_task(clnt, RPC_TASK_ASYNC, tk_ops, calldata)))
 		return -ENOMEM;
-
-	task->tk_calldata = calldata;
 	task->tk_action = tk_begin;
 	rpc_execute(task);
 	return 0;
@@ -867,10 +865,10 @@ struct nfs4_closedata {
 	struct nfs_fattr fattr;
 };
 
-static void nfs4_free_closedata(struct nfs4_closedata *calldata)
+static void nfs4_free_closedata(void *data)
 {
-	struct nfs4_state *state = calldata->state;
-	struct nfs4_state_owner *sp = state->owner;
+	struct nfs4_closedata *calldata = data;
+	struct nfs4_state_owner *sp = calldata->state->owner;
 
 	nfs4_put_open_state(calldata->state);
 	nfs_free_seqid(calldata->arg.seqid);
@@ -878,9 +876,9 @@ static void nfs4_free_closedata(struct nfs4_closedata *calldata)
 	kfree(calldata);
 }
 
-static void nfs4_close_done(struct rpc_task *task)
+static void nfs4_close_done(struct rpc_task *task, void *data)
 {
-	struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata;
+	struct nfs4_closedata *calldata = data;
 	struct nfs4_state *state = calldata->state;
 	struct nfs_server *server = NFS_SERVER(calldata->inode);
 
@@ -904,7 +902,6 @@ static void nfs4_close_done(struct rpc_task *task)
 			}
 	}
 	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
-	nfs4_free_closedata(calldata);
 }
 
 static void nfs4_close_begin(struct rpc_task *task)
@@ -918,10 +915,8 @@ static void nfs4_close_begin(struct rpc_task *task)
 		.rpc_cred = state->owner->so_cred,
 	};
 	int mode = 0, old_mode;
-	int status;
 
-	status = nfs_wait_on_sequence(calldata->arg.seqid, task);
-	if (status != 0)
+	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
 		return;
 	/* Recalculate the new open mode in case someone reopened the file
 	 * while we were waiting in line to be scheduled.
@@ -937,9 +932,8 @@ static void nfs4_close_begin(struct rpc_task *task)
 	spin_unlock(&calldata->inode->i_lock);
 	spin_unlock(&state->owner->so_lock);
 	if (mode == old_mode || test_bit(NFS_DELEGATED_STATE, &state->flags)) {
-		nfs4_free_closedata(calldata);
-		task->tk_exit = NULL;
-		rpc_exit(task, 0);
+		/* Note: exit _without_ calling nfs4_close_done */
+		task->tk_action = NULL;
 		return;
 	}
 	nfs_fattr_init(calldata->res.fattr);
@@ -949,6 +943,11 @@ static void nfs4_close_begin(struct rpc_task *task)
 	rpc_call_setup(task, &msg, 0);
 }
 
+static const struct rpc_call_ops nfs4_close_ops = {
+	.rpc_call_done = nfs4_close_done,
+	.rpc_release = nfs4_free_closedata,
+};
+
 /* 
  * It is possible for data to be read/written from a mem-mapped file 
  * after the sys_close call (which hits the vfs layer as a flush).
@@ -982,7 +981,7 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state)
 	calldata->res.server = server;
 
 	status = nfs4_call_async(server->client, nfs4_close_begin,
-			nfs4_close_done, calldata);
+			&nfs4_close_ops, calldata);
 	if (status == 0)
 		goto out;
 
@@ -2125,10 +2124,9 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return err;
 }
 
-static void
-nfs4_read_done(struct rpc_task *task)
+static void nfs4_read_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata;
+	struct nfs_read_data *data = calldata;
 	struct inode *inode = data->inode;
 
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
@@ -2138,9 +2136,14 @@ nfs4_read_done(struct rpc_task *task)
 	if (task->tk_status > 0)
 		renew_lease(NFS_SERVER(inode), data->timestamp);
 	/* Call back common NFS readpage processing */
-	nfs_readpage_result(task);
+	nfs_readpage_result(task, calldata);
 }
 
+static const struct rpc_call_ops nfs4_read_ops = {
+	.rpc_call_done = nfs4_read_done,
+	.rpc_release = nfs_readdata_release,
+};
+
 static void
 nfs4_proc_read_setup(struct nfs_read_data *data)
 {
@@ -2160,14 +2163,13 @@ nfs4_proc_read_setup(struct nfs_read_data *data)
 	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs4_read_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_read_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void
-nfs4_write_done(struct rpc_task *task)
+static void nfs4_write_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata;
+	struct nfs_write_data *data = calldata;
 	struct inode *inode = data->inode;
 	
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
@@ -2179,9 +2181,14 @@ nfs4_write_done(struct rpc_task *task)
 		nfs_post_op_update_inode(inode, data->res.fattr);
 	}
 	/* Call back common NFS writeback processing */
-	nfs_writeback_done(task);
+	nfs_writeback_done(task, calldata);
 }
 
+static const struct rpc_call_ops nfs4_write_ops = {
+	.rpc_call_done = nfs4_write_done,
+	.rpc_release = nfs_writedata_release,
+};
+
 static void
 nfs4_proc_write_setup(struct nfs_write_data *data, int how)
 {
@@ -2214,14 +2221,13 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how)
 	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs4_write_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_write_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void
-nfs4_commit_done(struct rpc_task *task)
+static void nfs4_commit_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata;
+	struct nfs_write_data *data = calldata;
 	struct inode *inode = data->inode;
 	
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
@@ -2231,9 +2237,14 @@ nfs4_commit_done(struct rpc_task *task)
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(inode, data->res.fattr);
 	/* Call back common NFS writeback processing */
-	nfs_commit_done(task);
+	nfs_commit_done(task, calldata);
 }
 
+static const struct rpc_call_ops nfs4_commit_ops = {
+	.rpc_call_done = nfs4_commit_done,
+	.rpc_release = nfs_commit_release,
+};
+
 static void
 nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
 {
@@ -2255,7 +2266,7 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
 	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs4_commit_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_commit_ops, data);
 	rpc_call_setup(task, &msg, 0);	
 }
 
@@ -2263,11 +2274,10 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
  * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
  * standalone procedure for queueing an asynchronous RENEW.
  */
-static void
-renew_done(struct rpc_task *task)
+static void nfs4_renew_done(struct rpc_task *task, void *data)
 {
 	struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
-	unsigned long timestamp = (unsigned long)task->tk_calldata;
+	unsigned long timestamp = (unsigned long)data;
 
 	if (task->tk_status < 0) {
 		switch (task->tk_status) {
@@ -2284,6 +2294,10 @@ renew_done(struct rpc_task *task)
 	spin_unlock(&clp->cl_lock);
 }
 
+static const struct rpc_call_ops nfs4_renew_ops = {
+	.rpc_call_done = nfs4_renew_done,
+};
+
 int
 nfs4_proc_async_renew(struct nfs4_client *clp)
 {
@@ -2294,7 +2308,7 @@ nfs4_proc_async_renew(struct nfs4_client *clp)
 	};
 
 	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
-			renew_done, (void *)jiffies);
+			&nfs4_renew_ops, (void *)jiffies);
 }
 
 int
@@ -2866,15 +2880,16 @@ static void nfs4_locku_release_calldata(struct nfs4_unlockdata *calldata)
 	}
 }
 
-static void nfs4_locku_complete(struct nfs4_unlockdata *calldata)
+static void nfs4_locku_complete(void *data)
 {
+	struct nfs4_unlockdata *calldata = data;
 	complete(&calldata->completion);
 	nfs4_locku_release_calldata(calldata);
 }
 
-static void nfs4_locku_done(struct rpc_task *task)
+static void nfs4_locku_done(struct rpc_task *task, void *data)
 {
-	struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata;
+	struct nfs4_unlockdata *calldata = data;
 
 	nfs_increment_lock_seqid(task->tk_status, calldata->luargs.seqid);
 	switch (task->tk_status) {
@@ -2890,10 +2905,8 @@ static void nfs4_locku_done(struct rpc_task *task)
 		default:
 			if (nfs4_async_handle_error(task, calldata->res.server) == -EAGAIN) {
 				rpc_restart_call(task);
-				return;
 			}
 	}
-	nfs4_locku_complete(calldata);
 }
 
 static void nfs4_locku_begin(struct rpc_task *task)
@@ -2911,14 +2924,18 @@ static void nfs4_locku_begin(struct rpc_task *task)
 	if (status != 0)
 		return;
 	if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) {
-		nfs4_locku_complete(calldata);
-		task->tk_exit = NULL;
-		rpc_exit(task, 0);
+		/* Note: exit _without_ running nfs4_locku_done */
+		task->tk_action = NULL;
 		return;
 	}
 	rpc_call_setup(task, &msg, 0);
 }
 
+static const struct rpc_call_ops nfs4_locku_ops = {
+	.rpc_call_done = nfs4_locku_done,
+	.rpc_release = nfs4_locku_complete,
+};
+
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
 	struct nfs4_unlockdata *calldata;
@@ -2963,7 +2980,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	init_completion(&calldata->completion);
 
 	status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_locku_begin,
-			nfs4_locku_done, calldata);
+			&nfs4_locku_ops, calldata);
 	if (status == 0)
 		wait_for_completion_interruptible(&calldata->completion);
 	do_vfs_lock(request->fl_file, request);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e1e3ca5d746b..6145e82b45e8 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -547,10 +547,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
 
-static void
-nfs_read_done(struct rpc_task *task)
+static void nfs_read_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata;
+	struct nfs_read_data *data = calldata;
 
 	if (task->tk_status >= 0) {
 		nfs_refresh_inode(data->inode, data->res.fattr);
@@ -560,9 +559,14 @@ nfs_read_done(struct rpc_task *task)
 		if (data->args.offset + data->args.count >= data->res.fattr->size)
 			data->res.eof = 1;
 	}
-	nfs_readpage_result(task);
+	nfs_readpage_result(task, calldata);
 }
 
+static const struct rpc_call_ops nfs_read_ops = {
+	.rpc_call_done = nfs_read_done,
+	.rpc_release = nfs_readdata_release,
+};
+
 static void
 nfs_proc_read_setup(struct nfs_read_data *data)
 {
@@ -580,20 +584,24 @@ nfs_proc_read_setup(struct nfs_read_data *data)
 	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs_read_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_read_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void
-nfs_write_done(struct rpc_task *task)
+static void nfs_write_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata;
+	struct nfs_write_data *data = calldata;
 
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(data->inode, data->res.fattr);
-	nfs_writeback_done(task);
+	nfs_writeback_done(task, calldata);
 }
 
+static const struct rpc_call_ops nfs_write_ops = {
+	.rpc_call_done = nfs_write_done,
+	.rpc_release = nfs_writedata_release,
+};
+
 static void
 nfs_proc_write_setup(struct nfs_write_data *data, int how)
 {
@@ -614,7 +622,7 @@ nfs_proc_write_setup(struct nfs_write_data *data, int how)
 	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs_write_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_write_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5f20eafba8ec..21486242c3d3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -42,9 +42,8 @@ mempool_t *nfs_rdata_mempool;
 
 #define MIN_POOL_READ	(32)
 
-void nfs_readdata_release(struct rpc_task *task)
+void nfs_readdata_release(void *data)
 {
-        struct nfs_read_data   *data = (struct nfs_read_data *)task->tk_calldata;
         nfs_readdata_free(data);
 }
 
@@ -220,9 +219,6 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	NFS_PROTO(inode)->read_setup(data);
 
 	data->task.tk_cookie = (unsigned long)inode;
-	data->task.tk_calldata = data;
-	/* Release requests */
-	data->task.tk_release = nfs_readdata_release;
 
 	dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 			data->task.tk_pid,
@@ -452,9 +448,9 @@ static void nfs_readpage_result_full(struct nfs_read_data *data, int status)
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
  */
-void nfs_readpage_result(struct rpc_task *task)
+void nfs_readpage_result(struct rpc_task *task, void *calldata)
 {
-	struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata;
+	struct nfs_read_data *data = calldata;
 	struct nfs_readargs *argp = &data->args;
 	struct nfs_readres *resp = &data->res;
 	int status = task->tk_status;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index d639d172d568..1494484ba86d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -116,10 +116,9 @@ nfs_async_unlink_init(struct rpc_task *task)
  *
  * Do the directory attribute update.
  */
-static void
-nfs_async_unlink_done(struct rpc_task *task)
+static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_unlinkdata	*data = (struct nfs_unlinkdata *)task->tk_calldata;
+	struct nfs_unlinkdata	*data = calldata;
 	struct dentry		*dir = data->dir;
 	struct inode		*dir_i;
 
@@ -141,13 +140,17 @@ nfs_async_unlink_done(struct rpc_task *task)
  * We need to call nfs_put_unlinkdata as a 'tk_release' task since the
  * rpc_task would be freed too.
  */
-static void
-nfs_async_unlink_release(struct rpc_task *task)
+static void nfs_async_unlink_release(void *calldata)
 {
-	struct nfs_unlinkdata	*data = (struct nfs_unlinkdata *)task->tk_calldata;
+	struct nfs_unlinkdata	*data = calldata;
 	nfs_put_unlinkdata(data);
 }
 
+static const struct rpc_call_ops nfs_unlink_ops = {
+	.rpc_call_done = nfs_async_unlink_done,
+	.rpc_release = nfs_async_unlink_release,
+};
+
 /**
  * nfs_async_unlink - asynchronous unlinking of a file
  * @dentry: dentry to unlink
@@ -179,10 +182,8 @@ nfs_async_unlink(struct dentry *dentry)
 	data->count = 1;
 
 	task = &data->task;
-	rpc_init_task(task, clnt, nfs_async_unlink_done , RPC_TASK_ASYNC);
-	task->tk_calldata = data;
+	rpc_init_task(task, clnt, RPC_TASK_ASYNC, &nfs_unlink_ops, data);
 	task->tk_action = nfs_async_unlink_init;
-	task->tk_release = nfs_async_unlink_release;
 
 	spin_lock(&dentry->d_lock);
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 95d00f9132d0..80bc4ea1b824 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -104,9 +104,8 @@ static inline void nfs_commit_free(struct nfs_write_data *p)
 	mempool_free(p, nfs_commit_mempool);
 }
 
-static void nfs_writedata_release(struct rpc_task *task)
+void nfs_writedata_release(void *wdata)
 {
-	struct nfs_write_data	*wdata = (struct nfs_write_data *)task->tk_calldata;
 	nfs_writedata_free(wdata);
 }
 
@@ -871,9 +870,6 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 
 	data->task.tk_priority = flush_task_priority(how);
 	data->task.tk_cookie = (unsigned long)inode;
-	data->task.tk_calldata = data;
-	/* Release requests */
-	data->task.tk_release = nfs_writedata_release;
 
 	dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 		data->task.tk_pid,
@@ -1131,9 +1127,9 @@ static void nfs_writeback_done_full(struct nfs_write_data *data, int status)
 /*
  * This function is called when the WRITE call is complete.
  */
-void nfs_writeback_done(struct rpc_task *task)
+void nfs_writeback_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data	*data = (struct nfs_write_data *) task->tk_calldata;
+	struct nfs_write_data	*data = calldata;
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
 
@@ -1200,9 +1196,8 @@ void nfs_writeback_done(struct rpc_task *task)
 
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-static void nfs_commit_release(struct rpc_task *task)
+void nfs_commit_release(void *wdata)
 {
-	struct nfs_write_data	*wdata = (struct nfs_write_data *)task->tk_calldata;
 	nfs_commit_free(wdata);
 }
 
@@ -1238,9 +1233,6 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 
 	data->task.tk_priority = flush_task_priority(how);
 	data->task.tk_cookie = (unsigned long)inode;
-	data->task.tk_calldata = data;
-	/* Release requests */
-	data->task.tk_release = nfs_commit_release;
 	
 	dprintk("NFS: %4d initiated commit call\n", data->task.tk_pid);
 }
@@ -1277,10 +1269,9 @@ nfs_commit_list(struct list_head *head, int how)
 /*
  * COMMIT call returned
  */
-void
-nfs_commit_done(struct rpc_task *task)
+void nfs_commit_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data	*data = (struct nfs_write_data *)task->tk_calldata;
+	struct nfs_write_data	*data = calldata;
 	struct nfs_page		*req;
 	int res = 0;
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 583c0710e45e..cf92008f219a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,7 +53,7 @@
 #define NFSPROC4_CB_COMPOUND 1
 
 /* declarations */
-static void nfs4_cb_null(struct rpc_task *task);
+static const struct rpc_call_ops nfs4_cb_null_ops;
 
 /* Index of predefined Linux callback client operations */
 
@@ -447,7 +447,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 	msg.rpc_cred = nfsd4_lookupcred(clp,0);
 	if (IS_ERR(msg.rpc_cred))
 		goto out_rpciod;
-	status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, nfs4_cb_null, NULL);
+	status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL);
 	put_rpccred(msg.rpc_cred);
 
 	if (status != 0) {
@@ -469,7 +469,7 @@ out_err:
 }
 
 static void
-nfs4_cb_null(struct rpc_task *task)
+nfs4_cb_null(struct rpc_task *task, void *dummy)
 {
 	struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
 	struct nfs4_callback *cb = &clp->cl_callback;
@@ -488,6 +488,10 @@ out:
 	put_nfs4_client(clp);
 }
 
+static const struct rpc_call_ops nfs4_cb_null_ops = {
+	.rpc_call_done = nfs4_cb_null,
+};
+
 /*
  * called with dp->dl_count inc'ed.
  * nfs4_lock_state() may or may not have been called.
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 16d4e5a08e1d..95c8fea293ba 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -172,7 +172,7 @@ extern struct nlm_host *nlm_find_client(void);
 /*
  * Server-side lock handling
  */
-int		  nlmsvc_async_call(struct nlm_rqst *, u32, rpc_action);
+int		  nlmsvc_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *);
 u32		  nlmsvc_lock(struct svc_rqst *, struct nlm_file *,
 					struct nlm_lock *, int, struct nlm_cookie *);
 u32		  nlmsvc_unlock(struct nlm_file *, struct nlm_lock *);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 2516adeccecf..4dff705d2ff2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -406,10 +406,12 @@ extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
-extern void nfs_writeback_done(struct rpc_task *task);
+extern void nfs_writeback_done(struct rpc_task *task, void *data);
+extern void nfs_writedata_release(void *data);
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-extern void nfs_commit_done(struct rpc_task *);
+extern void nfs_commit_done(struct rpc_task *, void *data);
+extern void nfs_commit_release(void *data);
 #endif
 
 /*
@@ -481,7 +483,9 @@ static inline void nfs_writedata_free(struct nfs_write_data *p)
 extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
-extern void nfs_readpage_result(struct rpc_task *);
+extern void nfs_readpage_result(struct rpc_task *, void *);
+extern void  nfs_readdata_release(void *data);
+
 
 /*
  * Allocate and free nfs_read_data structures
@@ -501,8 +505,6 @@ static inline void nfs_readdata_free(struct nfs_read_data *p)
 	mempool_free(p, nfs_rdata_mempool);
 }
 
-extern void  nfs_readdata_release(struct rpc_task *task);
-
 /*
  * linux/fs/nfs3proc.c
  */
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index ab151bbb66df..b0ab959eca65 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -126,7 +126,8 @@ int		rpc_register(u32, u32, int, unsigned short, int *);
 void		rpc_call_setup(struct rpc_task *, struct rpc_message *, int);
 
 int		rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg,
-			       int flags, rpc_action callback, void *clntdata);
+			       int flags, const struct rpc_call_ops *tk_ops,
+			       void *calldata);
 int		rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg,
 			      int flags);
 void		rpc_restart_call(struct rpc_task *);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 4c4b2dc8aca5..581d8cdc3b86 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -27,6 +27,7 @@ struct rpc_message {
 	struct rpc_cred *	rpc_cred;	/* Credentials */
 };
 
+struct rpc_call_ops;
 struct rpc_wait_queue;
 struct rpc_wait {
 	struct list_head	list;		/* wait queue links */
@@ -61,13 +62,12 @@ struct rpc_task {
 	 * timeout_fn   to be executed by timer bottom half
 	 * callback	to be executed after waking up
 	 * action	next procedure for async tasks
-	 * exit		exit async task and report to caller
+	 * tk_ops	caller callbacks
 	 */
 	void			(*tk_timeout_fn)(struct rpc_task *);
 	void			(*tk_callback)(struct rpc_task *);
 	void			(*tk_action)(struct rpc_task *);
-	void			(*tk_exit)(struct rpc_task *);
-	void			(*tk_release)(struct rpc_task *);
+	const struct rpc_call_ops *tk_ops;
 	void *			tk_calldata;
 
 	/*
@@ -111,6 +111,12 @@ struct rpc_task {
 
 typedef void			(*rpc_action)(struct rpc_task *);
 
+struct rpc_call_ops {
+	void (*rpc_call_done)(struct rpc_task *, void *);
+	void (*rpc_release)(void *);
+};
+
+
 /*
  * RPC task flags
  */
@@ -228,10 +234,12 @@ struct rpc_wait_queue {
 /*
  * Function prototypes
  */
-struct rpc_task *rpc_new_task(struct rpc_clnt *, rpc_action, int flags);
+struct rpc_task *rpc_new_task(struct rpc_clnt *, int flags,
+				const struct rpc_call_ops *ops, void *data);
 struct rpc_task *rpc_new_child(struct rpc_clnt *, struct rpc_task *parent);
-void		rpc_init_task(struct rpc_task *, struct rpc_clnt *,
-					rpc_action exitfunc, int flags);
+void		rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt,
+				int flags, const struct rpc_call_ops *ops,
+				void *data);
 void		rpc_release_task(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
 void		rpc_killall_tasks(struct rpc_clnt *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 6ab4cbd8a901..8b2f75bc006d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -374,10 +374,14 @@ out:
  * Default callback for async RPC calls
  */
 static void
-rpc_default_callback(struct rpc_task *task)
+rpc_default_callback(struct rpc_task *task, void *data)
 {
 }
 
+static const struct rpc_call_ops rpc_default_ops = {
+	.rpc_call_done = rpc_default_callback,
+};
+
 /*
  *	Export the signal mask handling for synchronous code that
  *	sleeps on RPC calls
@@ -432,7 +436,7 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 	BUG_ON(flags & RPC_TASK_ASYNC);
 
 	status = -ENOMEM;
-	task = rpc_new_task(clnt, NULL, flags);
+	task = rpc_new_task(clnt, flags, &rpc_default_ops, NULL);
 	if (task == NULL)
 		goto out;
 
@@ -459,7 +463,7 @@ out:
  */
 int
 rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
-	       rpc_action callback, void *data)
+	       const struct rpc_call_ops *tk_ops, void *data)
 {
 	struct rpc_task	*task;
 	sigset_t	oldset;
@@ -472,12 +476,9 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
 	flags |= RPC_TASK_ASYNC;
 
 	/* Create/initialize a new RPC task */
-	if (!callback)
-		callback = rpc_default_callback;
 	status = -ENOMEM;
-	if (!(task = rpc_new_task(clnt, callback, flags)))
+	if (!(task = rpc_new_task(clnt, flags, tk_ops, data)))
 		goto out;
-	task->tk_calldata = data;
 
 	/* Mask signals on GSS_AUTH upcalls */
 	rpc_task_sigmask(task, &oldset);		
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 3fcf7b0e1f6c..8d6233d3248b 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -555,13 +555,13 @@ __rpc_atrun(struct rpc_task *task)
 }
 
 /*
- * Helper that calls task->tk_exit if it exists
+ * Helper that calls task->tk_ops->rpc_call_done if it exists
  */
 void rpc_exit_task(struct rpc_task *task)
 {
 	task->tk_action = NULL;
-	if (task->tk_exit != NULL) {
-		task->tk_exit(task);
+	if (task->tk_ops->rpc_call_done != NULL) {
+		task->tk_ops->rpc_call_done(task, task->tk_calldata);
 		if (task->tk_action != NULL) {
 			WARN_ON(RPC_ASSASSINATED(task));
 			/* Always release the RPC slot and buffer memory */
@@ -747,7 +747,7 @@ rpc_free(struct rpc_task *task)
 /*
  * Creation and deletion of RPC task structures
  */
-void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action callback, int flags)
+void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, const struct rpc_call_ops *tk_ops, void *calldata)
 {
 	memset(task, 0, sizeof(*task));
 	init_timer(&task->tk_timer);
@@ -755,7 +755,8 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call
 	task->tk_timer.function = (void (*)(unsigned long)) rpc_run_timer;
 	task->tk_client = clnt;
 	task->tk_flags  = flags;
-	task->tk_exit   = callback;
+	task->tk_ops = tk_ops;
+	task->tk_calldata = calldata;
 
 	/* Initialize retry counters */
 	task->tk_garb_retry = 2;
@@ -784,6 +785,8 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call
 	list_add_tail(&task->tk_task, &all_tasks);
 	spin_unlock(&rpc_sched_lock);
 
+	BUG_ON(task->tk_ops == NULL);
+
 	dprintk("RPC: %4d new task procpid %d\n", task->tk_pid,
 				current->pid);
 }
@@ -794,8 +797,7 @@ rpc_alloc_task(void)
 	return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
 }
 
-static void
-rpc_default_free_task(struct rpc_task *task)
+static void rpc_free_task(struct rpc_task *task)
 {
 	dprintk("RPC: %4d freeing task\n", task->tk_pid);
 	mempool_free(task, rpc_task_mempool);
@@ -806,8 +808,7 @@ rpc_default_free_task(struct rpc_task *task)
  * clean up after an allocation failure, as the client may
  * have specified "oneshot".
  */
-struct rpc_task *
-rpc_new_task(struct rpc_clnt *clnt, rpc_action callback, int flags)
+struct rpc_task *rpc_new_task(struct rpc_clnt *clnt, int flags, const struct rpc_call_ops *tk_ops, void *calldata)
 {
 	struct rpc_task	*task;
 
@@ -815,10 +816,7 @@ rpc_new_task(struct rpc_clnt *clnt, rpc_action callback, int flags)
 	if (!task)
 		goto cleanup;
 
-	rpc_init_task(task, clnt, callback, flags);
-
-	/* Replace tk_release */
-	task->tk_release = rpc_default_free_task;
+	rpc_init_task(task, clnt, flags, tk_ops, calldata);
 
 	dprintk("RPC: %4d allocated task\n", task->tk_pid);
 	task->tk_flags |= RPC_TASK_DYNAMIC;
@@ -838,6 +836,8 @@ cleanup:
 
 void rpc_release_task(struct rpc_task *task)
 {
+	const struct rpc_call_ops *tk_ops = task->tk_ops;
+	void *calldata = task->tk_calldata;
 	dprintk("RPC: %4d release task\n", task->tk_pid);
 
 #ifdef RPC_DEBUG
@@ -869,8 +869,10 @@ void rpc_release_task(struct rpc_task *task)
 #ifdef RPC_DEBUG
 	task->tk_magic = 0;
 #endif
-	if (task->tk_release)
-		task->tk_release(task);
+	if (task->tk_flags & RPC_TASK_DYNAMIC)
+		rpc_free_task(task);
+	if (tk_ops->rpc_release)
+		tk_ops->rpc_release(calldata);
 }
 
 /**
@@ -883,12 +885,11 @@ void rpc_release_task(struct rpc_task *task)
  *
  * Caller must hold childq.lock
  */
-static inline struct rpc_task *rpc_find_parent(struct rpc_task *child)
+static inline struct rpc_task *rpc_find_parent(struct rpc_task *child, struct rpc_task *parent)
 {
-	struct rpc_task	*task, *parent;
+	struct rpc_task	*task;
 	struct list_head *le;
 
-	parent = (struct rpc_task *) child->tk_calldata;
 	task_for_each(task, le, &childq.tasks[0])
 		if (task == parent)
 			return parent;
@@ -896,18 +897,22 @@ static inline struct rpc_task *rpc_find_parent(struct rpc_task *child)
 	return NULL;
 }
 
-static void rpc_child_exit(struct rpc_task *child)
+static void rpc_child_exit(struct rpc_task *child, void *calldata)
 {
 	struct rpc_task	*parent;
 
 	spin_lock_bh(&childq.lock);
-	if ((parent = rpc_find_parent(child)) != NULL) {
+	if ((parent = rpc_find_parent(child, calldata)) != NULL) {
 		parent->tk_status = child->tk_status;
 		__rpc_wake_up_task(parent);
 	}
 	spin_unlock_bh(&childq.lock);
 }
 
+static const struct rpc_call_ops rpc_child_ops = {
+	.rpc_call_done = rpc_child_exit,
+};
+
 /*
  * Note: rpc_new_task releases the client after a failure.
  */
@@ -916,11 +921,9 @@ rpc_new_child(struct rpc_clnt *clnt, struct rpc_task *parent)
 {
 	struct rpc_task	*task;
 
-	task = rpc_new_task(clnt, NULL, RPC_TASK_ASYNC | RPC_TASK_CHILD);
+	task = rpc_new_task(clnt, RPC_TASK_ASYNC | RPC_TASK_CHILD, &rpc_child_ops, parent);
 	if (!task)
 		goto fail;
-	task->tk_exit = rpc_child_exit;
-	task->tk_calldata = parent;
 	return task;
 
 fail:
@@ -1056,7 +1059,7 @@ void rpc_show_tasks(void)
 		return;
 	}
 	printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout "
-		"-rpcwait -action- --exit--\n");
+		"-rpcwait -action- ---ops--\n");
 	alltask_for_each(t, le, &all_tasks) {
 		const char *rpc_waitq = "none";
 
@@ -1071,7 +1074,7 @@ void rpc_show_tasks(void)
 			(t->tk_client ? t->tk_client->cl_prog : 0),
 			t->tk_rqstp, t->tk_timeout,
 			rpc_waitq,
-			t->tk_action, t->tk_exit);
+			t->tk_action, t->tk_ops);
 	}
 	spin_unlock(&rpc_sched_lock);
 }
-- 
cgit v1.2.3-71-gd317


From 4ce70ada1ff1d0b80916ec9ec5764ce44a50a54f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:05 +0100
Subject: SUNRPC: Further cleanups

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c            | 21 +++++++++++----------
 fs/nfs/unlink.c              | 13 +++++--------
 include/linux/sunrpc/sched.h |  1 +
 net/sunrpc/sched.c           | 10 ++++++++++
 4 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3d5d3c07d621..368b75b3bcba 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -195,14 +195,13 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf
 }
 
 /* Helper for asynchronous RPC calls */
-static int nfs4_call_async(struct rpc_clnt *clnt, rpc_action tk_begin,
+static int nfs4_call_async(struct rpc_clnt *clnt,
 		const struct rpc_call_ops *tk_ops, void *calldata)
 {
 	struct rpc_task *task;
 
 	if (!(task = rpc_new_task(clnt, RPC_TASK_ASYNC, tk_ops, calldata)))
 		return -ENOMEM;
-	task->tk_action = tk_begin;
 	rpc_execute(task);
 	return 0;
 }
@@ -882,6 +881,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	struct nfs4_state *state = calldata->state;
 	struct nfs_server *server = NFS_SERVER(calldata->inode);
 
+	if (RPC_ASSASSINATED(task))
+		return;
         /* hmm. we are done with the inode, and in the process of freeing
 	 * the state_owner. we keep this around to process errors
 	 */
@@ -904,9 +905,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
 }
 
-static void nfs4_close_begin(struct rpc_task *task)
+static void nfs4_close_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata;
+	struct nfs4_closedata *calldata = data;
 	struct nfs4_state *state = calldata->state;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
@@ -944,6 +945,7 @@ static void nfs4_close_begin(struct rpc_task *task)
 }
 
 static const struct rpc_call_ops nfs4_close_ops = {
+	.rpc_call_prepare = nfs4_close_prepare,
 	.rpc_call_done = nfs4_close_done,
 	.rpc_release = nfs4_free_closedata,
 };
@@ -980,8 +982,7 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state)
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.server = server;
 
-	status = nfs4_call_async(server->client, nfs4_close_begin,
-			&nfs4_close_ops, calldata);
+	status = nfs4_call_async(server->client, &nfs4_close_ops, calldata);
 	if (status == 0)
 		goto out;
 
@@ -2909,9 +2910,9 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 	}
 }
 
-static void nfs4_locku_begin(struct rpc_task *task)
+static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata;
+	struct nfs4_unlockdata *calldata = data;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
 		.rpc_argp       = &calldata->arg,
@@ -2932,6 +2933,7 @@ static void nfs4_locku_begin(struct rpc_task *task)
 }
 
 static const struct rpc_call_ops nfs4_locku_ops = {
+	.rpc_call_prepare = nfs4_locku_prepare,
 	.rpc_call_done = nfs4_locku_done,
 	.rpc_release = nfs4_locku_complete,
 };
@@ -2979,8 +2981,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	atomic_set(&calldata->refcount, 2);
 	init_completion(&calldata->completion);
 
-	status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_locku_begin,
-			&nfs4_locku_ops, calldata);
+	status = nfs4_call_async(NFS_SERVER(inode)->client, &nfs4_locku_ops, calldata);
 	if (status == 0)
 		wait_for_completion_interruptible(&calldata->completion);
 	do_vfs_lock(request->fl_file, request);
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1494484ba86d..a65c7b53d558 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -87,10 +87,9 @@ nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data)
  * We delay initializing RPC info until after the call to dentry_iput()
  * in order to minimize races against rename().
  */
-static void
-nfs_async_unlink_init(struct rpc_task *task)
+static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
 {
-	struct nfs_unlinkdata	*data = (struct nfs_unlinkdata *)task->tk_calldata;
+	struct nfs_unlinkdata	*data = calldata;
 	struct dentry		*dir = data->dir;
 	struct rpc_message	msg = {
 		.rpc_cred	= data->cred,
@@ -147,6 +146,7 @@ static void nfs_async_unlink_release(void *calldata)
 }
 
 static const struct rpc_call_ops nfs_unlink_ops = {
+	.rpc_call_prepare = nfs_async_unlink_init,
 	.rpc_call_done = nfs_async_unlink_done,
 	.rpc_release = nfs_async_unlink_release,
 };
@@ -160,7 +160,6 @@ nfs_async_unlink(struct dentry *dentry)
 {
 	struct dentry	*dir = dentry->d_parent;
 	struct nfs_unlinkdata	*data;
-	struct rpc_task	*task;
 	struct rpc_clnt	*clnt = NFS_CLIENT(dir->d_inode);
 	int		status = -ENOMEM;
 
@@ -181,15 +180,13 @@ nfs_async_unlink(struct dentry *dentry)
 	nfs_deletes = data;
 	data->count = 1;
 
-	task = &data->task;
-	rpc_init_task(task, clnt, RPC_TASK_ASYNC, &nfs_unlink_ops, data);
-	task->tk_action = nfs_async_unlink_init;
+	rpc_init_task(&data->task, clnt, RPC_TASK_ASYNC, &nfs_unlink_ops, data);
 
 	spin_lock(&dentry->d_lock);
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
 	spin_unlock(&dentry->d_lock);
 
-	rpc_sleep_on(&nfs_delete_queue, task, NULL, NULL);
+	rpc_sleep_on(&nfs_delete_queue, &data->task, NULL, NULL);
 	status = 0;
  out:
 	return status;
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 581d8cdc3b86..ac1326fc3e1a 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -112,6 +112,7 @@ struct rpc_task {
 typedef void			(*rpc_action)(struct rpc_task *);
 
 struct rpc_call_ops {
+	void (*rpc_call_prepare)(struct rpc_task *, void *);
 	void (*rpc_call_done)(struct rpc_task *, void *);
 	void (*rpc_release)(void *);
 };
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 8d6233d3248b..2d74a1672028 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -554,6 +554,14 @@ __rpc_atrun(struct rpc_task *task)
 	rpc_wake_up_task(task);
 }
 
+/*
+ * Helper to call task->tk_ops->rpc_call_prepare
+ */
+static void rpc_prepare_task(struct rpc_task *task)
+{
+	task->tk_ops->rpc_call_prepare(task, task->tk_calldata);
+}
+
 /*
  * Helper that calls task->tk_ops->rpc_call_done if it exists
  */
@@ -756,6 +764,8 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, cons
 	task->tk_client = clnt;
 	task->tk_flags  = flags;
 	task->tk_ops = tk_ops;
+	if (tk_ops->rpc_call_prepare != NULL)
+		task->tk_action = rpc_prepare_task;
 	task->tk_calldata = calldata;
 
 	/* Initialize retry counters */
-- 
cgit v1.2.3-71-gd317


From 44c288732fdbd7e38460d156a40d29590bf93bce Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:06 +0100
Subject: NFSv4: stateful NFSv4 RPC call interface

 The NFSv4 model requires us to complete all RPC calls that might
 establish state on the server whether or not the user wants to
 interrupt it. We may also need to schedule new work (including
 new RPC calls) in order to cancel the new state.

 The asynchronous RPC model will allow us to ensure that RPC calls
 always complete, but in order to allow for "synchronous" RPC, we
 want to add the ability to wait for completion.
 The waits are, of course, interruptible.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c              |  1 -
 include/linux/sunrpc/sched.h | 21 ++++++++++--
 net/sunrpc/sched.c           | 78 +++++++++++++++++++++++++++++++++-----------
 3 files changed, 78 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index a834423942c7..ae2be0744191 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -268,7 +268,6 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq,
 		NFS_PROTO(inode)->read_setup(data);
 
 		data->task.tk_cookie = (unsigned long) inode;
-		data->task.tk_calldata = data;
 		data->complete = nfs_direct_read_result;
 
 		lock_kernel();
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index ac1326fc3e1a..94b0afa4ab05 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -42,6 +42,7 @@ struct rpc_task {
 #ifdef RPC_DEBUG
 	unsigned long		tk_magic;	/* 0xf00baa */
 #endif
+	atomic_t		tk_count;	/* Reference count */
 	struct list_head	tk_task;	/* global list of tasks */
 	struct rpc_clnt *	tk_client;	/* RPC client */
 	struct rpc_rqst *	tk_rqstp;	/* RPC request */
@@ -78,7 +79,6 @@ struct rpc_task {
 	struct timer_list	tk_timer;	/* kernel timer */
 	unsigned long		tk_timeout;	/* timeout for rpc_sleep() */
 	unsigned short		tk_flags;	/* misc flags */
-	unsigned char		tk_active   : 1;/* Task has been activated */
 	unsigned char		tk_priority : 2;/* Task priority */
 	unsigned long		tk_runstate;	/* Task run status */
 	struct workqueue_struct	*tk_workqueue;	/* Normally rpciod, but could
@@ -136,7 +136,6 @@ struct rpc_call_ops {
 #define RPC_IS_SWAPPER(t)	((t)->tk_flags & RPC_TASK_SWAPPER)
 #define RPC_DO_ROOTOVERRIDE(t)	((t)->tk_flags & RPC_TASK_ROOTCREDS)
 #define RPC_ASSASSINATED(t)	((t)->tk_flags & RPC_TASK_KILLED)
-#define RPC_IS_ACTIVATED(t)	((t)->tk_active)
 #define RPC_DO_CALLBACK(t)	((t)->tk_callback != NULL)
 #define RPC_IS_SOFT(t)		((t)->tk_flags & RPC_TASK_SOFT)
 #define RPC_TASK_UNINTERRUPTIBLE(t) ((t)->tk_flags & RPC_TASK_NOINTR)
@@ -145,6 +144,7 @@ struct rpc_call_ops {
 #define RPC_TASK_QUEUED		1
 #define RPC_TASK_WAKEUP		2
 #define RPC_TASK_HAS_TIMER	3
+#define RPC_TASK_ACTIVE		4
 
 #define RPC_IS_RUNNING(t)	(test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
 #define rpc_set_running(t)	(set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
@@ -175,6 +175,15 @@ struct rpc_call_ops {
 		smp_mb__after_clear_bit(); \
 	} while (0)
 
+#define RPC_IS_ACTIVATED(t)	(test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate))
+#define rpc_set_active(t)	(set_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate))
+#define rpc_clear_active(t)	\
+	do { \
+		smp_mb__before_clear_bit(); \
+		clear_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate); \
+		smp_mb__after_clear_bit(); \
+	} while(0)
+
 /*
  * Task priorities.
  * Note: if you change these, you must also change
@@ -237,6 +246,8 @@ struct rpc_wait_queue {
  */
 struct rpc_task *rpc_new_task(struct rpc_clnt *, int flags,
 				const struct rpc_call_ops *ops, void *data);
+struct rpc_task *rpc_run_task(struct rpc_clnt *clnt, int flags,
+				const struct rpc_call_ops *ops, void *data);
 struct rpc_task *rpc_new_child(struct rpc_clnt *, struct rpc_task *parent);
 void		rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt,
 				int flags, const struct rpc_call_ops *ops,
@@ -260,6 +271,7 @@ void *		rpc_malloc(struct rpc_task *, size_t);
 int		rpciod_up(void);
 void		rpciod_down(void);
 void		rpciod_wake_up(void);
+int		__rpc_wait_for_completion_task(struct rpc_task *task, int (*)(void *));
 #ifdef RPC_DEBUG
 void		rpc_show_tasks(void);
 #endif
@@ -272,6 +284,11 @@ static inline void rpc_exit(struct rpc_task *task, int status)
 	task->tk_action = rpc_exit_task;
 }
 
+static inline int rpc_wait_for_completion_task(struct rpc_task *task)
+{
+	return __rpc_wait_for_completion_task(task, NULL);
+}
+
 #ifdef RPC_DEBUG
 static inline const char * rpc_qname(struct rpc_wait_queue *q)
 {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 2d74a1672028..82d158dad16d 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -264,6 +264,35 @@ void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname)
 }
 EXPORT_SYMBOL(rpc_init_wait_queue);
 
+static int rpc_wait_bit_interruptible(void *word)
+{
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
+/*
+ * Mark an RPC call as having completed by clearing the 'active' bit
+ */
+static inline void rpc_mark_complete_task(struct rpc_task *task)
+{
+	rpc_clear_active(task);
+	wake_up_bit(&task->tk_runstate, RPC_TASK_ACTIVE);
+}
+
+/*
+ * Allow callers to wait for completion of an RPC call
+ */
+int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *))
+{
+	if (action == NULL)
+		action = rpc_wait_bit_interruptible;
+	return wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
+			action, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__rpc_wait_for_completion_task);
+
 /*
  * Make an RPC task runnable.
  *
@@ -299,10 +328,7 @@ static void rpc_make_runnable(struct rpc_task *task)
 static inline void
 rpc_schedule_run(struct rpc_task *task)
 {
-	/* Don't run a child twice! */
-	if (RPC_IS_ACTIVATED(task))
-		return;
-	task->tk_active = 1;
+	rpc_set_active(task);
 	rpc_make_runnable(task);
 }
 
@@ -324,8 +350,7 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 	}
 
 	/* Mark the task as being activated if so needed */
-	if (!RPC_IS_ACTIVATED(task))
-		task->tk_active = 1;
+	rpc_set_active(task);
 
 	__rpc_add_wait_queue(q, task);
 
@@ -580,14 +605,6 @@ void rpc_exit_task(struct rpc_task *task)
 }
 EXPORT_SYMBOL(rpc_exit_task);
 
-static int rpc_wait_bit_interruptible(void *word)
-{
-	if (signal_pending(current))
-		return -ERESTARTSYS;
-	schedule();
-	return 0;
-}
-
 /*
  * This is the RPC `scheduler' (or rather, the finite state machine).
  */
@@ -680,6 +697,8 @@ static int __rpc_execute(struct rpc_task *task)
 	dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status);
 	status = task->tk_status;
 
+	/* Wake up anyone who is waiting for task completion */
+	rpc_mark_complete_task(task);
 	/* Release all resources associated with the task */
 	rpc_release_task(task);
 	return status;
@@ -697,9 +716,7 @@ static int __rpc_execute(struct rpc_task *task)
 int
 rpc_execute(struct rpc_task *task)
 {
-	BUG_ON(task->tk_active);
-
-	task->tk_active = 1;
+	rpc_set_active(task);
 	rpc_set_running(task);
 	return __rpc_execute(task);
 }
@@ -761,6 +778,7 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, cons
 	init_timer(&task->tk_timer);
 	task->tk_timer.data     = (unsigned long) task;
 	task->tk_timer.function = (void (*)(unsigned long)) rpc_run_timer;
+	atomic_set(&task->tk_count, 1);
 	task->tk_client = clnt;
 	task->tk_flags  = flags;
 	task->tk_ops = tk_ops;
@@ -848,11 +866,13 @@ void rpc_release_task(struct rpc_task *task)
 {
 	const struct rpc_call_ops *tk_ops = task->tk_ops;
 	void *calldata = task->tk_calldata;
-	dprintk("RPC: %4d release task\n", task->tk_pid);
 
 #ifdef RPC_DEBUG
 	BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID);
 #endif
+	if (!atomic_dec_and_test(&task->tk_count))
+		return;
+	dprintk("RPC: %4d release task\n", task->tk_pid);
 
 	/* Remove from global task list */
 	spin_lock(&rpc_sched_lock);
@@ -860,7 +880,6 @@ void rpc_release_task(struct rpc_task *task)
 	spin_unlock(&rpc_sched_lock);
 
 	BUG_ON (RPC_IS_QUEUED(task));
-	task->tk_active = 0;
 
 	/* Synchronously delete any running timer */
 	rpc_delete_timer(task);
@@ -885,6 +904,27 @@ void rpc_release_task(struct rpc_task *task)
 		tk_ops->rpc_release(calldata);
 }
 
+/**
+ * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it
+ * @clnt - pointer to RPC client
+ * @flags - RPC flags
+ * @ops - RPC call ops
+ * @data - user call data
+ */
+struct rpc_task *rpc_run_task(struct rpc_clnt *clnt, int flags,
+					const struct rpc_call_ops *ops,
+					void *data)
+{
+	struct rpc_task *task;
+	task = rpc_new_task(clnt, flags, ops, data);
+	if (task == NULL)
+		return ERR_PTR(-ENOMEM);
+	atomic_inc(&task->tk_count);
+	rpc_execute(task);
+	return task;
+}
+EXPORT_SYMBOL(rpc_run_task);
+
 /**
  * rpc_find_parent - find the parent of a child task.
  * @child: child task
-- 
cgit v1.2.3-71-gd317


From cdd4e68b5f0ed12c64b3e2be83655d2a47588a74 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:12 +0100
Subject: NFSv4: Make open_confirm() asynchronous too

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 101 +++++++++++++++++++++++++++++++++++++-----------
 fs/nfs/nfs4xdr.c        |   7 +---
 include/linux/nfs_xdr.h |   2 +-
 3 files changed, 81 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aed8701c1a36..8154f2579469 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -57,7 +57,8 @@
 #define NFS4_POLL_RETRY_MIN	(1*HZ)
 #define NFS4_POLL_RETRY_MAX	(15*HZ)
 
-static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid);
+struct nfs4_opendata;
+static int _nfs4_proc_open_confirm(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
@@ -198,6 +199,8 @@ struct nfs4_opendata {
 	atomic_t count;
 	struct nfs_openargs o_arg;
 	struct nfs_openres o_res;
+	struct nfs_open_confirmargs c_arg;
+	struct nfs_open_confirmres c_res;
 	struct nfs_fattr f_attr;
 	struct nfs_fattr dir_attr;
 	struct dentry *dentry;
@@ -249,6 +252,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 		p->o_arg.u.attrs = &p->attrs;
 		memcpy(&p->attrs, attrs, sizeof(p->attrs));
 	}
+	p->c_arg.fh = &p->o_res.fh;
+	p->c_arg.stateid = &p->o_res.stateid;
+	p->c_arg.seqid = p->o_arg.seqid;
 	return p;
 err_free:
 	kfree(p);
@@ -433,8 +439,7 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state
 	if (status != 0)
 		goto out_free;
 	if(opendata->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
-		status = _nfs4_proc_open_confirm(server->client, NFS_FH(inode),
-				sp, &opendata->o_res.stateid, opendata->o_arg.seqid);
+		status = _nfs4_proc_open_confirm(opendata);
 		if (status != 0)
 			goto out_free;
 	}
@@ -472,28 +477,79 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
 	return err;
 }
 
-static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid)
+static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs_open_confirmargs arg = {
-		.fh             = fh,
-		.seqid          = seqid,
-		.stateid	= *stateid,
-	};
-	struct nfs_open_confirmres res;
-	struct 	rpc_message msg = {
-		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
-		.rpc_argp       = &arg,
-		.rpc_resp       = &res,
-		.rpc_cred	= sp->so_cred,
+	struct nfs4_opendata *data = calldata;
+	struct  rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+		.rpc_argp = &data->c_arg,
+		.rpc_resp = &data->c_res,
+		.rpc_cred = data->owner->so_cred,
 	};
+	rpc_call_setup(task, &msg, 0);
+}
+
+static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+
+	data->rpc_status = task->tk_status;
+	if (RPC_ASSASSINATED(task))
+		return;
+	if (data->rpc_status == 0)
+		memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
+				sizeof(data->o_res.stateid.data));
+	nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid);
+	nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status);
+}
+
+static void nfs4_open_confirm_release(void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+	struct nfs4_state *state = NULL;
+
+	/* If this request hasn't been cancelled, do nothing */
+	if (data->cancelled == 0)
+		goto out_free;
+	/* In case of error, no cleanup! */
+	if (data->rpc_status != 0)
+		goto out_free;
+	nfs_confirm_seqid(&data->owner->so_seqid, 0);
+	state = nfs4_opendata_to_nfs4_state(data);
+	if (state != NULL)
+		nfs4_close_state(state, data->o_arg.open_flags);
+out_free:
+	nfs4_opendata_free(data);
+}
+
+static const struct rpc_call_ops nfs4_open_confirm_ops = {
+	.rpc_call_prepare = nfs4_open_confirm_prepare,
+	.rpc_call_done = nfs4_open_confirm_done,
+	.rpc_release = nfs4_open_confirm_release,
+};
+
+/*
+ * Note: On error, nfs4_proc_open_confirm will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
+{
+	struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
+	struct rpc_task *task;
 	int status;
 
-	status = rpc_call_sync(clnt, &msg, RPC_TASK_NOINTR);
-	/* Confirm the sequence as being established */
-	nfs_confirm_seqid(&sp->so_seqid, status);
-	nfs_increment_open_seqid(status, seqid);
-	if (status >= 0)
-		memcpy(stateid, &res.stateid, sizeof(*stateid));
+	atomic_inc(&data->count);
+	task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data);
+	if (IS_ERR(task)) {
+		nfs4_opendata_free(data);
+		return PTR_ERR(task);
+	}
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status != 0) {
+		data->cancelled = 1;
+		smp_wmb();
+	} else
+		status = data->rpc_status;
+	rpc_release_task(task);
 	return status;
 }
 
@@ -602,8 +658,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	} else
 		nfs_refresh_inode(dir, o_res->dir_attr);
 	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
-		status = _nfs4_proc_open_confirm(server->client, &o_res->fh,
-				data->owner, &o_res->stateid, o_arg->seqid);
+		status = _nfs4_proc_open_confirm(data);
 		if (status != 0)
 			return status;
 	}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index db2bcf722f91..2ba9906f2a51 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -964,9 +964,9 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
 {
 	uint32_t *p;
 
-	RESERVE_SPACE(8+sizeof(arg->stateid.data));
+	RESERVE_SPACE(8+sizeof(arg->stateid->data));
 	WRITE32(OP_OPEN_CONFIRM);
-	WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
+	WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data));
 	WRITE32(arg->seqid->sequence->counter);
 
 	return 0;
@@ -1535,9 +1535,6 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct n
 	};
 	int status;
 
-	status = nfs_wait_on_sequence(args->seqid, req->rq_task);
-	if (status != 0)
-		goto out;
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
 	status = encode_putfh(&xdr, args->fh);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 40718669b9c8..518cfa5cd024 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -137,7 +137,7 @@ struct nfs_openres {
  */
 struct nfs_open_confirmargs {
 	const struct nfs_fh *	fh;
-	nfs4_stateid            stateid;
+	nfs4_stateid *		stateid;
 	struct nfs_seqid *	seqid;
 };
 
-- 
cgit v1.2.3-71-gd317


From 911d1aaf26fc4d771174d98fcab710a44e2a5fa0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:16 +0100
Subject: NFSv4: locking XDR cleanup

 Get rid of some unnecessary intermediate structures

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 192 ++++++++++++++++++++----------------------------
 fs/nfs/nfs4xdr.c        | 131 +++++++++++++++++++--------------
 include/linux/nfs_xdr.h |  52 ++++++-------
 3 files changed, 179 insertions(+), 196 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3ecb7da220f5..857125705b6f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2935,43 +2935,17 @@ nfs4_set_lock_task_retry(unsigned long timeout)
 	return timeout;
 }
 
-static inline int
-nfs4_lck_type(int cmd, struct file_lock *request)
-{
-	/* set lock type */
-	switch (request->fl_type) {
-		case F_RDLCK:
-			return IS_SETLKW(cmd) ? NFS4_READW_LT : NFS4_READ_LT;
-		case F_WRLCK:
-			return IS_SETLKW(cmd) ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
-		case F_UNLCK:
-			return NFS4_WRITE_LT; 
-	}
-	BUG();
-	return 0;
-}
-
-static inline uint64_t
-nfs4_lck_length(struct file_lock *request)
-{
-	if (request->fl_end == OFFSET_MAX)
-		return ~(uint64_t)0;
-	return request->fl_end - request->fl_start + 1;
-}
-
 static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
 	struct inode *inode = state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs4_client *clp = server->nfs4_state;
-	struct nfs_lockargs arg = {
+	struct nfs_lockt_args arg = {
 		.fh = NFS_FH(inode),
-		.type = nfs4_lck_type(cmd, request),
-		.offset = request->fl_start,
-		.length = nfs4_lck_length(request),
+		.fl = request,
 	};
-	struct nfs_lockres res = {
-		.server = server,
+	struct nfs_lockt_res res = {
+		.denied = request,
 	};
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_LOCKT],
@@ -2979,36 +2953,23 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 		.rpc_resp       = &res,
 		.rpc_cred	= state->owner->so_cred,
 	};
-	struct nfs_lowner nlo;
 	struct nfs4_lock_state *lsp;
 	int status;
 
 	down_read(&clp->cl_sem);
-	nlo.clientid = clp->cl_clientid;
+	arg.lock_owner.clientid = clp->cl_clientid;
 	status = nfs4_set_lock_state(state, request);
 	if (status != 0)
 		goto out;
 	lsp = request->fl_u.nfs4_fl.owner;
-	nlo.id = lsp->ls_id; 
-	arg.u.lockt = &nlo;
+	arg.lock_owner.id = lsp->ls_id; 
 	status = rpc_call_sync(server->client, &msg, 0);
-	if (!status) {
-		request->fl_type = F_UNLCK;
-	} else if (status == -NFS4ERR_DENIED) {
-		int64_t len, start, end;
-		start = res.u.denied.offset;
-		len = res.u.denied.length;
-		end = start + len - 1;
-		if (end < 0 || len == 0)
-			request->fl_end = OFFSET_MAX;
-		else
-			request->fl_end = (loff_t)end;
-		request->fl_start = (loff_t)start;
-		request->fl_type = F_WRLCK;
-		if (res.u.denied.type & 1)
-			request->fl_type = F_RDLCK;
-		request->fl_pid = 0;
-		status = 0;
+	switch (status) {
+		case 0:
+			request->fl_type = F_UNLCK;
+			break;
+		case -NFS4ERR_DENIED:
+			status = 0;
 	}
 out:
 	up_read(&clp->cl_sem);
@@ -3048,17 +3009,42 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
 }
 
 struct nfs4_unlockdata {
-	struct nfs_lockargs arg;
-	struct nfs_locku_opargs luargs;
-	struct nfs_lockres res;
+	struct nfs_locku_args arg;
+	struct nfs_locku_res res;
 	struct nfs4_lock_state *lsp;
 	struct nfs_open_context *ctx;
+	struct file_lock fl;
+	const struct nfs_server *server;
 };
 
+static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
+		struct nfs_open_context *ctx,
+		struct nfs4_lock_state *lsp,
+		struct nfs_seqid *seqid)
+{
+	struct nfs4_unlockdata *p;
+	struct inode *inode = lsp->ls_state->inode;
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL)
+		return NULL;
+	p->arg.fh = NFS_FH(inode);
+	p->arg.fl = &p->fl;
+	p->arg.seqid = seqid;
+	p->arg.stateid = &lsp->ls_stateid;
+	p->lsp = lsp;
+	atomic_inc(&lsp->ls_count);
+	/* Ensure we don't close file until we're done freeing locks! */
+	p->ctx = get_nfs_open_context(ctx);
+	memcpy(&p->fl, fl, sizeof(p->fl));
+	p->server = NFS_SERVER(inode);
+	return p;
+}
+
 static void nfs4_locku_release_calldata(void *data)
 {
 	struct nfs4_unlockdata *calldata = data;
-	nfs_free_seqid(calldata->luargs.seqid);
+	nfs_free_seqid(calldata->arg.seqid);
 	nfs4_put_lock_state(calldata->lsp);
 	put_nfs_open_context(calldata->ctx);
 	kfree(calldata);
@@ -3070,19 +3056,19 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 
 	if (RPC_ASSASSINATED(task))
 		return;
-	nfs_increment_lock_seqid(task->tk_status, calldata->luargs.seqid);
+	nfs_increment_lock_seqid(task->tk_status, calldata->arg.seqid);
 	switch (task->tk_status) {
 		case 0:
 			memcpy(calldata->lsp->ls_stateid.data,
-					calldata->res.u.stateid.data,
+					calldata->res.stateid.data,
 					sizeof(calldata->lsp->ls_stateid.data));
 			break;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
-			nfs4_schedule_state_recovery(calldata->res.server->nfs4_state);
+			nfs4_schedule_state_recovery(calldata->server->nfs4_state);
 			break;
 		default:
-			if (nfs4_async_handle_error(task, calldata->res.server) == -EAGAIN) {
+			if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) {
 				rpc_restart_call(task);
 			}
 	}
@@ -3097,10 +3083,8 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 		.rpc_resp       = &calldata->res,
 		.rpc_cred	= calldata->lsp->ls_state->owner->so_cred,
 	};
-	int status;
 
-	status = nfs_wait_on_sequence(calldata->luargs.seqid, task);
-	if (status != 0)
+	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
 		return;
 	if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) {
 		/* Note: exit _without_ running nfs4_locku_done */
@@ -3121,43 +3105,32 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	struct nfs4_unlockdata *calldata;
 	struct inode *inode = state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_seqid *seqid;
 	struct nfs4_lock_state *lsp;
 	struct rpc_task *task;
 	int status = 0;
 
 	/* Is this a delegated lock? */
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags))
-		goto out;
+		goto out_unlock;
+	/* Is this open_owner holding any locks on the server? */
+	if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
+		goto out_unlock;
 
 	status = nfs4_set_lock_state(state, request);
 	if (status != 0)
-		goto out;
+		goto out_unlock;
 	lsp = request->fl_u.nfs4_fl.owner;
-	/* We might have lost the locks! */
-	if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0)
-		goto out;
 	status = -ENOMEM;
-	calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
+	seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+	if (seqid == NULL)
+		goto out_unlock;
+	calldata = nfs4_alloc_unlockdata(request, request->fl_file->private_data,
+			lsp, seqid);
 	if (calldata == NULL)
-		goto out;
-	calldata->luargs.seqid = nfs_alloc_seqid(&lsp->ls_seqid);
-	if (calldata->luargs.seqid == NULL) {
-		kfree(calldata);
-		goto out;
-	}
-	calldata->luargs.stateid = &lsp->ls_stateid;
-	calldata->arg.fh = NFS_FH(inode);
-	calldata->arg.type = nfs4_lck_type(cmd, request);
-	calldata->arg.offset = request->fl_start;
-	calldata->arg.length = nfs4_lck_length(request);
-	calldata->arg.u.locku = &calldata->luargs;
-	calldata->res.server = server;
-	calldata->lsp = lsp;
-	atomic_inc(&lsp->ls_count);
-
-	/* Ensure we don't close file until we're done freeing locks! */
-	calldata->ctx = get_nfs_open_context((struct nfs_open_context*)request->fl_file->private_data);
-
+		goto out_free_seqid;
+	/* Unlock _before_ we do the RPC call */
+	do_vfs_lock(request->fl_file, request);
 	task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_locku_ops, calldata);
 	if (!IS_ERR(task)) {
 		status = nfs4_wait_for_completion_rpc_task(task);
@@ -3166,7 +3139,10 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 		status = PTR_ERR(task);
 		nfs4_locku_release_calldata(calldata);
 	}
-out:
+	return status;
+out_free_seqid:
+	nfs_free_seqid(seqid);
+out_unlock:
 	do_vfs_lock(request->fl_file, request);
 	return status;
 }
@@ -3176,27 +3152,19 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r
 	struct inode *inode = state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
-	struct nfs_lock_opargs largs = {
+	struct nfs_lock_args arg = {
+		.fh = NFS_FH(inode),
+		.fl = request,
 		.lock_stateid = &lsp->ls_stateid,
 		.open_stateid = &state->stateid,
 		.lock_owner = {
 			.clientid = server->nfs4_state->cl_clientid,
 			.id = lsp->ls_id,
 		},
+		.block = (IS_SETLKW(cmd)) ? 1 : 0,
 		.reclaim = reclaim,
 	};
-	struct nfs_lockargs arg = {
-		.fh = NFS_FH(inode),
-		.type = nfs4_lck_type(cmd, request),
-		.offset = request->fl_start,
-		.length = nfs4_lck_length(request),
-		.u = {
-			.lock = &largs,
-		},
-	};
-	struct nfs_lockres res = {
-		.server = server,
-	};
+	struct nfs_lock_res res;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_LOCK],
 		.rpc_argp       = &arg,
@@ -3205,37 +3173,37 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r
 	};
 	int status = -ENOMEM;
 
-	largs.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
-	if (largs.lock_seqid == NULL)
+	arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+	if (arg.lock_seqid == NULL)
 		return -ENOMEM;
 	if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) {
 		struct nfs4_state_owner *owner = state->owner;
 
-		largs.open_seqid = nfs_alloc_seqid(&owner->so_seqid);
-		if (largs.open_seqid == NULL)
+		arg.open_seqid = nfs_alloc_seqid(&owner->so_seqid);
+		if (arg.open_seqid == NULL)
 			goto out;
-		largs.new_lock_owner = 1;
+		arg.new_lock_owner = 1;
 		status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
 		/* increment open seqid on success, and seqid mutating errors */
-		if (largs.new_lock_owner != 0) {
-			nfs_increment_open_seqid(status, largs.open_seqid);
+		if (arg.new_lock_owner != 0) {
+			nfs_increment_open_seqid(status, arg.open_seqid);
 			if (status == 0)
 				nfs_confirm_seqid(&lsp->ls_seqid, 0);
 		}
-		nfs_free_seqid(largs.open_seqid);
+		nfs_free_seqid(arg.open_seqid);
 	} else
 		status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
 	/* increment lock seqid on success, and seqid mutating errors*/
-	nfs_increment_lock_seqid(status, largs.lock_seqid);
+	nfs_increment_lock_seqid(status, arg.lock_seqid);
 	/* save the returned stateid. */
 	if (status == 0) {
-		memcpy(lsp->ls_stateid.data, res.u.stateid.data,
+		memcpy(lsp->ls_stateid.data, res.stateid.data,
 				sizeof(lsp->ls_stateid.data));
 		lsp->ls_flags |= NFS_LOCK_INITIALIZED;
 	} else if (status == -NFS4ERR_DENIED)
 		status = -EAGAIN;
 out:
-	nfs_free_seqid(largs.lock_seqid);
+	nfs_free_seqid(arg.lock_seqid);
 	return status;
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3100172822c9..a7b5de899c6d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -742,69 +742,80 @@ static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
 	return 0;
 }
 
+static inline int nfs4_lock_type(struct file_lock *fl, int block)
+{
+	if ((fl->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) == F_RDLCK)
+		return block ? NFS4_READW_LT : NFS4_READ_LT;
+	return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
+}
+
+static inline uint64_t nfs4_lock_length(struct file_lock *fl)
+{
+	if (fl->fl_end == OFFSET_MAX)
+		return ~(uint64_t)0;
+	return fl->fl_end - fl->fl_start + 1;
+}
+
 /*
  * opcode,type,reclaim,offset,length,new_lock_owner = 32
  * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
  */
-static int encode_lock(struct xdr_stream *xdr, const struct nfs_lockargs *arg)
+static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
 {
 	uint32_t *p;
-	struct nfs_lock_opargs *opargs = arg->u.lock;
 
 	RESERVE_SPACE(32);
 	WRITE32(OP_LOCK);
-	WRITE32(arg->type); 
-	WRITE32(opargs->reclaim);
-	WRITE64(arg->offset);
-	WRITE64(arg->length);
-	WRITE32(opargs->new_lock_owner);
-	if (opargs->new_lock_owner){
+	WRITE32(nfs4_lock_type(args->fl, args->block));
+	WRITE32(args->reclaim);
+	WRITE64(args->fl->fl_start);
+	WRITE64(nfs4_lock_length(args->fl));
+	WRITE32(args->new_lock_owner);
+	if (args->new_lock_owner){
 		RESERVE_SPACE(40);
-		WRITE32(opargs->open_seqid->sequence->counter);
-		WRITEMEM(opargs->open_stateid->data, sizeof(opargs->open_stateid->data));
-		WRITE32(opargs->lock_seqid->sequence->counter);
-		WRITE64(opargs->lock_owner.clientid);
+		WRITE32(args->open_seqid->sequence->counter);
+		WRITEMEM(args->open_stateid->data, sizeof(args->open_stateid->data));
+		WRITE32(args->lock_seqid->sequence->counter);
+		WRITE64(args->lock_owner.clientid);
 		WRITE32(4);
-		WRITE32(opargs->lock_owner.id);
+		WRITE32(args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(20);
-		WRITEMEM(opargs->lock_stateid->data, sizeof(opargs->lock_stateid->data));
-		WRITE32(opargs->lock_seqid->sequence->counter);
+		WRITEMEM(args->lock_stateid->data, sizeof(args->lock_stateid->data));
+		WRITE32(args->lock_seqid->sequence->counter);
 	}
 
 	return 0;
 }
 
-static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockargs *arg)
+static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
 {
 	uint32_t *p;
-	struct nfs_lowner *opargs = arg->u.lockt;
 
 	RESERVE_SPACE(40);
 	WRITE32(OP_LOCKT);
-	WRITE32(arg->type);
-	WRITE64(arg->offset);
-	WRITE64(arg->length);
-	WRITE64(opargs->clientid);
+	WRITE32(nfs4_lock_type(args->fl, 0));
+	WRITE64(args->fl->fl_start);
+	WRITE64(nfs4_lock_length(args->fl));
+	WRITE64(args->lock_owner.clientid);
 	WRITE32(4);
-	WRITE32(opargs->id);
+	WRITE32(args->lock_owner.id);
 
 	return 0;
 }
 
-static int encode_locku(struct xdr_stream *xdr, const struct nfs_lockargs *arg)
+static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
 {
 	uint32_t *p;
-	struct nfs_locku_opargs *opargs = arg->u.locku;
 
 	RESERVE_SPACE(44);
 	WRITE32(OP_LOCKU);
-	WRITE32(arg->type);
-	WRITE32(opargs->seqid->sequence->counter);
-	WRITEMEM(opargs->stateid->data, sizeof(opargs->stateid->data));
-	WRITE64(arg->offset);
-	WRITE64(arg->length);
+	WRITE32(nfs4_lock_type(args->fl, 0));
+	WRITE32(args->seqid->sequence->counter);
+	WRITEMEM(args->stateid->data, sizeof(args->stateid->data));
+	WRITE64(args->fl->fl_start);
+	WRITE64(nfs4_lock_length(args->fl));
 
 	return 0;
 }
@@ -1596,21 +1607,20 @@ out:
 /*
  * Encode a LOCK request
  */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args)
+static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lock_args *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops   = 2,
 	};
-	struct nfs_lock_opargs *opargs = args->u.lock;
 	int status;
 
-	status = nfs_wait_on_sequence(opargs->lock_seqid, req->rq_task);
+	status = nfs_wait_on_sequence(args->lock_seqid, req->rq_task);
 	if (status != 0)
 		goto out;
 	/* Do we need to do an open_to_lock_owner? */
-	if (opargs->lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)
-		opargs->new_lock_owner = 0;
+	if (args->lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)
+		args->new_lock_owner = 0;
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
 	status = encode_putfh(&xdr, args->fh);
@@ -1624,7 +1634,7 @@ out:
 /*
  * Encode a LOCKT request
  */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args)
+static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockt_args *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1645,7 +1655,7 @@ out:
 /*
  * Encode a LOCKU request
  */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args)
+static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_locku_args *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -2949,55 +2959,64 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 /*
  * We create the owner, so we know a proper owner.id length is 4.
  */
-static int decode_lock_denied (struct xdr_stream *xdr, struct nfs_lock_denied *denied)
+static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 {
+	uint64_t offset, length, clientid;
 	uint32_t *p;
-	uint32_t namelen;
+	uint32_t namelen, type;
 
 	READ_BUF(32);
-	READ64(denied->offset);
-	READ64(denied->length);
-	READ32(denied->type);
-	READ64(denied->owner.clientid);
+	READ64(offset);
+	READ64(length);
+	READ32(type);
+	if (fl != NULL) {
+		fl->fl_start = (loff_t)offset;
+		fl->fl_end = fl->fl_start + (loff_t)length - 1;
+		if (length == ~(uint64_t)0)
+			fl->fl_end = OFFSET_MAX;
+		fl->fl_type = F_WRLCK;
+		if (type & 1)
+			fl->fl_type = F_RDLCK;
+		fl->fl_pid = 0;
+	}
+	READ64(clientid);
 	READ32(namelen);
 	READ_BUF(namelen);
-	if (namelen == 4)
-		READ32(denied->owner.id);
 	return -NFS4ERR_DENIED;
 }
 
-static int decode_lock(struct xdr_stream *xdr, struct nfs_lockres *res)
+static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 {
 	uint32_t *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCK);
 	if (status == 0) {
-		READ_BUF(sizeof(res->u.stateid.data));
-		COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data));
+		READ_BUF(sizeof(res->stateid.data));
+		COPYMEM(res->stateid.data, sizeof(res->stateid.data));
 	} else if (status == -NFS4ERR_DENIED)
-		return decode_lock_denied(xdr, &res->u.denied);
+		return decode_lock_denied(xdr, NULL);
 	return status;
 }
 
-static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockres *res)
+static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
 {
 	int status;
 	status = decode_op_hdr(xdr, OP_LOCKT);
 	if (status == -NFS4ERR_DENIED)
-		return decode_lock_denied(xdr, &res->u.denied);
+		return decode_lock_denied(xdr, res->denied);
 	return status;
 }
 
-static int decode_locku(struct xdr_stream *xdr, struct nfs_lockres *res)
+static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 {
 	uint32_t *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCKU);
 	if (status == 0) {
-		READ_BUF(sizeof(res->u.stateid.data));
-		COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data));
+		READ_BUF(sizeof(res->stateid.data));
+		COPYMEM(res->stateid.data, sizeof(res->stateid.data));
 	}
 	return status;
 }
@@ -3861,7 +3880,7 @@ out:
 /*
  * Decode LOCK response
  */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lock_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3882,7 +3901,7 @@ out:
 /*
  * Decode LOCKT response
  */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockt_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3903,7 +3922,7 @@ out:
 /*
  * Decode LOCKU response
  */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_locku_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 518cfa5cd024..b8b0eed98ec9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -165,50 +165,46 @@ struct nfs_closeres {
  *  * Arguments to the lock,lockt, and locku call.
  *   */
 struct nfs_lowner {
-	__u64           clientid;
-	u32                     id;
+	__u64			clientid;
+	u32			id;
 };
 
-struct nfs_lock_opargs {
+struct nfs_lock_args {
+	struct nfs_fh *		fh;
+	struct file_lock *	fl;
 	struct nfs_seqid *	lock_seqid;
 	nfs4_stateid *		lock_stateid;
 	struct nfs_seqid *	open_seqid;
 	nfs4_stateid *		open_stateid;
-	struct nfs_lowner       lock_owner;
-	__u32                   reclaim;
-	__u32                   new_lock_owner;
+	struct nfs_lowner	lock_owner;
+	unsigned char		block : 1;
+	unsigned char		reclaim : 1;
+	unsigned char		new_lock_owner : 1;
+};
+
+struct nfs_lock_res {
+	nfs4_stateid			stateid;
 };
 
-struct nfs_locku_opargs {
+struct nfs_locku_args {
+	struct nfs_fh *		fh;
+	struct file_lock *	fl;
 	struct nfs_seqid *	seqid;
 	nfs4_stateid *		stateid;
 };
 
-struct nfs_lockargs {
-	struct nfs_fh *         fh;
-	__u32                   type;
-	__u64                   offset; 
-	__u64                   length; 
-	union {
-		struct nfs_lock_opargs  *lock;    /* LOCK  */
-		struct nfs_lowner       *lockt;  /* LOCKT */
-		struct nfs_locku_opargs *locku;  /* LOCKU */
-	} u;
+struct nfs_locku_res {
+	nfs4_stateid			stateid;
 };
 
-struct nfs_lock_denied {
-	__u64                   offset;
-	__u64                   length;
-	__u32                   type;
-	struct nfs_lowner   	owner;
+struct nfs_lockt_args {
+	struct nfs_fh *		fh;
+	struct file_lock *	fl;
+	struct nfs_lowner	lock_owner;
 };
 
-struct nfs_lockres {
-	union {
-		nfs4_stateid            stateid;/* LOCK success, LOCKU */
-		struct nfs_lock_denied  denied; /* LOCK failed, LOCKT success */
-	} u;
-	const struct nfs_server *	server;
+struct nfs_lockt_res {
+	struct file_lock *	denied; /* LOCK, LOCKT failed */
 };
 
 struct nfs4_delegreturnargs {
-- 
cgit v1.2.3-71-gd317


From a911fd9a6046200e439b4af172e8379c0942eec3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 30 Nov 2005 18:08:59 -0500
Subject: NFS: simplify inlined bit ops in nfs_page.h

 Minor cleanup:  inlined bit ops in nfs_page.h can be simpler.

 Test plan:
 Write-intensive workload against a server that requires COMMITs.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_page.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index da2e077b65e2..66e2ed658527 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -79,9 +79,7 @@ extern  void nfs_clear_page_writeback(struct nfs_page *req);
 static inline int
 nfs_lock_request_dontget(struct nfs_page *req)
 {
-	if (test_and_set_bit(PG_BUSY, &req->wb_flags))
-		return 0;
-	return 1;
+	return !test_and_set_bit(PG_BUSY, &req->wb_flags);
 }
 
 /*
@@ -125,9 +123,7 @@ nfs_list_remove_request(struct nfs_page *req)
 static inline int
 nfs_defer_commit(struct nfs_page *req)
 {
-	if (test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags))
-		return 0;
-	return 1;
+	return !test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags);
 }
 
 static inline void
@@ -141,9 +137,7 @@ nfs_clear_commit(struct nfs_page *req)
 static inline int
 nfs_defer_reschedule(struct nfs_page *req)
 {
-	if (test_and_set_bit(PG_NEED_RESCHED, &req->wb_flags))
-		return 0;
-	return 1;
+	return !test_and_set_bit(PG_NEED_RESCHED, &req->wb_flags);
 }
 
 static inline void
-- 
cgit v1.2.3-71-gd317


From 40859d7ee64ed6bfad8a4e93f9bb5c1074afadff Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 30 Nov 2005 18:09:02 -0500
Subject: NFS: support large reads and writes on the wire

 Most NFS server implementations allow up to 64KB reads and writes on the
 wire.  The Solaris NFS server allows up to a megabyte, for instance.

 Now the Linux NFS client supports transfer sizes up to 1MB, too.  This will
 help reduce protocol and context switch overhead on read/write intensive NFS
 workloads, and support larger atomic read and write operations on servers
 that support them.

 Test-plan:
 Connectathon and iozone on mount point with wsize=rsize>32768 over TCP.
 Tests with NFS over UDP to verify the maximum RPC payload size cap.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c            |  5 +++--
 fs/nfs/inode.c             | 25 ++++++++++---------------
 fs/nfs/nfsroot.c           |  4 ++--
 fs/nfs/read.c              |  6 +++---
 fs/nfs/write.c             | 29 ++++++++++++++++++++++-------
 include/linux/nfs_fs.h     | 41 +++++++++++++++++++++++++++++++++++------
 include/linux/nfs_xdr.h    | 29 ++++++++++++++++-------------
 include/linux/sunrpc/xdr.h |  5 -----
 8 files changed, 91 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f69d95aa78b2..fd7ac5e841c1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -154,6 +154,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int
 	struct list_head *list;
 	struct nfs_direct_req *dreq;
 	unsigned int reads = 0;
+	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
 	if (!dreq)
@@ -167,7 +168,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int
 
 	list = &dreq->list;
 	for(;;) {
-		struct nfs_read_data *data = nfs_readdata_alloc();
+		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
 
 		if (unlikely(!data)) {
 			while (!list_empty(list)) {
@@ -431,7 +432,7 @@ static ssize_t nfs_direct_write_seg(struct inode *inode,
 	struct nfs_writeverf first_verf;
 	struct nfs_write_data *wdata;
 
-	wdata = nfs_writedata_alloc();
+	wdata = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
 	if (!wdata)
 		return -ENOMEM;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4e6558df54b8..acde2c5725bf 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -221,10 +221,10 @@ nfs_calc_block_size(u64 tsize)
 static inline unsigned long
 nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
 {
-	if (bsize < 1024)
-		bsize = NFS_DEF_FILE_IO_BUFFER_SIZE;
-	else if (bsize >= NFS_MAX_FILE_IO_BUFFER_SIZE)
-		bsize = NFS_MAX_FILE_IO_BUFFER_SIZE;
+	if (bsize < NFS_MIN_FILE_IO_SIZE)
+		bsize = NFS_DEF_FILE_IO_SIZE;
+	else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+		bsize = NFS_MAX_FILE_IO_SIZE;
 
 	return nfs_block_bits(bsize, nrbitsp);
 }
@@ -307,20 +307,15 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
 	max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
 	if (server->rsize > max_rpc_payload)
 		server->rsize = max_rpc_payload;
-	if (server->wsize > max_rpc_payload)
-		server->wsize = max_rpc_payload;
-
+	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+		server->rsize = NFS_MAX_FILE_IO_SIZE;
 	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (server->rpages > NFS_READ_MAXIOV) {
-		server->rpages = NFS_READ_MAXIOV;
-		server->rsize = server->rpages << PAGE_CACHE_SHIFT;
-	}
 
+	if (server->wsize > max_rpc_payload)
+		server->wsize = max_rpc_payload;
+	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+		server->wsize = NFS_MAX_FILE_IO_SIZE;
 	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        if (server->wpages > NFS_WRITE_MAXIOV) {
-		server->wpages = NFS_WRITE_MAXIOV;
-                server->wsize = server->wpages << PAGE_CACHE_SHIFT;
-	}
 
 	if (sb->s_blocksize == 0)
 		sb->s_blocksize = nfs_block_bits(server->wsize,
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 1b272a135a31..985cc53b8dd5 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -296,8 +296,8 @@ static int __init root_nfs_name(char *name)
 	nfs_port          = -1;
 	nfs_data.version  = NFS_MOUNT_VERSION;
 	nfs_data.flags    = NFS_MOUNT_NONLM;	/* No lockd in nfs root yet */
-	nfs_data.rsize    = NFS_DEF_FILE_IO_BUFFER_SIZE;
-	nfs_data.wsize    = NFS_DEF_FILE_IO_BUFFER_SIZE;
+	nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
+	nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
 	nfs_data.acregmin = 3;
 	nfs_data.acregmax = 60;
 	nfs_data.acdirmin = 30;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 21486242c3d3..05eb43fadf8e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -83,7 +83,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
 	int		result;
 	struct nfs_read_data *rdata;
 
-	rdata = nfs_readdata_alloc();
+	rdata = nfs_readdata_alloc(1);
 	if (!rdata)
 		return -ENOMEM;
 
@@ -283,7 +283,7 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
 
 	nbytes = req->wb_bytes;
 	for(;;) {
-		data = nfs_readdata_alloc();
+		data = nfs_readdata_alloc(1);
 		if (!data)
 			goto out_bad;
 		INIT_LIST_HEAD(&data->pages);
@@ -339,7 +339,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode)
 	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
 		return nfs_pagein_multi(head, inode);
 
-	data = nfs_readdata_alloc();
+	data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages);
 	if (!data)
 		goto out_bad;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 80bc4ea1b824..1ce0c200df16 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -89,18 +89,33 @@ static mempool_t *nfs_commit_mempool;
 
 static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
 
-static inline struct nfs_write_data *nfs_commit_alloc(void)
+static inline struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
 {
 	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
+
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
+		if (pagecount < NFS_PAGEVEC_SIZE)
+			p->pagevec = &p->page_array[0];
+		else {
+			size_t size = ++pagecount * sizeof(struct page *);
+			p->pagevec = kmalloc(size, GFP_NOFS);
+			if (p->pagevec) {
+				memset(p->pagevec, 0, size);
+			} else {
+				mempool_free(p, nfs_commit_mempool);
+				p = NULL;
+			}
+		}
 	}
 	return p;
 }
 
 static inline void nfs_commit_free(struct nfs_write_data *p)
 {
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
 	mempool_free(p, nfs_commit_mempool);
 }
 
@@ -167,7 +182,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
 	int		result, written = 0;
 	struct nfs_write_data *wdata;
 
-	wdata = nfs_writedata_alloc();
+	wdata = nfs_writedata_alloc(1);
 	if (!wdata)
 		return -ENOMEM;
 
@@ -909,7 +924,7 @@ static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how)
 
 	nbytes = req->wb_bytes;
 	for (;;) {
-		data = nfs_writedata_alloc();
+		data = nfs_writedata_alloc(1);
 		if (!data)
 			goto out_bad;
 		list_add(&data->pages, &list);
@@ -973,7 +988,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how)
 	if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE)
 		return nfs_flush_multi(head, inode, how);
 
-	data = nfs_writedata_alloc();
+	data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
 	if (!data)
 		goto out_bad;
 
@@ -1241,12 +1256,12 @@ static void nfs_commit_rpcsetup(struct list_head *head,
  * Commit dirty pages
  */
 static int
-nfs_commit_list(struct list_head *head, int how)
+nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
 	struct nfs_write_data	*data;
 	struct nfs_page         *req;
 
-	data = nfs_commit_alloc();
+	data = nfs_commit_alloc(NFS_SERVER(inode)->wpages);
 
 	if (!data)
 		goto out_bad;
@@ -1351,7 +1366,7 @@ int nfs_commit_inode(struct inode *inode, int how)
 	res = nfs_scan_commit(inode, &head, 0, 0);
 	spin_unlock(&nfsi->req_lock);
 	if (res) {
-		error = nfs_commit_list(&head, how);
+		error = nfs_commit_list(inode, &head, how);
 		if (error < 0)
 			return error;
 	}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 4dff705d2ff2..d38010ba6477 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -38,9 +38,6 @@
 # define NFS_DEBUG
 #endif
 
-#define NFS_MAX_FILE_IO_BUFFER_SIZE	32768
-#define NFS_DEF_FILE_IO_BUFFER_SIZE	4096
-
 /* Default timeout values */
 #define NFS_MAX_UDP_TIMEOUT	(60*HZ)
 #define NFS_MAX_TCP_TIMEOUT	(600*HZ)
@@ -462,18 +459,33 @@ static inline int nfs_wb_page(struct inode *inode, struct page* page)
  */
 extern mempool_t *nfs_wdata_mempool;
 
-static inline struct nfs_write_data *nfs_writedata_alloc(void)
+static inline struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
 	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
+
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
+		if (pagecount < NFS_PAGEVEC_SIZE)
+			p->pagevec = &p->page_array[0];
+		else {
+			size_t size = ++pagecount * sizeof(struct page *);
+			p->pagevec = kmalloc(size, GFP_NOFS);
+			if (p->pagevec) {
+				memset(p->pagevec, 0, size);
+			} else {
+				mempool_free(p, nfs_wdata_mempool);
+				p = NULL;
+			}
+		}
 	}
 	return p;
 }
 
 static inline void nfs_writedata_free(struct nfs_write_data *p)
 {
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
 	mempool_free(p, nfs_wdata_mempool);
 }
 
@@ -492,16 +504,33 @@ extern void  nfs_readdata_release(void *data);
  */
 extern mempool_t *nfs_rdata_mempool;
 
-static inline struct nfs_read_data *nfs_readdata_alloc(void)
+static inline struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
 	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
-	if (p)
+
+	if (p) {
 		memset(p, 0, sizeof(*p));
+		INIT_LIST_HEAD(&p->pages);
+		if (pagecount < NFS_PAGEVEC_SIZE)
+			p->pagevec = &p->page_array[0];
+		else {
+			size_t size = ++pagecount * sizeof(struct page *);
+			p->pagevec = kmalloc(size, GFP_NOFS);
+			if (p->pagevec) {
+				memset(p->pagevec, 0, size);
+			} else {
+				mempool_free(p, nfs_rdata_mempool);
+				p = NULL;
+			}
+		}
+	}
 	return p;
 }
 
 static inline void nfs_readdata_free(struct nfs_read_data *p)
 {
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
 	mempool_free(p, nfs_rdata_mempool);
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b8b0eed98ec9..9f422fd87673 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -4,6 +4,16 @@
 #include <linux/sunrpc/xprt.h>
 #include <linux/nfsacl.h>
 
+/*
+ * To change the maximum rsize and wsize supported by the NFS client, adjust
+ * NFS_MAX_FILE_IO_SIZE.  64KB is a typical maximum, but some servers can
+ * support a megabyte or more.  The default is left at 4096 bytes, which is
+ * reasonable for NFS over UDP.
+ */
+#define NFS_MAX_FILE_IO_SIZE	(1048576U)
+#define NFS_DEF_FILE_IO_SIZE	(4096U)
+#define NFS_MIN_FILE_IO_SIZE	(1024U)
+
 struct nfs4_fsid {
 	__u64 major;
 	__u64 minor;
@@ -215,12 +225,6 @@ struct nfs4_delegreturnargs {
 /*
  * Arguments to the read call.
  */
-
-#define NFS_READ_MAXIOV		(9U)
-#if (NFS_READ_MAXIOV > (MAX_IOVEC -2))
-#error "NFS_READ_MAXIOV is too large"
-#endif
-
 struct nfs_readargs {
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
@@ -239,11 +243,6 @@ struct nfs_readres {
 /*
  * Arguments to the write call.
  */
-#define NFS_WRITE_MAXIOV	(9U)
-#if (NFS_WRITE_MAXIOV > (MAX_IOVEC -2))
-#error "NFS_WRITE_MAXIOV is too large"
-#endif
-
 struct nfs_writeargs {
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
@@ -674,6 +673,8 @@ struct nfs4_server_caps_res {
 
 struct nfs_page;
 
+#define NFS_PAGEVEC_SIZE	(8U)
+
 struct nfs_read_data {
 	int			flags;
 	struct rpc_task		task;
@@ -682,13 +683,14 @@ struct nfs_read_data {
 	struct nfs_fattr	fattr;	/* fattr storage */
 	struct list_head	pages;	/* Coalesced read requests */
 	struct nfs_page		*req;	/* multi ops per nfs_page */
-	struct page		*pagevec[NFS_READ_MAXIOV];
+	struct page		**pagevec;
 	struct nfs_readargs args;
 	struct nfs_readres  res;
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
 	void (*complete) (struct nfs_read_data *, int);
+	struct page		*page_array[NFS_PAGEVEC_SIZE + 1];
 };
 
 struct nfs_write_data {
@@ -700,13 +702,14 @@ struct nfs_write_data {
 	struct nfs_writeverf	verf;
 	struct list_head	pages;		/* Coalesced requests we wish to flush */
 	struct nfs_page		*req;		/* multi ops per nfs_page */
-	struct page		*pagevec[NFS_WRITE_MAXIOV];
+	struct page		**pagevec;
 	struct nfs_writeargs	args;		/* argument struct */
 	struct nfs_writeres	res;		/* result struct */
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
 	void (*complete) (struct nfs_write_data *, int);
+	struct page		*page_array[NFS_PAGEVEC_SIZE + 1];
 };
 
 struct nfs_access_entry;
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5da968729cf8..5676794ee34f 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -134,11 +134,6 @@ xdr_adjust_iovec(struct kvec *iov, u32 *p)
 	return iov->iov_len = ((u8 *) p - (u8 *) iov->iov_base);
 }
 
-/*
- * Maximum number of iov's we use.
- */
-#define MAX_IOVEC	(12)
-
 /*
  * XDR buffer helper functions
  */
-- 
cgit v1.2.3-71-gd317


From 70b9ecbdb9c5fdc731f8780bffd45d9519020c4a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:34 +0100
Subject: NFS: Make stat() return updated mtimes after a write()

 The SuS states that a call to write() will cause mtime to be updated on
 the file. In order to satisfy that requirement, we need to flush out
 any cached writes in nfs_getattr().
 Speed things up slightly by not committing the writes.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  2 ++
 fs/nfs/write.c         | 23 ++++++++++++-----------
 include/linux/nfs_fs.h |  1 +
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index acde2c5725bf..2c7f8aac1dec 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -952,6 +952,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
 	int err;
 
+	/* Flush out writes to the server in order to update c/mtime */
+	nfs_sync_inode(inode, 0, 0, FLUSH_WAIT|FLUSH_NOCOMMIT);
 	if (__IS_FLG(inode, MS_NOATIME))
 		need_atime = 0;
 	else if (__IS_FLG(inode, MS_NODIRATIME) && S_ISDIR(inode->i_mode))
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1ce0c200df16..9449b6835509 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1377,22 +1377,23 @@ int nfs_commit_inode(struct inode *inode, int how)
 int nfs_sync_inode(struct inode *inode, unsigned long idx_start,
 		  unsigned int npages, int how)
 {
-	int	error,
-		wait;
+	int nocommit = how & FLUSH_NOCOMMIT;
+	int wait = how & FLUSH_WAIT;
+	int error;
 
-	wait = how & FLUSH_WAIT;
-	how &= ~FLUSH_WAIT;
+	how &= ~(FLUSH_WAIT|FLUSH_NOCOMMIT);
 
 	do {
-		error = 0;
-		if (wait)
+		if (wait) {
 			error = nfs_wait_on_requests(inode, idx_start, npages);
-		if (error == 0)
-			error = nfs_flush_inode(inode, idx_start, npages, how);
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-		if (error == 0)
+			if (error != 0)
+				continue;
+		}
+		error = nfs_flush_inode(inode, idx_start, npages, how);
+		if (error != 0)
+			continue;
+		if (!nocommit)
 			error = nfs_commit_inode(inode, how);
-#endif
 	} while (error > 0);
 	return error;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d38010ba6477..408d82d3d97c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -62,6 +62,7 @@
 #define FLUSH_STABLE		4	/* commit to stable storage */
 #define FLUSH_LOWPRI		8	/* low priority background flush */
 #define FLUSH_HIGHPRI		16	/* high priority memory reclaim flush */
+#define FLUSH_NOCOMMIT		32	/* Don't send the NFSv3/v4 COMMIT */
 
 #ifdef __KERNEL__
 
-- 
cgit v1.2.3-71-gd317


From fa178f29c0f8a0dce748181a5351f4a92fd4f455 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:38 +0100
Subject: NFSv4: Ensure DELEGRETURN returns attributes

 Upon return of a write delegation, the server will almost always bump the
 change attribute. Ensure that we pick up that change so that we don't
 invalidate our data cache unnecessarily.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c     |  2 --
 fs/nfs/nfs4proc.c       | 17 +++++++++++++----
 fs/nfs/nfs4xdr.c        | 33 ++++++++++++++++++++++-----------
 include/linux/nfs_xdr.h |  6 ++++++
 4 files changed, 41 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d2ee09b38cee..66cc720e3927 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -159,8 +159,6 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
 {
 	int res = 0;
 
-	__nfs_revalidate_inode(NFS_SERVER(inode), inode);
-
 	res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
 	nfs_free_delegation(delegation);
 	return res;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b3349154994b..984ca3454d04 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2920,11 +2920,12 @@ nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred)
 
 struct nfs4_delegreturndata {
 	struct nfs4_delegreturnargs args;
+	struct nfs4_delegreturnres res;
 	struct nfs_fh fh;
 	nfs4_stateid stateid;
 	struct rpc_cred *cred;
 	unsigned long timestamp;
-	const struct nfs_server *server;
+	struct nfs_fattr fattr;
 	int rpc_status;
 };
 
@@ -2934,8 +2935,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *calldata)
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
 		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
 		.rpc_cred = data->cred,
 	};
+	nfs_fattr_init(data->res.fattr);
 	rpc_call_setup(task, &msg, 0);
 }
 
@@ -2944,7 +2947,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 	struct nfs4_delegreturndata *data = calldata;
 	data->rpc_status = task->tk_status;
 	if (data->rpc_status == 0)
-		renew_lease(data->server, data->timestamp);
+		renew_lease(data->res.server, data->timestamp);
 }
 
 static void nfs4_delegreturn_release(void *calldata)
@@ -2964,6 +2967,7 @@ const static struct rpc_call_ops nfs4_delegreturn_ops = {
 static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
 {
 	struct nfs4_delegreturndata *data;
+	struct nfs_server *server = NFS_SERVER(inode);
 	struct rpc_task *task;
 	int status;
 
@@ -2972,11 +2976,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 		return -ENOMEM;
 	data->args.fhandle = &data->fh;
 	data->args.stateid = &data->stateid;
+	data->args.bitmask = server->attr_bitmask;
 	nfs_copy_fh(&data->fh, NFS_FH(inode));
 	memcpy(&data->stateid, stateid, sizeof(data->stateid));
+	data->res.fattr = &data->fattr;
+	data->res.server = server;
 	data->cred = get_rpccred(cred);
 	data->timestamp = jiffies;
-	data->server = NFS_SERVER(inode);
 	data->rpc_status = 0;
 
 	task = rpc_run_task(NFS_CLIENT(inode), RPC_TASK_ASYNC, &nfs4_delegreturn_ops, data);
@@ -2985,8 +2991,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 		return PTR_ERR(task);
 	}
 	status = nfs4_wait_for_completion_rpc_task(task);
-	if (status == 0)
+	if (status == 0) {
 		status = data->rpc_status;
+		if (status == 0)
+			nfs_post_op_update_inode(inode, &data->fattr);
+	}
 	rpc_release_task(task);
 	return status;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5d6bda43dfaa..12be1d682164 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -392,9 +392,11 @@ static int nfs_stat_to_errno(int);
 				decode_getattr_maxsz)
 #define NFS4_enc_delegreturn_sz	(compound_encode_hdr_maxsz + \
 				encode_putfh_maxsz + \
-				encode_delegreturn_maxsz)
+				encode_delegreturn_maxsz + \
+				encode_getattr_maxsz)
 #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
-				decode_delegreturn_maxsz)
+				decode_delegreturn_maxsz + \
+				decode_getattr_maxsz)
 #define NFS4_enc_getacl_sz	(compound_encode_hdr_maxsz + \
 				encode_putfh_maxsz + \
 				encode_getattr_maxsz)
@@ -1983,14 +1985,20 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const str
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
-		.nops = 2,
+		.nops = 3,
 	};
 	int status;
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
-	if ((status = encode_putfh(&xdr, args->fhandle)) == 0)
-		status = encode_delegreturn(&xdr, args->stateid);
+	status = encode_putfh(&xdr, args->fhandle);
+	if (status != 0)
+		goto out;
+	status = encode_delegreturn(&xdr, args->stateid);
+	if (status != 0)
+		goto out;
+	status = encode_getfattr(&xdr, args->bitmask);
+out:
 	return status;
 }
 
@@ -4184,7 +4192,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
 /*
  * DELEGRETURN request
  */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_delegreturnres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4192,11 +4200,14 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *d
 
 	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
 	status = decode_compound_hdr(&xdr, &hdr);
-	if (status == 0) {
-		status = decode_putfh(&xdr);
-		if (status == 0)
-			status = decode_delegreturn(&xdr);
-	}
+	if (status != 0)
+		goto out;
+	status = decode_putfh(&xdr);
+	if (status != 0)
+		goto out;
+	status = decode_delegreturn(&xdr);
+	decode_getfattr(&xdr, res->fattr, res->server);
+out:
 	return status;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9f422fd87673..6d6f69ec5675 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -220,6 +220,12 @@ struct nfs_lockt_res {
 struct nfs4_delegreturnargs {
 	const struct nfs_fh *fhandle;
 	const nfs4_stateid *stateid;
+	const u32 * bitmask;
+};
+
+struct nfs4_delegreturnres {
+	struct nfs_fattr * fattr;
+	const struct nfs_server *server;
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From a72b44222d222749d54b3e370d825094352e389f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:41 +0100
Subject: NFSv4: Allow user to set the port used by the NFSv4 callback channel

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/kernel-parameters.txt |  4 ++
 fs/nfs/Makefile                     |  1 +
 fs/nfs/callback.c                   |  3 +-
 fs/nfs/callback.h                   |  1 +
 fs/nfs/inode.c                      | 37 ++++++++++++++++++-
 fs/nfs/sysctl.c                     | 74 +++++++++++++++++++++++++++++++++++++
 include/linux/nfs_fs.h              | 11 ++++++
 7 files changed, 128 insertions(+), 3 deletions(-)
 create mode 100644 fs/nfs/sysctl.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 61a56b100c62..309c9cec6e7c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -910,6 +910,10 @@ running once the system is up.
 	nfsroot=	[NFS] nfs root filesystem for disk-less boxes.
 			See Documentation/nfsroot.txt.
 
+	nfs.callback_tcpport=
+			[NFS] set the TCP port on which the NFSv4 callback
+			channel should listen.
+
 	nmi_watchdog=	[KNL,BUGS=IA-32] Debugging features for SMP kernels
 
 	no387		[BUGS=IA-32] Tells the kernel to use the 387 maths
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8b3bb715d177..ec61fd56a1a9 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -13,4 +13,5 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   delegation.o idmap.o \
 			   callback.o callback_xdr.o callback_proc.o
 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
+nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-objs		:= $(nfs-y)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 30cae3602867..fcd97406a778 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -34,6 +34,7 @@ static struct nfs_callback_data nfs_callback_info;
 static DECLARE_MUTEX(nfs_callback_sema);
 static struct svc_program nfs4_callback_program;
 
+unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
 
 /*
@@ -98,7 +99,7 @@ int nfs_callback_up(void)
 	if (!serv)
 		goto out_err;
 	/* FIXME: We don't want to register this socket with the portmapper */
-	ret = svc_makesock(serv, IPPROTO_TCP, 0);
+	ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport);
 	if (ret < 0)
 		goto out_destroy;
 	if (!list_empty(&serv->sv_permsocks)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index a0db2d4f9415..b252e7fe53a5 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -65,6 +65,7 @@ extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
 extern int nfs_callback_up(void);
 extern int nfs_callback_down(void);
 
+extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
 
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7270b1d73d30..648cb1aef3b1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -40,6 +40,7 @@
 #include <asm/uaccess.h>
 
 #include "nfs4_fs.h"
+#include "callback.h"
 #include "delegation.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
@@ -2036,6 +2037,21 @@ static struct file_system_type nfs4_fs_type = {
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+static int param_set_port(const char *val, struct kernel_param *kp)
+{
+	char *endp;
+	int num = simple_strtol(val, &endp, 0);
+	if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+		return -EINVAL;
+	*((int *)kp->arg) = num;
+	return 0;
+}
+
+module_param_call(callback_tcpport, param_set_port, param_get_int,
+		 &nfs_callback_set_tcpport, 0644);
+
 #define nfs4_init_once(nfsi) \
 	do { \
 		INIT_LIST_HEAD(&(nfsi)->open_states); \
@@ -2043,8 +2059,25 @@ static struct file_system_type nfs4_fs_type = {
 		nfsi->delegation_state = 0; \
 		init_rwsem(&nfsi->rwsem); \
 	} while(0)
-#define register_nfs4fs() register_filesystem(&nfs4_fs_type)
-#define unregister_nfs4fs() unregister_filesystem(&nfs4_fs_type)
+
+static inline int register_nfs4fs(void)
+{
+	int ret;
+
+	ret = nfs_register_sysctl();
+	if (ret != 0)
+		return ret;
+	ret = register_filesystem(&nfs4_fs_type);
+	if (ret != 0)
+		nfs_unregister_sysctl();
+	return ret;
+}
+
+static inline void unregister_nfs4fs(void)
+{
+	unregister_filesystem(&nfs4_fs_type);
+	nfs_unregister_sysctl();
+}
 #else
 #define nfs4_init_once(nfsi) \
 	do { } while (0)
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
new file mode 100644
index 000000000000..fdc64b59a4ee
--- /dev/null
+++ b/fs/nfs/sysctl.c
@@ -0,0 +1,74 @@
+/*
+ * linux/fs/nfs/sysctl.c
+ *
+ * Sysctl interface to NFS parameters
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/nfs4.h>
+
+#include "callback.h"
+
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+static struct ctl_table_header *nfs_callback_sysctl_table;
+/*
+ * Something that isn't CTL_ANY, CTL_NONE or a value that may clash.
+ * Use the same values as fs/lockd/svc.c
+ */
+#define CTL_UNNUMBERED -2
+
+static ctl_table nfs_cb_sysctls[] = {
+#ifdef CONFIG_NFS_V4
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "nfs_callback_tcpport",
+		.data = &nfs_callback_set_tcpport,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec_minmax,
+		.extra1 = (int *)&nfs_set_port_min,
+		.extra2 = (int *)&nfs_set_port_max,
+	},
+#endif
+	{ .ctl_name = 0 }
+};
+
+static ctl_table nfs_cb_sysctl_dir[] = {
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "nfs",
+		.mode = 0555,
+		.child = nfs_cb_sysctls,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table nfs_cb_sysctl_root[] = {
+	{
+		.ctl_name = CTL_FS,
+		.procname = "fs",
+		.mode = 0555,
+		.child = nfs_cb_sysctl_dir,
+	},
+	{ .ctl_name = 0 }
+};
+
+int nfs_register_sysctl(void)
+{
+	nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root, 0);
+	if (nfs_callback_sysctl_table == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void nfs_unregister_sysctl(void)
+{
+	unregister_sysctl_table(nfs_callback_sysctl_table);
+	nfs_callback_sysctl_table = NULL;
+}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 408d82d3d97c..547d649b274e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -391,6 +391,17 @@ extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_
  */
 extern struct inode_operations nfs_symlink_inode_operations;
 
+/*
+ * linux/fs/nfs/sysctl.c
+ */
+#ifdef CONFIG_SYSCTL
+extern int nfs_register_sysctl(void);
+extern void nfs_unregister_sysctl(void);
+#else
+#define nfs_register_sysctl() do { } while(0)
+#define nfs_unregister_sysctl() do { } while(0)
+#endif
+
 /*
  * linux/fs/nfs/unlink.c
  */
-- 
cgit v1.2.3-71-gd317


From fb459f45f7c7689714023d41b3dca999bb90a5d3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 3 Jan 2006 09:55:41 +0100
Subject: SUNRPC: net/sunrpc/xdr.c: remove xdr_decode_string()

 This patch removes ths unused function xdr_decode_string().

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Neil Brown <neilb@suse.de>
Acked-by: Charles Lever <Charles.Lever@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xdr.h |  1 -
 net/sunrpc/xdr.c           | 21 ---------------------
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5676794ee34f..84c35d42d250 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -91,7 +91,6 @@ struct xdr_buf {
 u32 *	xdr_encode_opaque_fixed(u32 *p, const void *ptr, unsigned int len);
 u32 *	xdr_encode_opaque(u32 *p, const void *ptr, unsigned int len);
 u32 *	xdr_encode_string(u32 *p, const char *s);
-u32 *	xdr_decode_string(u32 *p, char **sp, int *lenp, int maxlen);
 u32 *	xdr_decode_string_inplace(u32 *p, char **sp, int *lenp, int maxlen);
 u32 *	xdr_encode_netobj(u32 *p, const struct xdr_netobj *);
 u32 *	xdr_decode_netobj(u32 *p, struct xdr_netobj *);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index aaf08cdd19f0..ca4bfa57e116 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -92,27 +92,6 @@ xdr_encode_string(u32 *p, const char *string)
 	return xdr_encode_array(p, string, strlen(string));
 }
 
-u32 *
-xdr_decode_string(u32 *p, char **sp, int *lenp, int maxlen)
-{
-	unsigned int	len;
-	char		*string;
-
-	if ((len = ntohl(*p++)) > maxlen)
-		return NULL;
-	if (lenp)
-		*lenp = len;
-	if ((len % 4) != 0) {
-		string = (char *) p;
-	} else {
-		string = (char *) (p - 1);
-		memmove(string, p, len);
-	}
-	string[len] = '\0';
-	*sp = string;
-	return p + XDR_QUADLEN(len);
-}
-
 u32 *
 xdr_decode_string_inplace(u32 *p, char **sp, int *lenp, int maxlen)
 {
-- 
cgit v1.2.3-71-gd317


From 64a318ee2af9000df482d7a125c3b3e1f1007404 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 3 Jan 2006 09:55:46 +0100
Subject: NLM: Further cancel fixes

 If the server receives an NLM cancel call and finds no waiting lock to
 cancel, then chances are the lock has already been applied, and the client
 just hadn't yet processed the NLM granted callback before it sent the
 cancel.

 The Open Group text, for example, perimts a server to return either success
 (LCK_GRANTED) or failure (LCK_DENIED) in this case.  But returning an error
 seems more helpful; the client may be able to use it to recognize that a
 race has occurred and to recover from the race.

 So, modify the relevant functions to return an error in this case.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c | 15 ++++++++++-----
 fs/locks.c         |  7 ++++++-
 include/linux/fs.h |  2 +-
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index b56d439bad82..9cfced65d4a2 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -227,25 +227,27 @@ failed:
  * It is the caller's responsibility to check whether the file
  * can be closed hereafter.
  */
-static void
+static int
 nlmsvc_delete_block(struct nlm_block *block, int unlock)
 {
 	struct file_lock	*fl = &block->b_call.a_args.lock.fl;
 	struct nlm_file		*file = block->b_file;
 	struct nlm_block	**bp;
+	int status = 0;
 
 	dprintk("lockd: deleting block %p...\n", block);
 
 	/* Remove block from list */
 	nlmsvc_remove_block(block);
-	posix_unblock_lock(file->f_file, fl);
+	if (unlock)
+		status = posix_unblock_lock(file->f_file, fl);
 
 	/* If the block is in the middle of a GRANT callback,
 	 * don't kill it yet. */
 	if (block->b_incall) {
 		nlmsvc_insert_block(block, NLM_NEVER);
 		block->b_done = 1;
-		return;
+		return status;
 	}
 
 	/* Remove block from file's list of blocks */
@@ -260,6 +262,7 @@ nlmsvc_delete_block(struct nlm_block *block, int unlock)
 		nlm_release_host(block->b_host);
 	nlmclnt_freegrantargs(&block->b_call);
 	kfree(block);
+	return status;
 }
 
 /*
@@ -270,6 +273,7 @@ int
 nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action)
 {
 	struct nlm_block	*block, *next;
+	/* XXX: Will everything get cleaned up if we don't unlock here? */
 
 	down(&file->f_sema);
 	for (block = file->f_blocks; block; block = next) {
@@ -439,6 +443,7 @@ u32
 nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 {
 	struct nlm_block	*block;
+	int status = 0;
 
 	dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
 				file->f_file->f_dentry->d_inode->i_sb->s_id,
@@ -449,9 +454,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 
 	down(&file->f_sema);
 	if ((block = nlmsvc_lookup_block(file, lock, 1)) != NULL)
-		nlmsvc_delete_block(block, 1);
+		status = nlmsvc_delete_block(block, 1);
 	up(&file->f_sema);
-	return nlm_granted;
+	return status ? nlm_lck_denied : nlm_granted;
 }
 
 /*
diff --git a/fs/locks.c b/fs/locks.c
index 75650d52fe60..fb32d6218e21 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1958,13 +1958,18 @@ EXPORT_SYMBOL(posix_block_lock);
  *
  *	lockd needs to block waiting for locks.
  */
-void
+int
 posix_unblock_lock(struct file *filp, struct file_lock *waiter)
 {
+	int status = 0;
+
 	lock_kernel();
 	if (waiter->fl_next)
 		__locks_delete_block(waiter);
+	else
+		status = -ENOENT;
 	unlock_kernel();
+	return status;
 }
 
 EXPORT_SYMBOL(posix_unblock_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 115e72be25d0..2c9c48d65630 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -760,7 +760,7 @@ extern struct file_lock *posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern void posix_block_lock(struct file_lock *, struct file_lock *);
-extern void posix_unblock_lock(struct file *, struct file_lock *);
+extern int posix_unblock_lock(struct file *, struct file_lock *);
 extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags);
-- 
cgit v1.2.3-71-gd317


From 02107148349f31eee7c0fb06fd7a880df73dbd20 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 3 Jan 2006 09:55:49 +0100
Subject: SUNRPC: switchable buffer allocation

 Add RPC client transport switch support for replacing buffer management
 on a per-transport basis.

 In the current IPv4 socket transport implementation, RPC buffers are
 allocated as needed for each RPC message that is sent.  Some transport
 implementations may choose to use pre-allocated buffers for encoding,
 sending, receiving, and unmarshalling RPC messages, however.  For
 transports capable of direct data placement, the buffers can be carved
 out of a pre-registered area of memory rather than from a slab cache.

 Test-plan:
 Millions of fsx operations.  Performance characterization with "sio" and
 "iozone".  Use oprofile and other tools to look for significant regression
 in CPU utilization.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h |  3 +--
 include/linux/sunrpc/xprt.h  | 10 ++++-----
 net/sunrpc/clnt.c            | 14 +++++++------
 net/sunrpc/sched.c           | 50 ++++++++++++++++++++++++--------------------
 net/sunrpc/xprt.c            |  3 +++
 net/sunrpc/xprtsock.c        |  5 +++++
 6 files changed, 49 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 94b0afa4ab05..8b25629accd8 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -52,8 +52,6 @@ struct rpc_task {
 	 * RPC call state
 	 */
 	struct rpc_message	tk_msg;		/* RPC call info */
-	__u32 *			tk_buffer;	/* XDR buffer */
-	size_t			tk_bufsize;
 	__u8			tk_garb_retry;
 	__u8			tk_cred_retry;
 
@@ -268,6 +266,7 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
 void		rpc_wake_up_status(struct rpc_wait_queue *, int);
 void		rpc_delay(struct rpc_task *, unsigned long);
 void *		rpc_malloc(struct rpc_task *, size_t);
+void		rpc_free(struct rpc_task *);
 int		rpciod_up(void);
 void		rpciod_down(void);
 void		rpciod_wake_up(void);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 3b8b6e823c70..7885b9621ce3 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -79,21 +79,19 @@ struct rpc_rqst {
 	void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
 	struct list_head	rq_list;
 
+	__u32 *			rq_buffer;	/* XDR encode buffer */
+	size_t			rq_bufsize;
+
 	struct xdr_buf		rq_private_buf;		/* The receive buffer
 							 * used in the softirq.
 							 */
 	unsigned long		rq_majortimeo;	/* major timeout alarm */
 	unsigned long		rq_timeout;	/* Current timeout value */
 	unsigned int		rq_retries;	/* # of retries */
-	/*
-	 * For authentication (e.g. auth_des)
-	 */
-	u32			rq_creddata[2];
 	
 	/*
 	 * Partial send handling
 	 */
-	
 	u32			rq_bytes_sent;	/* Bytes we have sent */
 
 	unsigned long		rq_xtime;	/* when transmitted */
@@ -107,6 +105,8 @@ struct rpc_xprt_ops {
 	int		(*reserve_xprt)(struct rpc_task *task);
 	void		(*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task);
 	void		(*connect)(struct rpc_task *task);
+	void *		(*buf_alloc)(struct rpc_task *task, size_t size);
+	void		(*buf_free)(struct rpc_task *task);
 	int		(*send_request)(struct rpc_task *task);
 	void		(*set_retrans_timeout)(struct rpc_task *task);
 	void		(*timer)(struct rpc_task *task);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b23c0d328c9c..25cba94c5683 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -644,24 +644,26 @@ call_reserveresult(struct rpc_task *task)
 
 /*
  * 2.	Allocate the buffer. For details, see sched.c:rpc_malloc.
- *	(Note: buffer memory is freed in rpc_task_release).
+ *	(Note: buffer memory is freed in xprt_release).
  */
 static void
 call_allocate(struct rpc_task *task)
 {
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = task->tk_xprt;
 	unsigned int	bufsiz;
 
 	dprintk("RPC: %4d call_allocate (status %d)\n", 
 				task->tk_pid, task->tk_status);
 	task->tk_action = call_bind;
-	if (task->tk_buffer)
+	if (req->rq_buffer)
 		return;
 
 	/* FIXME: compute buffer requirements more exactly using
 	 * auth->au_wslack */
 	bufsiz = task->tk_msg.rpc_proc->p_bufsiz + RPC_SLACK_SPACE;
 
-	if (rpc_malloc(task, bufsiz << 1) != NULL)
+	if (xprt->ops->buf_alloc(task, bufsiz << 1) != NULL)
 		return;
 	printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task); 
 
@@ -704,14 +706,14 @@ call_encode(struct rpc_task *task)
 				task->tk_pid, task->tk_status);
 
 	/* Default buffer setup */
-	bufsiz = task->tk_bufsize >> 1;
-	sndbuf->head[0].iov_base = (void *)task->tk_buffer;
+	bufsiz = req->rq_bufsize >> 1;
+	sndbuf->head[0].iov_base = (void *)req->rq_buffer;
 	sndbuf->head[0].iov_len  = bufsiz;
 	sndbuf->tail[0].iov_len  = 0;
 	sndbuf->page_len	 = 0;
 	sndbuf->len		 = 0;
 	sndbuf->buflen		 = bufsiz;
-	rcvbuf->head[0].iov_base = (void *)((char *)task->tk_buffer + bufsiz);
+	rcvbuf->head[0].iov_base = (void *)((char *)req->rq_buffer + bufsiz);
 	rcvbuf->head[0].iov_len  = bufsiz;
 	rcvbuf->tail[0].iov_len  = 0;
 	rcvbuf->page_len	 = 0;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 48510e3ffa02..7415406aa1ae 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -41,8 +41,6 @@ static mempool_t	*rpc_buffer_mempool __read_mostly;
 
 static void			__rpc_default_timer(struct rpc_task *task);
 static void			rpciod_killall(void);
-static void			rpc_free(struct rpc_task *task);
-
 static void			rpc_async_schedule(void *);
 
 /*
@@ -599,7 +597,6 @@ void rpc_exit_task(struct rpc_task *task)
 			WARN_ON(RPC_ASSASSINATED(task));
 			/* Always release the RPC slot and buffer memory */
 			xprt_release(task);
-			rpc_free(task);
 		}
 	}
 }
@@ -724,17 +721,19 @@ static void rpc_async_schedule(void *arg)
 	__rpc_execute((struct rpc_task *)arg);
 }
 
-/*
- * Allocate memory for RPC purposes.
+/**
+ * rpc_malloc - allocate an RPC buffer
+ * @task: RPC task that will use this buffer
+ * @size: requested byte size
  *
  * We try to ensure that some NFS reads and writes can always proceed
  * by using a mempool when allocating 'small' buffers.
  * In order to avoid memory starvation triggering more writebacks of
  * NFS requests, we use GFP_NOFS rather than GFP_KERNEL.
  */
-void *
-rpc_malloc(struct rpc_task *task, size_t size)
+void * rpc_malloc(struct rpc_task *task, size_t size)
 {
+	struct rpc_rqst *req = task->tk_rqstp;
 	gfp_t	gfp;
 
 	if (task->tk_flags & RPC_TASK_SWAPPER)
@@ -743,27 +742,33 @@ rpc_malloc(struct rpc_task *task, size_t size)
 		gfp = GFP_NOFS;
 
 	if (size > RPC_BUFFER_MAXSIZE) {
-		task->tk_buffer =  kmalloc(size, gfp);
-		if (task->tk_buffer)
-			task->tk_bufsize = size;
+		req->rq_buffer = kmalloc(size, gfp);
+		if (req->rq_buffer)
+			req->rq_bufsize = size;
 	} else {
-		task->tk_buffer =  mempool_alloc(rpc_buffer_mempool, gfp);
-		if (task->tk_buffer)
-			task->tk_bufsize = RPC_BUFFER_MAXSIZE;
+		req->rq_buffer = mempool_alloc(rpc_buffer_mempool, gfp);
+		if (req->rq_buffer)
+			req->rq_bufsize = RPC_BUFFER_MAXSIZE;
 	}
-	return task->tk_buffer;
+	return req->rq_buffer;
 }
 
-static void
-rpc_free(struct rpc_task *task)
+/**
+ * rpc_free - free buffer allocated via rpc_malloc
+ * @task: RPC task with a buffer to be freed
+ *
+ */
+void rpc_free(struct rpc_task *task)
 {
-	if (task->tk_buffer) {
-		if (task->tk_bufsize == RPC_BUFFER_MAXSIZE)
-			mempool_free(task->tk_buffer, rpc_buffer_mempool);
+	struct rpc_rqst *req = task->tk_rqstp;
+
+	if (req->rq_buffer) {
+		if (req->rq_bufsize == RPC_BUFFER_MAXSIZE)
+			mempool_free(req->rq_buffer, rpc_buffer_mempool);
 		else
-			kfree(task->tk_buffer);
-		task->tk_buffer = NULL;
-		task->tk_bufsize = 0;
+			kfree(req->rq_buffer);
+		req->rq_buffer = NULL;
+		req->rq_bufsize = 0;
 	}
 }
 
@@ -887,7 +892,6 @@ void rpc_release_task(struct rpc_task *task)
 		xprt_release(task);
 	if (task->tk_msg.rpc_cred)
 		rpcauth_unbindcred(task);
-	rpc_free(task);
 	if (task->tk_client) {
 		rpc_release_client(task->tk_client);
 		task->tk_client = NULL;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 6dda3860351f..069a6cbd49ea 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -838,6 +838,8 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
 	req->rq_timeout = xprt->timeout.to_initval;
 	req->rq_task	= task;
 	req->rq_xprt    = xprt;
+	req->rq_buffer  = NULL;
+	req->rq_bufsize = 0;
 	req->rq_xid     = xprt_alloc_xid(xprt);
 	req->rq_release_snd_buf = NULL;
 	dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid,
@@ -867,6 +869,7 @@ void xprt_release(struct rpc_task *task)
 		mod_timer(&xprt->timer,
 				xprt->last_used + xprt->idle_timeout);
 	spin_unlock_bh(&xprt->transport_lock);
+	xprt->ops->buf_free(task);
 	task->tk_rqstp = NULL;
 	if (req->rq_release_snd_buf)
 		req->rq_release_snd_buf(req);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 77e8800d4127..51f07c9a751b 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -28,6 +28,7 @@
 #include <linux/udp.h>
 #include <linux/tcp.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
 #include <linux/file.h>
 
 #include <net/sock.h>
@@ -1161,6 +1162,8 @@ static struct rpc_xprt_ops xs_udp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong,
 	.connect		= xs_connect,
+	.buf_alloc		= rpc_malloc,
+	.buf_free		= rpc_free,
 	.send_request		= xs_udp_send_request,
 	.set_retrans_timeout	= xprt_set_retrans_timeout_rtt,
 	.timer			= xs_udp_timer,
@@ -1173,6 +1176,8 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xprt_release_xprt,
 	.connect		= xs_connect,
+	.buf_alloc		= rpc_malloc,
+	.buf_free		= rpc_free,
 	.send_request		= xs_tcp_send_request,
 	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
 	.close			= xs_close,
-- 
cgit v1.2.3-71-gd317


From 35f5a422ce1af836007f811b613c440d0e348e06 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 3 Jan 2006 09:55:50 +0100
Subject: SUNRPC: new interface to force an RPC rebind

 We'd like to hide fields in rpc_xprt and rpc_clnt from upper layer protocols.
 Start by creating an API to force RPC rebind, replacing logic that simply
 sets cl_port to zero.

 Test-plan:
 Destructive testing (unplugging the network temporarily).  Connectathon
 with UDP and TCP.  NFSv2/3 and NFSv4 mounting should be carefully checked.
 Probably need to rig a server where certain services aren't running, or
 that returns an error for some typical operation.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c             |  4 ++--
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 21 +++++++++++++++------
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index c4c8601096e0..82f7a0b1d8ae 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -177,7 +177,7 @@ nlm_bind_host(struct nlm_host *host)
 	if ((clnt = host->h_rpcclnt) != NULL) {
 		xprt = clnt->cl_xprt;
 		if (time_after_eq(jiffies, host->h_nextrebind)) {
-			clnt->cl_port = 0;
+			rpc_force_rebind(clnt);
 			host->h_nextrebind = jiffies + NLM_HOST_REBIND;
 			dprintk("lockd: next rebind in %ld jiffies\n",
 					host->h_nextrebind - jiffies);
@@ -217,7 +217,7 @@ nlm_rebind_host(struct nlm_host *host)
 {
 	dprintk("lockd: rebind host %s\n", host->h_name);
 	if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) {
-		host->h_rpcclnt->cl_port = 0;
+		rpc_force_rebind(host->h_rpcclnt);
 		host->h_nextrebind = jiffies + NLM_HOST_REBIND;
 	}
 }
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index b0ab959eca65..3d605765f84b 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -135,6 +135,7 @@ void		rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset);
 void		rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset);
 void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 size_t		rpc_max_payload(struct rpc_clnt *);
+void		rpc_force_rebind(struct rpc_clnt *);
 int		rpc_ping(struct rpc_clnt *clnt, int flags);
 
 static __inline__
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 25cba94c5683..2789d3083fe7 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -538,6 +538,18 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL(rpc_max_payload);
 
+/**
+ * rpc_force_rebind - force transport to check that remote port is unchanged
+ * @clnt: client to rebind
+ *
+ */
+void rpc_force_rebind(struct rpc_clnt *clnt)
+{
+	if (clnt->cl_autobind)
+		clnt->cl_port = 0;
+}
+EXPORT_SYMBOL(rpc_force_rebind);
+
 /*
  * Restart an (async) RPC call. Usually called from within the
  * exit handler.
@@ -853,8 +865,7 @@ call_connect_status(struct rpc_task *task)
 	}
 
 	/* Something failed: remote service port may have changed */
-	if (clnt->cl_autobind)
-		clnt->cl_port = 0;
+	rpc_force_rebind(clnt);
 
 	switch (status) {
 	case -ENOTCONN:
@@ -935,8 +946,7 @@ call_status(struct rpc_task *task)
 		break;
 	case -ECONNREFUSED:
 	case -ENOTCONN:
-		if (clnt->cl_autobind)
-			clnt->cl_port = 0;
+		rpc_force_rebind(clnt);
 		task->tk_action = call_bind;
 		break;
 	case -EAGAIN:
@@ -995,8 +1005,7 @@ call_timeout(struct rpc_task *task)
 		printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
 			clnt->cl_protname, clnt->cl_server);
 	}
-	if (clnt->cl_autobind)
-		clnt->cl_port = 0;
+	rpc_force_rebind(clnt);
 
 retry:
 	clnt->cl_stats->rpcretrans++;
-- 
cgit v1.2.3-71-gd317


From 922004120b10dcb0ce04b55014168e8a7a8c1a0e Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 3 Jan 2006 09:55:51 +0100
Subject: SUNRPC: transport switch API for setting port number

 At some point, transport endpoint addresses will no longer be IPv4.  To hide
 the structure of the rpc_xprt's address field from ULPs and port mappers,
 add an API for setting the port number during an RPC bind operation.

 Test-plan:
 Destructive testing (unplugging the network temporarily).  Connectathon
 with UDP and TCP.  NFSv2/3 and NFSv4 mounting should be carefully checked.
 Probably need to rig a server where certain services aren't running, or
 that returns an error for some typical operation.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/pmap_clnt.c      |  8 +++++---
 net/sunrpc/xprtsock.c       | 14 ++++++++++++++
 3 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 7885b9621ce3..dd860128ceda 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -104,6 +104,7 @@ struct rpc_xprt_ops {
 	void		(*set_buffer_size)(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize);
 	int		(*reserve_xprt)(struct rpc_task *task);
 	void		(*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task);
+	void		(*set_port)(struct rpc_xprt *xprt, unsigned short port);
 	void		(*connect)(struct rpc_task *task);
 	void *		(*buf_alloc)(struct rpc_task *task, size_t size);
 	void		(*buf_free)(struct rpc_task *task);
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index cad4568fbbe2..0935adb91b3c 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -131,10 +131,13 @@ static void
 pmap_getport_done(struct rpc_task *task)
 {
 	struct rpc_clnt	*clnt = task->tk_client;
+	struct rpc_xprt *xprt = task->tk_xprt;
 	struct rpc_portmap *map = clnt->cl_pmap;
 
 	dprintk("RPC: %4d pmap_getport_done(status %d, port %d)\n",
 			task->tk_pid, task->tk_status, clnt->cl_port);
+
+	xprt->ops->set_port(xprt, 0);
 	if (task->tk_status < 0) {
 		/* Make the calling task exit with an error */
 		task->tk_action = rpc_exit_task;
@@ -142,9 +145,8 @@ pmap_getport_done(struct rpc_task *task)
 		/* Program not registered */
 		rpc_exit(task, -EACCES);
 	} else {
-		/* byte-swap port number first */
+		xprt->ops->set_port(xprt, clnt->cl_port);
 		clnt->cl_port = htons(clnt->cl_port);
-		clnt->cl_xprt->addr.sin_port = clnt->cl_port;
 	}
 	spin_lock(&pmap_lock);
 	map->pm_binding = 0;
@@ -205,7 +207,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileg
 	xprt = xprt_create_proto(proto, srvaddr, NULL);
 	if (IS_ERR(xprt))
 		return (struct rpc_clnt *)xprt;
-	xprt->addr.sin_port = htons(RPC_PMAP_PORT);
+	xprt->ops->set_port(xprt, RPC_PMAP_PORT);
 	if (!privileged)
 		xprt->resvport = 0;
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 51f07c9a751b..3e8893001479 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -921,6 +921,18 @@ static void xs_udp_timer(struct rpc_task *task)
 	xprt_adjust_cwnd(task, -ETIMEDOUT);
 }
 
+/**
+ * xs_set_port - reset the port number in the remote endpoint address
+ * @xprt: generic transport
+ * @port: new port number
+ *
+ */
+static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
+{
+	dprintk("RPC:      setting port for xprt %p to %u\n", xprt, port);
+	xprt->addr.sin_port = htons(port);
+}
+
 static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock)
 {
 	struct sockaddr_in myaddr = {
@@ -1161,6 +1173,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 	.set_buffer_size	= xs_udp_set_buffer_size,
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong,
+	.set_port		= xs_set_port,
 	.connect		= xs_connect,
 	.buf_alloc		= rpc_malloc,
 	.buf_free		= rpc_free,
@@ -1175,6 +1188,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 static struct rpc_xprt_ops xs_tcp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xprt_release_xprt,
+	.set_port		= xs_set_port,
 	.connect		= xs_connect,
 	.buf_alloc		= rpc_malloc,
 	.buf_free		= rpc_free,
-- 
cgit v1.2.3-71-gd317


From f518e35aec984036903c1003e867f833747a9d79 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 3 Jan 2006 09:55:52 +0100
Subject: SUNRPC: get rid of cl_chatty

 Clean up: Every ULP that uses the in-kernel RPC client, except the NLM
 client, sets cl_chatty.  There's no reason why NLM shouldn't set it, so
 just get rid of cl_chatty and always be verbose.

 Test-plan:
 Compile with CONFIG_NFS enabled.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         |  3 +--
 fs/lockd/mon.c              |  1 -
 fs/nfs/inode.c              |  2 --
 fs/nfs/mount_clnt.c         |  1 -
 fs/nfsd/nfs4callback.c      |  1 -
 include/linux/sunrpc/clnt.h |  1 -
 net/sunrpc/clnt.c           | 10 ++++------
 net/sunrpc/pmap_clnt.c      |  1 -
 8 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 816333cd377b..145524039577 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -222,8 +222,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 			goto done;
 		}
 		clnt->cl_softrtry = nfssrv->client->cl_softrtry;
-		clnt->cl_intr     = nfssrv->client->cl_intr;
-		clnt->cl_chatty   = nfssrv->client->cl_chatty;
+		clnt->cl_intr = nfssrv->client->cl_intr;
 	}
 
 	/* Keep the old signal mask */
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 2d144abe84ad..0edc03e67966 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -123,7 +123,6 @@ nsm_create(void)
 	if (IS_ERR(clnt))
 		goto out_err;
 	clnt->cl_softrtry = 1;
-	clnt->cl_chatty   = 1;
 	clnt->cl_oneshot  = 1;
 	return clnt;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 648cb1aef3b1..4625479a6b62 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -413,7 +413,6 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 
 	clnt->cl_intr     = 1;
 	clnt->cl_softrtry = 1;
-	clnt->cl_chatty   = 1;
 
 	return clnt;
 
@@ -1838,7 +1837,6 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
 		}
 		clnt->cl_intr     = 1;
 		clnt->cl_softrtry = 1;
-		clnt->cl_chatty   = 1;
 		clp->cl_rpcclient = clnt;
 		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
 		nfs_idmap_new(clp);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 0e82617f2de0..db99b8f678f8 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -82,7 +82,6 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version,
 				RPC_AUTH_UNIX);
 	if (!IS_ERR(clnt)) {
 		clnt->cl_softrtry = 1;
-		clnt->cl_chatty   = 1;
 		clnt->cl_oneshot  = 1;
 		clnt->cl_intr = 1;
 	}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cf92008f219a..d828662d737d 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -431,7 +431,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 	}
 	clnt->cl_intr = 0;
 	clnt->cl_softrtry = 1;
-	clnt->cl_chatty = 1;
 
 	/* Kick rpciod, put the call on the wire. */
 
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 3d605765f84b..f147e6b84332 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -49,7 +49,6 @@ struct rpc_clnt {
 
 	unsigned int		cl_softrtry : 1,/* soft timeouts */
 				cl_intr     : 1,/* interruptible */
-				cl_chatty   : 1,/* be verbose */
 				cl_autobind : 1,/* use getport() */
 				cl_oneshot  : 1,/* dispose after use */
 				cl_dead     : 1;/* abandoned */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2789d3083fe7..5530ac8c6df9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -957,8 +957,7 @@ call_status(struct rpc_task *task)
 		rpc_exit(task, status);
 		break;
 	default:
-		if (clnt->cl_chatty)
-			printk("%s: RPC call returned error %d\n",
+		printk("%s: RPC call returned error %d\n",
 			       clnt->cl_protname, -status);
 		rpc_exit(task, status);
 		break;
@@ -993,14 +992,13 @@ call_timeout(struct rpc_task *task)
 
 	dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid);
 	if (RPC_IS_SOFT(task)) {
-		if (clnt->cl_chatty)
-			printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
+		printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
 				clnt->cl_protname, clnt->cl_server);
 		rpc_exit(task, -EIO);
 		return;
 	}
 
-	if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN)) {
+	if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) {
 		task->tk_flags |= RPC_CALL_MAJORSEEN;
 		printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
 			clnt->cl_protname, clnt->cl_server);
@@ -1027,7 +1025,7 @@ call_decode(struct rpc_task *task)
 	dprintk("RPC: %4d call_decode (status %d)\n", 
 				task->tk_pid, task->tk_status);
 
-	if (clnt->cl_chatty && (task->tk_flags & RPC_CALL_MAJORSEEN)) {
+	if (task->tk_flags & RPC_CALL_MAJORSEEN) {
 		printk(KERN_NOTICE "%s: server %s OK\n",
 			clnt->cl_protname, clnt->cl_server);
 		task->tk_flags &= ~RPC_CALL_MAJORSEEN;
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 0935adb91b3c..8139ce68e915 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -217,7 +217,6 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileg
 				RPC_AUTH_UNIX);
 	if (!IS_ERR(clnt)) {
 		clnt->cl_softrtry = 1;
-		clnt->cl_chatty   = 1;
 		clnt->cl_oneshot  = 1;
 	}
 	return clnt;
-- 
cgit v1.2.3-71-gd317


From 632e3bdc5006334cea894d078660b691685e1075 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:55 +0100
Subject: SUNRPC: Ensure client closes the socket when server initiates a close

 If the server decides to close the RPC socket, we currently don't actually
 respond until either another RPC call is scheduled, or until xprt_autoclose()
 gets called by the socket expiry timer (which may be up to 5 minutes
 later).

 This patch ensures that xprt_autoclose() is called much sooner if the
 server closes the socket.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprt.c           | 33 ++++++++++++++++-----------------
 net/sunrpc/xprtsock.c       | 12 ++++++++++--
 3 files changed, 27 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index dd860128ceda..6ef99b14ff09 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -254,6 +254,7 @@ int			xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to);
 #define XPRT_LOCKED		(0)
 #define XPRT_CONNECTED		(1)
 #define XPRT_CONNECTING		(2)
+#define XPRT_CLOSE_WAIT		(3)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 069a6cbd49ea..8bc0d5acf0da 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -119,6 +119,17 @@ out_sleep:
 	return 0;
 }
 
+static void xprt_clear_locked(struct rpc_xprt *xprt)
+{
+	xprt->snd_task = NULL;
+	if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state) || xprt->shutdown) {
+		smp_mb__before_clear_bit();
+		clear_bit(XPRT_LOCKED, &xprt->state);
+		smp_mb__after_clear_bit();
+	} else
+		schedule_work(&xprt->task_cleanup);
+}
+
 /*
  * xprt_reserve_xprt_cong - serialize write access to transports
  * @task: task that is requesting access to the transport
@@ -145,9 +156,7 @@ int xprt_reserve_xprt_cong(struct rpc_task *task)
 		}
 		return 1;
 	}
-	smp_mb__before_clear_bit();
-	clear_bit(XPRT_LOCKED, &xprt->state);
-	smp_mb__after_clear_bit();
+	xprt_clear_locked(xprt);
 out_sleep:
 	dprintk("RPC: %4d failed to lock transport %p\n", task->tk_pid, xprt);
 	task->tk_timeout = 0;
@@ -193,9 +202,7 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt)
 	return;
 
 out_unlock:
-	smp_mb__before_clear_bit();
-	clear_bit(XPRT_LOCKED, &xprt->state);
-	smp_mb__after_clear_bit();
+	xprt_clear_locked(xprt);
 }
 
 static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
@@ -222,9 +229,7 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
 		return;
 	}
 out_unlock:
-	smp_mb__before_clear_bit();
-	clear_bit(XPRT_LOCKED, &xprt->state);
-	smp_mb__after_clear_bit();
+	xprt_clear_locked(xprt);
 }
 
 /**
@@ -237,10 +242,7 @@ out_unlock:
 void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	if (xprt->snd_task == task) {
-		xprt->snd_task = NULL;
-		smp_mb__before_clear_bit();
-		clear_bit(XPRT_LOCKED, &xprt->state);
-		smp_mb__after_clear_bit();
+		xprt_clear_locked(xprt);
 		__xprt_lock_write_next(xprt);
 	}
 }
@@ -256,10 +258,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
 void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	if (xprt->snd_task == task) {
-		xprt->snd_task = NULL;
-		smp_mb__before_clear_bit();
-		clear_bit(XPRT_LOCKED, &xprt->state);
-		smp_mb__after_clear_bit();
+		xprt_clear_locked(xprt);
 		__xprt_lock_write_next_cong(xprt);
 	}
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3e8893001479..c458f8d1d6d1 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -425,7 +425,7 @@ static void xs_close(struct rpc_xprt *xprt)
 	struct sock *sk = xprt->inet;
 
 	if (!sk)
-		return;
+		goto clear_close_wait;
 
 	dprintk("RPC:      xs_close xprt %p\n", xprt);
 
@@ -442,6 +442,10 @@ static void xs_close(struct rpc_xprt *xprt)
 	sk->sk_no_check = 0;
 
 	sock_release(sock);
+clear_close_wait:
+	smp_mb__before_clear_bit();
+	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+	smp_mb__after_clear_bit();
 }
 
 /**
@@ -801,9 +805,13 @@ static void xs_tcp_state_change(struct sock *sk)
 	case TCP_SYN_SENT:
 	case TCP_SYN_RECV:
 		break;
+	case TCP_CLOSE_WAIT:
+		/* Try to schedule an autoclose RPC calls */
+		set_bit(XPRT_CLOSE_WAIT, &xprt->state);
+		if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
+			schedule_work(&xprt->task_cleanup);
 	default:
 		xprt_disconnect(xprt);
-		break;
 	}
  out:
 	read_unlock(&sk->sk_callback_lock);
-- 
cgit v1.2.3-71-gd317


From 58df095b732529ade8f4051b41d7c29731afecd6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:57 +0100
Subject: NFSv4: Allow entries in the idmap cache to expire

 If someone changes the uid/gid mapping in userland, then we do eventually
 want those changes to be propagated to the kernel. Currently the kernel
 assumes that it may cache entries forever.

 Add an expiration time + garbage collector for idmap entries.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/kernel-parameters.txt |  4 ++++
 fs/nfs/idmap.c                      |  9 +++++++++
 fs/nfs/inode.c                      | 14 ++++++++++++++
 fs/nfs/sysctl.c                     | 10 ++++++++++
 include/linux/nfs_idmap.h           |  2 ++
 5 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 309c9cec6e7c..a482fde09bbb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -914,6 +914,10 @@ running once the system is up.
 			[NFS] set the TCP port on which the NFSv4 callback
 			channel should listen.
 
+	nfs.idmap_cache_timeout=
+			[NFS] set the maximum lifetime for idmapper cache
+			entries.
+
 	nmi_watchdog=	[KNL,BUGS=IA-32] Debugging features for SMP kernels
 
 	no387		[BUGS=IA-32] Tells the kernel to use the 387 maths
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index ffb8df91dc34..821edd30333b 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -54,7 +54,11 @@
 
 #define IDMAP_HASH_SZ          128
 
+/* Default cache timeout is 10 minutes */
+unsigned int nfs_idmap_cache_timeout = 600 * HZ;
+
 struct idmap_hashent {
+	unsigned long ih_expires;
 	__u32 ih_id;
 	int ih_namelen;
 	char ih_name[IDMAP_NAMESZ];
@@ -149,6 +153,8 @@ idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len)
 
 	if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0)
 		return NULL;
+	if (time_after(jiffies, he->ih_expires))
+		return NULL;
 	return he;
 }
 
@@ -164,6 +170,8 @@ idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
 	struct idmap_hashent *he = idmap_id_hash(h, id);
 	if (he->ih_id != id || he->ih_namelen == 0)
 		return NULL;
+	if (time_after(jiffies, he->ih_expires))
+		return NULL;
 	return he;
 }
 
@@ -192,6 +200,7 @@ idmap_update_entry(struct idmap_hashent *he, const char *name,
 	memcpy(he->ih_name, name, namelen);
 	he->ih_name[namelen] = '\0';
 	he->ih_namelen = namelen;
+	he->ih_expires = jiffies + nfs_idmap_cache_timeout;
 }
 
 /*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4625479a6b62..e7bd0d92600f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2050,6 +2050,20 @@ static int param_set_port(const char *val, struct kernel_param *kp)
 module_param_call(callback_tcpport, param_set_port, param_get_int,
 		 &nfs_callback_set_tcpport, 0644);
 
+static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
+{
+	char *endp;
+	int num = simple_strtol(val, &endp, 0);
+	int jif = num * HZ;
+	if (endp == val || *endp || num < 0 || jif < num)
+		return -EINVAL;
+	*((int *)kp->arg) = jif;
+	return 0;
+}
+
+module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
+		 &nfs_idmap_cache_timeout, 0644);
+
 #define nfs4_init_once(nfsi) \
 	do { \
 		INIT_LIST_HEAD(&(nfsi)->open_states); \
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index fdc64b59a4ee..4c486eb867ca 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -11,6 +11,7 @@
 #include <linux/sysctl.h>
 #include <linux/module.h>
 #include <linux/nfs4.h>
+#include <linux/nfs_idmap.h>
 
 #include "callback.h"
 
@@ -35,6 +36,15 @@ static ctl_table nfs_cb_sysctls[] = {
 		.extra1 = (int *)&nfs_set_port_min,
 		.extra2 = (int *)&nfs_set_port_max,
 	},
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "idmap_cache_timeout",
+		.data = &nfs_idmap_cache_timeout,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec_jiffies,
+		.strategy = &sysctl_jiffies,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index a0f1f25e0ead..102e56094296 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -71,6 +71,8 @@ int nfs_map_name_to_uid(struct nfs4_client *, const char *, size_t, __u32 *);
 int nfs_map_group_to_gid(struct nfs4_client *, const char *, size_t, __u32 *);
 int nfs_map_uid_to_name(struct nfs4_client *, __u32, char *);
 int nfs_map_gid_to_group(struct nfs4_client *, __u32, char *);
+
+extern unsigned int nfs_idmap_cache_timeout;
 #endif /* __KERNEL__ */
 
 #endif /* NFS_IDMAP_H */
-- 
cgit v1.2.3-71-gd317


From 9eed129bbde80cbd7ffeacaa1555ba1e0c9a0997 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 3 Jan 2006 09:56:00 +0100
Subject: SUNRPC: Update the spkm3 code to use the make_checksum interface

 Also update the tokenlen calculations to accomodate g_token_size().

 Signed-off-by: Andy Adamson <andros@citi.umich.edu>
 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/gss_spkm3.h       |  2 +-
 net/sunrpc/auth_gss/gss_spkm3_seal.c   | 11 +++++------
 net/sunrpc/auth_gss/gss_spkm3_token.c  |  3 ++-
 net/sunrpc/auth_gss/gss_spkm3_unseal.c |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/gss_spkm3.h b/include/linux/sunrpc/gss_spkm3.h
index 0beb2cf00a84..336e218c2782 100644
--- a/include/linux/sunrpc/gss_spkm3.h
+++ b/include/linux/sunrpc/gss_spkm3.h
@@ -48,7 +48,7 @@ u32 spkm3_read_token(struct spkm3_ctx *ctx, struct xdr_netobj *read_token, struc
 #define CKSUMTYPE_RSA_MD5            0x0007
 
 s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
-                   struct xdr_netobj *cksum);
+                   int body_offset, struct xdr_netobj *cksum);
 void asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits);
 int decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, 
                    int explen);
diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c
index d1e12b25d6e2..86fbf7c3e39c 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_seal.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c
@@ -59,7 +59,7 @@ spkm3_make_token(struct spkm3_ctx *ctx,
 	char			tokhdrbuf[25];
 	struct xdr_netobj	md5cksum = {.len = 0, .data = NULL};
 	struct xdr_netobj	mic_hdr = {.len = 0, .data = tokhdrbuf};
-	int			tmsglen, tokenlen = 0;
+	int			tokenlen = 0;
 	unsigned char		*ptr;
 	s32			now;
 	int			ctxelen = 0, ctxzbit = 0;
@@ -92,24 +92,23 @@ spkm3_make_token(struct spkm3_ctx *ctx,
 	}
 
 	if (toktype == SPKM_MIC_TOK) {
-		tmsglen = 0;
 		/* Calculate checksum over the mic-header */
 		asn1_bitstring_len(&ctx->ctx_id, &ctxelen, &ctxzbit);
 		spkm3_mic_header(&mic_hdr.data, &mic_hdr.len, ctx->ctx_id.data,
 		                         ctxelen, ctxzbit);
 
 		if (make_checksum(checksum_type, mic_hdr.data, mic_hdr.len, 
-		                             text, &md5cksum))
+		                             text, 0, &md5cksum))
 			goto out_err;
 
 		asn1_bitstring_len(&md5cksum, &md5elen, &md5zbit);
-		tokenlen = 10 + ctxelen + 1 + 2 + md5elen + 1;
+		tokenlen = 10 + ctxelen + 1 + md5elen + 1;
 
 		/* Create token header using generic routines */
-		token->len = g_token_size(&ctx->mech_used, tokenlen + tmsglen);
+		token->len = g_token_size(&ctx->mech_used, tokenlen);
 
 		ptr = token->data;
-		g_make_token_header(&ctx->mech_used, tokenlen + tmsglen, &ptr);
+		g_make_token_header(&ctx->mech_used, tokenlen, &ptr);
 
 		spkm3_make_mic_token(&ptr, tokenlen, &mic_hdr, &md5cksum, md5elen, md5zbit);
 	} else if (toktype == SPKM_WRAP_TOK) { /* Not Supported */
diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c
index 1f824578d773..af0d7ce74686 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_token.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_token.c
@@ -182,6 +182,7 @@ spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, unsigned char *ct
  * *tokp points to the beginning of the SPKM_MIC token  described 
  * in rfc 2025, section 3.2.1: 
  *
+ * toklen is the inner token length
  */
 void
 spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hdr, struct xdr_netobj *md5cksum, int md5elen, int md5zbit)
@@ -189,7 +190,7 @@ spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hd
 	unsigned char *ict = *tokp;
 
 	*(u8 *)ict++ = 0xa4;
-	*(u8 *)ict++ = toklen - 2; 
+	*(u8 *)ict++ = toklen;
 	memcpy(ict, mic_hdr->data, mic_hdr->len);
 	ict += mic_hdr->len;
 
diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c
index 241d5b30dfcb..96851b0ba1ba 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c
@@ -95,7 +95,7 @@ spkm3_read_token(struct spkm3_ctx *ctx,
 		ret = GSS_S_DEFECTIVE_TOKEN;
 		code = make_checksum(CKSUMTYPE_RSA_MD5, ptr + 2, 
 					mic_hdrlen + 2, 
-		                        message_buffer, &md5cksum);
+		                        message_buffer, 0, &md5cksum);
 
 		if (code)
 			goto out;
-- 
cgit v1.2.3-71-gd317


From 367cb704212cd0c9273ba2b1e62523139210563b Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@mars.ravnborg.org>
Date: Fri, 6 Jan 2006 21:17:50 +0100
Subject: kbuild: un-stringnify KBUILD_MODNAME

Now when kbuild passes KBUILD_MODNAME with "" do not __stringify it when
used. Remove __stringnify for all users.
This also fixes the output of:

$ ls -l /sys/module/
drwxr-xr-x 4 root root 0 2006-01-05 14:24 pcmcia
drwxr-xr-x 4 root root 0 2006-01-05 14:24 pcmcia_core
drwxr-xr-x 3 root root 0 2006-01-05 14:24 "processor"
drwxr-xr-x 3 root root 0 2006-01-05 14:24 "psmouse"

The quoting of the module names will be gone again.
Thanks to GregKH + Kay Sievers for reproting this.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 drivers/media/dvb/cinergyT2/cinergyT2.c | 2 +-
 drivers/media/dvb/ttpci/budget.h        | 2 +-
 drivers/media/video/tda9840.c           | 2 +-
 drivers/media/video/tea6415c.c          | 2 +-
 drivers/media/video/tea6420.c           | 2 +-
 include/linux/moduleparam.h             | 2 +-
 include/media/saa7146.h                 | 6 +++---
 net/ipv4/netfilter/ip_nat_ftp.c         | 2 +-
 net/ipv4/netfilter/ip_nat_irc.c         | 2 +-
 security/capability.c                   | 6 ++----
 10 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/dvb/cinergyT2/cinergyT2.c b/drivers/media/dvb/cinergyT2/cinergyT2.c
index b996fb59b7e4..1d69bf031fb9 100644
--- a/drivers/media/dvb/cinergyT2/cinergyT2.c
+++ b/drivers/media/dvb/cinergyT2/cinergyT2.c
@@ -60,7 +60,7 @@ MODULE_PARM_DESC(debug, "Turn on/off debugging (default:off).");
 #define dprintk(level, args...)						\
 do {									\
 	if ((debug & level)) {						\
-		printk("%s: %s(): ", __stringify(KBUILD_MODNAME),	\
+		printk("%s: %s(): ", KBUILD_MODNAME,			\
 		       __FUNCTION__);					\
 		printk(args); }						\
 } while (0)
diff --git a/drivers/media/dvb/ttpci/budget.h b/drivers/media/dvb/ttpci/budget.h
index fdaa3318ad3a..c8d48cfba277 100644
--- a/drivers/media/dvb/ttpci/budget.h
+++ b/drivers/media/dvb/ttpci/budget.h
@@ -19,7 +19,7 @@ extern int budget_debug;
 #endif
 
 #define dprintk(level,args...) \
-	    do { if ((budget_debug & level)) { printk("%s: %s(): ",__stringify(KBUILD_MODNAME), __FUNCTION__); printk(args); } } while (0)
+	    do { if ((budget_debug & level)) { printk("%s: %s(): ", KBUILD_MODNAME, __FUNCTION__); printk(args); } } while (0)
 
 struct budget_info {
 	char *name;
diff --git a/drivers/media/video/tda9840.c b/drivers/media/video/tda9840.c
index 1794686612c6..0cb5c7e9a884 100644
--- a/drivers/media/video/tda9840.c
+++ b/drivers/media/video/tda9840.c
@@ -34,7 +34,7 @@ static int debug = 0;		/* insmod parameter */
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "Turn on/off device debugging (default:off).");
 #define dprintk(args...) \
-            do { if (debug) { printk("%s: %s()[%d]: ",__stringify(KBUILD_MODNAME), __FUNCTION__, __LINE__); printk(args); } } while (0)
+            do { if (debug) { printk("%s: %s()[%d]: ", KBUILD_MODNAME, __FUNCTION__, __LINE__); printk(args); } } while (0)
 
 #define	SWITCH		0x00
 #define	LEVEL_ADJUST	0x02
diff --git a/drivers/media/video/tea6415c.c b/drivers/media/video/tea6415c.c
index ee3688348b66..09149dad1b84 100644
--- a/drivers/media/video/tea6415c.c
+++ b/drivers/media/video/tea6415c.c
@@ -36,7 +36,7 @@ static int debug = 0;		/* insmod parameter */
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "Turn on/off device debugging (default:off).");
 #define dprintk(args...) \
-            do { if (debug) { printk("%s: %s()[%d]: ",__stringify(KBUILD_MODNAME), __FUNCTION__, __LINE__); printk(args); } } while (0)
+            do { if (debug) { printk("%s: %s()[%d]: ", KBUILD_MODNAME, __FUNCTION__, __LINE__); printk(args); } } while (0)
 
 #define TEA6415C_NUM_INPUTS	8
 #define TEA6415C_NUM_OUTPUTS	6
diff --git a/drivers/media/video/tea6420.c b/drivers/media/video/tea6420.c
index 17975c19da5e..e908f917c8d2 100644
--- a/drivers/media/video/tea6420.c
+++ b/drivers/media/video/tea6420.c
@@ -36,7 +36,7 @@ static int debug = 0;		/* insmod parameter */
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "Turn on/off device debugging (default:off).");
 #define dprintk(args...) \
-            do { if (debug) { printk("%s: %s()[%d]: ",__stringify(KBUILD_MODNAME), __FUNCTION__, __LINE__); printk(args); } } while (0)
+            do { if (debug) { printk("%s: %s()[%d]: ", KBUILD_MODNAME, __FUNCTION__, __LINE__); printk(args); } } while (0)
 
 /* addresses to scan, found only at 0x4c and/or 0x4d (7-Bit) */
 static unsigned short normal_i2c[] = { I2C_TEA6420_1, I2C_TEA6420_2, I2C_CLIENT_END };
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 368ec8e45bd0..b5c98c43779e 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -10,7 +10,7 @@
 #ifdef MODULE
 #define MODULE_PARAM_PREFIX /* empty */
 #else
-#define MODULE_PARAM_PREFIX __stringify(KBUILD_MODNAME) "."
+#define MODULE_PARAM_PREFIX KBUILD_MODNAME "."
 #endif
 
 #ifdef MODULE
diff --git a/include/media/saa7146.h b/include/media/saa7146.h
index e5be2b9b846b..2bc634fcb7bb 100644
--- a/include/media/saa7146.h
+++ b/include/media/saa7146.h
@@ -21,14 +21,14 @@
 
 extern unsigned int saa7146_debug;
 
-//#define DEBUG_PROLOG printk("(0x%08x)(0x%08x) %s: %s(): ",(dev==0?-1:(dev->mem==0?-1:saa7146_read(dev,RPS_ADDR0))),(dev==0?-1:(dev->mem==0?-1:saa7146_read(dev,IER))),__stringify(KBUILD_MODNAME),__FUNCTION__)
+//#define DEBUG_PROLOG printk("(0x%08x)(0x%08x) %s: %s(): ",(dev==0?-1:(dev->mem==0?-1:saa7146_read(dev,RPS_ADDR0))),(dev==0?-1:(dev->mem==0?-1:saa7146_read(dev,IER))),KBUILD_MODNAME,__FUNCTION__)
 
 #ifndef DEBUG_VARIABLE
 	#define DEBUG_VARIABLE saa7146_debug
 #endif
 
-#define DEBUG_PROLOG printk("%s: %s(): ",__stringify(KBUILD_MODNAME),__FUNCTION__)
-#define INFO(x) { printk("%s: ",__stringify(KBUILD_MODNAME)); printk x; }
+#define DEBUG_PROLOG printk("%s: %s(): ",KBUILD_MODNAME,__FUNCTION__)
+#define INFO(x) { printk("%s: ",KBUILD_MODNAME); printk x; }
 
 #define ERR(x) { DEBUG_PROLOG; printk x; }
 
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
index d83757a70d9f..b8daab3c64af 100644
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -171,7 +171,7 @@ static int __init init(void)
 /* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
 static int warn_set(const char *val, struct kernel_param *kp)
 {
-	printk(KERN_INFO __stringify(KBUILD_MODNAME)
+	printk(KERN_INFO KBUILD_MODNAME
 	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
 	return 0;
 }
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
index de31942babe3..461c833eaca1 100644
--- a/net/ipv4/netfilter/ip_nat_irc.c
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -113,7 +113,7 @@ static int __init init(void)
 /* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
 static int warn_set(const char *val, struct kernel_param *kp)
 {
-	printk(KERN_INFO __stringify(KBUILD_MODNAME)
+	printk(KERN_INFO KBUILD_MODNAME
 	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
 	return 0;
 }
diff --git a/security/capability.c b/security/capability.c
index ec18d6075625..f9b35cc0b248 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -49,8 +49,6 @@ static struct security_operations capability_ops = {
 	.vm_enough_memory =             cap_vm_enough_memory,
 };
 
-#define MY_NAME __stringify(KBUILD_MODNAME)
-
 /* flag to keep track of how we were registered */
 static int secondary;
 
@@ -67,7 +65,7 @@ static int __init capability_init (void)
 	/* register ourselves with the security framework */
 	if (register_security (&capability_ops)) {
 		/* try registering with primary module */
-		if (mod_reg_security (MY_NAME, &capability_ops)) {
+		if (mod_reg_security (KBUILD_MODNAME, &capability_ops)) {
 			printk (KERN_INFO "Failure registering capabilities "
 				"with primary security module.\n");
 			return -EINVAL;
@@ -85,7 +83,7 @@ static void __exit capability_exit (void)
 		return;
 	/* remove ourselves from the security framework */
 	if (secondary) {
-		if (mod_unreg_security (MY_NAME, &capability_ops))
+		if (mod_unreg_security (KBUILD_MODNAME, &capability_ops))
 			printk (KERN_INFO "Failure unregistering capabilities "
 				"with primary module.\n");
 		return;
-- 
cgit v1.2.3-71-gd317


From 4bad4dc919573dbe9a5b41dd9edff279e99822d7 Mon Sep 17 00:00:00 2001
From: Kris Katterjohn <kjak@ispwest.com>
Date: Fri, 6 Jan 2006 13:08:20 -0800
Subject: [NET]: Change sk_run_filter()'s return type in net/core/filter.c

It should return an unsigned value, and fix sk_filter() as well.

Signed-off-by: Kris Katterjohn <kjak@ispwest.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 2 +-
 include/net/sock.h     | 4 ++--
 net/core/filter.c      | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 3ba843c46382..c6cb8f095088 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -143,7 +143,7 @@ static inline unsigned int sk_filter_len(struct sk_filter *fp)
 struct sk_buff;
 struct sock;
 
-extern int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen);
+extern unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen);
 extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 extern int sk_chk_filter(struct sock_filter *filter, int flen);
 #endif /* __KERNEL__ */
diff --git a/include/net/sock.h b/include/net/sock.h
index 6961700ff3a0..1806e5b61419 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -856,8 +856,8 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock)
 		
 		filter = sk->sk_filter;
 		if (filter) {
-			int pkt_len = sk_run_filter(skb, filter->insns,
-						    filter->len);
+			unsigned int pkt_len = sk_run_filter(skb, filter->insns,
+							     filter->len);
 			if (!pkt_len)
 				err = -EPERM;
 			else
diff --git a/net/core/filter.c b/net/core/filter.c
index 8964d3445588..9eb9d0017a01 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -75,7 +75,7 @@ static inline void *load_pointer(struct sk_buff *skb, int k,
  * len is the number of filter blocks in the array.
  */
  
-int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
+unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
 {
 	struct sock_filter *fentry;	/* We walk down these */
 	void *ptr;
@@ -241,9 +241,9 @@ load_b:
 			A = X;
 			continue;
 		case BPF_RET|BPF_K:
-			return ((unsigned int)fentry->k);
+			return fentry->k;
 		case BPF_RET|BPF_A:
-			return ((unsigned int)A);
+			return A;
 		case BPF_ST:
 			mem[fentry->k] = A;
 			continue;
-- 
cgit v1.2.3-71-gd317


From 76ab608d86cf1ef5c5c46819b5733eb9f9f964f8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 6 Jan 2006 13:24:29 -0800
Subject: [NET]: Endian-annotate struct iphdr

And fix trivial warnings that emerged.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ip.h         | 10 +++++-----
 net/ipv4/ip_fragment.c     |  2 +-
 net/ipv4/ip_output.c       |  4 ++--
 net/ipv4/ipvs/ip_vs_xmit.c |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ip.h b/include/linux/ip.h
index 9e2eb9a602eb..4b55cf1df732 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -90,14 +90,14 @@ struct iphdr {
 #error	"Please fix <asm/byteorder.h>"
 #endif
 	__u8	tos;
-	__u16	tot_len;
-	__u16	id;
-	__u16	frag_off;
+	__be16	tot_len;
+	__be16	id;
+	__be16	frag_off;
 	__u8	ttl;
 	__u8	protocol;
 	__u16	check;
-	__u32	saddr;
-	__u32	daddr;
+	__be32	saddr;
+	__be32	daddr;
 	/*The options start here. */
 };
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index ce2b70ce4018..2a8adda15e11 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -383,7 +383,7 @@ out_nomem:
  */
 static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
 {
-	__u16 id = iph->id;
+	__be16 id = iph->id;
 	__u32 saddr = iph->saddr;
 	__u32 daddr = iph->daddr;
 	__u8 protocol = iph->protocol;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 71da31818cfc..8b1c9bd0091e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -418,7 +418,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 	struct sk_buff *skb2;
 	unsigned int mtu, hlen, left, len, ll_rs;
 	int offset;
-	int not_last_frag;
+	__be16 not_last_frag;
 	struct rtable *rt = (struct rtable*)skb->dst;
 	int err = 0;
 
@@ -1180,7 +1180,7 @@ int ip_push_pending_frames(struct sock *sk)
 	struct ip_options *opt = NULL;
 	struct rtable *rt = inet->cork.rt;
 	struct iphdr *iph;
-	int df = 0;
+	__be16 df = 0;
 	__u8 ttl;
 	int err = 0;
 
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index 3b87482049cf..52c12e9edbbc 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -322,7 +322,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct net_device *tdev;		/* Device to other host */
 	struct iphdr  *old_iph = skb->nh.iph;
 	u8     tos = old_iph->tos;
-	u16    df = old_iph->frag_off;
+	__be16 df = old_iph->frag_off;
 	struct iphdr  *iph;			/* Our new IP header */
 	int    max_headroom;			/* The extra header space needed */
 	int    mtu;
-- 
cgit v1.2.3-71-gd317


From a2167dc62e9142b9a4bfb20f7e001c0f0a26fd8c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 6 Jan 2006 13:24:54 -0800
Subject: [NET]: Endian-annotate in_aton()

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet.h | 2 +-
 net/core/utils.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inet.h b/include/linux/inet.h
index 3b5e9fdff872..6c5587af118d 100644
--- a/include/linux/inet.h
+++ b/include/linux/inet.h
@@ -45,6 +45,6 @@
 #ifdef __KERNEL__
 #include <linux/types.h>
 
-extern __u32 in_aton(const char *str);
+extern __be32 in_aton(const char *str);
 #endif
 #endif	/* _LINUX_INET_H */
diff --git a/net/core/utils.c b/net/core/utils.c
index 587eb7787deb..ac1d1fcf8673 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -162,7 +162,7 @@ EXPORT_SYMBOL(net_srandom);
  * is otherwise not dependent on the TCP/IP stack.
  */
 
-__u32 in_aton(const char *str)
+__be32 in_aton(const char *str)
 {
 	unsigned long l;
 	unsigned int val;
-- 
cgit v1.2.3-71-gd317


From a62c80e559809e6c7851ec04d30575e85ad6f6ed Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sat, 7 Jan 2006 13:52:45 +0000
Subject: [ARM] Move AMBA include files to include/linux/amba/

Since the ARM AMBA bus is used on MIPS as well as ARM, we need
to make the bus available for other architectures to use.  Move
the AMBA include files from include/asm-arm/hardware/ to
include/linux/amba/

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/common/amba.c                        |   2 +-
 arch/arm/mach-aaec2000/core.c                 |   2 +-
 arch/arm/mach-aaec2000/core.h                 |   2 +-
 arch/arm/mach-integrator/core.c               |   2 +-
 arch/arm/mach-integrator/impd1.c              |   4 +-
 arch/arm/mach-integrator/integrator_ap.c      |   4 +-
 arch/arm/mach-integrator/integrator_cp.c      |   6 +-
 arch/arm/mach-integrator/time.c               |   2 +-
 arch/arm/mach-realview/core.c                 |   4 +-
 arch/arm/mach-realview/core.h                 |   3 +-
 arch/arm/mach-realview/realview_eb.c          |   2 +-
 arch/arm/mach-versatile/core.c                |   4 +-
 arch/arm/mach-versatile/core.h                |   2 +-
 arch/arm/mach-versatile/versatile_ab.c        |   2 +-
 arch/arm/mach-versatile/versatile_pb.c        |   2 +-
 drivers/input/serio/ambakmi.c                 |   4 +-
 drivers/mmc/mmci.c                            |   2 +-
 drivers/serial/amba-pl010.c                   |   4 +-
 drivers/serial/amba-pl011.c                   |   4 +-
 drivers/video/amba-clcd.c                     |   5 +-
 include/asm-arm/arch-integrator/debug-macro.S |   2 +-
 include/asm-arm/arch-realview/debug-macro.S   |   2 +-
 include/asm-arm/arch-versatile/debug-macro.S  |   2 +-
 include/asm-arm/hardware/amba.h               |  55 ------
 include/asm-arm/hardware/amba_clcd.h          | 271 --------------------------
 include/asm-arm/hardware/amba_kmi.h           |  92 ---------
 include/asm-arm/hardware/amba_serial.h        | 161 ---------------
 include/linux/amba/bus.h                      |  55 ++++++
 include/linux/amba/clcd.h                     | 271 ++++++++++++++++++++++++++
 include/linux/amba/kmi.h                      |  92 +++++++++
 include/linux/amba/serial.h                   | 161 +++++++++++++++
 sound/arm/aaci.c                              |   2 +-
 32 files changed, 614 insertions(+), 614 deletions(-)
 delete mode 100644 include/asm-arm/hardware/amba.h
 delete mode 100644 include/asm-arm/hardware/amba_clcd.h
 delete mode 100644 include/asm-arm/hardware/amba_kmi.h
 delete mode 100644 include/asm-arm/hardware/amba_serial.h
 create mode 100644 include/linux/amba/bus.h
 create mode 100644 include/linux/amba/clcd.h
 create mode 100644 include/linux/amba/kmi.h
 create mode 100644 include/linux/amba/serial.h

(limited to 'include/linux')

diff --git a/arch/arm/common/amba.c b/arch/arm/common/amba.c
index e1013112c354..2bb0ce81bb69 100644
--- a/arch/arm/common/amba.c
+++ b/arch/arm/common/amba.c
@@ -12,10 +12,10 @@
 #include <linux/device.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/amba/bus.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/hardware/amba.h>
 #include <asm/sizes.h>
 
 #define to_amba_device(d)	container_of(d, struct amba_device, dev)
diff --git a/arch/arm/mach-aaec2000/core.c b/arch/arm/mach-aaec2000/core.c
index 4e706d9ad368..dce4815cf53c 100644
--- a/arch/arm/mach-aaec2000/core.c
+++ b/arch/arm/mach-aaec2000/core.c
@@ -20,11 +20,11 @@
 #include <linux/interrupt.h>
 #include <linux/timex.h>
 #include <linux/signal.h>
+#include <linux/amba/bus.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
 #include <asm/sizes.h>
-#include <asm/hardware/amba.h>
 
 #include <asm/mach/flash.h>
 #include <asm/mach/irq.h>
diff --git a/arch/arm/mach-aaec2000/core.h b/arch/arm/mach-aaec2000/core.h
index daefc0ea14a1..b6029a95f19c 100644
--- a/arch/arm/mach-aaec2000/core.h
+++ b/arch/arm/mach-aaec2000/core.h
@@ -9,7 +9,7 @@
  *
  */
 
-#include <asm/hardware/amba_clcd.h>
+#include <linux/amba/clcd.h>
 
 struct sys_timer;
 
diff --git a/arch/arm/mach-integrator/core.c b/arch/arm/mach-integrator/core.c
index dacbf504dae2..20071a2767cc 100644
--- a/arch/arm/mach-integrator/core.c
+++ b/arch/arm/mach-integrator/core.c
@@ -15,11 +15,11 @@
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/amba/bus.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
 #include <asm/io.h>
-#include <asm/hardware/amba.h>
 #include <asm/hardware/arm_timer.h>
 #include <asm/arch/cm.h>
 #include <asm/system.h>
diff --git a/arch/arm/mach-integrator/impd1.c b/arch/arm/mach-integrator/impd1.c
index a4bafee77a06..a85d471c5bfa 100644
--- a/arch/arm/mach-integrator/impd1.c
+++ b/arch/arm/mach-integrator/impd1.c
@@ -18,11 +18,11 @@
 #include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/clcd.h>
 
 #include <asm/io.h>
 #include <asm/hardware/icst525.h>
-#include <asm/hardware/amba.h>
-#include <asm/hardware/amba_clcd.h>
 #include <asm/arch/lm.h>
 #include <asm/arch/impd1.h>
 #include <asm/sizes.h>
diff --git a/arch/arm/mach-integrator/integrator_ap.c b/arch/arm/mach-integrator/integrator_ap.c
index 4c0f7c65facf..3afedeb56a6e 100644
--- a/arch/arm/mach-integrator/integrator_ap.c
+++ b/arch/arm/mach-integrator/integrator_ap.c
@@ -25,6 +25,8 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/sysdev.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/kmi.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
@@ -32,8 +34,6 @@
 #include <asm/setup.h>
 #include <asm/param.h>		/* HZ */
 #include <asm/mach-types.h>
-#include <asm/hardware/amba.h>
-#include <asm/hardware/amba_kmi.h>
 
 #include <asm/arch/lm.h>
 
diff --git a/arch/arm/mach-integrator/integrator_cp.c b/arch/arm/mach-integrator/integrator_cp.c
index 93f7ccb22c27..16cf2482a3e9 100644
--- a/arch/arm/mach-integrator/integrator_cp.c
+++ b/arch/arm/mach-integrator/integrator_cp.c
@@ -16,15 +16,15 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/sysdev.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/kmi.h>
+#include <linux/amba/clcd.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/setup.h>
 #include <asm/mach-types.h>
-#include <asm/hardware/amba.h>
-#include <asm/hardware/amba_kmi.h>
-#include <asm/hardware/amba_clcd.h>
 #include <asm/hardware/icst525.h>
 
 #include <asm/arch/cm.h>
diff --git a/arch/arm/mach-integrator/time.c b/arch/arm/mach-integrator/time.c
index 1a844ca139e0..9f46aaef8968 100644
--- a/arch/arm/mach-integrator/time.c
+++ b/arch/arm/mach-integrator/time.c
@@ -14,8 +14,8 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/device.h>
+#include <linux/amba/bus.h>
 
-#include <asm/hardware/amba.h>
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
diff --git a/arch/arm/mach-realview/core.c b/arch/arm/mach-realview/core.c
index af6580f1ceb8..4a222f59f2cf 100644
--- a/arch/arm/mach-realview/core.c
+++ b/arch/arm/mach-realview/core.c
@@ -24,14 +24,14 @@
 #include <linux/dma-mapping.h>
 #include <linux/sysdev.h>
 #include <linux/interrupt.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/clcd.h>
 
 #include <asm/system.h>
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/leds.h>
-#include <asm/hardware/amba.h>
-#include <asm/hardware/amba_clcd.h>
 #include <asm/hardware/arm_timer.h>
 #include <asm/hardware/icst307.h>
 
diff --git a/arch/arm/mach-realview/core.h b/arch/arm/mach-realview/core.h
index c06e6041df41..93e86d9f439c 100644
--- a/arch/arm/mach-realview/core.h
+++ b/arch/arm/mach-realview/core.h
@@ -22,7 +22,8 @@
 #ifndef __ASM_ARCH_REALVIEW_H
 #define __ASM_ARCH_REALVIEW_H
 
-#include <asm/hardware/amba.h>
+#include <linux/amba/bus.h>
+
 #include <asm/leds.h>
 #include <asm/io.h>
 
diff --git a/arch/arm/mach-realview/realview_eb.c b/arch/arm/mach-realview/realview_eb.c
index 7dc32503fdf2..112f7592aca9 100644
--- a/arch/arm/mach-realview/realview_eb.c
+++ b/arch/arm/mach-realview/realview_eb.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/sysdev.h>
+#include <linux/amba/bus.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
@@ -30,7 +31,6 @@
 #include <asm/leds.h>
 #include <asm/mach-types.h>
 #include <asm/hardware/gic.h>
-#include <asm/hardware/amba.h>
 #include <asm/hardware/icst307.h>
 
 #include <asm/mach/arch.h>
diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index a1ca46630dda..90023745b23a 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -25,14 +25,14 @@
 #include <linux/platform_device.h>
 #include <linux/sysdev.h>
 #include <linux/interrupt.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/clcd.h>
 
 #include <asm/system.h>
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/leds.h>
-#include <asm/hardware/amba.h>
-#include <asm/hardware/amba_clcd.h>
 #include <asm/hardware/arm_timer.h>
 #include <asm/hardware/icst307.h>
 
diff --git a/arch/arm/mach-versatile/core.h b/arch/arm/mach-versatile/core.h
index 588c20669d5d..afcaa858eb1f 100644
--- a/arch/arm/mach-versatile/core.h
+++ b/arch/arm/mach-versatile/core.h
@@ -22,7 +22,7 @@
 #ifndef __ASM_ARCH_VERSATILE_H
 #define __ASM_ARCH_VERSATILE_H
 
-#include <asm/hardware/amba.h>
+#include <linux/amba/bus.h>
 
 extern void __init versatile_init(void);
 extern void __init versatile_init_irq(void);
diff --git a/arch/arm/mach-versatile/versatile_ab.c b/arch/arm/mach-versatile/versatile_ab.c
index 8b0b3bef24ae..e74c8a2fbb95 100644
--- a/arch/arm/mach-versatile/versatile_ab.c
+++ b/arch/arm/mach-versatile/versatile_ab.c
@@ -23,12 +23,12 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/sysdev.h>
+#include <linux/amba/bus.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/mach-types.h>
-#include <asm/hardware/amba.h>
 
 #include <asm/mach/arch.h>
 
diff --git a/arch/arm/mach-versatile/versatile_pb.c b/arch/arm/mach-versatile/versatile_pb.c
index 7c3078c38916..22d5ca07f75d 100644
--- a/arch/arm/mach-versatile/versatile_pb.c
+++ b/arch/arm/mach-versatile/versatile_pb.c
@@ -23,12 +23,12 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/sysdev.h>
+#include <linux/amba/bus.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/mach-types.h>
-#include <asm/hardware/amba.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/mmc.h>
diff --git a/drivers/input/serio/ambakmi.c b/drivers/input/serio/ambakmi.c
index d847ed51cfb1..cbab5d26377b 100644
--- a/drivers/input/serio/ambakmi.c
+++ b/drivers/input/serio/ambakmi.c
@@ -19,11 +19,11 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/kmi.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/hardware/amba.h>
-#include <asm/hardware/amba_kmi.h>
 #include <asm/hardware/clock.h>
 
 #define KMI_BASE	(kmi->base)
diff --git a/drivers/mmc/mmci.c b/drivers/mmc/mmci.c
index 31b0b6d612bf..57375bc12372 100644
--- a/drivers/mmc/mmci.c
+++ b/drivers/mmc/mmci.c
@@ -19,12 +19,12 @@
 #include <linux/highmem.h>
 #include <linux/mmc/host.h>
 #include <linux/mmc/protocol.h>
+#include <linux/amba/bus.h>
 
 #include <asm/div64.h>
 #include <asm/io.h>
 #include <asm/scatterlist.h>
 #include <asm/sizes.h>
-#include <asm/hardware/amba.h>
 #include <asm/hardware/clock.h>
 #include <asm/mach/mmc.h>
 
diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c
index ddd0307fece2..48f6e872314b 100644
--- a/drivers/serial/amba-pl010.c
+++ b/drivers/serial/amba-pl010.c
@@ -47,12 +47,12 @@
 #include <linux/tty_flip.h>
 #include <linux/serial_core.h>
 #include <linux/serial.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/serial.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/hardware.h>
-#include <asm/hardware/amba.h>
-#include <asm/hardware/amba_serial.h>
 
 #define UART_NR		2
 
diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 531b0e4f25e5..4ae4dff59795 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -47,12 +47,12 @@
 #include <linux/tty_flip.h>
 #include <linux/serial_core.h>
 #include <linux/serial.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/serial.h>
 
 #include <asm/io.h>
 #include <asm/sizes.h>
-#include <asm/hardware/amba.h>
 #include <asm/hardware/clock.h>
-#include <asm/hardware/amba_serial.h>
 
 #define UART_NR			14
 
diff --git a/drivers/video/amba-clcd.c b/drivers/video/amba-clcd.c
index 69421c86252c..3358a1429651 100644
--- a/drivers/video/amba-clcd.c
+++ b/drivers/video/amba-clcd.c
@@ -21,13 +21,12 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/list.h>
+#include <linux/amba/bus.h>
+#include <linux/amba/clcd.h>
 
 #include <asm/sizes.h>
-#include <asm/hardware/amba.h>
 #include <asm/hardware/clock.h>
 
-#include <asm/hardware/amba_clcd.h>
-
 #define to_clcd(info)	container_of(info, struct clcd_fb, fb)
 
 /* This is limited to 16 characters when displayed by X startup */
diff --git a/include/asm-arm/arch-integrator/debug-macro.S b/include/asm-arm/arch-integrator/debug-macro.S
index 484a1aa47098..031d30941791 100644
--- a/include/asm-arm/arch-integrator/debug-macro.S
+++ b/include/asm-arm/arch-integrator/debug-macro.S
@@ -11,7 +11,7 @@
  *
 */
 
-#include <asm/hardware/amba_serial.h>
+#include <linux/amba/serial.h>
 
 		.macro	addruart,rx
 		mrc	p15, 0, \rx, c1, c0
diff --git a/include/asm-arm/arch-realview/debug-macro.S b/include/asm-arm/arch-realview/debug-macro.S
index ed28bd012236..017ad996848d 100644
--- a/include/asm-arm/arch-realview/debug-macro.S
+++ b/include/asm-arm/arch-realview/debug-macro.S
@@ -11,7 +11,7 @@
  *
 */
 
-#include <asm/hardware/amba_serial.h>
+#include <linux/amba/serial.h>
 
 		.macro	addruart,rx
 		mrc	p15, 0, \rx, c1, c0
diff --git a/include/asm-arm/arch-versatile/debug-macro.S b/include/asm-arm/arch-versatile/debug-macro.S
index 89e38ac1444e..ef6167116dbb 100644
--- a/include/asm-arm/arch-versatile/debug-macro.S
+++ b/include/asm-arm/arch-versatile/debug-macro.S
@@ -11,7 +11,7 @@
  *
 */
 
-#include <asm/hardware/amba_serial.h>
+#include <linux/amba/serial.h>
 
 		.macro	addruart,rx
 		mrc	p15, 0, \rx, c1, c0
diff --git a/include/asm-arm/hardware/amba.h b/include/asm-arm/hardware/amba.h
deleted file mode 100644
index 51e6e54b2aa1..000000000000
--- a/include/asm-arm/hardware/amba.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  linux/include/asm-arm/hardware/amba.h
- *
- *  Copyright (C) 2003 Deep Blue Solutions Ltd, All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef ASMARM_AMBA_H
-#define ASMARM_AMBA_H
-
-#define AMBA_NR_IRQS	2
-
-struct amba_device {
-	struct device		dev;
-	struct resource		res;
-	u64			dma_mask;
-	unsigned int		periphid;
-	unsigned int		irq[AMBA_NR_IRQS];
-};
-
-struct amba_id {
-	unsigned int		id;
-	unsigned int		mask;
-	void			*data;
-};
-
-struct amba_driver {
-	struct device_driver	drv;
-	int			(*probe)(struct amba_device *, void *);
-	int			(*remove)(struct amba_device *);
-	void			(*shutdown)(struct amba_device *);
-	int			(*suspend)(struct amba_device *, pm_message_t);
-	int			(*resume)(struct amba_device *);
-	struct amba_id		*id_table;
-};
-
-#define amba_get_drvdata(d)	dev_get_drvdata(&d->dev)
-#define amba_set_drvdata(d,p)	dev_set_drvdata(&d->dev, p)
-
-int amba_driver_register(struct amba_driver *);
-void amba_driver_unregister(struct amba_driver *);
-int amba_device_register(struct amba_device *, struct resource *);
-void amba_device_unregister(struct amba_device *);
-struct amba_device *amba_find_device(const char *, struct device *, unsigned int, unsigned int);
-int amba_request_regions(struct amba_device *, const char *);
-void amba_release_regions(struct amba_device *);
-
-#define amba_config(d)	(((d)->periphid >> 24) & 0xff)
-#define amba_rev(d)	(((d)->periphid >> 20) & 0x0f)
-#define amba_manf(d)	(((d)->periphid >> 12) & 0xff)
-#define amba_part(d)	((d)->periphid & 0xfff)
-
-#endif
diff --git a/include/asm-arm/hardware/amba_clcd.h b/include/asm-arm/hardware/amba_clcd.h
deleted file mode 100644
index 6b8d73dc1ab0..000000000000
--- a/include/asm-arm/hardware/amba_clcd.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * linux/include/asm-arm/hardware/amba_clcd.h -- Integrator LCD panel.
- *
- * David A Rusling
- *
- * Copyright (C) 2001 ARM Limited
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive
- * for more details.
- */
-#include <linux/config.h>
-#include <linux/fb.h>
-
-/*
- * CLCD Controller Internal Register addresses
- */
-#define CLCD_TIM0		0x00000000
-#define CLCD_TIM1 		0x00000004
-#define CLCD_TIM2 		0x00000008
-#define CLCD_TIM3 		0x0000000c
-#define CLCD_UBAS 		0x00000010
-#define CLCD_LBAS 		0x00000014
-
-#if !defined(CONFIG_ARCH_VERSATILE) && !defined(CONFIG_ARCH_REALVIEW)
-#define CLCD_IENB 		0x00000018
-#define CLCD_CNTL 		0x0000001c
-#else
-/*
- * Someone rearranged these two registers on the Versatile
- * platform...
- */
-#define CLCD_IENB 		0x0000001c
-#define CLCD_CNTL 		0x00000018
-#endif
-
-#define CLCD_STAT 		0x00000020
-#define CLCD_INTR 		0x00000024
-#define CLCD_UCUR 		0x00000028
-#define CLCD_LCUR 		0x0000002C
-#define CLCD_PALL 		0x00000200
-#define CLCD_PALETTE		0x00000200
-
-#define TIM2_CLKSEL		(1 << 5)
-#define TIM2_IVS		(1 << 11)
-#define TIM2_IHS		(1 << 12)
-#define TIM2_IPC		(1 << 13)
-#define TIM2_IOE		(1 << 14)
-#define TIM2_BCD		(1 << 26)
-
-#define CNTL_LCDEN		(1 << 0)
-#define CNTL_LCDBPP1		(0 << 1)
-#define CNTL_LCDBPP2		(1 << 1)
-#define CNTL_LCDBPP4		(2 << 1)
-#define CNTL_LCDBPP8		(3 << 1)
-#define CNTL_LCDBPP16		(4 << 1)
-#define CNTL_LCDBPP24		(5 << 1)
-#define CNTL_LCDBW		(1 << 4)
-#define CNTL_LCDTFT		(1 << 5)
-#define CNTL_LCDMONO8		(1 << 6)
-#define CNTL_LCDDUAL		(1 << 7)
-#define CNTL_BGR		(1 << 8)
-#define CNTL_BEBO		(1 << 9)
-#define CNTL_BEPO		(1 << 10)
-#define CNTL_LCDPWR		(1 << 11)
-#define CNTL_LCDVCOMP(x)	((x) << 12)
-#define CNTL_LDMAFIFOTIME	(1 << 15)
-#define CNTL_WATERMARK		(1 << 16)
-
-struct clcd_panel {
-	struct fb_videomode	mode;
-	signed short		width;	/* width in mm */
-	signed short		height;	/* height in mm */
-	u32			tim2;
-	u32			tim3;
-	u32			cntl;
-	unsigned int		bpp:8,
-				fixedtimings:1,
-				grayscale:1;
-	unsigned int		connector;
-};
-
-struct clcd_regs {
-	u32			tim0;
-	u32			tim1;
-	u32			tim2;
-	u32			tim3;
-	u32			cntl;
-	unsigned long		pixclock;
-};
-
-struct clcd_fb;
-
-/*
- * the board-type specific routines
- */
-struct clcd_board {
-	const char *name;
-
-	/*
-	 * Optional.  Check whether the var structure is acceptable
-	 * for this display.
-	 */
-	int	(*check)(struct clcd_fb *fb, struct fb_var_screeninfo *var);
-
-	/*
-	 * Compulsary.  Decode fb->fb.var into regs->*.  In the case of
-	 * fixed timing, set regs->* to the register values required.
-	 */
-	void	(*decode)(struct clcd_fb *fb, struct clcd_regs *regs);
-
-	/*
-	 * Optional.  Disable any extra display hardware.
-	 */
-	void	(*disable)(struct clcd_fb *);
-
-	/*
-	 * Optional.  Enable any extra display hardware.
-	 */
-	void	(*enable)(struct clcd_fb *);
-
-	/*
-	 * Setup platform specific parts of CLCD driver
-	 */
-	int	(*setup)(struct clcd_fb *);
-
-	/*
-	 * mmap the framebuffer memory
-	 */
-	int	(*mmap)(struct clcd_fb *, struct vm_area_struct *);
-
-	/*
-	 * Remove platform specific parts of CLCD driver
-	 */
-	void	(*remove)(struct clcd_fb *);
-};
-
-struct amba_device;
-struct clk;
-
-/* this data structure describes each frame buffer device we find */
-struct clcd_fb {
-	struct fb_info		fb;
-	struct amba_device	*dev;
-	struct clk		*clk;
-	struct clcd_panel	*panel;
-	struct clcd_board	*board;
-	void			*board_data;
-	void __iomem		*regs;
-	u32			clcd_cntl;
-	u32			cmap[16];
-};
-
-static inline void clcdfb_decode(struct clcd_fb *fb, struct clcd_regs *regs)
-{
-	u32 val, cpl;
-
-	/*
-	 * Program the CLCD controller registers and start the CLCD
-	 */
-	val = ((fb->fb.var.xres / 16) - 1) << 2;
-	val |= (fb->fb.var.hsync_len - 1) << 8;
-	val |= (fb->fb.var.right_margin - 1) << 16;
-	val |= (fb->fb.var.left_margin - 1) << 24;
-	regs->tim0 = val;
-
-	val = fb->fb.var.yres;
-	if (fb->panel->cntl & CNTL_LCDDUAL)
-		val /= 2;
-	val -= 1;
-	val |= (fb->fb.var.vsync_len - 1) << 10;
-	val |= fb->fb.var.lower_margin << 16;
-	val |= fb->fb.var.upper_margin << 24;
-	regs->tim1 = val;
-
-	val = fb->panel->tim2;
-	val |= fb->fb.var.sync & FB_SYNC_HOR_HIGH_ACT  ? 0 : TIM2_IHS;
-	val |= fb->fb.var.sync & FB_SYNC_VERT_HIGH_ACT ? 0 : TIM2_IVS;
-
-	cpl = fb->fb.var.xres_virtual;
-	if (fb->panel->cntl & CNTL_LCDTFT)	  /* TFT */
-		/* / 1 */;
-	else if (!fb->fb.var.grayscale)		  /* STN color */
-		cpl = cpl * 8 / 3;
-	else if (fb->panel->cntl & CNTL_LCDMONO8) /* STN monochrome, 8bit */
-		cpl /= 8;
-	else					  /* STN monochrome, 4bit */
-		cpl /= 4;
-
-	regs->tim2 = val | ((cpl - 1) << 16);
-
-	regs->tim3 = fb->panel->tim3;
-
-	val = fb->panel->cntl;
-	if (fb->fb.var.grayscale)
-		val |= CNTL_LCDBW;
-
-	switch (fb->fb.var.bits_per_pixel) {
-	case 1:
-		val |= CNTL_LCDBPP1;
-		break;
-	case 2:
-		val |= CNTL_LCDBPP2;
-		break;
-	case 4:
-		val |= CNTL_LCDBPP4;
-		break;
-	case 8:
-		val |= CNTL_LCDBPP8;
-		break;
-	case 16:
-		val |= CNTL_LCDBPP16;
-		break;
-	case 32:
-		val |= CNTL_LCDBPP24;
-		break;
-	}
-
-	regs->cntl = val;
-	regs->pixclock = fb->fb.var.pixclock;
-}
-
-static inline int clcdfb_check(struct clcd_fb *fb, struct fb_var_screeninfo *var)
-{
-	var->xres_virtual = var->xres = (var->xres + 15) & ~15;
-	var->yres_virtual = var->yres = (var->yres + 1) & ~1;
-
-#define CHECK(e,l,h) (var->e < l || var->e > h)
-	if (CHECK(right_margin, (5+1), 256) ||	/* back porch */
-	    CHECK(left_margin, (5+1), 256) ||	/* front porch */
-	    CHECK(hsync_len, (5+1), 256) ||
-	    var->xres > 4096 ||
-	    var->lower_margin > 255 ||		/* back porch */
-	    var->upper_margin > 255 ||		/* front porch */
-	    var->vsync_len > 32 ||
-	    var->yres > 1024)
-		return -EINVAL;
-#undef CHECK
-
-	/* single panel mode: PCD = max(PCD, 1) */
-	/* dual panel mode: PCD = max(PCD, 5) */
-
-	/*
-	 * You can't change the grayscale setting, and
-	 * we can only do non-interlaced video.
-	 */
-	if (var->grayscale != fb->fb.var.grayscale ||
-	    (var->vmode & FB_VMODE_MASK) != FB_VMODE_NONINTERLACED)
-		return -EINVAL;
-
-#define CHECK(e) (var->e != fb->fb.var.e)
-	if (fb->panel->fixedtimings &&
-	    (CHECK(xres)		||
-	     CHECK(yres)		||
-	     CHECK(bits_per_pixel)	||
-	     CHECK(pixclock)		||
-	     CHECK(left_margin)		||
-	     CHECK(right_margin)	||
-	     CHECK(upper_margin)	||
-	     CHECK(lower_margin)	||
-	     CHECK(hsync_len)		||
-	     CHECK(vsync_len)		||
-	     CHECK(sync)))
-		return -EINVAL;
-#undef CHECK
-
-	var->nonstd = 0;
-	var->accel_flags = 0;
-
-	return 0;
-}
diff --git a/include/asm-arm/hardware/amba_kmi.h b/include/asm-arm/hardware/amba_kmi.h
deleted file mode 100644
index a39e5be751b3..000000000000
--- a/include/asm-arm/hardware/amba_kmi.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- *  linux/include/asm-arm/hardware/amba_kmi.h
- *
- *  Internal header file for AMBA KMI ports
- *
- *  Copyright (C) 2000 Deep Blue Solutions Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- *
- * ---------------------------------------------------------------------------
- *  From ARM PrimeCell(tm) PS2 Keyboard/Mouse Interface (PL050) Technical
- *  Reference Manual - ARM DDI 0143B - see http://www.arm.com/
- * ---------------------------------------------------------------------------
- */
-#ifndef ASM_ARM_HARDWARE_AMBA_KMI_H
-#define ASM_ARM_HARDWARE_AMBA_KMI_H
-
-/*
- * KMI control register:
- *  KMICR_TYPE       0 = PS2/AT mode, 1 = No line control bit mode
- *  KMICR_RXINTREN   1 = enable RX interrupts
- *  KMICR_TXINTREN   1 = enable TX interrupts
- *  KMICR_EN         1 = enable KMI
- *  KMICR_FD         1 = force KMI data low
- *  KMICR_FC         1 = force KMI clock low
- */
-#define KMICR		(KMI_BASE + 0x00)
-#define KMICR_TYPE		(1 << 5)
-#define KMICR_RXINTREN		(1 << 4)
-#define KMICR_TXINTREN		(1 << 3)
-#define KMICR_EN		(1 << 2)
-#define KMICR_FD		(1 << 1)
-#define KMICR_FC		(1 << 0)
-
-/*
- * KMI status register:
- *  KMISTAT_TXEMPTY  1 = transmitter register empty
- *  KMISTAT_TXBUSY   1 = currently sending data
- *  KMISTAT_RXFULL   1 = receiver register ready to be read
- *  KMISTAT_RXBUSY   1 = currently receiving data
- *  KMISTAT_RXPARITY parity of last databyte received
- *  KMISTAT_IC       current level of KMI clock input
- *  KMISTAT_ID       current level of KMI data input
- */
-#define KMISTAT		(KMI_BASE + 0x04)
-#define KMISTAT_TXEMPTY		(1 << 6)
-#define KMISTAT_TXBUSY		(1 << 5)
-#define KMISTAT_RXFULL		(1 << 4)
-#define KMISTAT_RXBUSY		(1 << 3)
-#define KMISTAT_RXPARITY	(1 << 2)
-#define KMISTAT_IC		(1 << 1)
-#define KMISTAT_ID		(1 << 0)
-
-/*
- * KMI data register
- */
-#define KMIDATA		(KMI_BASE + 0x08)
-
-/*
- * KMI clock divisor: to generate 8MHz internal clock
- *  div = (ref / 8MHz) - 1; 0 <= div <= 15
- */
-#define KMICLKDIV	(KMI_BASE + 0x0c)
-
-/*
- * KMI interrupt register:
- *  KMIIR_TXINTR     1 = transmit interrupt asserted
- *  KMIIR_RXINTR     1 = receive interrupt asserted
- */
-#define KMIIR		(KMI_BASE + 0x10)
-#define KMIIR_TXINTR		(1 << 1)
-#define KMIIR_RXINTR		(1 << 0)
-
-/*
- * The size of the KMI primecell
- */
-#define KMI_SIZE	(0x100)
-
-#endif
diff --git a/include/asm-arm/hardware/amba_serial.h b/include/asm-arm/hardware/amba_serial.h
deleted file mode 100644
index dc726ffccebd..000000000000
--- a/include/asm-arm/hardware/amba_serial.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- *  linux/include/asm-arm/hardware/serial_amba.h
- *
- *  Internal header file for AMBA serial ports
- *
- *  Copyright (C) ARM Limited
- *  Copyright (C) 2000 Deep Blue Solutions Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-#ifndef ASM_ARM_HARDWARE_SERIAL_AMBA_H
-#define ASM_ARM_HARDWARE_SERIAL_AMBA_H
-
-/* -------------------------------------------------------------------------------
- *  From AMBA UART (PL010) Block Specification
- * -------------------------------------------------------------------------------
- *  UART Register Offsets.
- */
-#define UART01x_DR		0x00	/* Data read or written from the interface. */
-#define UART01x_RSR		0x04	/* Receive status register (Read). */
-#define UART01x_ECR		0x04	/* Error clear register (Write). */
-#define UART010_LCRH		0x08	/* Line control register, high byte. */
-#define UART010_LCRM		0x0C	/* Line control register, middle byte. */
-#define UART010_LCRL		0x10	/* Line control register, low byte. */
-#define UART010_CR		0x14	/* Control register. */
-#define UART01x_FR		0x18	/* Flag register (Read only). */
-#define UART010_IIR		0x1C	/* Interrupt indentification register (Read). */
-#define UART010_ICR		0x1C	/* Interrupt clear register (Write). */
-#define UART01x_ILPR		0x20	/* IrDA low power counter register. */
-#define UART011_IBRD		0x24	/* Integer baud rate divisor register. */
-#define UART011_FBRD		0x28	/* Fractional baud rate divisor register. */
-#define UART011_LCRH		0x2c	/* Line control register. */
-#define UART011_CR		0x30	/* Control register. */
-#define UART011_IFLS		0x34	/* Interrupt fifo level select. */
-#define UART011_IMSC		0x38	/* Interrupt mask. */
-#define UART011_RIS		0x3c	/* Raw interrupt status. */
-#define UART011_MIS		0x40	/* Masked interrupt status. */
-#define UART011_ICR		0x44	/* Interrupt clear register. */
-#define UART011_DMACR		0x48	/* DMA control register. */
-
-#define UART011_DR_OE		(1 << 11)
-#define UART011_DR_BE		(1 << 10)
-#define UART011_DR_PE		(1 << 9)
-#define UART011_DR_FE		(1 << 8)
-
-#define UART01x_RSR_OE 		0x08
-#define UART01x_RSR_BE 		0x04
-#define UART01x_RSR_PE 		0x02
-#define UART01x_RSR_FE 		0x01
-
-#define UART011_FR_RI		0x100
-#define UART011_FR_TXFE		0x080
-#define UART011_FR_RXFF		0x040
-#define UART01x_FR_TXFF		0x020
-#define UART01x_FR_RXFE		0x010
-#define UART01x_FR_BUSY		0x008
-#define UART01x_FR_DCD 		0x004
-#define UART01x_FR_DSR 		0x002
-#define UART01x_FR_CTS 		0x001
-#define UART01x_FR_TMSK		(UART01x_FR_TXFF + UART01x_FR_BUSY)
-
-#define UART011_CR_CTSEN	0x8000	/* CTS hardware flow control */
-#define UART011_CR_RTSEN	0x4000	/* RTS hardware flow control */
-#define UART011_CR_OUT2		0x2000	/* OUT2 */
-#define UART011_CR_OUT1		0x1000	/* OUT1 */
-#define UART011_CR_RTS		0x0800	/* RTS */
-#define UART011_CR_DTR		0x0400	/* DTR */
-#define UART011_CR_RXE		0x0200	/* receive enable */
-#define UART011_CR_TXE		0x0100	/* transmit enable */
-#define UART011_CR_LBE		0x0080	/* loopback enable */
-#define UART010_CR_RTIE		0x0040
-#define UART010_CR_TIE 		0x0020
-#define UART010_CR_RIE 		0x0010
-#define UART010_CR_MSIE		0x0008
-#define UART01x_CR_IIRLP	0x0004	/* SIR low power mode */
-#define UART01x_CR_SIREN	0x0002	/* SIR enable */
-#define UART01x_CR_UARTEN	0x0001	/* UART enable */
- 
-#define UART011_LCRH_SPS	0x80
-#define UART01x_LCRH_WLEN_8	0x60
-#define UART01x_LCRH_WLEN_7	0x40
-#define UART01x_LCRH_WLEN_6	0x20
-#define UART01x_LCRH_WLEN_5	0x00
-#define UART01x_LCRH_FEN	0x10
-#define UART01x_LCRH_STP2	0x08
-#define UART01x_LCRH_EPS	0x04
-#define UART01x_LCRH_PEN	0x02
-#define UART01x_LCRH_BRK	0x01
-
-#define UART010_IIR_RTIS	0x08
-#define UART010_IIR_TIS		0x04
-#define UART010_IIR_RIS		0x02
-#define UART010_IIR_MIS		0x01
-
-#define UART011_IFLS_RX1_8	(0 << 3)
-#define UART011_IFLS_RX2_8	(1 << 3)
-#define UART011_IFLS_RX4_8	(2 << 3)
-#define UART011_IFLS_RX6_8	(3 << 3)
-#define UART011_IFLS_RX7_8	(4 << 3)
-#define UART011_IFLS_TX1_8	(0 << 0)
-#define UART011_IFLS_TX2_8	(1 << 0)
-#define UART011_IFLS_TX4_8	(2 << 0)
-#define UART011_IFLS_TX6_8	(3 << 0)
-#define UART011_IFLS_TX7_8	(4 << 0)
-
-#define UART011_OEIM		(1 << 10)	/* overrun error interrupt mask */
-#define UART011_BEIM		(1 << 9)	/* break error interrupt mask */
-#define UART011_PEIM		(1 << 8)	/* parity error interrupt mask */
-#define UART011_FEIM		(1 << 7)	/* framing error interrupt mask */
-#define UART011_RTIM		(1 << 6)	/* receive timeout interrupt mask */
-#define UART011_TXIM		(1 << 5)	/* transmit interrupt mask */
-#define UART011_RXIM		(1 << 4)	/* receive interrupt mask */
-#define UART011_DSRMIM		(1 << 3)	/* DSR interrupt mask */
-#define UART011_DCDMIM		(1 << 2)	/* DCD interrupt mask */
-#define UART011_CTSMIM		(1 << 1)	/* CTS interrupt mask */
-#define UART011_RIMIM		(1 << 0)	/* RI interrupt mask */
-
-#define UART011_OEIS		(1 << 10)	/* overrun error interrupt status */
-#define UART011_BEIS		(1 << 9)	/* break error interrupt status */
-#define UART011_PEIS		(1 << 8)	/* parity error interrupt status */
-#define UART011_FEIS		(1 << 7)	/* framing error interrupt status */
-#define UART011_RTIS		(1 << 6)	/* receive timeout interrupt status */
-#define UART011_TXIS		(1 << 5)	/* transmit interrupt status */
-#define UART011_RXIS		(1 << 4)	/* receive interrupt status */
-#define UART011_DSRMIS		(1 << 3)	/* DSR interrupt status */
-#define UART011_DCDMIS		(1 << 2)	/* DCD interrupt status */
-#define UART011_CTSMIS		(1 << 1)	/* CTS interrupt status */
-#define UART011_RIMIS		(1 << 0)	/* RI interrupt status */
-
-#define UART011_OEIC		(1 << 10)	/* overrun error interrupt clear */
-#define UART011_BEIC		(1 << 9)	/* break error interrupt clear */
-#define UART011_PEIC		(1 << 8)	/* parity error interrupt clear */
-#define UART011_FEIC		(1 << 7)	/* framing error interrupt clear */
-#define UART011_RTIC		(1 << 6)	/* receive timeout interrupt clear */
-#define UART011_TXIC		(1 << 5)	/* transmit interrupt clear */
-#define UART011_RXIC		(1 << 4)	/* receive interrupt clear */
-#define UART011_DSRMIC		(1 << 3)	/* DSR interrupt clear */
-#define UART011_DCDMIC		(1 << 2)	/* DCD interrupt clear */
-#define UART011_CTSMIC		(1 << 1)	/* CTS interrupt clear */
-#define UART011_RIMIC		(1 << 0)	/* RI interrupt clear */
-
-#define UART011_DMAONERR	(1 << 2)	/* disable dma on error */
-#define UART011_TXDMAE		(1 << 1)	/* enable transmit dma */
-#define UART011_RXDMAE		(1 << 0)	/* enable receive dma */
-
-#define UART01x_RSR_ANY		(UART01x_RSR_OE|UART01x_RSR_BE|UART01x_RSR_PE|UART01x_RSR_FE)
-#define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS)
-
-#endif
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
new file mode 100644
index 000000000000..51e6e54b2aa1
--- /dev/null
+++ b/include/linux/amba/bus.h
@@ -0,0 +1,55 @@
+/*
+ *  linux/include/asm-arm/hardware/amba.h
+ *
+ *  Copyright (C) 2003 Deep Blue Solutions Ltd, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef ASMARM_AMBA_H
+#define ASMARM_AMBA_H
+
+#define AMBA_NR_IRQS	2
+
+struct amba_device {
+	struct device		dev;
+	struct resource		res;
+	u64			dma_mask;
+	unsigned int		periphid;
+	unsigned int		irq[AMBA_NR_IRQS];
+};
+
+struct amba_id {
+	unsigned int		id;
+	unsigned int		mask;
+	void			*data;
+};
+
+struct amba_driver {
+	struct device_driver	drv;
+	int			(*probe)(struct amba_device *, void *);
+	int			(*remove)(struct amba_device *);
+	void			(*shutdown)(struct amba_device *);
+	int			(*suspend)(struct amba_device *, pm_message_t);
+	int			(*resume)(struct amba_device *);
+	struct amba_id		*id_table;
+};
+
+#define amba_get_drvdata(d)	dev_get_drvdata(&d->dev)
+#define amba_set_drvdata(d,p)	dev_set_drvdata(&d->dev, p)
+
+int amba_driver_register(struct amba_driver *);
+void amba_driver_unregister(struct amba_driver *);
+int amba_device_register(struct amba_device *, struct resource *);
+void amba_device_unregister(struct amba_device *);
+struct amba_device *amba_find_device(const char *, struct device *, unsigned int, unsigned int);
+int amba_request_regions(struct amba_device *, const char *);
+void amba_release_regions(struct amba_device *);
+
+#define amba_config(d)	(((d)->periphid >> 24) & 0xff)
+#define amba_rev(d)	(((d)->periphid >> 20) & 0x0f)
+#define amba_manf(d)	(((d)->periphid >> 12) & 0xff)
+#define amba_part(d)	((d)->periphid & 0xfff)
+
+#endif
diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h
new file mode 100644
index 000000000000..6b8d73dc1ab0
--- /dev/null
+++ b/include/linux/amba/clcd.h
@@ -0,0 +1,271 @@
+/*
+ * linux/include/asm-arm/hardware/amba_clcd.h -- Integrator LCD panel.
+ *
+ * David A Rusling
+ *
+ * Copyright (C) 2001 ARM Limited
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details.
+ */
+#include <linux/config.h>
+#include <linux/fb.h>
+
+/*
+ * CLCD Controller Internal Register addresses
+ */
+#define CLCD_TIM0		0x00000000
+#define CLCD_TIM1 		0x00000004
+#define CLCD_TIM2 		0x00000008
+#define CLCD_TIM3 		0x0000000c
+#define CLCD_UBAS 		0x00000010
+#define CLCD_LBAS 		0x00000014
+
+#if !defined(CONFIG_ARCH_VERSATILE) && !defined(CONFIG_ARCH_REALVIEW)
+#define CLCD_IENB 		0x00000018
+#define CLCD_CNTL 		0x0000001c
+#else
+/*
+ * Someone rearranged these two registers on the Versatile
+ * platform...
+ */
+#define CLCD_IENB 		0x0000001c
+#define CLCD_CNTL 		0x00000018
+#endif
+
+#define CLCD_STAT 		0x00000020
+#define CLCD_INTR 		0x00000024
+#define CLCD_UCUR 		0x00000028
+#define CLCD_LCUR 		0x0000002C
+#define CLCD_PALL 		0x00000200
+#define CLCD_PALETTE		0x00000200
+
+#define TIM2_CLKSEL		(1 << 5)
+#define TIM2_IVS		(1 << 11)
+#define TIM2_IHS		(1 << 12)
+#define TIM2_IPC		(1 << 13)
+#define TIM2_IOE		(1 << 14)
+#define TIM2_BCD		(1 << 26)
+
+#define CNTL_LCDEN		(1 << 0)
+#define CNTL_LCDBPP1		(0 << 1)
+#define CNTL_LCDBPP2		(1 << 1)
+#define CNTL_LCDBPP4		(2 << 1)
+#define CNTL_LCDBPP8		(3 << 1)
+#define CNTL_LCDBPP16		(4 << 1)
+#define CNTL_LCDBPP24		(5 << 1)
+#define CNTL_LCDBW		(1 << 4)
+#define CNTL_LCDTFT		(1 << 5)
+#define CNTL_LCDMONO8		(1 << 6)
+#define CNTL_LCDDUAL		(1 << 7)
+#define CNTL_BGR		(1 << 8)
+#define CNTL_BEBO		(1 << 9)
+#define CNTL_BEPO		(1 << 10)
+#define CNTL_LCDPWR		(1 << 11)
+#define CNTL_LCDVCOMP(x)	((x) << 12)
+#define CNTL_LDMAFIFOTIME	(1 << 15)
+#define CNTL_WATERMARK		(1 << 16)
+
+struct clcd_panel {
+	struct fb_videomode	mode;
+	signed short		width;	/* width in mm */
+	signed short		height;	/* height in mm */
+	u32			tim2;
+	u32			tim3;
+	u32			cntl;
+	unsigned int		bpp:8,
+				fixedtimings:1,
+				grayscale:1;
+	unsigned int		connector;
+};
+
+struct clcd_regs {
+	u32			tim0;
+	u32			tim1;
+	u32			tim2;
+	u32			tim3;
+	u32			cntl;
+	unsigned long		pixclock;
+};
+
+struct clcd_fb;
+
+/*
+ * the board-type specific routines
+ */
+struct clcd_board {
+	const char *name;
+
+	/*
+	 * Optional.  Check whether the var structure is acceptable
+	 * for this display.
+	 */
+	int	(*check)(struct clcd_fb *fb, struct fb_var_screeninfo *var);
+
+	/*
+	 * Compulsary.  Decode fb->fb.var into regs->*.  In the case of
+	 * fixed timing, set regs->* to the register values required.
+	 */
+	void	(*decode)(struct clcd_fb *fb, struct clcd_regs *regs);
+
+	/*
+	 * Optional.  Disable any extra display hardware.
+	 */
+	void	(*disable)(struct clcd_fb *);
+
+	/*
+	 * Optional.  Enable any extra display hardware.
+	 */
+	void	(*enable)(struct clcd_fb *);
+
+	/*
+	 * Setup platform specific parts of CLCD driver
+	 */
+	int	(*setup)(struct clcd_fb *);
+
+	/*
+	 * mmap the framebuffer memory
+	 */
+	int	(*mmap)(struct clcd_fb *, struct vm_area_struct *);
+
+	/*
+	 * Remove platform specific parts of CLCD driver
+	 */
+	void	(*remove)(struct clcd_fb *);
+};
+
+struct amba_device;
+struct clk;
+
+/* this data structure describes each frame buffer device we find */
+struct clcd_fb {
+	struct fb_info		fb;
+	struct amba_device	*dev;
+	struct clk		*clk;
+	struct clcd_panel	*panel;
+	struct clcd_board	*board;
+	void			*board_data;
+	void __iomem		*regs;
+	u32			clcd_cntl;
+	u32			cmap[16];
+};
+
+static inline void clcdfb_decode(struct clcd_fb *fb, struct clcd_regs *regs)
+{
+	u32 val, cpl;
+
+	/*
+	 * Program the CLCD controller registers and start the CLCD
+	 */
+	val = ((fb->fb.var.xres / 16) - 1) << 2;
+	val |= (fb->fb.var.hsync_len - 1) << 8;
+	val |= (fb->fb.var.right_margin - 1) << 16;
+	val |= (fb->fb.var.left_margin - 1) << 24;
+	regs->tim0 = val;
+
+	val = fb->fb.var.yres;
+	if (fb->panel->cntl & CNTL_LCDDUAL)
+		val /= 2;
+	val -= 1;
+	val |= (fb->fb.var.vsync_len - 1) << 10;
+	val |= fb->fb.var.lower_margin << 16;
+	val |= fb->fb.var.upper_margin << 24;
+	regs->tim1 = val;
+
+	val = fb->panel->tim2;
+	val |= fb->fb.var.sync & FB_SYNC_HOR_HIGH_ACT  ? 0 : TIM2_IHS;
+	val |= fb->fb.var.sync & FB_SYNC_VERT_HIGH_ACT ? 0 : TIM2_IVS;
+
+	cpl = fb->fb.var.xres_virtual;
+	if (fb->panel->cntl & CNTL_LCDTFT)	  /* TFT */
+		/* / 1 */;
+	else if (!fb->fb.var.grayscale)		  /* STN color */
+		cpl = cpl * 8 / 3;
+	else if (fb->panel->cntl & CNTL_LCDMONO8) /* STN monochrome, 8bit */
+		cpl /= 8;
+	else					  /* STN monochrome, 4bit */
+		cpl /= 4;
+
+	regs->tim2 = val | ((cpl - 1) << 16);
+
+	regs->tim3 = fb->panel->tim3;
+
+	val = fb->panel->cntl;
+	if (fb->fb.var.grayscale)
+		val |= CNTL_LCDBW;
+
+	switch (fb->fb.var.bits_per_pixel) {
+	case 1:
+		val |= CNTL_LCDBPP1;
+		break;
+	case 2:
+		val |= CNTL_LCDBPP2;
+		break;
+	case 4:
+		val |= CNTL_LCDBPP4;
+		break;
+	case 8:
+		val |= CNTL_LCDBPP8;
+		break;
+	case 16:
+		val |= CNTL_LCDBPP16;
+		break;
+	case 32:
+		val |= CNTL_LCDBPP24;
+		break;
+	}
+
+	regs->cntl = val;
+	regs->pixclock = fb->fb.var.pixclock;
+}
+
+static inline int clcdfb_check(struct clcd_fb *fb, struct fb_var_screeninfo *var)
+{
+	var->xres_virtual = var->xres = (var->xres + 15) & ~15;
+	var->yres_virtual = var->yres = (var->yres + 1) & ~1;
+
+#define CHECK(e,l,h) (var->e < l || var->e > h)
+	if (CHECK(right_margin, (5+1), 256) ||	/* back porch */
+	    CHECK(left_margin, (5+1), 256) ||	/* front porch */
+	    CHECK(hsync_len, (5+1), 256) ||
+	    var->xres > 4096 ||
+	    var->lower_margin > 255 ||		/* back porch */
+	    var->upper_margin > 255 ||		/* front porch */
+	    var->vsync_len > 32 ||
+	    var->yres > 1024)
+		return -EINVAL;
+#undef CHECK
+
+	/* single panel mode: PCD = max(PCD, 1) */
+	/* dual panel mode: PCD = max(PCD, 5) */
+
+	/*
+	 * You can't change the grayscale setting, and
+	 * we can only do non-interlaced video.
+	 */
+	if (var->grayscale != fb->fb.var.grayscale ||
+	    (var->vmode & FB_VMODE_MASK) != FB_VMODE_NONINTERLACED)
+		return -EINVAL;
+
+#define CHECK(e) (var->e != fb->fb.var.e)
+	if (fb->panel->fixedtimings &&
+	    (CHECK(xres)		||
+	     CHECK(yres)		||
+	     CHECK(bits_per_pixel)	||
+	     CHECK(pixclock)		||
+	     CHECK(left_margin)		||
+	     CHECK(right_margin)	||
+	     CHECK(upper_margin)	||
+	     CHECK(lower_margin)	||
+	     CHECK(hsync_len)		||
+	     CHECK(vsync_len)		||
+	     CHECK(sync)))
+		return -EINVAL;
+#undef CHECK
+
+	var->nonstd = 0;
+	var->accel_flags = 0;
+
+	return 0;
+}
diff --git a/include/linux/amba/kmi.h b/include/linux/amba/kmi.h
new file mode 100644
index 000000000000..a39e5be751b3
--- /dev/null
+++ b/include/linux/amba/kmi.h
@@ -0,0 +1,92 @@
+/*
+ *  linux/include/asm-arm/hardware/amba_kmi.h
+ *
+ *  Internal header file for AMBA KMI ports
+ *
+ *  Copyright (C) 2000 Deep Blue Solutions Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * ---------------------------------------------------------------------------
+ *  From ARM PrimeCell(tm) PS2 Keyboard/Mouse Interface (PL050) Technical
+ *  Reference Manual - ARM DDI 0143B - see http://www.arm.com/
+ * ---------------------------------------------------------------------------
+ */
+#ifndef ASM_ARM_HARDWARE_AMBA_KMI_H
+#define ASM_ARM_HARDWARE_AMBA_KMI_H
+
+/*
+ * KMI control register:
+ *  KMICR_TYPE       0 = PS2/AT mode, 1 = No line control bit mode
+ *  KMICR_RXINTREN   1 = enable RX interrupts
+ *  KMICR_TXINTREN   1 = enable TX interrupts
+ *  KMICR_EN         1 = enable KMI
+ *  KMICR_FD         1 = force KMI data low
+ *  KMICR_FC         1 = force KMI clock low
+ */
+#define KMICR		(KMI_BASE + 0x00)
+#define KMICR_TYPE		(1 << 5)
+#define KMICR_RXINTREN		(1 << 4)
+#define KMICR_TXINTREN		(1 << 3)
+#define KMICR_EN		(1 << 2)
+#define KMICR_FD		(1 << 1)
+#define KMICR_FC		(1 << 0)
+
+/*
+ * KMI status register:
+ *  KMISTAT_TXEMPTY  1 = transmitter register empty
+ *  KMISTAT_TXBUSY   1 = currently sending data
+ *  KMISTAT_RXFULL   1 = receiver register ready to be read
+ *  KMISTAT_RXBUSY   1 = currently receiving data
+ *  KMISTAT_RXPARITY parity of last databyte received
+ *  KMISTAT_IC       current level of KMI clock input
+ *  KMISTAT_ID       current level of KMI data input
+ */
+#define KMISTAT		(KMI_BASE + 0x04)
+#define KMISTAT_TXEMPTY		(1 << 6)
+#define KMISTAT_TXBUSY		(1 << 5)
+#define KMISTAT_RXFULL		(1 << 4)
+#define KMISTAT_RXBUSY		(1 << 3)
+#define KMISTAT_RXPARITY	(1 << 2)
+#define KMISTAT_IC		(1 << 1)
+#define KMISTAT_ID		(1 << 0)
+
+/*
+ * KMI data register
+ */
+#define KMIDATA		(KMI_BASE + 0x08)
+
+/*
+ * KMI clock divisor: to generate 8MHz internal clock
+ *  div = (ref / 8MHz) - 1; 0 <= div <= 15
+ */
+#define KMICLKDIV	(KMI_BASE + 0x0c)
+
+/*
+ * KMI interrupt register:
+ *  KMIIR_TXINTR     1 = transmit interrupt asserted
+ *  KMIIR_RXINTR     1 = receive interrupt asserted
+ */
+#define KMIIR		(KMI_BASE + 0x10)
+#define KMIIR_TXINTR		(1 << 1)
+#define KMIIR_RXINTR		(1 << 0)
+
+/*
+ * The size of the KMI primecell
+ */
+#define KMI_SIZE	(0x100)
+
+#endif
diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
new file mode 100644
index 000000000000..dc726ffccebd
--- /dev/null
+++ b/include/linux/amba/serial.h
@@ -0,0 +1,161 @@
+/*
+ *  linux/include/asm-arm/hardware/serial_amba.h
+ *
+ *  Internal header file for AMBA serial ports
+ *
+ *  Copyright (C) ARM Limited
+ *  Copyright (C) 2000 Deep Blue Solutions Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef ASM_ARM_HARDWARE_SERIAL_AMBA_H
+#define ASM_ARM_HARDWARE_SERIAL_AMBA_H
+
+/* -------------------------------------------------------------------------------
+ *  From AMBA UART (PL010) Block Specification
+ * -------------------------------------------------------------------------------
+ *  UART Register Offsets.
+ */
+#define UART01x_DR		0x00	/* Data read or written from the interface. */
+#define UART01x_RSR		0x04	/* Receive status register (Read). */
+#define UART01x_ECR		0x04	/* Error clear register (Write). */
+#define UART010_LCRH		0x08	/* Line control register, high byte. */
+#define UART010_LCRM		0x0C	/* Line control register, middle byte. */
+#define UART010_LCRL		0x10	/* Line control register, low byte. */
+#define UART010_CR		0x14	/* Control register. */
+#define UART01x_FR		0x18	/* Flag register (Read only). */
+#define UART010_IIR		0x1C	/* Interrupt indentification register (Read). */
+#define UART010_ICR		0x1C	/* Interrupt clear register (Write). */
+#define UART01x_ILPR		0x20	/* IrDA low power counter register. */
+#define UART011_IBRD		0x24	/* Integer baud rate divisor register. */
+#define UART011_FBRD		0x28	/* Fractional baud rate divisor register. */
+#define UART011_LCRH		0x2c	/* Line control register. */
+#define UART011_CR		0x30	/* Control register. */
+#define UART011_IFLS		0x34	/* Interrupt fifo level select. */
+#define UART011_IMSC		0x38	/* Interrupt mask. */
+#define UART011_RIS		0x3c	/* Raw interrupt status. */
+#define UART011_MIS		0x40	/* Masked interrupt status. */
+#define UART011_ICR		0x44	/* Interrupt clear register. */
+#define UART011_DMACR		0x48	/* DMA control register. */
+
+#define UART011_DR_OE		(1 << 11)
+#define UART011_DR_BE		(1 << 10)
+#define UART011_DR_PE		(1 << 9)
+#define UART011_DR_FE		(1 << 8)
+
+#define UART01x_RSR_OE 		0x08
+#define UART01x_RSR_BE 		0x04
+#define UART01x_RSR_PE 		0x02
+#define UART01x_RSR_FE 		0x01
+
+#define UART011_FR_RI		0x100
+#define UART011_FR_TXFE		0x080
+#define UART011_FR_RXFF		0x040
+#define UART01x_FR_TXFF		0x020
+#define UART01x_FR_RXFE		0x010
+#define UART01x_FR_BUSY		0x008
+#define UART01x_FR_DCD 		0x004
+#define UART01x_FR_DSR 		0x002
+#define UART01x_FR_CTS 		0x001
+#define UART01x_FR_TMSK		(UART01x_FR_TXFF + UART01x_FR_BUSY)
+
+#define UART011_CR_CTSEN	0x8000	/* CTS hardware flow control */
+#define UART011_CR_RTSEN	0x4000	/* RTS hardware flow control */
+#define UART011_CR_OUT2		0x2000	/* OUT2 */
+#define UART011_CR_OUT1		0x1000	/* OUT1 */
+#define UART011_CR_RTS		0x0800	/* RTS */
+#define UART011_CR_DTR		0x0400	/* DTR */
+#define UART011_CR_RXE		0x0200	/* receive enable */
+#define UART011_CR_TXE		0x0100	/* transmit enable */
+#define UART011_CR_LBE		0x0080	/* loopback enable */
+#define UART010_CR_RTIE		0x0040
+#define UART010_CR_TIE 		0x0020
+#define UART010_CR_RIE 		0x0010
+#define UART010_CR_MSIE		0x0008
+#define UART01x_CR_IIRLP	0x0004	/* SIR low power mode */
+#define UART01x_CR_SIREN	0x0002	/* SIR enable */
+#define UART01x_CR_UARTEN	0x0001	/* UART enable */
+ 
+#define UART011_LCRH_SPS	0x80
+#define UART01x_LCRH_WLEN_8	0x60
+#define UART01x_LCRH_WLEN_7	0x40
+#define UART01x_LCRH_WLEN_6	0x20
+#define UART01x_LCRH_WLEN_5	0x00
+#define UART01x_LCRH_FEN	0x10
+#define UART01x_LCRH_STP2	0x08
+#define UART01x_LCRH_EPS	0x04
+#define UART01x_LCRH_PEN	0x02
+#define UART01x_LCRH_BRK	0x01
+
+#define UART010_IIR_RTIS	0x08
+#define UART010_IIR_TIS		0x04
+#define UART010_IIR_RIS		0x02
+#define UART010_IIR_MIS		0x01
+
+#define UART011_IFLS_RX1_8	(0 << 3)
+#define UART011_IFLS_RX2_8	(1 << 3)
+#define UART011_IFLS_RX4_8	(2 << 3)
+#define UART011_IFLS_RX6_8	(3 << 3)
+#define UART011_IFLS_RX7_8	(4 << 3)
+#define UART011_IFLS_TX1_8	(0 << 0)
+#define UART011_IFLS_TX2_8	(1 << 0)
+#define UART011_IFLS_TX4_8	(2 << 0)
+#define UART011_IFLS_TX6_8	(3 << 0)
+#define UART011_IFLS_TX7_8	(4 << 0)
+
+#define UART011_OEIM		(1 << 10)	/* overrun error interrupt mask */
+#define UART011_BEIM		(1 << 9)	/* break error interrupt mask */
+#define UART011_PEIM		(1 << 8)	/* parity error interrupt mask */
+#define UART011_FEIM		(1 << 7)	/* framing error interrupt mask */
+#define UART011_RTIM		(1 << 6)	/* receive timeout interrupt mask */
+#define UART011_TXIM		(1 << 5)	/* transmit interrupt mask */
+#define UART011_RXIM		(1 << 4)	/* receive interrupt mask */
+#define UART011_DSRMIM		(1 << 3)	/* DSR interrupt mask */
+#define UART011_DCDMIM		(1 << 2)	/* DCD interrupt mask */
+#define UART011_CTSMIM		(1 << 1)	/* CTS interrupt mask */
+#define UART011_RIMIM		(1 << 0)	/* RI interrupt mask */
+
+#define UART011_OEIS		(1 << 10)	/* overrun error interrupt status */
+#define UART011_BEIS		(1 << 9)	/* break error interrupt status */
+#define UART011_PEIS		(1 << 8)	/* parity error interrupt status */
+#define UART011_FEIS		(1 << 7)	/* framing error interrupt status */
+#define UART011_RTIS		(1 << 6)	/* receive timeout interrupt status */
+#define UART011_TXIS		(1 << 5)	/* transmit interrupt status */
+#define UART011_RXIS		(1 << 4)	/* receive interrupt status */
+#define UART011_DSRMIS		(1 << 3)	/* DSR interrupt status */
+#define UART011_DCDMIS		(1 << 2)	/* DCD interrupt status */
+#define UART011_CTSMIS		(1 << 1)	/* CTS interrupt status */
+#define UART011_RIMIS		(1 << 0)	/* RI interrupt status */
+
+#define UART011_OEIC		(1 << 10)	/* overrun error interrupt clear */
+#define UART011_BEIC		(1 << 9)	/* break error interrupt clear */
+#define UART011_PEIC		(1 << 8)	/* parity error interrupt clear */
+#define UART011_FEIC		(1 << 7)	/* framing error interrupt clear */
+#define UART011_RTIC		(1 << 6)	/* receive timeout interrupt clear */
+#define UART011_TXIC		(1 << 5)	/* transmit interrupt clear */
+#define UART011_RXIC		(1 << 4)	/* receive interrupt clear */
+#define UART011_DSRMIC		(1 << 3)	/* DSR interrupt clear */
+#define UART011_DCDMIC		(1 << 2)	/* DCD interrupt clear */
+#define UART011_CTSMIC		(1 << 1)	/* CTS interrupt clear */
+#define UART011_RIMIC		(1 << 0)	/* RI interrupt clear */
+
+#define UART011_DMAONERR	(1 << 2)	/* disable dma on error */
+#define UART011_TXDMAE		(1 << 1)	/* enable transmit dma */
+#define UART011_RXDMAE		(1 << 0)	/* enable receive dma */
+
+#define UART01x_RSR_ANY		(UART01x_RSR_OE|UART01x_RSR_BE|UART01x_RSR_PE|UART01x_RSR_FE)
+#define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS)
+
+#endif
diff --git a/sound/arm/aaci.c b/sound/arm/aaci.c
index 559ead6367da..5b6cae50d0d5 100644
--- a/sound/arm/aaci.c
+++ b/sound/arm/aaci.c
@@ -17,11 +17,11 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/err.h>
+#include <linux/amba/bus.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/sizes.h>
-#include <asm/hardware/amba.h>
 
 #include <sound/driver.h>
 #include <sound/core.h>
-- 
cgit v1.2.3-71-gd317


From f8ce25476d5f12ffa29b885e49c38cd95053437e Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sat, 7 Jan 2006 16:15:52 +0000
Subject: [ARM] Move asm/hardware/clock.h to linux/clk.h

This is needs to be visible to other architectures using the AMBA
bus and peripherals.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mach-aaec2000/clock.c        |   2 +-
 arch/arm/mach-integrator/clock.c      |   2 +-
 arch/arm/mach-omap1/board-palmte.c    |   2 +-
 arch/arm/mach-omap1/clock.c           |   2 +-
 arch/arm/mach-omap1/serial.c          |   2 +-
 arch/arm/mach-omap2/clock.c           |   2 +-
 arch/arm/mach-omap2/serial.c          |   2 +-
 arch/arm/mach-omap2/timer-gp.c        |   3 +-
 arch/arm/mach-realview/clock.c        |   2 +-
 arch/arm/mach-s3c2410/clock.c         |   3 +-
 arch/arm/mach-s3c2410/s3c2440-clock.c |   3 +-
 arch/arm/mach-s3c2410/s3c2440.c       |   2 +-
 arch/arm/mach-s3c2410/time.c          |   2 +-
 arch/arm/mach-versatile/clock.c       |   2 +-
 arch/arm/plat-omap/clock.c            |   2 +-
 arch/arm/plat-omap/common.c           |   2 +-
 arch/arm/plat-omap/cpu-omap.c         |   3 +-
 arch/arm/plat-omap/gpio.c             |   2 +-
 arch/arm/plat-omap/mcbsp.c            |   3 +-
 arch/arm/plat-omap/ocpi.c             |   2 +-
 drivers/char/s3c2410-rtc.c            |   2 +-
 drivers/char/watchdog/s3c2410_wdt.c   |   2 +-
 drivers/i2c/busses/i2c-s3c2410.c      |   2 +-
 drivers/input/serio/ambakmi.c         |   2 +-
 drivers/mmc/mmci.c                    |   2 +-
 drivers/mtd/nand/s3c2410.c            |   2 +-
 drivers/serial/amba-pl011.c           |   2 +-
 drivers/serial/s3c2410.c              |   2 +-
 drivers/usb/host/ohci-omap.c          |   2 +-
 drivers/usb/host/ohci-s3c2410.c       |   2 +-
 drivers/video/amba-clcd.c             |   2 +-
 drivers/video/s3c2410fb.c             |   2 +-
 include/asm-arm/arch-omap/system.h    |   3 +-
 include/asm-arm/hardware/clock.h      | 124 ----------------------------------
 include/linux/clk.h                   | 124 ++++++++++++++++++++++++++++++++++
 35 files changed, 159 insertions(+), 161 deletions(-)
 delete mode 100644 include/asm-arm/hardware/clock.h
 create mode 100644 include/linux/clk.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-aaec2000/clock.c b/arch/arm/mach-aaec2000/clock.c
index 828208348b76..1c84c60941e1 100644
--- a/arch/arm/mach-aaec2000/clock.c
+++ b/arch/arm/mach-aaec2000/clock.c
@@ -15,9 +15,9 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/string.h>
+#include <linux/clk.h>
 
 #include <asm/semaphore.h>
-#include <asm/hardware/clock.h>
 
 #include "clock.h"
 
diff --git a/arch/arm/mach-integrator/clock.c b/arch/arm/mach-integrator/clock.c
index bbfe46cd91fe..40684e01e865 100644
--- a/arch/arm/mach-integrator/clock.c
+++ b/arch/arm/mach-integrator/clock.c
@@ -14,9 +14,9 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/string.h>
+#include <linux/clk.h>
 
 #include <asm/semaphore.h>
-#include <asm/hardware/clock.h>
 #include <asm/hardware/icst525.h>
 
 #include "clock.h"
diff --git a/arch/arm/mach-omap1/board-palmte.c b/arch/arm/mach-omap1/board-palmte.c
index 540b20d78cca..5c975eb5c34b 100644
--- a/arch/arm/mach-omap1/board-palmte.c
+++ b/arch/arm/mach-omap1/board-palmte.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/notifier.h>
+#include <linux/clk.h>
 
 #include <asm/hardware.h>
 #include <asm/mach-types.h>
@@ -30,7 +31,6 @@
 #include <asm/arch/usb.h>
 #include <asm/arch/board.h>
 #include <asm/arch/common.h>
-#include <asm/hardware/clock.h>
 
 static void __init omap_generic_init_irq(void)
 {
diff --git a/arch/arm/mach-omap1/clock.c b/arch/arm/mach-omap1/clock.c
index 4277eee44ed5..9d862f86bba6 100644
--- a/arch/arm/mach-omap1/clock.c
+++ b/arch/arm/mach-omap1/clock.c
@@ -16,9 +16,9 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/clk.h>
 
 #include <asm/io.h>
-#include <asm/hardware/clock.h>
 
 #include <asm/arch/usb.h>
 #include <asm/arch/clock.h>
diff --git a/arch/arm/mach-omap1/serial.c b/arch/arm/mach-omap1/serial.c
index 6810cfb84462..fcfb81d13cfe 100644
--- a/arch/arm/mach-omap1/serial.c
+++ b/arch/arm/mach-omap1/serial.c
@@ -17,10 +17,10 @@
 #include <linux/tty.h>
 #include <linux/serial_8250.h>
 #include <linux/serial_reg.h>
+#include <linux/clk.h>
 
 #include <asm/io.h>
 #include <asm/mach-types.h>
-#include <asm/hardware/clock.h>
 
 #include <asm/arch/board.h>
 #include <asm/arch/mux.h>
diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 85818d9f2635..5407b9549150 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -22,10 +22,10 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
+#include <linux/clk.h>
 
 #include <asm/io.h>
 
-#include <asm/hardware/clock.h>
 #include <asm/arch/clock.h>
 #include <asm/arch/sram.h>
 #include <asm/arch/prcm.h>
diff --git a/arch/arm/mach-omap2/serial.c b/arch/arm/mach-omap2/serial.c
index f4df04fe1dd8..e1bd46a96e11 100644
--- a/arch/arm/mach-omap2/serial.c
+++ b/arch/arm/mach-omap2/serial.c
@@ -16,9 +16,9 @@
 #include <linux/init.h>
 #include <linux/serial_8250.h>
 #include <linux/serial_reg.h>
+#include <linux/clk.h>
 
 #include <asm/io.h>
-#include <asm/hardware/clock.h>
 
 #include <asm/arch/common.h>
 #include <asm/arch/board.h>
diff --git a/arch/arm/mach-omap2/timer-gp.c b/arch/arm/mach-omap2/timer-gp.c
index 9ec11443200f..23d36b1c40fe 100644
--- a/arch/arm/mach-omap2/timer-gp.c
+++ b/arch/arm/mach-omap2/timer-gp.c
@@ -21,10 +21,11 @@
 #include <linux/time.h>
 #include <linux/interrupt.h>
 #include <linux/err.h>
+#include <linux/clk.h>
+
 #include <asm/mach/time.h>
 #include <asm/delay.h>
 #include <asm/io.h>
-#include <asm/hardware/clock.h>
 
 #define OMAP2_GP_TIMER1_BASE	0x48028000
 #define OMAP2_GP_TIMER2_BASE	0x4802a000
diff --git a/arch/arm/mach-realview/clock.c b/arch/arm/mach-realview/clock.c
index 331e1b483aa7..ec3f7e798623 100644
--- a/arch/arm/mach-realview/clock.c
+++ b/arch/arm/mach-realview/clock.c
@@ -13,9 +13,9 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/clk.h>
 
 #include <asm/semaphore.h>
-#include <asm/hardware/clock.h>
 #include <asm/hardware/icst307.h>
 
 #include "clock.h"
diff --git a/arch/arm/mach-s3c2410/clock.c b/arch/arm/mach-s3c2410/clock.c
index 77f321fac281..5830ae3ddd19 100644
--- a/arch/arm/mach-s3c2410/clock.c
+++ b/arch/arm/mach-s3c2410/clock.c
@@ -34,16 +34,15 @@
 #include <linux/err.h>
 #include <linux/platform_device.h>
 #include <linux/sysdev.h>
-
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
+#include <linux/clk.h>
 
 #include <asm/hardware.h>
 #include <asm/atomic.h>
 #include <asm/irq.h>
 #include <asm/io.h>
 
-#include <asm/hardware/clock.h>
 #include <asm/arch/regs-clock.h>
 
 #include "clock.h"
diff --git a/arch/arm/mach-s3c2410/s3c2440-clock.c b/arch/arm/mach-s3c2410/s3c2440-clock.c
index c67e0979aec3..b557a2be8a01 100644
--- a/arch/arm/mach-s3c2410/s3c2440-clock.c
+++ b/arch/arm/mach-s3c2410/s3c2440-clock.c
@@ -29,16 +29,15 @@
 #include <linux/err.h>
 #include <linux/device.h>
 #include <linux/sysdev.h>
-
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
+#include <linux/clk.h>
 
 #include <asm/hardware.h>
 #include <asm/atomic.h>
 #include <asm/irq.h>
 #include <asm/io.h>
 
-#include <asm/hardware/clock.h>
 #include <asm/arch/regs-clock.h>
 
 #include "clock.h"
diff --git a/arch/arm/mach-s3c2410/s3c2440.c b/arch/arm/mach-s3c2410/s3c2440.c
index 4d63e7133b48..b7fe6d9453fb 100644
--- a/arch/arm/mach-s3c2410/s3c2440.c
+++ b/arch/arm/mach-s3c2410/s3c2440.c
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/sysdev.h>
+#include <linux/clk.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
@@ -36,7 +37,6 @@
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/hardware/clock.h>
 
 #include <asm/arch/regs-clock.h>
 #include <asm/arch/regs-serial.h>
diff --git a/arch/arm/mach-s3c2410/time.c b/arch/arm/mach-s3c2410/time.c
index 9acda44b25a6..10a2976aefdd 100644
--- a/arch/arm/mach-s3c2410/time.c
+++ b/arch/arm/mach-s3c2410/time.c
@@ -24,6 +24,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/err.h>
+#include <linux/clk.h>
 
 #include <asm/system.h>
 #include <asm/leds.h>
@@ -35,7 +36,6 @@
 #include <asm/arch/regs-timer.h>
 #include <asm/arch/regs-irq.h>
 #include <asm/mach/time.h>
-#include <asm/hardware/clock.h>
 
 #include "clock.h"
 #include "cpu.h"
diff --git a/arch/arm/mach-versatile/clock.c b/arch/arm/mach-versatile/clock.c
index ada3142da8dc..dcf10014f5cd 100644
--- a/arch/arm/mach-versatile/clock.c
+++ b/arch/arm/mach-versatile/clock.c
@@ -14,9 +14,9 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/string.h>
+#include <linux/clk.h>
 
 #include <asm/semaphore.h>
-#include <asm/hardware/clock.h>
 #include <asm/hardware/icst307.h>
 
 #include "clock.h"
diff --git a/arch/arm/plat-omap/clock.c b/arch/arm/plat-omap/clock.c
index 7ce39b986e23..84fd65656fcf 100644
--- a/arch/arm/plat-omap/clock.c
+++ b/arch/arm/plat-omap/clock.c
@@ -19,10 +19,10 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/string.h>
+#include <linux/clk.h>
 
 #include <asm/io.h>
 #include <asm/semaphore.h>
-#include <asm/hardware/clock.h>
 
 #include <asm/arch/clock.h>
 
diff --git a/arch/arm/plat-omap/common.c b/arch/arm/plat-omap/common.c
index ccdb452630cf..adffc5a859ee 100644
--- a/arch/arm/plat-omap/common.c
+++ b/arch/arm/plat-omap/common.c
@@ -18,12 +18,12 @@
 #include <linux/tty.h>
 #include <linux/serial_8250.h>
 #include <linux/serial_reg.h>
+#include <linux/clk.h>
 
 #include <asm/hardware.h>
 #include <asm/system.h>
 #include <asm/pgtable.h>
 #include <asm/mach/map.h>
-#include <asm/hardware/clock.h>
 #include <asm/io.h>
 #include <asm/setup.h>
 
diff --git a/arch/arm/plat-omap/cpu-omap.c b/arch/arm/plat-omap/cpu-omap.c
index fd894bb00107..98edc9fdd6d1 100644
--- a/arch/arm/plat-omap/cpu-omap.c
+++ b/arch/arm/plat-omap/cpu-omap.c
@@ -19,13 +19,12 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/err.h>
+#include <linux/clk.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/system.h>
 
-#include <asm/hardware/clock.h>
-
 /* TODO: Add support for SDRAM timing changes */
 
 int omap_verify_speed(struct cpufreq_policy *policy)
diff --git a/arch/arm/plat-omap/gpio.c b/arch/arm/plat-omap/gpio.c
index 76f721d85137..ca3681a824ac 100644
--- a/arch/arm/plat-omap/gpio.c
+++ b/arch/arm/plat-omap/gpio.c
@@ -19,9 +19,9 @@
 #include <linux/ptrace.h>
 #include <linux/sysdev.h>
 #include <linux/err.h>
+#include <linux/clk.h>
 
 #include <asm/hardware.h>
-#include <asm/hardware/clock.h>
 #include <asm/irq.h>
 #include <asm/arch/irqs.h>
 #include <asm/arch/gpio.h>
diff --git a/arch/arm/plat-omap/mcbsp.c b/arch/arm/plat-omap/mcbsp.c
index ea9475c86656..be0e0f32a598 100644
--- a/arch/arm/plat-omap/mcbsp.c
+++ b/arch/arm/plat-omap/mcbsp.c
@@ -19,6 +19,7 @@
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/err.h>
+#include <linux/clk.h>
 
 #include <asm/delay.h>
 #include <asm/io.h>
@@ -30,8 +31,6 @@
 #include <asm/arch/dsp_common.h>
 #include <asm/arch/mcbsp.h>
 
-#include <asm/hardware/clock.h>
-
 #ifdef CONFIG_MCBSP_DEBUG
 #define DBG(x...)	printk(x)
 #else
diff --git a/arch/arm/plat-omap/ocpi.c b/arch/arm/plat-omap/ocpi.c
index b86148227480..e40fcc8b43d4 100644
--- a/arch/arm/plat-omap/ocpi.c
+++ b/arch/arm/plat-omap/ocpi.c
@@ -31,9 +31,9 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/err.h>
+#include <linux/clk.h>
 
 #include <asm/io.h>
-#include <asm/hardware/clock.h>
 #include <asm/hardware.h>
 
 #define OCPI_BASE		0xfffec320
diff --git a/drivers/char/s3c2410-rtc.c b/drivers/char/s3c2410-rtc.c
index 3df7a574267b..2e308657f6f6 100644
--- a/drivers/char/s3c2410-rtc.c
+++ b/drivers/char/s3c2410-rtc.c
@@ -24,6 +24,7 @@
 #include <linux/interrupt.h>
 #include <linux/rtc.h>
 #include <linux/bcd.h>
+#include <linux/clk.h>
 
 #include <asm/hardware.h>
 #include <asm/uaccess.h>
@@ -33,7 +34,6 @@
 
 #include <asm/mach/time.h>
 
-#include <asm/hardware/clock.h>
 #include <asm/arch/regs-rtc.h>
 
 /* need this for the RTC_AF definitions */
diff --git a/drivers/char/watchdog/s3c2410_wdt.c b/drivers/char/watchdog/s3c2410_wdt.c
index 621e8a99e733..9dc54736e4eb 100644
--- a/drivers/char/watchdog/s3c2410_wdt.c
+++ b/drivers/char/watchdog/s3c2410_wdt.c
@@ -46,12 +46,12 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
+#include <linux/clk.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
 #include <asm/arch/map.h>
-#include <asm/hardware/clock.h>
 
 #undef S3C24XX_VA_WATCHDOG
 #define S3C24XX_VA_WATCHDOG (0)
diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c
index 2a2f86d8c2d8..f7d40f8e5f5c 100644
--- a/drivers/i2c/busses/i2c-s3c2410.c
+++ b/drivers/i2c/busses/i2c-s3c2410.c
@@ -34,12 +34,12 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/platform_device.h>
+#include <linux/clk.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
 #include <asm/io.h>
 
-#include <asm/hardware/clock.h>
 #include <asm/arch/regs-gpio.h>
 #include <asm/arch/regs-iic.h>
 #include <asm/arch/iic.h>
diff --git a/drivers/input/serio/ambakmi.c b/drivers/input/serio/ambakmi.c
index cbab5d26377b..3df5eedf8f31 100644
--- a/drivers/input/serio/ambakmi.c
+++ b/drivers/input/serio/ambakmi.c
@@ -21,10 +21,10 @@
 #include <linux/err.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/kmi.h>
+#include <linux/clk.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/hardware/clock.h>
 
 #define KMI_BASE	(kmi->base)
 
diff --git a/drivers/mmc/mmci.c b/drivers/mmc/mmci.c
index 57375bc12372..a0cd916ab792 100644
--- a/drivers/mmc/mmci.c
+++ b/drivers/mmc/mmci.c
@@ -20,12 +20,12 @@
 #include <linux/mmc/host.h>
 #include <linux/mmc/protocol.h>
 #include <linux/amba/bus.h>
+#include <linux/clk.h>
 
 #include <asm/div64.h>
 #include <asm/io.h>
 #include <asm/scatterlist.h>
 #include <asm/sizes.h>
-#include <asm/hardware/clock.h>
 #include <asm/mach/mmc.h>
 
 #include "mmci.h"
diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c
index b796a9a6b924..5b55599739f3 100644
--- a/drivers/mtd/nand/s3c2410.c
+++ b/drivers/mtd/nand/s3c2410.c
@@ -53,6 +53,7 @@
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/clk.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
@@ -60,7 +61,6 @@
 #include <linux/mtd/partitions.h>
 
 #include <asm/io.h>
-#include <asm/hardware/clock.h>
 
 #include <asm/arch/regs-nand.h>
 #include <asm/arch/nand.h>
diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 4ae4dff59795..129670556162 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -49,10 +49,10 @@
 #include <linux/serial.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/serial.h>
+#include <linux/clk.h>
 
 #include <asm/io.h>
 #include <asm/sizes.h>
-#include <asm/hardware/clock.h>
 
 #define UART_NR			14
 
diff --git a/drivers/serial/s3c2410.c b/drivers/serial/s3c2410.c
index eb47f5b71aeb..fe83ce6fef52 100644
--- a/drivers/serial/s3c2410.c
+++ b/drivers/serial/s3c2410.c
@@ -72,12 +72,12 @@
 #include <linux/serial_core.h>
 #include <linux/serial.h>
 #include <linux/delay.h>
+#include <linux/clk.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
 
 #include <asm/hardware.h>
-#include <asm/hardware/clock.h>
 
 #include <asm/arch/regs-serial.h>
 #include <asm/arch/regs-gpio.h>
diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c
index c9e29d808711..3785b3f7df1b 100644
--- a/drivers/usb/host/ohci-omap.c
+++ b/drivers/usb/host/ohci-omap.c
@@ -17,6 +17,7 @@
 #include <linux/signal.h>	/* SA_INTERRUPT */
 #include <linux/jiffies.h>
 #include <linux/platform_device.h>
+#include <linux/clk.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
@@ -27,7 +28,6 @@
 #include <asm/arch/gpio.h>
 #include <asm/arch/fpga.h>
 #include <asm/arch/usb.h>
-#include <asm/hardware/clock.h>
 
 
 /* OMAP-1510 OHCI has its own MMU for DMA */
diff --git a/drivers/usb/host/ohci-s3c2410.c b/drivers/usb/host/ohci-s3c2410.c
index add198a4be79..372527a83593 100644
--- a/drivers/usb/host/ohci-s3c2410.c
+++ b/drivers/usb/host/ohci-s3c2410.c
@@ -20,9 +20,9 @@
 */
 
 #include <linux/platform_device.h>
+#include <linux/clk.h>
 
 #include <asm/hardware.h>
-#include <asm/hardware/clock.h>
 #include <asm/arch/usb-control.h>
 
 #define valid_port(idx) ((idx) == 1 || (idx) == 2)
diff --git a/drivers/video/amba-clcd.c b/drivers/video/amba-clcd.c
index 3358a1429651..0da4083ba908 100644
--- a/drivers/video/amba-clcd.c
+++ b/drivers/video/amba-clcd.c
@@ -23,9 +23,9 @@
 #include <linux/list.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/clcd.h>
+#include <linux/clk.h>
 
 #include <asm/sizes.h>
-#include <asm/hardware/clock.h>
 
 #define to_clcd(info)	container_of(info, struct clcd_fb, fb)
 
diff --git a/drivers/video/s3c2410fb.c b/drivers/video/s3c2410fb.c
index d9c08cc7ac44..fe99d17a21d7 100644
--- a/drivers/video/s3c2410fb.c
+++ b/drivers/video/s3c2410fb.c
@@ -87,6 +87,7 @@
 #include <linux/workqueue.h>
 #include <linux/wait.h>
 #include <linux/platform_device.h>
+#include <linux/clk.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
@@ -96,7 +97,6 @@
 #include <asm/arch/regs-lcd.h>
 #include <asm/arch/regs-gpio.h>
 #include <asm/arch/fb.h>
-#include <asm/hardware/clock.h>
 
 #ifdef CONFIG_PM
 #include <linux/pm.h>
diff --git a/include/asm-arm/arch-omap/system.h b/include/asm-arm/arch-omap/system.h
index 9af415d2944a..6724a81bd10b 100644
--- a/include/asm-arm/arch-omap/system.h
+++ b/include/asm-arm/arch-omap/system.h
@@ -5,8 +5,9 @@
 #ifndef __ASM_ARCH_SYSTEM_H
 #define __ASM_ARCH_SYSTEM_H
 #include <linux/config.h>
+#include <linux/clk.h>
+
 #include <asm/mach-types.h>
-#include <asm/hardware/clock.h>
 #include <asm/hardware.h>
 #include <asm/arch/prcm.h>
 
diff --git a/include/asm-arm/hardware/clock.h b/include/asm-arm/hardware/clock.h
deleted file mode 100644
index 69f33215e437..000000000000
--- a/include/asm-arm/hardware/clock.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- *  linux/include/asm-arm/hardware/clock.h
- *
- *  Copyright (C) 2004 ARM Limited.
- *  Written by Deep Blue Solutions Limited.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef ASMARM_CLOCK_H
-#define ASMARM_CLOCK_H
-
-struct device;
-
-/*
- * The base API.
- */
-
-
-/*
- * struct clk - an machine class defined object / cookie.
- */
-struct clk;
-
-/**
- * clk_get - lookup and obtain a reference to a clock producer.
- * @dev: device for clock "consumer"
- * @id: clock comsumer ID
- *
- * Returns a struct clk corresponding to the clock producer, or
- * valid IS_ERR() condition containing errno.  The implementation
- * uses @dev and @id to determine the clock consumer, and thereby
- * the clock producer.  (IOW, @id may be identical strings, but
- * clk_get may return different clock producers depending on @dev.)
- *
- * Drivers must assume that the clock source is not enabled.
- */
-struct clk *clk_get(struct device *dev, const char *id);
-
-/**
- * clk_enable - inform the system when the clock source should be running.
- * @clk: clock source
- *
- * If the clock can not be enabled/disabled, this should return success.
- *
- * Returns success (0) or negative errno.
- */
-int clk_enable(struct clk *clk);
-
-/**
- * clk_disable - inform the system when the clock source is no longer required.
- * @clk: clock source
- *
- * Inform the system that a clock source is no longer required by
- * a driver and may be shut down.
- *
- * Implementation detail: if the clock source is shared between
- * multiple drivers, clk_enable() calls must be balanced by the
- * same number of clk_disable() calls for the clock source to be
- * disabled.
- */
-void clk_disable(struct clk *clk);
-
-/**
- * clk_get_rate - obtain the current clock rate (in Hz) for a clock source.
- *		  This is only valid once the clock source has been enabled.
- * @clk: clock source
- */
-unsigned long clk_get_rate(struct clk *clk);
-
-/**
- * clk_put	- "free" the clock source
- * @clk: clock source
- *
- * Note: drivers must ensure that all clk_enable calls made on this
- * clock source are balanced by clk_disable calls prior to calling
- * this function.
- */
-void clk_put(struct clk *clk);
-
-
-/*
- * The remaining APIs are optional for machine class support.
- */
-
-
-/**
- * clk_round_rate - adjust a rate to the exact rate a clock can provide
- * @clk: clock source
- * @rate: desired clock rate in Hz
- *
- * Returns rounded clock rate in Hz, or negative errno.
- */
-long clk_round_rate(struct clk *clk, unsigned long rate);
- 
-/**
- * clk_set_rate - set the clock rate for a clock source
- * @clk: clock source
- * @rate: desired clock rate in Hz
- *
- * Returns success (0) or negative errno.
- */
-int clk_set_rate(struct clk *clk, unsigned long rate);
- 
-/**
- * clk_set_parent - set the parent clock source for this clock
- * @clk: clock source
- * @parent: parent clock source
- *
- * Returns success (0) or negative errno.
- */
-int clk_set_parent(struct clk *clk, struct clk *parent);
-
-/**
- * clk_get_parent - get the parent clock source for this clock
- * @clk: clock source
- *
- * Returns struct clk corresponding to parent clock source, or
- * valid IS_ERR() condition containing errno.
- */
-struct clk *clk_get_parent(struct clk *clk);
-
-#endif
diff --git a/include/linux/clk.h b/include/linux/clk.h
new file mode 100644
index 000000000000..12848f81bb37
--- /dev/null
+++ b/include/linux/clk.h
@@ -0,0 +1,124 @@
+/*
+ *  linux/include/linux/clk.h
+ *
+ *  Copyright (C) 2004 ARM Limited.
+ *  Written by Deep Blue Solutions Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef ASMARM_CLOCK_H
+#define ASMARM_CLOCK_H
+
+struct device;
+
+/*
+ * The base API.
+ */
+
+
+/*
+ * struct clk - an machine class defined object / cookie.
+ */
+struct clk;
+
+/**
+ * clk_get - lookup and obtain a reference to a clock producer.
+ * @dev: device for clock "consumer"
+ * @id: clock comsumer ID
+ *
+ * Returns a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  (IOW, @id may be identical strings, but
+ * clk_get may return different clock producers depending on @dev.)
+ *
+ * Drivers must assume that the clock source is not enabled.
+ */
+struct clk *clk_get(struct device *dev, const char *id);
+
+/**
+ * clk_enable - inform the system when the clock source should be running.
+ * @clk: clock source
+ *
+ * If the clock can not be enabled/disabled, this should return success.
+ *
+ * Returns success (0) or negative errno.
+ */
+int clk_enable(struct clk *clk);
+
+/**
+ * clk_disable - inform the system when the clock source is no longer required.
+ * @clk: clock source
+ *
+ * Inform the system that a clock source is no longer required by
+ * a driver and may be shut down.
+ *
+ * Implementation detail: if the clock source is shared between
+ * multiple drivers, clk_enable() calls must be balanced by the
+ * same number of clk_disable() calls for the clock source to be
+ * disabled.
+ */
+void clk_disable(struct clk *clk);
+
+/**
+ * clk_get_rate - obtain the current clock rate (in Hz) for a clock source.
+ *		  This is only valid once the clock source has been enabled.
+ * @clk: clock source
+ */
+unsigned long clk_get_rate(struct clk *clk);
+
+/**
+ * clk_put	- "free" the clock source
+ * @clk: clock source
+ *
+ * Note: drivers must ensure that all clk_enable calls made on this
+ * clock source are balanced by clk_disable calls prior to calling
+ * this function.
+ */
+void clk_put(struct clk *clk);
+
+
+/*
+ * The remaining APIs are optional for machine class support.
+ */
+
+
+/**
+ * clk_round_rate - adjust a rate to the exact rate a clock can provide
+ * @clk: clock source
+ * @rate: desired clock rate in Hz
+ *
+ * Returns rounded clock rate in Hz, or negative errno.
+ */
+long clk_round_rate(struct clk *clk, unsigned long rate);
+ 
+/**
+ * clk_set_rate - set the clock rate for a clock source
+ * @clk: clock source
+ * @rate: desired clock rate in Hz
+ *
+ * Returns success (0) or negative errno.
+ */
+int clk_set_rate(struct clk *clk, unsigned long rate);
+ 
+/**
+ * clk_set_parent - set the parent clock source for this clock
+ * @clk: clock source
+ * @parent: parent clock source
+ *
+ * Returns success (0) or negative errno.
+ */
+int clk_set_parent(struct clk *clk, struct clk *parent);
+
+/**
+ * clk_get_parent - get the parent clock source for this clock
+ * @clk: clock source
+ *
+ * Returns struct clk corresponding to parent clock source, or
+ * valid IS_ERR() condition containing errno.
+ */
+struct clk *clk_get_parent(struct clk *clk);
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 16a6677fdf1d1194f688f8291b06fbaff248c353 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:01:48 -0800
Subject: [XFRM]: Netfilter IPsec output hooks

Call netfilter hooks before IPsec transforms. Packets visit the
FORWARD/LOCAL_OUT and POST_ROUTING hook before the first encapsulation
and the LOCAL_OUT and POST_ROUTING hook before each following tunnel mode
transform.

Patch from Herbert Xu <herbert@gondor.apana.org.au>:

Move the loop from dst_output into xfrm4_output/xfrm6_output since they're
the only ones who need to it. xfrm{4,6}_output_one() processes the first SA
all subsequent transport mode SAs and is called in a loop that calls the
netfilter hooks between each two calls.

In order to avoid the tail call issue, I've added the inline function
nf_hook which is nf_hook_slow plus the empty list check.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 61 +++++++++++++++++++++++---------------
 include/net/dst.h         | 11 +------
 net/ipv4/xfrm4_output.c   | 71 +++++++++++++++++++++++++++++++++-----------
 net/ipv6/xfrm6_output.c   | 75 +++++++++++++++++++++++++++++++++++------------
 4 files changed, 148 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index be365e70ee99..79bb977afeac 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -168,6 +168,37 @@ void nf_log_packet(int pf,
 		   const struct net_device *out,
 		   struct nf_loginfo *li,
 		   const char *fmt, ...);
+
+int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
+		 struct net_device *indev, struct net_device *outdev,
+		 int (*okfn)(struct sk_buff *), int thresh);
+
+/**
+ *	nf_hook_thresh - call a netfilter hook
+ *	
+ *	Returns 1 if the hook has allowed the packet to pass.  The function
+ *	okfn must be invoked by the caller in this case.  Any other return
+ *	value indicates the packet has been consumed by the hook.
+ */
+static inline int nf_hook_thresh(int pf, unsigned int hook,
+				 struct sk_buff **pskb,
+				 struct net_device *indev,
+				 struct net_device *outdev,
+				 int (*okfn)(struct sk_buff *), int thresh)
+{
+#ifndef CONFIG_NETFILTER_DEBUG
+	if (list_empty(&nf_hooks[pf][hook]))
+		return 1;
+#endif
+	return nf_hook_slow(pf, hook, pskb, indev, outdev, okfn, thresh);
+}
+
+static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb,
+			  struct net_device *indev, struct net_device *outdev,
+			  int (*okfn)(struct sk_buff *))
+{
+	return nf_hook_thresh(pf, hook, pskb, indev, outdev, okfn, INT_MIN);
+}
                    
 /* Activate hook; either okfn or kfree_skb called, unless a hook
    returns NF_STOLEN (in which case, it's up to the hook to deal with
@@ -188,35 +219,17 @@ void nf_log_packet(int pf,
 
 /* This is gross, but inline doesn't cut it for avoiding the function
    call in fast path: gcc doesn't inline (needs value tracking?). --RR */
-#ifdef CONFIG_NETFILTER_DEBUG
-#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)			       \
-({int __ret;								       \
-if ((__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \
-	__ret = (okfn)(skb);						       \
-__ret;})
-#define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh)	       \
-({int __ret;								       \
-if ((__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)  \
-	__ret = (okfn)(skb);						       \
-__ret;})
-#else
-#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)			       \
-({int __ret;								       \
-if (list_empty(&nf_hooks[pf][hook]) ||					       \
-    (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \
-	__ret = (okfn)(skb);						       \
-__ret;})
+
+/* HX: It's slightly less gross now. */
+
 #define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh)	       \
 ({int __ret;								       \
-if (list_empty(&nf_hooks[pf][hook]) ||					       \
-    (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)  \
+if ((__ret=nf_hook_thresh(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)\
 	__ret = (okfn)(skb);						       \
 __ret;})
-#endif
 
-int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
-		 struct net_device *indev, struct net_device *outdev,
-		 int (*okfn)(struct sk_buff *), int thresh);
+#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \
+	NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, INT_MIN)
 
 /* Call setsockopt() */
 int nf_setsockopt(struct sock *sk, int pf, int optval, char __user *opt, 
diff --git a/include/net/dst.h b/include/net/dst.h
index bee8b84d329d..5161e89017f9 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -225,16 +225,7 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout)
 /* Output packet to network from transport.  */
 static inline int dst_output(struct sk_buff *skb)
 {
-	int err;
-
-	for (;;) {
-		err = skb->dst->output(skb);
-
-		if (likely(err == 0))
-			return err;
-		if (unlikely(err != NET_XMIT_BYPASS))
-			return err;
-	}
+	return skb->dst->output(skb);
 }
 
 /* Input packet from network to transport.  */
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 66620a95942a..51fabb8f7c54 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -8,8 +8,10 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/compiler.h>
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
+#include <linux/netfilter_ipv4.h>
 #include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
@@ -95,7 +97,7 @@ out:
 	return ret;
 }
 
-int xfrm4_output(struct sk_buff *skb)
+static int xfrm4_output_one(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
 	struct xfrm_state *x = dst->xfrm;
@@ -113,27 +115,32 @@ int xfrm4_output(struct sk_buff *skb)
 			goto error_nolock;
 	}
 
-	spin_lock_bh(&x->lock);
-	err = xfrm_state_check(x, skb);
-	if (err)
-		goto error;
+	do {
+		spin_lock_bh(&x->lock);
+		err = xfrm_state_check(x, skb);
+		if (err)
+			goto error;
 
-	xfrm4_encap(skb);
+		xfrm4_encap(skb);
 
-	err = x->type->output(x, skb);
-	if (err)
-		goto error;
+		err = x->type->output(x, skb);
+		if (err)
+			goto error;
 
-	x->curlft.bytes += skb->len;
-	x->curlft.packets++;
+		x->curlft.bytes += skb->len;
+		x->curlft.packets++;
 
-	spin_unlock_bh(&x->lock);
+		spin_unlock_bh(&x->lock);
 	
-	if (!(skb->dst = dst_pop(dst))) {
-		err = -EHOSTUNREACH;
-		goto error_nolock;
-	}
-	err = NET_XMIT_BYPASS;
+		if (!(skb->dst = dst_pop(dst))) {
+			err = -EHOSTUNREACH;
+			goto error_nolock;
+		}
+		dst = skb->dst;
+		x = dst->xfrm;
+	} while (x && !x->props.mode);
+
+	err = 0;
 
 out_exit:
 	return err;
@@ -143,3 +150,33 @@ error_nolock:
 	kfree_skb(skb);
 	goto out_exit;
 }
+
+static int xfrm4_output_finish(struct sk_buff *skb)
+{
+	int err;
+
+	while (likely((err = xfrm4_output_one(skb)) == 0)) {
+		nf_reset(skb);
+
+		err = nf_hook(PF_INET, NF_IP_LOCAL_OUT, &skb, NULL,
+			      skb->dst->dev, dst_output);
+		if (unlikely(err != 1))
+			break;
+
+		if (!skb->dst->xfrm)
+			return dst_output(skb);
+
+		err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL,
+			      skb->dst->dev, xfrm4_output_finish);
+		if (unlikely(err != 1))
+			break;
+	}
+
+	return err;
+}
+
+int xfrm4_output(struct sk_buff *skb)
+{
+	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev,
+		       xfrm4_output_finish);
+}
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 6b9867717d11..fc0ea38953c4 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -9,9 +9,11 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/compiler.h>
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/icmpv6.h>
+#include <linux/netfilter_ipv6.h>
 #include <net/dsfield.h>
 #include <net/inet_ecn.h>
 #include <net/ipv6.h>
@@ -92,7 +94,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 	return ret;
 }
 
-int xfrm6_output(struct sk_buff *skb)
+static int xfrm6_output_one(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
 	struct xfrm_state *x = dst->xfrm;
@@ -110,29 +112,34 @@ int xfrm6_output(struct sk_buff *skb)
 			goto error_nolock;
 	}
 
-	spin_lock_bh(&x->lock);
-	err = xfrm_state_check(x, skb);
-	if (err)
-		goto error;
+	do {
+		spin_lock_bh(&x->lock);
+		err = xfrm_state_check(x, skb);
+		if (err)
+			goto error;
 
-	xfrm6_encap(skb);
+		xfrm6_encap(skb);
 
-	err = x->type->output(x, skb);
-	if (err)
-		goto error;
+		err = x->type->output(x, skb);
+		if (err)
+			goto error;
 
-	x->curlft.bytes += skb->len;
-	x->curlft.packets++;
+		x->curlft.bytes += skb->len;
+		x->curlft.packets++;
 
-	spin_unlock_bh(&x->lock);
+		spin_unlock_bh(&x->lock);
 
-	skb->nh.raw = skb->data;
-	
-	if (!(skb->dst = dst_pop(dst))) {
-		err = -EHOSTUNREACH;
-		goto error_nolock;
-	}
-	err = NET_XMIT_BYPASS;
+		skb->nh.raw = skb->data;
+		
+		if (!(skb->dst = dst_pop(dst))) {
+			err = -EHOSTUNREACH;
+			goto error_nolock;
+		}
+		dst = skb->dst;
+		x = dst->xfrm;
+	} while (x && !x->props.mode);
+
+	err = 0;
 
 out_exit:
 	return err;
@@ -142,3 +149,33 @@ error_nolock:
 	kfree_skb(skb);
 	goto out_exit;
 }
+
+static int xfrm6_output_finish(struct sk_buff *skb)
+{
+	int err;
+
+	while (likely((err = xfrm6_output_one(skb)) == 0)) {
+		nf_reset(skb);
+	
+		err = nf_hook(PF_INET6, NF_IP6_LOCAL_OUT, &skb, NULL,
+			      skb->dst->dev, dst_output);
+		if (unlikely(err != 1))
+			break;
+
+		if (!skb->dst->xfrm)
+			return dst_output(skb);
+
+		err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, &skb, NULL,
+			      skb->dst->dev, xfrm6_output_finish);
+		if (unlikely(err != 1))
+			break;
+	}
+
+	return err;
+}
+
+int xfrm6_output(struct sk_buff *skb)
+{
+	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev,
+		       xfrm6_output_finish);
+}
-- 
cgit v1.2.3-71-gd317


From 951dbc8ac714b04c36296b8b5c36c8e036ce433f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:02:34 -0800
Subject: [IPV6]: Move nextheader offset to the IP6CB

Move nextheader offset to the IP6CB to make it possible to pass a
packet to ip6_input_finish multiple times and have it skip already
parsed headers. As a nice side effect this gets rid of the manual
hopopts skipping in ip6_input_finish.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h    |  1 +
 include/net/protocol.h  |  2 +-
 include/net/xfrm.h      |  6 +++---
 net/dccp/ipv6.c         |  2 +-
 net/ipv6/exthdrs.c      | 19 ++++++++++++-------
 net/ipv6/icmp.c         |  4 ++--
 net/ipv6/ip6_input.c    | 21 ++++++---------------
 net/ipv6/ip6_tunnel.c   |  2 +-
 net/ipv6/reassembly.c   | 11 +++++------
 net/ipv6/tcp_ipv6.c     |  2 +-
 net/ipv6/udp.c          |  2 +-
 net/ipv6/xfrm6_input.c  |  8 ++++----
 net/ipv6/xfrm6_tunnel.c |  6 +++---
 net/sctp/ipv6.c         |  2 +-
 14 files changed, 42 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 93bbed5c6cf4..5cfc71529595 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -191,6 +191,7 @@ struct inet6_skb_parm {
 	__u16			srcrt;
 	__u16			dst1;
 	__u16			lastopt;
+	__u32			nhoff;
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 63f7db99c2a6..6dc5970612d7 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -43,7 +43,7 @@ struct net_protocol {
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 struct inet6_protocol 
 {
-	int	(*handler)(struct sk_buff **skb, unsigned int *nhoffp);
+	int	(*handler)(struct sk_buff **skb);
 
 	void	(*err_handler)(struct sk_buff *skb,
 			       struct inet6_skb_parm *opt,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 07d7b50cdd76..297d09d28fe4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -831,7 +831,7 @@ struct xfrm_tunnel {
 };
 
 struct xfrm6_tunnel {
-	int (*handler)(struct sk_buff **pskb, unsigned int *nhoffp);
+	int (*handler)(struct sk_buff **pskb);
 	void (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			    int type, int code, int offset, __u32 info);
 };
@@ -868,8 +868,8 @@ extern int xfrm4_rcv(struct sk_buff *skb);
 extern int xfrm4_output(struct sk_buff *skb);
 extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
 extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler);
-extern int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi);
-extern int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+extern int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi);
+extern int xfrm6_rcv(struct sk_buff **pskb);
 extern int xfrm6_tunnel_register(struct xfrm6_tunnel *handler);
 extern int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler);
 extern u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 683250a05f58..df074259f9c3 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1029,7 +1029,7 @@ discard:
 	return 0;
 }
 
-static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int dccp_v6_rcv(struct sk_buff **pskb)
 {
 	const struct dccp_hdr *dh;
 	struct sk_buff *skb = *pskb;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 113374dc342c..2a1e7e45b890 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -152,7 +152,7 @@ static struct tlvtype_proc tlvprocdestopt_lst[] = {
 	{-1,			NULL}
 };
 
-static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_destopt_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -169,7 +169,7 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
 
 	if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
 		skb->h.raw += ((skb->h.raw[1]+1)<<3);
-		*nhoffp = opt->dst1;
+		opt->nhoff = opt->dst1;
 		return 1;
 	}
 
@@ -192,7 +192,7 @@ void __init ipv6_destopt_init(void)
   NONE header. No data in packet.
  ********************************/
 
-static int ipv6_nodata_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_nodata_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 
@@ -215,7 +215,7 @@ void __init ipv6_nodata_init(void)
   Routing header.
  ********************************/
 
-static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_rthdr_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -249,7 +249,7 @@ looped_back:
 		skb->h.raw += (hdr->hdrlen + 1) << 3;
 		opt->dst0 = opt->dst1;
 		opt->dst1 = 0;
-		*nhoffp = (&hdr->nexthdr) - skb->nh.raw;
+		opt->nhoff = (&hdr->nexthdr) - skb->nh.raw;
 		return 1;
 	}
 
@@ -487,9 +487,14 @@ static struct tlvtype_proc tlvprochopopt_lst[] = {
 
 int ipv6_parse_hopopts(struct sk_buff *skb, int nhoff)
 {
-	IP6CB(skb)->hop = sizeof(struct ipv6hdr);
-	if (ip6_parse_tlv(tlvprochopopt_lst, skb))
+	struct inet6_skb_parm *opt = IP6CB(skb);
+
+	opt->hop = sizeof(struct ipv6hdr);
+	if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
+		skb->h.raw += (skb->h.raw[1]+1)<<3;
+		opt->nhoff = sizeof(struct ipv6hdr);
 		return sizeof(struct ipv6hdr);
+	}
 	return -1;
 }
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 6ec6a2b549bb..53c81fcd20ba 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -79,7 +79,7 @@ DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
 static DEFINE_PER_CPU(struct socket *, __icmpv6_socket) = NULL;
 #define icmpv6_socket	__get_cpu_var(__icmpv6_socket)
 
-static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+static int icmpv6_rcv(struct sk_buff **pskb);
 
 static struct inet6_protocol icmpv6_protocol = {
 	.handler	=	icmpv6_rcv,
@@ -581,7 +581,7 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info)
  *	Handle icmp messages
  */
 
-static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int icmpv6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct net_device *dev = skb->dev;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index a6026d2787d2..13d724150f33 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -97,6 +97,9 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	if (hdr->version != 6)
 		goto err;
 
+	skb->h.raw = (u8 *)(hdr + 1);
+	IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
 	pkt_len = ntohs(hdr->payload_len);
 
 	/* pkt_len may be zero if Jumbo payload option is present */
@@ -111,8 +114,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	}
 
 	if (hdr->nexthdr == NEXTHDR_HOP) {
-		skb->h.raw = (u8*)(hdr+1);
-		if (ipv6_parse_hopopts(skb, offsetof(struct ipv6hdr, nexthdr)) < 0) {
+		if (ipv6_parse_hopopts(skb, IP6CB(skb)->nhoff) < 0) {
 			IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 			return 0;
 		}
@@ -143,26 +145,15 @@ static inline int ip6_input_finish(struct sk_buff *skb)
 	int nexthdr;
 	u8 hash;
 
-	skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
-
 	/*
 	 *	Parse extension headers
 	 */
 
-	nexthdr = skb->nh.ipv6h->nexthdr;
-	nhoff = offsetof(struct ipv6hdr, nexthdr);
-
-	/* Skip hop-by-hop options, they are already parsed. */
-	if (nexthdr == NEXTHDR_HOP) {
-		nhoff = sizeof(struct ipv6hdr);
-		nexthdr = skb->h.raw[0];
-		skb->h.raw += (skb->h.raw[1]+1)<<3;
-	}
-
 	rcu_read_lock();
 resubmit:
 	if (!pskb_pull(skb, skb->h.raw - skb->data))
 		goto discard;
+	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb->nh.raw[nhoff];
 
 	raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
@@ -194,7 +185,7 @@ resubmit:
 		    !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) 
 			goto discard;
 		
-		ret = ipprot->handler(&skb, &nhoff);
+		ret = ipprot->handler(&skb);
 		if (ret > 0)
 			goto resubmit;
 		else if (ret == 0)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index e315d0f80af1..f079621c8b67 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -510,7 +510,7 @@ static inline void ip6ip6_ecn_decapsulate(struct ipv6hdr *outer_iph,
  **/
 
 static int 
-ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+ip6ip6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct ipv6hdr *ipv6h;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5d316cb72ec9..15e1456b3f18 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -581,7 +581,6 @@ err:
  *	the last and the first frames arrived and all the bits are here.
  */
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
-			  unsigned int *nhoffp,
 			  struct net_device *dev)
 {
 	struct sk_buff *fp, *head = fq->fragments;
@@ -654,6 +653,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
 	head->dev = dev;
 	skb_set_timestamp(head, &fq->stamp);
 	head->nh.ipv6h->payload_len = htons(payload_len);
+	IP6CB(head)->nhoff = nhoff;
 
 	*skb_in = head;
 
@@ -663,7 +663,6 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
 
 	IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
 	fq->fragments = NULL;
-	*nhoffp = nhoff;
 	return 1;
 
 out_oversize:
@@ -678,7 +677,7 @@ out_fail:
 	return -1;
 }
 
-static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_frag_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp; 
 	struct net_device *dev = skb->dev;
@@ -710,7 +709,7 @@ static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
 		skb->h.raw += sizeof(struct frag_hdr);
 		IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
 
-		*nhoffp = (u8*)fhdr - skb->nh.raw;
+		IP6CB(skb)->nhoff = (u8*)fhdr - skb->nh.raw;
 		return 1;
 	}
 
@@ -722,11 +721,11 @@ static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
 
 		spin_lock(&fq->lock);
 
-		ip6_frag_queue(fq, skb, fhdr, *nhoffp);
+		ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		if (fq->last_in == (FIRST_IN|LAST_IN) &&
 		    fq->meat == fq->len)
-			ret = ip6_frag_reasm(fq, skbp, nhoffp, dev);
+			ret = ip6_frag_reasm(fq, skbp, dev);
 
 		spin_unlock(&fq->lock);
 		fq_put(fq, NULL);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2947bc56d8a0..a25f4e8a8ada 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1153,7 +1153,7 @@ ipv6_pktoptions:
 	return 0;
 }
 
-static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int tcp_v6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct tcphdr *th;	
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d8538dcea813..c47648892c04 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -435,7 +435,7 @@ out:
 	read_unlock(&udp_hash_lock);
 }
 
-static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int udpv6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct sock *sk;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 28c29d78338e..1079e47f3933 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -26,7 +26,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
 		IP6_ECN_set_ce(inner_iph);
 }
 
-int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi)
+int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi)
 {
 	struct sk_buff *skb = *pskb;
 	int err;
@@ -38,7 +38,7 @@ int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi)
 	int nexthdr;
 	unsigned int nhoff;
 
-	nhoff = *nhoffp;
+	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb->nh.raw[nhoff];
 
 	seq = 0;
@@ -144,7 +144,7 @@ drop:
 
 EXPORT_SYMBOL(xfrm6_rcv_spi);
 
-int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+int xfrm6_rcv(struct sk_buff **pskb)
 {
-	return xfrm6_rcv_spi(pskb, nhoffp, 0);
+	return xfrm6_rcv_spi(pskb, 0);
 }
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index fbef7826a74f..da09ff258648 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -397,7 +397,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler)
 
 EXPORT_SYMBOL(xfrm6_tunnel_deregister);
 
-static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int xfrm6_tunnel_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct xfrm6_tunnel *handler = xfrm6_tunnel_handler;
@@ -405,11 +405,11 @@ static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
 	u32 spi;
 
 	/* device-like_ip6ip6_handler() */
-	if (handler && handler->handler(pskb, nhoffp) == 0)
+	if (handler && handler->handler(pskb) == 0)
 		return 0;
 
 	spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr);
-	return xfrm6_rcv_spi(pskb, nhoffp, spi);
+	return xfrm6_rcv_spi(pskb, spi);
 }
 
 static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 15c05165c905..04c7fab4edc4 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -905,7 +905,7 @@ static struct inet_protosw sctpv6_stream_protosw = {
 	.flags         = SCTP_PROTOSW_FLAG,
 };
 
-static int sctp6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int sctp6_rcv(struct sk_buff **pskb)
 {
 	return sctp_rcv(*pskb) ? -1 : 0;
 }
-- 
cgit v1.2.3-71-gd317


From 3e3850e989c5d2eb1aab6f0fd9257759f0f4cbc6 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:04:54 -0800
Subject: [NETFILTER]: Fix xfrm lookup in
 ip_route_me_harder/ip6_route_me_harder

ip_route_me_harder doesn't use the port numbers of the xfrm lookup and
uses ip_route_input for non-local addresses which doesn't do a xfrm
lookup, ip6_route_me_harder doesn't do a xfrm lookup at all.

Use xfrm_decode_session and do the lookup manually, make sure both
only do the lookup if the packet hasn't been transformed already.

Makeing sure the lookup only happens once needs a new field in the
IP6CB, which exceeds the size of skb->cb. The size of skb->cb is
increased to 48b. Apparently the IPv6 mobile extensions need some
more room anyway.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h    |  3 +++
 include/linux/skbuff.h  |  2 +-
 include/net/ip.h        |  3 ++-
 include/net/xfrm.h      |  2 +-
 net/ipv4/ip_gre.c       |  2 +-
 net/ipv4/ipip.c         |  2 +-
 net/ipv4/netfilter.c    | 12 ++++++++++--
 net/ipv4/xfrm4_output.c |  1 +
 net/ipv6/netfilter.c    |  9 ++++++++-
 net/ipv6/xfrm6_output.c |  1 +
 net/xfrm/xfrm_policy.c  |  9 +++++----
 11 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 5cfc71529595..9c8f4c9ed429 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -192,6 +192,9 @@ struct inet6_skb_parm {
 	__u16			dst1;
 	__u16			lastopt;
 	__u32			nhoff;
+	__u16			flags;
+
+#define IP6SKB_XFRM_TRANSFORMED	1
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 483cfc47ec34..e5fd66c5650b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -251,7 +251,7 @@ struct sk_buff {
 	 * want to keep them across layers you have to do a skb_clone()
 	 * first. This is owned by whoever has the skb queued ATM.
 	 */
-	char			cb[40];
+	char			cb[48];
 
 	unsigned int		len,
 				data_len,
diff --git a/include/net/ip.h b/include/net/ip.h
index 52f4d9c69704..a494d04e5dea 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -39,7 +39,8 @@ struct inet_skb_parm
 
 #define IPSKB_FORWARDED		1
 #define IPSKB_XFRM_TUNNEL_SIZE	2
-#define IPSKB_FRAG_COMPLETE	4
+#define IPSKB_XFRM_TRANSFORMED	4
+#define IPSKB_FRAG_COMPLETE	8
 };
 
 struct ipcm_cookie
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 297d09d28fe4..d6111a2f0a23 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -668,7 +668,7 @@ static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *s
 	return xfrm_policy_check(sk, dir, skb, AF_INET6);
 }
 
-
+extern int xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family);
 extern int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);
 
 static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 65c3a91ed85e..de16e944777f 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -832,7 +832,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, gre_hlen);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED);
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 078b59be91f4..bbd85f5ec985 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -621,7 +621,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED);
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index ae0779d82c5d..4c637a1cbd23 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -7,11 +7,13 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 
+#include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/icmp.h>
 #include <net/route.h>
-#include <linux/ip.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
 
 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
 int ip_route_me_harder(struct sk_buff **pskb)
@@ -33,7 +35,6 @@ int ip_route_me_harder(struct sk_buff **pskb)
 #ifdef CONFIG_IP_ROUTE_FWMARK
 		fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
 #endif
-		fl.proto = iph->protocol;
 		if (ip_route_output_key(&rt, &fl) != 0)
 			return -1;
 
@@ -60,6 +61,13 @@ int ip_route_me_harder(struct sk_buff **pskb)
 	if ((*pskb)->dst->error)
 		return -1;
 
+#ifdef CONFIG_XFRM
+	if (!(IPCB(*pskb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(*pskb, &fl, AF_INET) == 0)
+		if (xfrm_lookup(&(*pskb)->dst, &fl, (*pskb)->sk, 0))
+			return -1;
+#endif
+
 	/* Change in oif may mean change in hh_len. */
 	hh_len = (*pskb)->dst->dev->hard_header_len;
 	if (skb_headroom(*pskb) < hh_len) {
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 51fabb8f7c54..160c48800ab8 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -140,6 +140,7 @@ static int xfrm4_output_one(struct sk_buff *skb)
 		x = dst->xfrm;
 	} while (x && !x->props.mode);
 
+	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
 	err = 0;
 
 out_exit:
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index f8626ebf90fd..b63678328a3b 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -10,6 +10,7 @@
 #include <net/dst.h>
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/xfrm.h>
 
 int ip6_route_me_harder(struct sk_buff *skb)
 {
@@ -21,11 +22,17 @@ int ip6_route_me_harder(struct sk_buff *skb)
 		{ .ip6_u =
 		  { .daddr = iph->daddr,
 		    .saddr = iph->saddr, } },
-		.proto = iph->nexthdr,
 	};
 
 	dst = ip6_route_output(skb->sk, &fl);
 
+#ifdef CONFIG_XFRM
+	if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(skb, &fl, AF_INET6) == 0)
+		if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0))
+			return -1;
+#endif
+
 	if (dst->error) {
 		IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 		LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index fc0ea38953c4..80242172a5df 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -139,6 +139,7 @@ static int xfrm6_output_one(struct sk_buff *skb)
 		x = dst->xfrm;
 	} while (x && !x->props.mode);
 
+	IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
 	err = 0;
 
 out_exit:
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 64a447375fdb..f2edc9225b6a 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -951,8 +951,8 @@ xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
 	return start;
 }
 
-static int
-_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
+int
+xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 
@@ -963,6 +963,7 @@ _decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
 	xfrm_policy_put_afinfo(afinfo);
 	return 0;
 }
+EXPORT_SYMBOL(xfrm_decode_session);
 
 static inline int secpath_has_tunnel(struct sec_path *sp, int k)
 {
@@ -982,7 +983,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	u8 fl_dir = policy_to_flow_dir(dir);
 	u32 sk_sid;
 
-	if (_decode_session(skb, &fl, family) < 0)
+	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
 
 	sk_sid = security_sk_sid(sk, &fl, fl_dir);
@@ -1055,7 +1056,7 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 {
 	struct flowi fl;
 
-	if (_decode_session(skb, &fl, family) < 0)
+	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
 
 	return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;
-- 
cgit v1.2.3-71-gd317


From eb9c7ebe6980c41cf6ae889e301c3b49f473ee9f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:06:30 -0800
Subject: [NETFILTER]: Handle NAT in IPsec policy checks

Handle NAT of decapsulated IPsec packets by reconstructing the struct flowi
of the original packet from the conntrack information for IPsec policy
checks.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h              | 16 +++++++++++
 net/dccp/ipv4.c                        |  2 +-
 net/ipv4/netfilter.c                   |  3 ++
 net/ipv4/netfilter/ip_nat_standalone.c | 50 ++++++++++++++++++++++++++++++++--
 net/xfrm/xfrm_policy.c                 |  2 ++
 5 files changed, 70 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 79bb977afeac..84506dfa1f37 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -274,6 +274,20 @@ struct nf_queue_rerouter {
 extern int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer);
 extern int nf_unregister_queue_rerouter(int pf);
 
+#include <net/flow.h>
+extern void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
+
+static inline void
+nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family)
+{
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+	void (*decodefn)(struct sk_buff *, struct flowi *);
+
+	if (family == AF_INET && (decodefn = ip_nat_decode_session) != NULL)
+		decodefn(skb, fl);
+#endif
+}
+
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
 extern struct proc_dir_entry *proc_net_netfilter;
@@ -282,6 +296,8 @@ extern struct proc_dir_entry *proc_net_netfilter;
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
+static inline void
+nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family) {}
 #endif /*CONFIG_NETFILTER*/
 
 #endif /*__KERNEL__*/
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 23ba177c1150..00f983226672 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -986,6 +986,7 @@ int dccp_v4_rcv(struct sk_buff *skb)
 
 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
+	nf_reset(skb);
 
 	return sk_receive_skb(sk, skb);
 
@@ -1099,7 +1100,6 @@ int dccp_v4_destroy_sock(struct sock *sk)
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
 	}
-	nf_reset(skb);
 
 	/* Clean up a referenced DCCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash != NULL)
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4c637a1cbd23..3321092b0914 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -86,6 +86,9 @@ int ip_route_me_harder(struct sk_buff **pskb)
 }
 EXPORT_SYMBOL(ip_route_me_harder);
 
+void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
+EXPORT_SYMBOL(ip_nat_decode_session);
+
 /*
  * Extra routing may needed on local out, as the QUEUE target never
  * returns control to the table.
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index b518697af4db..8b8a1f00bbf4 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -55,6 +55,44 @@
 			         : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN"  \
 				    : "*ERROR*")))
 
+#ifdef CONFIG_XFRM
+static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+	struct ip_conntrack *ct;
+	struct ip_conntrack_tuple *t;
+	enum ip_conntrack_info ctinfo;
+	enum ip_conntrack_dir dir;
+	unsigned long statusbit;
+
+	ct = ip_conntrack_get(skb, &ctinfo);
+	if (ct == NULL)
+		return;
+	dir = CTINFO2DIR(ctinfo);
+	t = &ct->tuplehash[dir].tuple;
+
+	if (dir == IP_CT_DIR_ORIGINAL)
+		statusbit = IPS_DST_NAT;
+	else
+		statusbit = IPS_SRC_NAT;
+
+	if (ct->status & statusbit) {
+		fl->fl4_dst = t->dst.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP)
+			fl->fl_ip_dport = t->dst.u.tcp.port;
+	}
+
+	statusbit ^= IPS_NAT_MASK;
+
+	if (ct->status & statusbit) {
+		fl->fl4_src = t->src.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP)
+			fl->fl_ip_sport = t->src.u.tcp.port;
+	}
+}
+#endif
+		
 static unsigned int
 ip_nat_fn(unsigned int hooknum,
 	  struct sk_buff **pskb,
@@ -330,10 +368,14 @@ static int init_or_cleanup(int init)
 
 	if (!init) goto cleanup;
 
+#ifdef CONFIG_XFRM
+	BUG_ON(ip_nat_decode_session != NULL);
+	ip_nat_decode_session = nat_decode_session;
+#endif
 	ret = ip_nat_rule_init();
 	if (ret < 0) {
 		printk("ip_nat_init: can't setup rules.\n");
-		goto cleanup_nothing;
+		goto cleanup_decode_session;
 	}
 	ret = nf_register_hook(&ip_nat_in_ops);
 	if (ret < 0) {
@@ -381,7 +423,11 @@ static int init_or_cleanup(int init)
 	nf_unregister_hook(&ip_nat_in_ops);
  cleanup_rule_init:
 	ip_nat_rule_cleanup();
- cleanup_nothing:
+ cleanup_decode_session:
+#ifdef CONFIG_XFRM
+	ip_nat_decode_session = NULL;
+	synchronize_net();
+#endif
 	return ret;
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f2edc9225b6a..59614a994b4e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -22,6 +22,7 @@
 #include <linux/workqueue.h>
 #include <linux/notifier.h>
 #include <linux/netdevice.h>
+#include <linux/netfilter.h>
 #include <linux/module.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -985,6 +986,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 
 	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
+	nf_nat_decode_session(skb, &fl, family);
 
 	sk_sid = security_sk_sid(sk, &fl, fl_dir);
 
-- 
cgit v1.2.3-71-gd317


From e16a8f0b8c53312beb1d8b52e463aae79aa809c7 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:06:48 -0800
Subject: [NETFILTER]: Add ipt_policy/ip6t_policy matches

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ipt_policy.h  |  52 +++++++++
 include/linux/netfilter_ipv6/ip6t_policy.h |  52 +++++++++
 net/ipv4/netfilter/Kconfig                 |  10 ++
 net/ipv4/netfilter/Makefile                |   1 +
 net/ipv4/netfilter/ipt_policy.c            | 170 ++++++++++++++++++++++++++++
 net/ipv6/netfilter/Kconfig                 |  10 ++
 net/ipv6/netfilter/Makefile                |   1 +
 net/ipv6/netfilter/ip6t_policy.c           | 175 +++++++++++++++++++++++++++++
 8 files changed, 471 insertions(+)
 create mode 100644 include/linux/netfilter_ipv4/ipt_policy.h
 create mode 100644 include/linux/netfilter_ipv6/ip6t_policy.h
 create mode 100644 net/ipv4/netfilter/ipt_policy.c
 create mode 100644 net/ipv6/netfilter/ip6t_policy.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ipt_policy.h b/include/linux/netfilter_ipv4/ipt_policy.h
new file mode 100644
index 000000000000..7fd1bec453f1
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_policy.h
@@ -0,0 +1,52 @@
+#ifndef _IPT_POLICY_H
+#define _IPT_POLICY_H
+
+#define IPT_POLICY_MAX_ELEM	4
+
+enum ipt_policy_flags
+{
+	IPT_POLICY_MATCH_IN	= 0x1,
+	IPT_POLICY_MATCH_OUT	= 0x2,
+	IPT_POLICY_MATCH_NONE	= 0x4,
+	IPT_POLICY_MATCH_STRICT	= 0x8,
+};
+
+enum ipt_policy_modes
+{
+	IPT_POLICY_MODE_TRANSPORT,
+	IPT_POLICY_MODE_TUNNEL
+};
+
+struct ipt_policy_spec
+{
+	u_int8_t	saddr:1,
+			daddr:1,
+			proto:1,
+			mode:1,
+			spi:1,
+			reqid:1;
+};
+
+struct ipt_policy_elem
+{
+	u_int32_t	saddr;
+	u_int32_t	smask;
+	u_int32_t	daddr;
+	u_int32_t	dmask;
+	u_int32_t	spi;
+	u_int32_t	reqid;
+	u_int8_t	proto;
+	u_int8_t	mode;
+
+	struct ipt_policy_spec	match;
+	struct ipt_policy_spec	invert;
+};
+
+struct ipt_policy_info
+{
+	struct ipt_policy_elem pol[IPT_POLICY_MAX_ELEM];
+	u_int16_t flags;
+	u_int16_t len;
+};
+
+#endif /* _IPT_POLICY_H */
diff --git a/include/linux/netfilter_ipv6/ip6t_policy.h b/include/linux/netfilter_ipv6/ip6t_policy.h
new file mode 100644
index 000000000000..5a93afcd2ff1
--- /dev/null
+++ b/include/linux/netfilter_ipv6/ip6t_policy.h
@@ -0,0 +1,52 @@
+#ifndef _IP6T_POLICY_H
+#define _IP6T_POLICY_H
+
+#define IP6T_POLICY_MAX_ELEM	4
+
+enum ip6t_policy_flags
+{
+	IP6T_POLICY_MATCH_IN		= 0x1,
+	IP6T_POLICY_MATCH_OUT		= 0x2,
+	IP6T_POLICY_MATCH_NONE		= 0x4,
+	IP6T_POLICY_MATCH_STRICT	= 0x8,
+};
+
+enum ip6t_policy_modes
+{
+	IP6T_POLICY_MODE_TRANSPORT,
+	IP6T_POLICY_MODE_TUNNEL
+};
+
+struct ip6t_policy_spec
+{
+	u_int8_t	saddr:1,
+			daddr:1,
+			proto:1,
+			mode:1,
+			spi:1,
+			reqid:1;
+};
+
+struct ip6t_policy_elem
+{
+	struct in6_addr	saddr;
+	struct in6_addr	smask;
+	struct in6_addr	daddr;
+	struct in6_addr	dmask;
+	u_int32_t	spi;
+	u_int32_t	reqid;
+	u_int8_t	proto;
+	u_int8_t	mode;
+
+	struct ip6t_policy_spec	match;
+	struct ip6t_policy_spec	invert;
+};
+
+struct ip6t_policy_info
+{
+	struct ip6t_policy_elem pol[IP6T_POLICY_MAX_ELEM];
+	u_int16_t flags;
+	u_int16_t len;
+};
+
+#endif /* _IP6T_POLICY_H */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 88a60650e6b8..a9893ec03e02 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -487,6 +487,16 @@ config IP_NF_MATCH_STRING
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_MATCH_POLICY
+       tristate "IPsec policy match support"
+       depends on IP_NF_IPTABLES && XFRM
+       help
+         Policy matching allows you to match packets based on the
+         IPsec policy that was used during decapsulation/will
+         be used during encapsulation.
+
+         To compile it as a module, choose M here.  If unsure, say N.
+
 # `filter', generic and specific targets
 config IP_NF_FILTER
 	tristate "Packet filtering"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index d0a447e520a2..549b01a648b3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
 obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
 obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
+obj-$(CONFIG_IP_NF_MATCH_POLICY) += ipt_policy.o
 obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
 obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
 
diff --git a/net/ipv4/netfilter/ipt_policy.c b/net/ipv4/netfilter/ipt_policy.c
new file mode 100644
index 000000000000..709debcc69c9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_policy.c
@@ -0,0 +1,170 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_policy.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("IPtables IPsec policy matching module");
+MODULE_LICENSE("GPL");
+
+
+static inline int
+match_xfrm_state(struct xfrm_state *x, const struct ipt_policy_elem *e)
+{
+#define MATCH(x,y)	(!e->match.x || ((e->x == (y)) ^ e->invert.x))
+
+	return MATCH(saddr, x->props.saddr.a4 & e->smask) &&
+	       MATCH(daddr, x->id.daddr.a4 & e->dmask) &&
+	       MATCH(proto, x->id.proto) &&
+	       MATCH(mode, x->props.mode) &&
+	       MATCH(spi, x->id.spi) &&
+	       MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct ipt_policy_info *info)
+{
+	const struct ipt_policy_elem *e;
+	struct sec_path *sp = skb->sp;
+	int strict = info->flags & IPT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (sp == NULL)
+		return -1;
+	if (strict && info->len != sp->len)
+		return 0;
+
+	for (i = sp->len - 1; i >= 0; i--) {
+		pos = strict ? i - sp->len + 1 : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(sp->x[i].xvec, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct ipt_policy_info *info)
+{
+	const struct ipt_policy_elem *e;
+	struct dst_entry *dst = skb->dst;
+	int strict = info->flags & IPT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (dst->xfrm == NULL)
+		return -1;
+
+	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+		pos = strict ? i : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(dst->xfrm, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int match(const struct sk_buff *skb,
+                 const struct net_device *in,
+                 const struct net_device *out,
+                 const void *matchinfo, int offset, int *hotdrop)
+{
+	const struct ipt_policy_info *info = matchinfo;
+	int ret;
+
+	if (info->flags & IPT_POLICY_MATCH_IN)
+		ret = match_policy_in(skb, info);
+	else
+		ret = match_policy_out(skb, info);
+
+	if (ret < 0)
+		ret = info->flags & IPT_POLICY_MATCH_NONE ? 1 : 0;
+	else if (info->flags & IPT_POLICY_MATCH_NONE)
+		ret = 0;
+
+	return ret;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+	struct ipt_policy_info *info = matchinfo;
+
+	if (matchsize != IPT_ALIGN(sizeof(*info))) {
+		printk(KERN_ERR "ipt_policy: matchsize %u != %zu\n",
+		       matchsize, IPT_ALIGN(sizeof(*info)));
+		return 0;
+	}
+	if (!(info->flags & (IPT_POLICY_MATCH_IN|IPT_POLICY_MATCH_OUT))) {
+		printk(KERN_ERR "ipt_policy: neither incoming nor "
+		                "outgoing policy selected\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_LOCAL_IN)
+	    && info->flags & IPT_POLICY_MATCH_OUT) {
+		printk(KERN_ERR "ipt_policy: output policy not valid in "
+		                "PRE_ROUTING and INPUT\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP_POST_ROUTING | 1 << NF_IP_LOCAL_OUT)
+	    && info->flags & IPT_POLICY_MATCH_IN) {
+		printk(KERN_ERR "ipt_policy: input policy not valid in "
+		                "POST_ROUTING and OUTPUT\n");
+		return 0;
+	}
+	if (info->len > IPT_POLICY_MAX_ELEM) {
+		printk(KERN_ERR "ipt_policy: too many policy elements\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_match policy_match = {
+	.name		= "policy",
+	.match		= match,
+	.checkentry 	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&policy_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&policy_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 04912f9b35c3..105dd69ee9fb 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -179,6 +179,16 @@ config IP6_NF_MATCH_PHYSDEV
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_MATCH_POLICY
+	tristate "IPsec policy match support"
+	depends on IP6_NF_IPTABLES && XFRM
+	help
+	  Policy matching allows you to match packets based on the
+	  IPsec policy that was used during decapsulation/will
+	  be used during encapsulation.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 # The targets
 config IP6_NF_FILTER
 	tristate "Packet filtering"
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 9ab5b2ca1f59..c0c809b426e8 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o ip6t_dst.o
 obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o
 obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o
 obj-$(CONFIG_IP6_NF_MATCH_AHESP) += ip6t_esp.o ip6t_ah.o
+obj-$(CONFIG_IP6_NF_MATCH_POLICY) += ip6t_policy.o
 obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
 obj-$(CONFIG_IP6_NF_MATCH_MULTIPORT) += ip6t_multiport.o
 obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o
diff --git a/net/ipv6/netfilter/ip6t_policy.c b/net/ipv6/netfilter/ip6t_policy.c
new file mode 100644
index 000000000000..13fedad48c1d
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_policy.c
@@ -0,0 +1,175 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_policy.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("IPtables IPsec policy matching module");
+MODULE_LICENSE("GPL");
+
+
+static inline int
+match_xfrm_state(struct xfrm_state *x, const struct ip6t_policy_elem *e)
+{
+#define MATCH_ADDR(x,y,z)	(!e->match.x || \
+				 ((ip6_masked_addrcmp((z), &e->x, &e->y)) == 0) ^ e->invert.x)
+#define MATCH(x,y)		(!e->match.x || ((e->x == (y)) ^ e->invert.x))
+	
+	return MATCH_ADDR(saddr, smask, (struct in6_addr *)&x->props.saddr.a6) &&
+	       MATCH_ADDR(daddr, dmask, (struct in6_addr *)&x->id.daddr.a6) &&
+	       MATCH(proto, x->id.proto) &&
+	       MATCH(mode, x->props.mode) &&
+	       MATCH(spi, x->id.spi) &&
+	       MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct ip6t_policy_info *info)
+{
+	const struct ip6t_policy_elem *e;
+	struct sec_path *sp = skb->sp;
+	int strict = info->flags & IP6T_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (sp == NULL)
+		return -1;
+	if (strict && info->len != sp->len)
+		return 0;
+
+	for (i = sp->len - 1; i >= 0; i--) {
+		pos = strict ? i - sp->len + 1 : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(sp->x[i].xvec, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct ip6t_policy_info *info)
+{
+	const struct ip6t_policy_elem *e;
+	struct dst_entry *dst = skb->dst;
+	int strict = info->flags & IP6T_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (dst->xfrm == NULL)
+		return -1;
+
+	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+		pos = strict ? i : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(dst->xfrm, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int match(const struct sk_buff *skb,
+                 const struct net_device *in,
+                 const struct net_device *out,
+                 const void *matchinfo,
+		 int offset,
+		 unsigned int protoff,
+		 int *hotdrop)
+{
+	const struct ip6t_policy_info *info = matchinfo;
+	int ret;
+
+	if (info->flags & IP6T_POLICY_MATCH_IN)
+		ret = match_policy_in(skb, info);
+	else
+		ret = match_policy_out(skb, info);
+
+	if (ret < 0)
+		ret = info->flags & IP6T_POLICY_MATCH_NONE ? 1 : 0;
+	else if (info->flags & IP6T_POLICY_MATCH_NONE)
+		ret = 0;
+
+	return ret;
+}
+
+static int checkentry(const char *tablename, const struct ip6t_ip6 *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+	struct ip6t_policy_info *info = matchinfo;
+
+	if (matchsize != IP6T_ALIGN(sizeof(*info))) {
+		printk(KERN_ERR "ip6t_policy: matchsize %u != %zu\n",
+		       matchsize, IP6T_ALIGN(sizeof(*info)));
+		return 0;
+	}
+	if (!(info->flags & (IP6T_POLICY_MATCH_IN|IP6T_POLICY_MATCH_OUT))) {
+		printk(KERN_ERR "ip6t_policy: neither incoming nor "
+		                "outgoing policy selected\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP6_PRE_ROUTING | 1 << NF_IP6_LOCAL_IN)
+	    && info->flags & IP6T_POLICY_MATCH_OUT) {
+		printk(KERN_ERR "ip6t_policy: output policy not valid in "
+		                "PRE_ROUTING and INPUT\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP6_POST_ROUTING | 1 << NF_IP6_LOCAL_OUT)
+	    && info->flags & IP6T_POLICY_MATCH_IN) {
+		printk(KERN_ERR "ip6t_policy: input policy not valid in "
+		                "POST_ROUTING and OUTPUT\n");
+		return 0;
+	}
+	if (info->len > IP6T_POLICY_MAX_ELEM) {
+		printk(KERN_ERR "ip6t_policy: too many policy elements\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ip6t_match policy_match = {
+	.name		= "policy",
+	.match		= match,
+	.checkentry 	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ip6t_register_match(&policy_match);
+}
+
+static void __exit fini(void)
+{
+	ip6t_unregister_match(&policy_match);
+}
+
+module_init(init);
+module_exit(fini);
-- 
cgit v1.2.3-71-gd317


From f53b61d8c385140fe7f09e0c9187ae813ee9f330 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Sat, 7 Jan 2006 12:50:27 -0800
Subject: [NETFILTER]: Add dummy nf_hook{_thresh}() when NETFILTER is disabled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 84506dfa1f37..4cf6088625c1 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -295,7 +295,22 @@ extern struct proc_dir_entry *proc_net_netfilter;
 
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
+static inline int nf_hook_thresh(int pf, unsigned int hook,
+				 struct sk_buff **pskb,
+				 struct net_device *indev,
+				 struct net_device *outdev,
+				 int (*okfn)(struct sk_buff *), int thresh)
+{
+	return okfn(*pskb);
+}
+static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb,
+			  struct net_device *indev, struct net_device *outdev,
+			  int (*okfn)(struct sk_buff *))
+{
+	return okfn(*pskb);
+}
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
+struct flowi;
 static inline void
 nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family) {}
 #endif /*CONFIG_NETFILTER*/
-- 
cgit v1.2.3-71-gd317


From 67207b9664a8d603138ef1556141e6d0a102bea7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 15 Nov 2005 15:53:48 -0500
Subject: [PATCH] spufs: The SPU file system, base

This is the current version of the spu file system, used
for driving SPEs on the Cell Broadband Engine.

This release is almost identical to the version for the
2.6.14 kernel posted earlier, which is available as part
of the Cell BE Linux distribution from
http://www.bsc.es/projects/deepcomputing/linuxoncell/.

The first patch provides all the interfaces for running
spu application, but does not have any support for
debugging SPU tasks or for scheduling. Both these
functionalities are added in the subsequent patches.

See Documentation/filesystems/spufs.txt on how to use
spufs.

Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 Documentation/filesystems/spufs.txt          | 521 +++++++++++++++++++
 arch/powerpc/Kconfig                         |   1 +
 arch/powerpc/kernel/systbl.S                 |   2 +
 arch/powerpc/mm/hash_utils_64.c              |   1 +
 arch/powerpc/platforms/cell/Kconfig          |  13 +
 arch/powerpc/platforms/cell/Makefile         |   3 +
 arch/powerpc/platforms/cell/spu_base.c       | 740 +++++++++++++++++++++++++++
 arch/powerpc/platforms/cell/spu_syscalls.c   |  86 ++++
 arch/powerpc/platforms/cell/spufs/Makefile   |   3 +
 arch/powerpc/platforms/cell/spufs/context.c  |  67 +++
 arch/powerpc/platforms/cell/spufs/file.c     | 596 +++++++++++++++++++++
 arch/powerpc/platforms/cell/spufs/inode.c    | 470 +++++++++++++++++
 arch/powerpc/platforms/cell/spufs/spufs.h    |  71 +++
 arch/powerpc/platforms/cell/spufs/syscalls.c | 106 ++++
 arch/ppc/kernel/ppc_ksyms.c                  |   1 -
 include/asm-powerpc/spu.h                    | 498 ++++++++++++++++++
 include/asm-powerpc/unistd.h                 |   2 +
 include/linux/syscalls.h                     |   5 +
 kernel/sys_ni.c                              |   2 +
 mm/memory.c                                  |   2 +
 20 files changed, 3189 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/filesystems/spufs.txt
 create mode 100644 arch/powerpc/platforms/cell/Kconfig
 create mode 100644 arch/powerpc/platforms/cell/spu_base.c
 create mode 100644 arch/powerpc/platforms/cell/spu_syscalls.c
 create mode 100644 arch/powerpc/platforms/cell/spufs/Makefile
 create mode 100644 arch/powerpc/platforms/cell/spufs/context.c
 create mode 100644 arch/powerpc/platforms/cell/spufs/file.c
 create mode 100644 arch/powerpc/platforms/cell/spufs/inode.c
 create mode 100644 arch/powerpc/platforms/cell/spufs/spufs.h
 create mode 100644 arch/powerpc/platforms/cell/spufs/syscalls.c
 create mode 100644 include/asm-powerpc/spu.h

(limited to 'include/linux')

diff --git a/Documentation/filesystems/spufs.txt b/Documentation/filesystems/spufs.txt
new file mode 100644
index 000000000000..8edc3952eff4
--- /dev/null
+++ b/Documentation/filesystems/spufs.txt
@@ -0,0 +1,521 @@
+SPUFS(2)                   Linux Programmer's Manual                  SPUFS(2)
+
+
+
+NAME
+       spufs - the SPU file system
+
+
+DESCRIPTION
+       The SPU file system is used on PowerPC machines that implement the Cell
+       Broadband Engine Architecture in order to access Synergistic  Processor
+       Units (SPUs).
+
+       The file system provides a name space similar to posix shared memory or
+       message queues. Users that have write permissions on  the  file  system
+       can use spu_create(2) to establish SPU contexts in the spufs root.
+
+       Every SPU context is represented by a directory containing a predefined
+       set of files. These files can be used for manipulating the state of the
+       logical SPU. Users can change permissions on those files, but not actu-
+       ally add or remove files.
+
+
+MOUNT OPTIONS
+       uid=<uid>
+              set the user owning the mount point, the default is 0 (root).
+
+       gid=<gid>
+              set the group owning the mount point, the default is 0 (root).
+
+
+FILES
+       The files in spufs mostly follow the standard behavior for regular sys-
+       tem  calls like read(2) or write(2), but often support only a subset of
+       the operations supported on regular file systems. This list details the
+       supported  operations  and  the  deviations  from  the behaviour in the
+       respective man pages.
+
+       All files that support the read(2) operation also support readv(2)  and
+       all  files  that support the write(2) operation also support writev(2).
+       All files support the access(2) and stat(2) family of  operations,  but
+       only  the  st_mode,  st_nlink,  st_uid and st_gid fields of struct stat
+       contain reliable information.
+
+       All files support the chmod(2)/fchmod(2) and chown(2)/fchown(2)  opera-
+       tions,  but  will  not be able to grant permissions that contradict the
+       possible operations, e.g. read access on the wbox file.
+
+       The current set of files is:
+
+
+   /mem
+       the contents of the local storage memory  of  the  SPU.   This  can  be
+       accessed  like  a regular shared memory file and contains both code and
+       data in the address space of the SPU.  The possible  operations  on  an
+       open mem file are:
+
+       read(2), pread(2), write(2), pwrite(2), lseek(2)
+              These  operate  as  documented, with the exception that seek(2),
+              write(2) and pwrite(2) are not supported beyond the end  of  the
+              file. The file size is the size of the local storage of the SPU,
+              which normally is 256 kilobytes.
+
+       mmap(2)
+              Mapping mem into the process address space gives access  to  the
+              SPU  local  storage  within  the  process  address  space.  Only
+              MAP_SHARED mappings are allowed.
+
+
+   /mbox
+       The first SPU to CPU communication mailbox. This file is read-only  and
+       can  be  read  in  units of 32 bits.  The file can only be used in non-
+       blocking mode and it even poll() will not block on  it.   The  possible
+       operations on an open mbox file are:
+
+       read(2)
+              If  a  count smaller than four is requested, read returns -1 and
+              sets errno to EINVAL.  If there is no data available in the mail
+              box,  the  return  value  is set to -1 and errno becomes EAGAIN.
+              When data has been read successfully, four bytes are  placed  in
+              the data buffer and the value four is returned.
+
+
+   /ibox
+       The  second  SPU  to CPU communication mailbox. This file is similar to
+       the first mailbox file, but can be read in blocking I/O mode,  and  the
+       poll  familiy of system calls can be used to wait for it.  The possible
+       operations on an open ibox file are:
+
+       read(2)
+              If a count smaller than four is requested, read returns  -1  and
+              sets errno to EINVAL.  If there is no data available in the mail
+              box and the file descriptor has been opened with O_NONBLOCK, the
+              return value is set to -1 and errno becomes EAGAIN.
+
+              If  there  is  no  data  available  in the mail box and the file
+              descriptor has been opened without  O_NONBLOCK,  the  call  will
+              block  until  the  SPU  writes to its interrupt mailbox channel.
+              When data has been read successfully, four bytes are  placed  in
+              the data buffer and the value four is returned.
+
+       poll(2)
+              Poll  on  the  ibox  file returns (POLLIN | POLLRDNORM) whenever
+              data is available for reading.
+
+
+   /wbox
+       The CPU to SPU communation mailbox. It is write-only can can be written
+       in  units  of  32  bits. If the mailbox is full, write() will block and
+       poll can be used to wait for it becoming  empty  again.   The  possible
+       operations  on  an open wbox file are: write(2) If a count smaller than
+       four is requested, write returns -1 and sets errno to EINVAL.  If there
+       is  no space available in the mail box and the file descriptor has been
+       opened with O_NONBLOCK, the return value is set to -1 and errno becomes
+       EAGAIN.
+
+       If  there is no space available in the mail box and the file descriptor
+       has been opened without O_NONBLOCK, the call will block until  the  SPU
+       reads  from  its PPE mailbox channel.  When data has been read success-
+       fully, four bytes are placed in the data buffer and the value  four  is
+       returned.
+
+       poll(2)
+              Poll  on  the  ibox file returns (POLLOUT | POLLWRNORM) whenever
+              space is available for writing.
+
+
+   /mbox_stat
+   /ibox_stat
+   /wbox_stat
+       Read-only files that contain the length of the current queue, i.e.  how
+       many  words  can  be  read  from  mbox or ibox or how many words can be
+       written to wbox without blocking.  The files can be read only in 4-byte
+       units  and  return  a  big-endian  binary integer number.  The possible
+       operations on an open *box_stat file are:
+
+       read(2)
+              If a count smaller than four is requested, read returns  -1  and
+              sets errno to EINVAL.  Otherwise, a four byte value is placed in
+              the data buffer, containing the number of elements that  can  be
+              read  from  (for  mbox_stat  and  ibox_stat)  or written to (for
+              wbox_stat) the respective mail box without blocking or resulting
+              in EAGAIN.
+
+
+   /npc
+   /decr
+   /decr_status
+   /spu_tag_mask
+   /event_mask
+   /srr0
+       Internal  registers  of  the SPU. The representation is an ASCII string
+       with the numeric value of the next instruction to  be  executed.  These
+       can  be  used in read/write mode for debugging, but normal operation of
+       programs should not rely on them because access to any of  them  except
+       npc requires an SPU context save and is therefore very inefficient.
+
+       The contents of these files are:
+
+       npc                 Next Program Counter
+
+       decr                SPU Decrementer
+
+       decr_status         Decrementer Status
+
+       spu_tag_mask        MFC tag mask for SPU DMA
+
+       event_mask          Event mask for SPU interrupts
+
+       srr0                Interrupt Return address register
+
+
+       The   possible   operations   on   an   open  npc,  decr,  decr_status,
+       spu_tag_mask, event_mask or srr0 file are:
+
+       read(2)
+              When the count supplied to the read call  is  shorter  than  the
+              required  length for the pointer value plus a newline character,
+              subsequent reads from the same file descriptor  will  result  in
+              completing  the string, regardless of changes to the register by
+              a running SPU task.  When a complete string has been  read,  all
+              subsequent read operations will return zero bytes and a new file
+              descriptor needs to be opened to read the value again.
+
+       write(2)
+              A write operation on the file results in setting the register to
+              the  value  given  in  the string. The string is parsed from the
+              beginning to the first non-numeric character or the end  of  the
+              buffer.  Subsequent writes to the same file descriptor overwrite
+              the previous setting.
+
+
+   /fpcr
+       This file gives access to the Floating Point Status and Control  Regis-
+       ter as a four byte long file. The operations on the fpcr file are:
+
+       read(2)
+              If  a  count smaller than four is requested, read returns -1 and
+              sets errno to EINVAL.  Otherwise, a four byte value is placed in
+              the data buffer, containing the current value of the fpcr regis-
+              ter.
+
+       write(2)
+              If a count smaller than four is requested, write returns -1  and
+              sets  errno  to  EINVAL.  Otherwise, a four byte value is copied
+              from the data buffer, updating the value of the fpcr register.
+
+
+   /signal1
+   /signal2
+       The two signal notification channels of an SPU.  These  are  read-write
+       files  that  operate  on  a 32 bit word.  Writing to one of these files
+       triggers an interrupt on the SPU. The  value  writting  to  the  signal
+       files can be read from the SPU through a channel read or from host user
+       space through the file.  After the value has been read by the  SPU,  it
+       is  reset  to zero.  The possible operations on an open signal1 or sig-
+       nal2 file are:
+
+       read(2)
+              If a count smaller than four is requested, read returns  -1  and
+              sets errno to EINVAL.  Otherwise, a four byte value is placed in
+              the data buffer, containing the current value of  the  specified
+              signal notification register.
+
+       write(2)
+              If  a count smaller than four is requested, write returns -1 and
+              sets errno to EINVAL.  Otherwise, a four byte  value  is  copied
+              from the data buffer, updating the value of the specified signal
+              notification register.  The signal  notification  register  will
+              either be replaced with the input data or will be updated to the
+              bitwise OR or the old value and the input data, depending on the
+              contents  of  the  signal1_type,  or  signal2_type respectively,
+              file.
+
+
+   /signal1_type
+   /signal2_type
+       These two files change the behavior of the signal1 and signal2  notifi-
+       cation  files.  The  contain  a numerical ASCII string which is read as
+       either "1" or "0".  In mode 0 (overwrite), the  hardware  replaces  the
+       contents of the signal channel with the data that is written to it.  in
+       mode 1 (logical OR), the hardware accumulates the bits that are  subse-
+       quently written to it.  The possible operations on an open signal1_type
+       or signal2_type file are:
+
+       read(2)
+              When the count supplied to the read call  is  shorter  than  the
+              required  length  for the digit plus a newline character, subse-
+              quent reads from the same file descriptor will  result  in  com-
+              pleting  the  string.  When a complete string has been read, all
+              subsequent read operations will return zero bytes and a new file
+              descriptor needs to be opened to read the value again.
+
+       write(2)
+              A write operation on the file results in setting the register to
+              the value given in the string. The string  is  parsed  from  the
+              beginning  to  the first non-numeric character or the end of the
+              buffer.  Subsequent writes to the same file descriptor overwrite
+              the previous setting.
+
+
+EXAMPLES
+       /etc/fstab entry
+              none      /spu      spufs     gid=spu   0    0
+
+
+AUTHORS
+       Arnd  Bergmann  <arndb@de.ibm.com>,  Mark  Nutter <mnutter@us.ibm.com>,
+       Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
+
+SEE ALSO
+       capabilities(7), close(2), spu_create(2), spu_run(2), spufs(7)
+
+
+
+Linux                             2005-09-28                          SPUFS(2)
+
+------------------------------------------------------------------------------
+
+SPU_RUN(2)                 Linux Programmer's Manual                SPU_RUN(2)
+
+
+
+NAME
+       spu_run - execute an spu context
+
+
+SYNOPSIS
+       #include <sys/spu.h>
+
+       int spu_run(int fd, unsigned int *npc, unsigned int *event);
+
+DESCRIPTION
+       The  spu_run system call is used on PowerPC machines that implement the
+       Cell Broadband Engine Architecture in order to access Synergistic  Pro-
+       cessor  Units  (SPUs).  It  uses the fd that was returned from spu_cre-
+       ate(2) to address a specific SPU context. When the context gets  sched-
+       uled  to a physical SPU, it starts execution at the instruction pointer
+       passed in npc.
+
+       Execution of SPU code happens synchronously, meaning that spu_run  does
+       not  return  while the SPU is still running. If there is a need to exe-
+       cute SPU code in parallel with other code on either  the  main  CPU  or
+       other  SPUs,  you  need to create a new thread of execution first, e.g.
+       using the pthread_create(3) call.
+
+       When spu_run returns, the current value of the SPU instruction  pointer
+       is  written back to npc, so you can call spu_run again without updating
+       the pointers.
+
+       event can be a NULL pointer or point to an extended  status  code  that
+       gets  filled  when spu_run returns. It can be one of the following con-
+       stants:
+
+       SPE_EVENT_DMA_ALIGNMENT
+              A DMA alignment error
+
+       SPE_EVENT_SPE_DATA_SEGMENT
+              A DMA segmentation error
+
+       SPE_EVENT_SPE_DATA_STORAGE
+              A DMA storage error
+
+       If NULL is passed as the event argument, these errors will result in  a
+       signal delivered to the calling process.
+
+RETURN VALUE
+       spu_run  returns the value of the spu_status register or -1 to indicate
+       an error and set errno to one of the error  codes  listed  below.   The
+       spu_status  register  value  contains  a  bit  mask of status codes and
+       optionally a 14 bit code returned from the stop-and-signal  instruction
+       on the SPU. The bit masks for the status codes are:
+
+       0x02   SPU was stopped by stop-and-signal.
+
+       0x04   SPU was stopped by halt.
+
+       0x08   SPU is waiting for a channel.
+
+       0x10   SPU is in single-step mode.
+
+       0x20   SPU has tried to execute an invalid instruction.
+
+       0x40   SPU has tried to access an invalid channel.
+
+       0x3fff0000
+              The  bits  masked with this value contain the code returned from
+              stop-and-signal.
+
+       There are always one or more of the lower eight bits set  or  an  error
+       code is returned from spu_run.
+
+ERRORS
+       EAGAIN or EWOULDBLOCK
+              fd is in non-blocking mode and spu_run would block.
+
+       EBADF  fd is not a valid file descriptor.
+
+       EFAULT npc is not a valid pointer or status is neither NULL nor a valid
+              pointer.
+
+       EINTR  A signal occured while spu_run was in progress.  The  npc  value
+              has  been updated to the new program counter value if necessary.
+
+       EINVAL fd is not a file descriptor returned from spu_create(2).
+
+       ENOMEM Insufficient memory was available to handle a page fault result-
+              ing from an MFC direct memory access.
+
+       ENOSYS the functionality is not provided by the current system, because
+              either the hardware does not provide SPUs or the spufs module is
+              not loaded.
+
+
+NOTES
+       spu_run  is  meant  to  be  used  from  libraries that implement a more
+       abstract interface to SPUs, not to be used from  regular  applications.
+       See  http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec-
+       ommended libraries.
+
+
+CONFORMING TO
+       This call is Linux specific and only implemented by the ppc64 architec-
+       ture. Programs using this system call are not portable.
+
+
+BUGS
+       The code does not yet fully implement all features lined out here.
+
+
+AUTHOR
+       Arnd Bergmann <arndb@de.ibm.com>
+
+SEE ALSO
+       capabilities(7), close(2), spu_create(2), spufs(7)
+
+
+
+Linux                             2005-09-28                        SPU_RUN(2)
+
+------------------------------------------------------------------------------
+
+SPU_CREATE(2)              Linux Programmer's Manual             SPU_CREATE(2)
+
+
+
+NAME
+       spu_create - create a new spu context
+
+
+SYNOPSIS
+       #include <sys/types.h>
+       #include <sys/spu.h>
+
+       int spu_create(const char *pathname, int flags, mode_t mode);
+
+DESCRIPTION
+       The  spu_create  system call is used on PowerPC machines that implement
+       the Cell Broadband Engine Architecture in order to  access  Synergistic
+       Processor  Units (SPUs). It creates a new logical context for an SPU in
+       pathname and returns a handle to associated  with  it.   pathname  must
+       point  to  a  non-existing directory in the mount point of the SPU file
+       system (spufs).  When spu_create is successful, a directory  gets  cre-
+       ated on pathname and it is populated with files.
+
+       The  returned  file  handle can only be passed to spu_run(2) or closed,
+       other operations are not defined on it. When it is closed, all  associ-
+       ated  directory entries in spufs are removed. When the last file handle
+       pointing either inside  of  the  context  directory  or  to  this  file
+       descriptor is closed, the logical SPU context is destroyed.
+
+       The  parameter flags can be zero or any bitwise or'd combination of the
+       following constants:
+
+       SPU_RAWIO
+              Allow mapping of some of the hardware registers of the SPU  into
+              user space. This flag requires the CAP_SYS_RAWIO capability, see
+              capabilities(7).
+
+       The mode parameter specifies the permissions used for creating the  new
+       directory  in  spufs.   mode is modified with the user's umask(2) value
+       and then used for both the directory and the files contained in it. The
+       file permissions mask out some more bits of mode because they typically
+       support only read or write access. See stat(2) for a full list  of  the
+       possible mode values.
+
+
+RETURN VALUE
+       spu_create  returns a new file descriptor. It may return -1 to indicate
+       an error condition and set errno to  one  of  the  error  codes  listed
+       below.
+
+
+ERRORS
+       EACCESS
+              The  current  user does not have write access on the spufs mount
+              point.
+
+       EEXIST An SPU context already exists at the given path name.
+
+       EFAULT pathname is not a valid string pointer in  the  current  address
+              space.
+
+       EINVAL pathname is not a directory in the spufs mount point.
+
+       ELOOP  Too many symlinks were found while resolving pathname.
+
+       EMFILE The process has reached its maximum open file limit.
+
+       ENAMETOOLONG
+              pathname was too long.
+
+       ENFILE The system has reached the global open file limit.
+
+       ENOENT Part of pathname could not be resolved.
+
+       ENOMEM The kernel could not allocate all resources required.
+
+       ENOSPC There  are  not  enough  SPU resources available to create a new
+              context or the user specific limit for the number  of  SPU  con-
+              texts has been reached.
+
+       ENOSYS the functionality is not provided by the current system, because
+              either the hardware does not provide SPUs or the spufs module is
+              not loaded.
+
+       ENOTDIR
+              A part of pathname is not a directory.
+
+
+
+NOTES
+       spu_create  is  meant  to  be used from libraries that implement a more
+       abstract interface to SPUs, not to be used from  regular  applications.
+       See  http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec-
+       ommended libraries.
+
+
+FILES
+       pathname must point to a location beneath the mount point of spufs.  By
+       convention, it gets mounted in /spu.
+
+
+CONFORMING TO
+       This call is Linux specific and only implemented by the ppc64 architec-
+       ture. Programs using this system call are not portable.
+
+
+BUGS
+       The code does not yet fully implement all features lined out here.
+
+
+AUTHOR
+       Arnd Bergmann <arndb@de.ibm.com>
+
+SEE ALSO
+       capabilities(7), close(2), spu_run(2), spufs(7)
+
+
+
+Linux                             2005-09-28                     SPU_CREATE(2)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 4d71aa3ecbb5..39ca7b9da369 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -482,6 +482,7 @@ source arch/powerpc/platforms/embedded6xx/Kconfig
 source arch/powerpc/platforms/4xx/Kconfig
 source arch/powerpc/platforms/85xx/Kconfig
 source arch/powerpc/platforms/8xx/Kconfig
+source arch/powerpc/platforms/cell/Kconfig
 
 menu "Kernel options"
 
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index 4bb3650420b4..989f6286991a 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -319,3 +319,5 @@ COMPAT_SYS(ioprio_get)
 SYSCALL(inotify_init)
 SYSCALL(inotify_add_watch)
 SYSCALL(inotify_rm_watch)
+SYSCALL(spu_run)
+SYSCALL(spu_create)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index a606504678bd..846a1894cf95 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -644,6 +644,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	DBG_LOW(" -> rc=%d\n", rc);
 	return rc;
 }
+EXPORT_SYMBOL_GPL(hash_page);
 
 void hash_preload(struct mm_struct *mm, unsigned long ea,
 		  unsigned long access, unsigned long trap)
diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig
new file mode 100644
index 000000000000..3157071e241c
--- /dev/null
+++ b/arch/powerpc/platforms/cell/Kconfig
@@ -0,0 +1,13 @@
+menu "Cell Broadband Engine options"
+	depends on PPC_CELL
+
+config SPU_FS
+	tristate "SPU file system"
+	default m
+	depends on PPC_CELL
+	help
+	  The SPU file system is used to access Synergistic Processing
+	  Units on machines implementing the Broadband Processor
+	  Architecture.
+
+endmenu
diff --git a/arch/powerpc/platforms/cell/Makefile b/arch/powerpc/platforms/cell/Makefile
index 55e094b96bc0..74616cf13af9 100644
--- a/arch/powerpc/platforms/cell/Makefile
+++ b/arch/powerpc/platforms/cell/Makefile
@@ -1,2 +1,5 @@
 obj-y			+= interrupt.o iommu.o setup.o spider-pic.o
 obj-$(CONFIG_SMP)	+= smp.o
+obj-$(CONFIG_SPU_FS)	+= spufs/ spu_base.o
+builtin-spufs-$(CONFIG_SPU_FS)	+= spu_syscalls.o
+obj-y			+= $(builtin-spufs-m)
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
new file mode 100644
index 000000000000..9e9096590a07
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -0,0 +1,740 @@
+/*
+ * Low-level SPU handling
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG 1
+
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/semaphore.h>
+#include <asm/spu.h>
+#include <asm/mmu_context.h>
+
+#include "interrupt.h"
+
+static int __spu_trap_invalid_dma(struct spu *spu)
+{
+	pr_debug("%s\n", __FUNCTION__);
+	force_sig(SIGBUS, /* info, */ current);
+	return 0;
+}
+
+static int __spu_trap_dma_align(struct spu *spu)
+{
+	pr_debug("%s\n", __FUNCTION__);
+	force_sig(SIGBUS, /* info, */ current);
+	return 0;
+}
+
+static int __spu_trap_error(struct spu *spu)
+{
+	pr_debug("%s\n", __FUNCTION__);
+	force_sig(SIGILL, /* info, */ current);
+	return 0;
+}
+
+static void spu_restart_dma(struct spu *spu)
+{
+	struct spu_priv2 __iomem *priv2 = spu->priv2;
+	out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESTART_DMA_COMMAND);
+}
+
+static int __spu_trap_data_seg(struct spu *spu, unsigned long ea)
+{
+	struct spu_priv2 __iomem *priv2;
+	struct mm_struct *mm;
+
+	pr_debug("%s\n", __FUNCTION__);
+
+	if (REGION_ID(ea) != USER_REGION_ID) {
+		pr_debug("invalid region access at %016lx\n", ea);
+		return 1;
+	}
+
+	priv2 = spu->priv2;
+	mm = spu->mm;
+
+	if (spu->slb_replace >= 8)
+		spu->slb_replace = 0;
+
+	out_be64(&priv2->slb_index_W, spu->slb_replace);
+	out_be64(&priv2->slb_vsid_RW,
+		(get_vsid(mm->context.id, ea) << SLB_VSID_SHIFT)
+						 | SLB_VSID_USER);
+	out_be64(&priv2->slb_esid_RW, (ea & ESID_MASK) | SLB_ESID_V);
+
+	spu_restart_dma(spu);
+
+	pr_debug("set slb %d context %lx, ea %016lx, vsid %016lx, esid %016lx\n",
+		spu->slb_replace, mm->context.id, ea,
+		(get_vsid(mm->context.id, ea) << SLB_VSID_SHIFT)| SLB_VSID_USER,
+		 (ea & ESID_MASK) | SLB_ESID_V);
+	return 0;
+}
+
+static int __spu_trap_data_map(struct spu *spu, unsigned long ea)
+{
+	unsigned long dsisr;
+	struct spu_priv1 __iomem *priv1;
+
+	pr_debug("%s\n", __FUNCTION__);
+	priv1 = spu->priv1;
+	dsisr = in_be64(&priv1->mfc_dsisr_RW);
+
+	wake_up(&spu->stop_wq);
+
+	return 0;
+}
+
+static int __spu_trap_mailbox(struct spu *spu)
+{
+	wake_up_all(&spu->ibox_wq);
+	kill_fasync(&spu->ibox_fasync, SIGIO, POLLIN);
+
+	/* atomically disable SPU mailbox interrupts */
+	spin_lock(&spu->register_lock);
+	out_be64(&spu->priv1->int_mask_class2_RW,
+		in_be64(&spu->priv1->int_mask_class2_RW) & ~0x1);
+	spin_unlock(&spu->register_lock);
+	return 0;
+}
+
+static int __spu_trap_stop(struct spu *spu)
+{
+	pr_debug("%s\n", __FUNCTION__);
+	spu->stop_code = in_be32(&spu->problem->spu_status_R);
+	wake_up(&spu->stop_wq);
+	return 0;
+}
+
+static int __spu_trap_halt(struct spu *spu)
+{
+	pr_debug("%s\n", __FUNCTION__);
+	spu->stop_code = in_be32(&spu->problem->spu_status_R);
+	wake_up(&spu->stop_wq);
+	return 0;
+}
+
+static int __spu_trap_tag_group(struct spu *spu)
+{
+	pr_debug("%s\n", __FUNCTION__);
+	/* wake_up(&spu->dma_wq); */
+	return 0;
+}
+
+static int __spu_trap_spubox(struct spu *spu)
+{
+	wake_up_all(&spu->wbox_wq);
+	kill_fasync(&spu->wbox_fasync, SIGIO, POLLOUT);
+
+	/* atomically disable SPU mailbox interrupts */
+	spin_lock(&spu->register_lock);
+	out_be64(&spu->priv1->int_mask_class2_RW,
+		in_be64(&spu->priv1->int_mask_class2_RW) & ~0x10);
+	spin_unlock(&spu->register_lock);
+	return 0;
+}
+
+static irqreturn_t
+spu_irq_class_0(int irq, void *data, struct pt_regs *regs)
+{
+	struct spu *spu;
+
+	spu = data;
+	spu->class_0_pending = 1;
+	wake_up(&spu->stop_wq);
+
+	return IRQ_HANDLED;
+}
+
+static int
+spu_irq_class_0_bottom(struct spu *spu)
+{
+	unsigned long stat;
+
+	spu->class_0_pending = 0;
+
+	stat = in_be64(&spu->priv1->int_stat_class0_RW);
+
+	if (stat & 1) /* invalid MFC DMA */
+		__spu_trap_invalid_dma(spu);
+
+	if (stat & 2) /* invalid DMA alignment */
+		__spu_trap_dma_align(spu);
+
+	if (stat & 4) /* error on SPU */
+		__spu_trap_error(spu);
+
+	out_be64(&spu->priv1->int_stat_class0_RW, stat);
+	return 0;
+}
+
+static irqreturn_t
+spu_irq_class_1(int irq, void *data, struct pt_regs *regs)
+{
+	struct spu *spu;
+	unsigned long stat, dar;
+
+	spu = data;
+	stat  = in_be64(&spu->priv1->int_stat_class1_RW);
+	dar   = in_be64(&spu->priv1->mfc_dar_RW);
+
+	if (stat & 1) /* segment fault */
+		__spu_trap_data_seg(spu, dar);
+
+	if (stat & 2) { /* mapping fault */
+		__spu_trap_data_map(spu, dar);
+	}
+
+	if (stat & 4) /* ls compare & suspend on get */
+		;
+
+	if (stat & 8) /* ls compare & suspend on put */
+		;
+
+	out_be64(&spu->priv1->int_stat_class1_RW, stat);
+	return stat ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static irqreturn_t
+spu_irq_class_2(int irq, void *data, struct pt_regs *regs)
+{
+	struct spu *spu;
+	unsigned long stat;
+
+	spu = data;
+	stat = in_be64(&spu->priv1->int_stat_class2_RW);
+
+	pr_debug("class 2 interrupt %d, %lx, %lx\n", irq, stat,
+		in_be64(&spu->priv1->int_mask_class2_RW));
+
+
+	if (stat & 1)  /* PPC core mailbox */
+		__spu_trap_mailbox(spu);
+
+	if (stat & 2) /* SPU stop-and-signal */
+		__spu_trap_stop(spu);
+
+	if (stat & 4) /* SPU halted */
+		__spu_trap_halt(spu);
+
+	if (stat & 8) /* DMA tag group complete */
+		__spu_trap_tag_group(spu);
+
+	if (stat & 0x10) /* SPU mailbox threshold */
+		__spu_trap_spubox(spu);
+
+	out_be64(&spu->priv1->int_stat_class2_RW, stat);
+	return stat ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static int
+spu_request_irqs(struct spu *spu)
+{
+	int ret;
+	int irq_base;
+
+	irq_base = IIC_NODE_STRIDE * spu->node + IIC_SPE_OFFSET;
+
+	snprintf(spu->irq_c0, sizeof (spu->irq_c0), "spe%02d.0", spu->number);
+	ret = request_irq(irq_base + spu->isrc,
+		 spu_irq_class_0, 0, spu->irq_c0, spu);
+	if (ret)
+		goto out;
+	out_be64(&spu->priv1->int_mask_class0_RW, 0x7);
+
+	snprintf(spu->irq_c1, sizeof (spu->irq_c1), "spe%02d.1", spu->number);
+	ret = request_irq(irq_base + IIC_CLASS_STRIDE + spu->isrc,
+		 spu_irq_class_1, 0, spu->irq_c1, spu);
+	if (ret)
+		goto out1;
+	out_be64(&spu->priv1->int_mask_class1_RW, 0x3);
+
+	snprintf(spu->irq_c2, sizeof (spu->irq_c2), "spe%02d.2", spu->number);
+	ret = request_irq(irq_base + 2*IIC_CLASS_STRIDE + spu->isrc,
+		 spu_irq_class_2, 0, spu->irq_c2, spu);
+	if (ret)
+		goto out2;
+	out_be64(&spu->priv1->int_mask_class2_RW, 0xe);
+	goto out;
+
+out2:
+	free_irq(irq_base + IIC_CLASS_STRIDE + spu->isrc, spu);
+out1:
+	free_irq(irq_base + spu->isrc, spu);
+out:
+	return ret;
+}
+
+static void
+spu_free_irqs(struct spu *spu)
+{
+	int irq_base;
+
+	irq_base = IIC_NODE_STRIDE * spu->node + IIC_SPE_OFFSET;
+
+	free_irq(irq_base + spu->isrc, spu);
+	free_irq(irq_base + IIC_CLASS_STRIDE + spu->isrc, spu);
+	free_irq(irq_base + 2*IIC_CLASS_STRIDE + spu->isrc, spu);
+}
+
+static LIST_HEAD(spu_list);
+static DECLARE_MUTEX(spu_mutex);
+
+static void spu_init_channels(struct spu *spu)
+{
+	static const struct {
+		 unsigned channel;
+		 unsigned count;
+	} zero_list[] = {
+		{ 0x00, 1, }, { 0x01, 1, }, { 0x03, 1, }, { 0x04, 1, },
+		{ 0x18, 1, }, { 0x19, 1, }, { 0x1b, 1, }, { 0x1d, 1, },
+	}, count_list[] = {
+		{ 0x00, 0, }, { 0x03, 0, }, { 0x04, 0, }, { 0x15, 16, },
+		{ 0x17, 1, }, { 0x18, 0, }, { 0x19, 0, }, { 0x1b, 0, },
+		{ 0x1c, 1, }, { 0x1d, 0, }, { 0x1e, 1, },
+	};
+	struct spu_priv2 *priv2;
+	int i;
+
+	priv2 = spu->priv2;
+
+	/* initialize all channel data to zero */
+	for (i = 0; i < ARRAY_SIZE(zero_list); i++) {
+		int count;
+
+		out_be64(&priv2->spu_chnlcntptr_RW, zero_list[i].channel);
+		for (count = 0; count < zero_list[i].count; count++)
+			out_be64(&priv2->spu_chnldata_RW, 0);
+	}
+
+	/* initialize channel counts to meaningful values */
+	for (i = 0; i < ARRAY_SIZE(count_list); i++) {
+		out_be64(&priv2->spu_chnlcntptr_RW, count_list[i].channel);
+		out_be64(&priv2->spu_chnlcnt_RW, count_list[i].count);
+	}
+}
+
+static void spu_init_regs(struct spu *spu)
+{
+	out_be64(&spu->priv1->int_mask_class0_RW, 0x7);
+	out_be64(&spu->priv1->int_mask_class1_RW, 0x3);
+	out_be64(&spu->priv1->int_mask_class2_RW, 0xe);
+}
+
+struct spu *spu_alloc(void)
+{
+	struct spu *spu;
+
+	down(&spu_mutex);
+	if (!list_empty(&spu_list)) {
+		spu = list_entry(spu_list.next, struct spu, list);
+		list_del_init(&spu->list);
+		pr_debug("Got SPU %x %d\n", spu->isrc, spu->number);
+	} else {
+		pr_debug("No SPU left\n");
+		spu = NULL;
+	}
+	up(&spu_mutex);
+
+	if (spu) {
+		spu_init_channels(spu);
+		spu_init_regs(spu);
+	}
+
+	return spu;
+}
+EXPORT_SYMBOL(spu_alloc);
+
+void spu_free(struct spu *spu)
+{
+	down(&spu_mutex);
+	spu->ibox_fasync = NULL;
+	spu->wbox_fasync = NULL;
+	list_add_tail(&spu->list, &spu_list);
+	up(&spu_mutex);
+}
+EXPORT_SYMBOL(spu_free);
+
+extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap); //XXX
+static int spu_handle_mm_fault(struct spu *spu)
+{
+	struct spu_priv1 __iomem *priv1;
+	struct mm_struct *mm = spu->mm;
+	struct vm_area_struct *vma;
+	u64 ea, dsisr, is_write;
+	int ret;
+
+	priv1 = spu->priv1;
+	ea = in_be64(&priv1->mfc_dar_RW);
+	dsisr = in_be64(&priv1->mfc_dsisr_RW);
+#if 0
+	if (!IS_VALID_EA(ea)) {
+		return -EFAULT;
+	}
+#endif /* XXX */
+	if (mm == NULL) {
+		return -EFAULT;
+	}
+	if (mm->pgd == NULL) {
+		return -EFAULT;
+	}
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, ea);
+	if (!vma)
+		goto bad_area;
+	if (vma->vm_start <= ea)
+		goto good_area;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		goto bad_area;
+#if 0
+	if (expand_stack(vma, ea))
+		goto bad_area;
+#endif /* XXX */
+good_area:
+	is_write = dsisr & MFC_DSISR_ACCESS_PUT;
+	if (is_write) {
+		if (!(vma->vm_flags & VM_WRITE))
+			goto bad_area;
+	} else {
+		if (dsisr & MFC_DSISR_ACCESS_DENIED)
+			goto bad_area;
+		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+			goto bad_area;
+	}
+	ret = 0;
+	switch (handle_mm_fault(mm, vma, ea, is_write)) {
+	case VM_FAULT_MINOR:
+		current->min_flt++;
+		break;
+	case VM_FAULT_MAJOR:
+		current->maj_flt++;
+		break;
+	case VM_FAULT_SIGBUS:
+		ret = -EFAULT;
+		goto bad_area;
+	case VM_FAULT_OOM:
+		ret = -ENOMEM;
+		goto bad_area;
+	default:
+		BUG();
+	}
+	up_read(&mm->mmap_sem);
+	return ret;
+
+bad_area:
+	up_read(&mm->mmap_sem);
+	return -EFAULT;
+}
+
+static int spu_handle_pte_fault(struct spu *spu)
+{
+	struct spu_priv1 __iomem *priv1;
+	u64 ea, dsisr, access, error = 0UL;
+	int ret = 0;
+
+	priv1 = spu->priv1;
+	ea = in_be64(&priv1->mfc_dar_RW);
+	dsisr = in_be64(&priv1->mfc_dsisr_RW);
+	access = (_PAGE_PRESENT | _PAGE_USER);
+	if (dsisr & MFC_DSISR_PTE_NOT_FOUND) {
+		if (hash_page(ea, access, 0x300) != 0)
+			error |= CLASS1_ENABLE_STORAGE_FAULT_INTR;
+	}
+	if ((error & CLASS1_ENABLE_STORAGE_FAULT_INTR) ||
+	    (dsisr & MFC_DSISR_ACCESS_DENIED)) {
+		if ((ret = spu_handle_mm_fault(spu)) != 0)
+			error |= CLASS1_ENABLE_STORAGE_FAULT_INTR;
+		else
+			error &= ~CLASS1_ENABLE_STORAGE_FAULT_INTR;
+	}
+	if (!error)
+		spu_restart_dma(spu);
+
+	return ret;
+}
+
+int spu_run(struct spu *spu)
+{
+	struct spu_problem __iomem *prob;
+	struct spu_priv1 __iomem *priv1;
+	struct spu_priv2 __iomem *priv2;
+	unsigned long status;
+	int ret;
+
+	prob = spu->problem;
+	priv1 = spu->priv1;
+	priv2 = spu->priv2;
+
+	/* Let SPU run.  */
+	spu->mm = current->mm;
+	eieio();
+	out_be32(&prob->spu_runcntl_RW, SPU_RUNCNTL_RUNNABLE);
+
+	do {
+		ret = wait_event_interruptible(spu->stop_wq,
+			 (!((status = in_be32(&prob->spu_status_R)) & 0x1))
+			|| (in_be64(&priv1->mfc_dsisr_RW) & MFC_DSISR_PTE_NOT_FOUND)
+			|| spu->class_0_pending);
+
+		if (status & SPU_STATUS_STOPPED_BY_STOP)
+			ret = -EAGAIN;
+		else if (status & SPU_STATUS_STOPPED_BY_HALT)
+			ret = -EIO;
+		else if (in_be64(&priv1->mfc_dsisr_RW) & MFC_DSISR_PTE_NOT_FOUND)
+			ret = spu_handle_pte_fault(spu);
+
+		if (spu->class_0_pending)
+			spu_irq_class_0_bottom(spu);
+
+		if (!ret && signal_pending(current))
+			ret = -ERESTARTSYS;
+
+	} while (!ret);
+
+	/* Ensure SPU is stopped.  */
+	out_be32(&prob->spu_runcntl_RW, SPU_RUNCNTL_STOP);
+	eieio();
+	while (in_be32(&prob->spu_status_R) & SPU_STATUS_RUNNING)
+		cpu_relax();
+
+	out_be64(&priv2->slb_invalidate_all_W, 0);
+	out_be64(&priv1->tlb_invalidate_entry_W, 0UL);
+	eieio();
+
+	spu->mm = NULL;
+
+	/* Check for SPU breakpoint.  */
+	if (unlikely(current->ptrace & PT_PTRACED)) {
+		status = in_be32(&prob->spu_status_R);
+
+		if ((status & SPU_STATUS_STOPPED_BY_STOP)
+		    && status >> SPU_STOP_STATUS_SHIFT == 0x3fff) {
+			force_sig(SIGTRAP, current);
+			ret = -ERESTARTSYS;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(spu_run);
+
+static void __iomem * __init map_spe_prop(struct device_node *n,
+						 const char *name)
+{
+	struct address_prop {
+		unsigned long address;
+		unsigned int len;
+	} __attribute__((packed)) *prop;
+
+	void *p;
+	int proplen;
+
+	p = get_property(n, name, &proplen);
+	if (proplen != sizeof (struct address_prop))
+		return NULL;
+
+	prop = p;
+
+	return ioremap(prop->address, prop->len);
+}
+
+static void spu_unmap(struct spu *spu)
+{
+	iounmap(spu->priv2);
+	iounmap(spu->priv1);
+	iounmap(spu->problem);
+	iounmap((u8 __iomem *)spu->local_store);
+}
+
+static int __init spu_map_device(struct spu *spu, struct device_node *spe)
+{
+	char *prop;
+	int ret;
+
+	ret = -ENODEV;
+	prop = get_property(spe, "isrc", NULL);
+	if (!prop)
+		goto out;
+	spu->isrc = *(unsigned int *)prop;
+
+	spu->name = get_property(spe, "name", NULL);
+	if (!spu->name)
+		goto out;
+
+	prop = get_property(spe, "local-store", NULL);
+	if (!prop)
+		goto out;
+	spu->local_store_phys = *(unsigned long *)prop;
+
+	/* we use local store as ram, not io memory */
+	spu->local_store = (void __force *)map_spe_prop(spe, "local-store");
+	if (!spu->local_store)
+		goto out;
+
+	spu->problem= map_spe_prop(spe, "problem");
+	if (!spu->problem)
+		goto out_unmap;
+
+	spu->priv1= map_spe_prop(spe, "priv1");
+	if (!spu->priv1)
+		goto out_unmap;
+
+	spu->priv2= map_spe_prop(spe, "priv2");
+	if (!spu->priv2)
+		goto out_unmap;
+	ret = 0;
+	goto out;
+
+out_unmap:
+	spu_unmap(spu);
+out:
+	return ret;
+}
+
+static int __init find_spu_node_id(struct device_node *spe)
+{
+	unsigned int *id;
+	struct device_node *cpu;
+
+	cpu = spe->parent->parent;
+	id = (unsigned int *)get_property(cpu, "node-id", NULL);
+
+	return id ? *id : 0;
+}
+
+static int __init create_spu(struct device_node *spe)
+{
+	struct spu *spu;
+	int ret;
+	static int number;
+
+	ret = -ENOMEM;
+	spu = kmalloc(sizeof (*spu), GFP_KERNEL);
+	if (!spu)
+		goto out;
+
+	ret = spu_map_device(spu, spe);
+	if (ret)
+		goto out_free;
+
+	spu->node = find_spu_node_id(spe);
+	spu->stop_code = 0;
+	spu->slb_replace = 0;
+	spu->mm = NULL;
+	spu->class_0_pending = 0;
+	spin_lock_init(&spu->register_lock);
+
+	out_be64(&spu->priv1->mfc_sdr_RW, mfspr(SPRN_SDR1));
+	out_be64(&spu->priv1->mfc_sr1_RW, 0x33);
+
+	init_waitqueue_head(&spu->stop_wq);
+	init_waitqueue_head(&spu->wbox_wq);
+	init_waitqueue_head(&spu->ibox_wq);
+
+	spu->ibox_fasync = NULL;
+	spu->wbox_fasync = NULL;
+
+	down(&spu_mutex);
+	spu->number = number++;
+	ret = spu_request_irqs(spu);
+	if (ret)
+		goto out_unmap;
+
+	list_add(&spu->list, &spu_list);
+	up(&spu_mutex);
+
+	pr_debug(KERN_DEBUG "Using SPE %s %02x %p %p %p %p %d\n",
+		spu->name, spu->isrc, spu->local_store,
+		spu->problem, spu->priv1, spu->priv2, spu->number);
+	goto out;
+
+out_unmap:
+	up(&spu_mutex);
+	spu_unmap(spu);
+out_free:
+	kfree(spu);
+out:
+	return ret;
+}
+
+static void destroy_spu(struct spu *spu)
+{
+	list_del_init(&spu->list);
+
+	spu_free_irqs(spu);
+	spu_unmap(spu);
+	kfree(spu);
+}
+
+static void cleanup_spu_base(void)
+{
+	struct spu *spu, *tmp;
+	down(&spu_mutex);
+	list_for_each_entry_safe(spu, tmp, &spu_list, list)
+		destroy_spu(spu);
+	up(&spu_mutex);
+}
+module_exit(cleanup_spu_base);
+
+static int __init init_spu_base(void)
+{
+	struct device_node *node;
+	int ret;
+
+	ret = -ENODEV;
+	for (node = of_find_node_by_type(NULL, "spe");
+			node; node = of_find_node_by_type(node, "spe")) {
+		ret = create_spu(node);
+		if (ret) {
+			printk(KERN_WARNING "%s: Error initializing %s\n",
+				__FUNCTION__, node->name);
+			cleanup_spu_base();
+			break;
+		}
+	}
+	/* in some old firmware versions, the spe is called 'spc', so we
+	   look for that as well */
+	for (node = of_find_node_by_type(NULL, "spc");
+			node; node = of_find_node_by_type(node, "spc")) {
+		ret = create_spu(node);
+		if (ret) {
+			printk(KERN_WARNING "%s: Error initializing %s\n",
+				__FUNCTION__, node->name);
+			cleanup_spu_base();
+			break;
+		}
+	}
+	return ret;
+}
+module_init(init_spu_base);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnd Bergmann <arndb@de.ibm.com>");
diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c
new file mode 100644
index 000000000000..43e0b187ffde
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spu_syscalls.c
@@ -0,0 +1,86 @@
+/*
+ * SPU file system -- system call stubs
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+
+#include <asm/spu.h>
+
+struct spufs_calls spufs_calls = {
+	.owner = NULL,
+};
+
+/* These stub syscalls are needed to have the actual implementation
+ * within a loadable module. When spufs is built into the kernel,
+ * this file is not used and the syscalls directly enter the fs code */
+
+asmlinkage long sys_spu_create(const char __user *name,
+		unsigned int flags, mode_t mode)
+{
+	long ret;
+
+	ret = -ENOSYS;
+	if (try_module_get(spufs_calls.owner)) {
+		ret = spufs_calls.create_thread(name, flags, mode);
+		module_put(spufs_calls.owner);
+	}
+	return ret;
+}
+
+asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus)
+{
+	long ret;
+	struct file *filp;
+	int fput_needed;
+
+	ret = -ENOSYS;
+	if (try_module_get(spufs_calls.owner)) {
+		ret = -EBADF;
+		filp = fget_light(fd, &fput_needed);
+		if (filp) {
+			ret = spufs_calls.spu_run(filp, unpc, ustatus);
+			fput_light(filp, fput_needed);
+		}
+		module_put(spufs_calls.owner);
+	}
+	return ret;
+}
+
+int register_spu_syscalls(struct spufs_calls *calls)
+{
+	if (spufs_calls.owner)
+		return -EBUSY;
+
+	spufs_calls.create_thread = calls->create_thread;
+	spufs_calls.spu_run = calls->spu_run;
+	smp_mb();
+	spufs_calls.owner = calls->owner;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_spu_syscalls);
+
+void unregister_spu_syscalls(struct spufs_calls *calls)
+{
+	BUG_ON(spufs_calls.owner != calls->owner);
+	spufs_calls.owner = NULL;
+}
+EXPORT_SYMBOL_GPL(unregister_spu_syscalls);
diff --git a/arch/powerpc/platforms/cell/spufs/Makefile b/arch/powerpc/platforms/cell/spufs/Makefile
new file mode 100644
index 000000000000..6f496e37bcb7
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_SPU_FS) += spufs.o
+
+spufs-y += inode.o file.o context.o syscalls.o
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
new file mode 100644
index 000000000000..a69b85e2778a
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -0,0 +1,67 @@
+/*
+ * SPU file system -- SPU context management
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/slab.h>
+#include <asm/spu.h>
+#include "spufs.h"
+
+struct spu_context *alloc_spu_context(void)
+{
+	struct spu_context *ctx;
+	ctx = kmalloc(sizeof *ctx, GFP_KERNEL);
+	if (!ctx)
+		goto out;
+	ctx->spu = spu_alloc();
+	if (!ctx->spu)
+		goto out_free;
+	init_rwsem(&ctx->backing_sema);
+	spin_lock_init(&ctx->mmio_lock);
+	kref_init(&ctx->kref);
+	goto out;
+out_free:
+	kfree(ctx);
+	ctx = NULL;
+out:
+	return ctx;
+}
+
+void destroy_spu_context(struct kref *kref)
+{
+	struct spu_context *ctx;
+	ctx = container_of(kref, struct spu_context, kref);
+	if (ctx->spu)
+		spu_free(ctx->spu);
+	kfree(ctx);
+}
+
+struct spu_context * get_spu_context(struct spu_context *ctx)
+{
+	kref_get(&ctx->kref);
+	return ctx;
+}
+
+int put_spu_context(struct spu_context *ctx)
+{
+	return kref_put(&ctx->kref, &destroy_spu_context);
+}
+
+
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
new file mode 100644
index 000000000000..c1e643310494
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -0,0 +1,596 @@
+/*
+ * SPU file system -- file contents
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+
+#include <asm/io.h>
+#include <asm/semaphore.h>
+#include <asm/spu.h>
+#include <asm/uaccess.h>
+
+#include "spufs.h"
+
+static int
+spufs_mem_open(struct inode *inode, struct file *file)
+{
+	struct spufs_inode_info *i = SPUFS_I(inode);
+	file->private_data = i->i_ctx;
+	return 0;
+}
+
+static ssize_t
+spufs_mem_read(struct file *file, char __user *buffer,
+				size_t size, loff_t *pos)
+{
+	struct spu *spu;
+	struct spu_context *ctx;
+	int ret;
+
+	ctx = file->private_data;
+	spu = ctx->spu;
+
+	down_read(&ctx->backing_sema);
+	if (spu->number & 0/*1*/) {
+		ret = generic_file_read(file, buffer, size, pos);
+		goto out;
+	}
+
+	ret = simple_read_from_buffer(buffer, size, pos,
+					spu->local_store, LS_SIZE);
+out:
+	up_read(&ctx->backing_sema);
+	return ret;
+}
+
+static ssize_t
+spufs_mem_write(struct file *file, const char __user *buffer,
+					size_t size, loff_t *pos)
+{
+	struct spu_context *ctx = file->private_data;
+	struct spu *spu = ctx->spu;
+
+	if (spu->number & 0) //1)
+		return generic_file_write(file, buffer, size, pos);
+
+	size = min_t(ssize_t, LS_SIZE - *pos, size);
+	if (size <= 0)
+		return -EFBIG;
+	*pos += size;
+	return copy_from_user(spu->local_store + *pos - size,
+				buffer, size) ? -EFAULT : size;
+}
+
+static int
+spufs_mem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct spu_context *ctx = file->private_data;
+	struct spu *spu = ctx->spu;
+	unsigned long pfn;
+
+	if (spu->number & 0) //1)
+		return generic_file_mmap(file, vma);
+
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_page_prot = __pgprot(pgprot_val (vma->vm_page_prot)
+							| _PAGE_NO_CACHE);
+	pfn = spu->local_store_phys >> PAGE_SHIFT;
+	/*
+	 * This will work for actual SPUs, but not for vmalloc memory:
+	 */
+	if (remap_pfn_range(vma, vma->vm_start, pfn,
+				vma->vm_end-vma->vm_start, vma->vm_page_prot))
+		return -EAGAIN;
+	return 0;
+}
+
+static struct file_operations spufs_mem_fops = {
+	.open	 = spufs_mem_open,
+	.read    = spufs_mem_read,
+	.write   = spufs_mem_write,
+	.mmap    = spufs_mem_mmap,
+	.llseek  = generic_file_llseek,
+};
+
+/* generic open function for all pipe-like files */
+static int spufs_pipe_open(struct inode *inode, struct file *file)
+{
+	struct spufs_inode_info *i = SPUFS_I(inode);
+	file->private_data = i->i_ctx;
+
+	return nonseekable_open(inode, file);
+}
+
+static ssize_t spufs_mbox_read(struct file *file, char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx;
+	struct spu_problem __iomem *prob;
+	u32 mbox_stat;
+	u32 mbox_data;
+
+	if (len < 4)
+		return -EINVAL;
+
+	ctx = file->private_data;
+	prob = ctx->spu->problem;
+	mbox_stat = in_be32(&prob->mb_stat_R);
+	if (!(mbox_stat & 0x0000ff))
+		return -EAGAIN;
+
+	mbox_data = in_be32(&prob->pu_mb_R);
+
+	if (copy_to_user(buf, &mbox_data, sizeof mbox_data))
+		return -EFAULT;
+
+	return 4;
+}
+
+static struct file_operations spufs_mbox_fops = {
+	.open	= spufs_pipe_open,
+	.read	= spufs_mbox_read,
+};
+
+static ssize_t spufs_mbox_stat_read(struct file *file, char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx;
+	u32 mbox_stat;
+
+	if (len < 4)
+		return -EINVAL;
+
+	ctx = file->private_data;
+	mbox_stat = in_be32(&ctx->spu->problem->mb_stat_R) & 0xff;
+
+	if (copy_to_user(buf, &mbox_stat, sizeof mbox_stat))
+		return -EFAULT;
+
+	return 4;
+}
+
+static struct file_operations spufs_mbox_stat_fops = {
+	.open	= spufs_pipe_open,
+	.read	= spufs_mbox_stat_read,
+};
+
+/* low-level ibox access function */
+size_t spu_ibox_read(struct spu *spu, u32 *data)
+{
+	int ret;
+
+	spin_lock_irq(&spu->register_lock);
+
+	if (in_be32(&spu->problem->mb_stat_R) & 0xff0000) {
+		/* read the first available word */
+		*data = in_be64(&spu->priv2->puint_mb_R);
+		ret = 4;
+	} else {
+		/* make sure we get woken up by the interrupt */
+		out_be64(&spu->priv1->int_mask_class2_RW,
+			in_be64(&spu->priv1->int_mask_class2_RW) | 0x1);
+		ret = 0;
+	}
+
+	spin_unlock_irq(&spu->register_lock);
+	return ret;
+}
+EXPORT_SYMBOL(spu_ibox_read);
+
+static int spufs_ibox_fasync(int fd, struct file *file, int on)
+{
+	struct spu_context *ctx;
+	ctx = file->private_data;
+	return fasync_helper(fd, file, on, &ctx->spu->ibox_fasync);
+}
+
+static ssize_t spufs_ibox_read(struct file *file, char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx;
+	u32 ibox_data;
+	ssize_t ret;
+
+	if (len < 4)
+		return -EINVAL;
+
+	ctx = file->private_data;
+
+	ret = 0;
+	if (file->f_flags & O_NONBLOCK) {
+		if (!spu_ibox_read(ctx->spu, &ibox_data))
+			ret = -EAGAIN;
+	} else {
+		ret = wait_event_interruptible(ctx->spu->ibox_wq,
+				 spu_ibox_read(ctx->spu, &ibox_data));
+	}
+
+	if (ret)
+		return ret;
+
+	ret = 4;
+	if (copy_to_user(buf, &ibox_data, sizeof ibox_data))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static unsigned int spufs_ibox_poll(struct file *file, poll_table *wait)
+{
+	struct spu_context *ctx;
+	struct spu_problem __iomem *prob;
+	u32 mbox_stat;
+	unsigned int mask;
+
+	ctx = file->private_data;
+	prob = ctx->spu->problem;
+	mbox_stat = in_be32(&prob->mb_stat_R);
+
+	poll_wait(file, &ctx->spu->ibox_wq, wait);
+
+	mask = 0;
+	if (mbox_stat & 0xff0000)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static struct file_operations spufs_ibox_fops = {
+	.open	= spufs_pipe_open,
+	.read	= spufs_ibox_read,
+	.poll	= spufs_ibox_poll,
+	.fasync	= spufs_ibox_fasync,
+};
+
+static ssize_t spufs_ibox_stat_read(struct file *file, char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx;
+	u32 ibox_stat;
+
+	if (len < 4)
+		return -EINVAL;
+
+	ctx = file->private_data;
+	ibox_stat = (in_be32(&ctx->spu->problem->mb_stat_R) >> 16) & 0xff;
+
+	if (copy_to_user(buf, &ibox_stat, sizeof ibox_stat))
+		return -EFAULT;
+
+	return 4;
+}
+
+static struct file_operations spufs_ibox_stat_fops = {
+	.open	= spufs_pipe_open,
+	.read	= spufs_ibox_stat_read,
+};
+
+/* low-level mailbox write */
+size_t spu_wbox_write(struct spu *spu, u32 data)
+{
+	int ret;
+
+	spin_lock_irq(&spu->register_lock);
+
+	if (in_be32(&spu->problem->mb_stat_R) & 0x00ff00) {
+		/* we have space to write wbox_data to */
+		out_be32(&spu->problem->spu_mb_W, data);
+		ret = 4;
+	} else {
+		/* make sure we get woken up by the interrupt when space
+		   becomes available */
+		out_be64(&spu->priv1->int_mask_class2_RW,
+			in_be64(&spu->priv1->int_mask_class2_RW) | 0x10);
+		ret = 0;
+	}
+
+	spin_unlock_irq(&spu->register_lock);
+	return ret;
+}
+EXPORT_SYMBOL(spu_wbox_write);
+
+static int spufs_wbox_fasync(int fd, struct file *file, int on)
+{
+	struct spu_context *ctx;
+	ctx = file->private_data;
+	return fasync_helper(fd, file, on, &ctx->spu->wbox_fasync);
+}
+
+static ssize_t spufs_wbox_write(struct file *file, const char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx;
+	u32 wbox_data;
+	int ret;
+
+	if (len < 4)
+		return -EINVAL;
+
+	ctx = file->private_data;
+
+	if (copy_from_user(&wbox_data, buf, sizeof wbox_data))
+		return -EFAULT;
+
+	ret = 0;
+	if (file->f_flags & O_NONBLOCK) {
+		if (!spu_wbox_write(ctx->spu, wbox_data))
+			ret = -EAGAIN;
+	} else {
+		ret = wait_event_interruptible(ctx->spu->wbox_wq,
+			spu_wbox_write(ctx->spu, wbox_data));
+	}
+
+	return ret ? ret : sizeof wbox_data;
+}
+
+static unsigned int spufs_wbox_poll(struct file *file, poll_table *wait)
+{
+	struct spu_context *ctx;
+	struct spu_problem __iomem *prob;
+	u32 mbox_stat;
+	unsigned int mask;
+
+	ctx = file->private_data;
+	prob = ctx->spu->problem;
+	mbox_stat = in_be32(&prob->mb_stat_R);
+
+	poll_wait(file, &ctx->spu->wbox_wq, wait);
+
+	mask = 0;
+	if (mbox_stat & 0x00ff00)
+		mask = POLLOUT | POLLWRNORM;
+
+	return mask;
+}
+
+static struct file_operations spufs_wbox_fops = {
+	.open	= spufs_pipe_open,
+	.write	= spufs_wbox_write,
+	.poll	= spufs_wbox_poll,
+	.fasync	= spufs_wbox_fasync,
+};
+
+static ssize_t spufs_wbox_stat_read(struct file *file, char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx;
+	u32 wbox_stat;
+
+	if (len < 4)
+		return -EINVAL;
+
+	ctx = file->private_data;
+	wbox_stat = (in_be32(&ctx->spu->problem->mb_stat_R) >> 8) & 0xff;
+
+	if (copy_to_user(buf, &wbox_stat, sizeof wbox_stat))
+		return -EFAULT;
+
+	return 4;
+}
+
+static struct file_operations spufs_wbox_stat_fops = {
+	.open	= spufs_pipe_open,
+	.read	= spufs_wbox_stat_read,
+};
+
+long spufs_run_spu(struct file *file, struct spu_context *ctx,
+		u32 *npc, u32 *status)
+{
+	struct spu_problem __iomem *prob;
+	int ret;
+
+	if (file->f_flags & O_NONBLOCK) {
+		ret = -EAGAIN;
+		if (!down_write_trylock(&ctx->backing_sema))
+			goto out;
+	} else {
+		down_write(&ctx->backing_sema);
+	}
+
+	prob = ctx->spu->problem;
+	out_be32(&prob->spu_npc_RW, *npc);
+
+	ret = spu_run(ctx->spu);
+
+	*status = in_be32(&prob->spu_status_R);
+	*npc = in_be32(&prob->spu_npc_RW);
+
+	up_write(&ctx->backing_sema);
+
+out:
+	return ret;
+}
+
+static ssize_t spufs_signal1_read(struct file *file, char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx;
+	struct spu_problem *prob;
+	u32 data;
+
+	ctx = file->private_data;
+	prob = ctx->spu->problem;
+
+	if (len < 4)
+		return -EINVAL;
+
+	data = in_be32(&prob->signal_notify1);
+	if (copy_to_user(buf, &data, 4))
+		return -EFAULT;
+
+	return 4;
+}
+
+static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx;
+	struct spu_problem *prob;
+	u32 data;
+
+	ctx = file->private_data;
+	prob = ctx->spu->problem;
+
+	if (len < 4)
+		return -EINVAL;
+
+	if (copy_from_user(&data, buf, 4))
+		return -EFAULT;
+
+	out_be32(&prob->signal_notify1, data);
+
+	return 4;
+}
+
+static struct file_operations spufs_signal1_fops = {
+	.open = spufs_pipe_open,
+	.read = spufs_signal1_read,
+	.write = spufs_signal1_write,
+};
+
+static ssize_t spufs_signal2_read(struct file *file, char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx;
+	struct spu_problem *prob;
+	u32 data;
+
+	ctx = file->private_data;
+	prob = ctx->spu->problem;
+
+	if (len < 4)
+		return -EINVAL;
+
+	data = in_be32(&prob->signal_notify2);
+	if (copy_to_user(buf, &data, 4))
+		return -EFAULT;
+
+	return 4;
+}
+
+static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
+			size_t len, loff_t *pos)
+{
+	struct spu_context *ctx;
+	struct spu_problem *prob;
+	u32 data;
+
+	ctx = file->private_data;
+	prob = ctx->spu->problem;
+
+	if (len < 4)
+		return -EINVAL;
+
+	if (copy_from_user(&data, buf, 4))
+		return -EFAULT;
+
+	out_be32(&prob->signal_notify2, data);
+
+	return 4;
+}
+
+static struct file_operations spufs_signal2_fops = {
+	.open = spufs_pipe_open,
+	.read = spufs_signal2_read,
+	.write = spufs_signal2_write,
+};
+
+static void spufs_signal1_type_set(void *data, u64 val)
+{
+	struct spu_context *ctx = data;
+	struct spu_priv2 *priv2 = ctx->spu->priv2;
+	u64 tmp;
+
+	spin_lock_irq(&ctx->spu->register_lock);
+	tmp = in_be64(&priv2->spu_cfg_RW);
+	if (val)
+		tmp |= 1;
+	else
+		tmp &= ~1;
+	out_be64(&priv2->spu_cfg_RW, tmp);
+	spin_unlock_irq(&ctx->spu->register_lock);
+}
+
+static u64 spufs_signal1_type_get(void *data)
+{
+	struct spu_context *ctx = data;
+	return (in_be64(&ctx->spu->priv2->spu_cfg_RW) & 1) != 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(spufs_signal1_type, spufs_signal1_type_get,
+					spufs_signal1_type_set, "%llu");
+
+static void spufs_signal2_type_set(void *data, u64 val)
+{
+	struct spu_context *ctx = data;
+	struct spu_priv2 *priv2 = ctx->spu->priv2;
+	u64 tmp;
+
+	spin_lock_irq(&ctx->spu->register_lock);
+	tmp = in_be64(&priv2->spu_cfg_RW);
+	if (val)
+		tmp |= 2;
+	else
+		tmp &= ~2;
+	out_be64(&priv2->spu_cfg_RW, tmp);
+	spin_unlock_irq(&ctx->spu->register_lock);
+}
+
+static u64 spufs_signal2_type_get(void *data)
+{
+	struct spu_context *ctx = data;
+	return (in_be64(&ctx->spu->priv2->spu_cfg_RW) & 2) != 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(spufs_signal2_type, spufs_signal2_type_get,
+					spufs_signal2_type_set, "%llu");
+
+static void spufs_npc_set(void *data, u64 val)
+{
+	struct spu_context *ctx = data;
+	out_be32(&ctx->spu->problem->spu_npc_RW, val);
+}
+
+static u64 spufs_npc_get(void *data)
+{
+	struct spu_context *ctx = data;
+	u64 ret;
+	ret = in_be32(&ctx->spu->problem->spu_npc_RW);
+	return ret;
+}
+DEFINE_SIMPLE_ATTRIBUTE(spufs_npc_ops, spufs_npc_get, spufs_npc_set, "%llx\n")
+
+struct tree_descr spufs_dir_contents[] = {
+	{ "mem",  &spufs_mem_fops,  0666, },
+	{ "mbox", &spufs_mbox_fops, 0444, },
+	{ "ibox", &spufs_ibox_fops, 0444, },
+	{ "wbox", &spufs_wbox_fops, 0222, },
+	{ "mbox_stat", &spufs_mbox_stat_fops, 0444, },
+	{ "ibox_stat", &spufs_ibox_stat_fops, 0444, },
+	{ "wbox_stat", &spufs_wbox_stat_fops, 0444, },
+	{ "signal1", &spufs_signal1_fops, 0666, },
+	{ "signal2", &spufs_signal2_fops, 0666, },
+	{ "signal1_type", &spufs_signal1_type, 0666, },
+	{ "signal2_type", &spufs_signal2_type, 0666, },
+	{ "npc", &spufs_npc_ops, 0666, },
+	{},
+};
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
new file mode 100644
index 000000000000..f7aa0a6b1ce5
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -0,0 +1,470 @@
+/*
+ * SPU file system
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+#include <linux/init.h>
+#include <linux/ioctl.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+
+#include <asm/io.h>
+#include <asm/semaphore.h>
+#include <asm/spu.h>
+#include <asm/uaccess.h>
+
+#include "spufs.h"
+
+static kmem_cache_t *spufs_inode_cache;
+
+/* Information about the backing dev, same as ramfs */
+#if 0
+static struct backing_dev_info spufs_backing_dev_info = {
+	.ra_pages       = 0,    /* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK |
+	  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | BDI_CAP_READ_MAP |
+	  BDI_CAP_WRITE_MAP,
+};
+
+static struct address_space_operations spufs_aops = {
+	.readpage       = simple_readpage,
+	.prepare_write  = simple_prepare_write,
+	.commit_write   = simple_commit_write,
+};
+#endif
+
+/* Inode operations */
+
+static struct inode *
+spufs_alloc_inode(struct super_block *sb)
+{
+	struct spufs_inode_info *ei;
+
+	ei = kmem_cache_alloc(spufs_inode_cache, SLAB_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void
+spufs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(spufs_inode_cache, SPUFS_I(inode));
+}
+
+static void
+spufs_init_once(void *p, kmem_cache_t * cachep, unsigned long flags)
+{
+	struct spufs_inode_info *ei = p;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		inode_init_once(&ei->vfs_inode);
+	}
+}
+
+static struct inode *
+spufs_new_inode(struct super_block *sb, int mode)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		goto out;
+
+	inode->i_mode = mode;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+out:
+	return inode;
+}
+
+static int
+spufs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+
+/*	dump_stack();
+	pr_debug("ia_size %lld, i_size:%lld\n", attr->ia_size, inode->i_size);
+*/
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    (attr->ia_size != inode->i_size))
+		return -EINVAL;
+	return inode_setattr(inode, attr);
+}
+
+
+static int
+spufs_new_file(struct super_block *sb, struct dentry *dentry,
+		struct file_operations *fops, int mode,
+		struct spu_context *ctx)
+{
+	static struct inode_operations spufs_file_iops = {
+		.getattr = simple_getattr,
+		.setattr = spufs_setattr,
+		.unlink  = simple_unlink,
+	};
+	struct inode *inode;
+	int ret;
+
+	ret = -ENOSPC;
+	inode = spufs_new_inode(sb, S_IFREG | mode);
+	if (!inode)
+		goto out;
+
+	ret = 0;
+	inode->i_op = &spufs_file_iops;
+	inode->i_fop = fops;
+	inode->u.generic_ip = SPUFS_I(inode)->i_ctx = get_spu_context(ctx);
+	d_add(dentry, inode);
+out:
+	return ret;
+}
+
+static void
+spufs_delete_inode(struct inode *inode)
+{
+	if (SPUFS_I(inode)->i_ctx)
+		put_spu_context(SPUFS_I(inode)->i_ctx);
+	clear_inode(inode);
+}
+
+static int
+spufs_fill_dir(struct dentry *dir, struct tree_descr *files,
+		int mode, struct spu_context *ctx)
+{
+	struct dentry *dentry;
+	int ret;
+
+	while (files->name && files->name[0]) {
+		ret = -ENOMEM;
+		dentry = d_alloc_name(dir, files->name);
+		if (!dentry)
+			goto out;
+		ret = spufs_new_file(dir->d_sb, dentry, files->ops,
+					files->mode & mode, ctx);
+		if (ret)
+			goto out;
+		files++;
+	}
+	return 0;
+out:
+	// FIXME: remove all files that are left
+
+	return ret;
+}
+
+static int spufs_rmdir(struct inode *root, struct dentry *dir_dentry)
+{
+	struct dentry *dentry;
+	int err;
+
+	spin_lock(&dcache_lock);
+	/* remove all entries */
+	err = 0;
+	list_for_each_entry(dentry, &dir_dentry->d_subdirs, d_child) {
+		if (d_unhashed(dentry) || !dentry->d_inode)
+			continue;
+		atomic_dec(&dentry->d_count);
+		spin_lock(&dentry->d_lock);
+		__d_drop(dentry);
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&dcache_lock);
+	if (!err) {
+		shrink_dcache_parent(dir_dentry);
+		err = simple_rmdir(root, dir_dentry);
+	}
+	return err;
+}
+
+static int spufs_dir_close(struct inode *inode, struct file *file)
+{
+	struct inode *dir;
+	struct dentry *dentry;
+	int ret;
+
+	dentry = file->f_dentry;
+	dir = dentry->d_parent->d_inode;
+	down(&dir->i_sem);
+	ret = spufs_rmdir(dir, file->f_dentry);
+	WARN_ON(ret);
+	up(&dir->i_sem);
+	return dcache_dir_close(inode, file);
+}
+
+struct inode_operations spufs_dir_inode_operations = {
+	.lookup = simple_lookup,
+};
+
+struct file_operations spufs_autodelete_dir_operations = {
+	.open		= dcache_dir_open,
+	.release	= spufs_dir_close,
+	.llseek		= dcache_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= dcache_readdir,
+	.fsync		= simple_sync_file,
+};
+
+static int
+spufs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int ret;
+	struct inode *inode;
+	struct spu_context *ctx;
+
+	ret = -ENOSPC;
+	inode = spufs_new_inode(dir->i_sb, mode | S_IFDIR);
+	if (!inode)
+		goto out;
+
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		inode->i_mode &= S_ISGID;
+	}
+	ctx = alloc_spu_context();
+	SPUFS_I(inode)->i_ctx = ctx;
+	if (!ctx)
+		goto out_iput;
+
+	inode->i_op = &spufs_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	ret = spufs_fill_dir(dentry, spufs_dir_contents, mode, ctx);
+	if (ret)
+		goto out_free_ctx;
+
+	d_instantiate(dentry, inode);
+	dget(dentry);
+	dir->i_nlink++;
+	goto out;
+
+out_free_ctx:
+	put_spu_context(ctx);
+out_iput:
+	iput(inode);
+out:
+	return ret;
+}
+
+long
+spufs_create_thread(struct nameidata *nd, const char *name,
+			unsigned int flags, mode_t mode)
+{
+	struct dentry *dentry;
+	struct file *filp;
+	int ret;
+
+	/* need to be at the root of spufs */
+	ret = -EINVAL;
+	if (nd->dentry->d_sb->s_magic != SPUFS_MAGIC ||
+		nd->dentry != nd->dentry->d_sb->s_root)
+		goto out;
+
+	dentry = lookup_create(nd, 1);
+	ret = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out_dir;
+
+	ret = -EEXIST;
+	if (dentry->d_inode)
+		goto out_dput;
+
+	mode &= ~current->fs->umask;
+	ret = spufs_mkdir(nd->dentry->d_inode, dentry, mode & S_IRWXUGO);
+	if (ret)
+		goto out_dput;
+
+	ret = get_unused_fd();
+	if (ret < 0)
+		goto out_dput;
+
+	dentry->d_inode->i_nlink++;
+
+	filp = filp_open(name, O_RDONLY, mode);
+	if (IS_ERR(filp)) {
+		// FIXME: remove directory again
+		put_unused_fd(ret);
+		ret = PTR_ERR(filp);
+	} else {
+		filp->f_op = &spufs_autodelete_dir_operations;
+		fd_install(ret, filp);
+	}
+
+out_dput:
+	dput(dentry);
+out_dir:
+	up(&nd->dentry->d_inode->i_sem);
+out:
+	return ret;
+}
+
+/* File system initialization */
+enum {
+	Opt_uid, Opt_gid, Opt_err,
+};
+
+static match_table_t spufs_tokens = {
+	{ Opt_uid, "uid=%d" },
+	{ Opt_gid, "gid=%d" },
+	{ Opt_err, NULL  },
+};
+
+static int
+spufs_parse_options(char *options, struct inode *root)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token, option;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, spufs_tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+			root->i_uid = option;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			root->i_gid = option;
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int
+spufs_create_root(struct super_block *sb, void *data) {
+	struct inode *inode;
+	int ret;
+
+	ret = -ENOMEM;
+	inode = spufs_new_inode(sb, S_IFDIR | 0775);
+	if (!inode)
+		goto out;
+
+	inode->i_op = &spufs_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	SPUFS_I(inode)->i_ctx = NULL;
+
+	ret = -EINVAL;
+	if (!spufs_parse_options(data, inode))
+		goto out_iput;
+
+	ret = -ENOMEM;
+	sb->s_root = d_alloc_root(inode);
+	if (!sb->s_root)
+		goto out_iput;
+
+	return 0;
+out_iput:
+	iput(inode);
+out:
+	return ret;
+}
+
+static int
+spufs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	static struct super_operations s_ops = {
+		.alloc_inode = spufs_alloc_inode,
+		.destroy_inode = spufs_destroy_inode,
+		.statfs = simple_statfs,
+		.delete_inode = spufs_delete_inode,
+		.drop_inode = generic_delete_inode,
+	};
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = SPUFS_MAGIC;
+	sb->s_op = &s_ops;
+
+	return spufs_create_root(sb, data);
+}
+
+static struct super_block *
+spufs_get_sb(struct file_system_type *fstype, int flags,
+		const char *name, void *data)
+{
+	return get_sb_single(fstype, flags, data, spufs_fill_super);
+}
+
+static struct file_system_type spufs_type = {
+	.owner = THIS_MODULE,
+	.name = "spufs",
+	.get_sb = spufs_get_sb,
+	.kill_sb = kill_litter_super,
+};
+
+static int spufs_init(void)
+{
+	int ret;
+	ret = -ENOMEM;
+	spufs_inode_cache = kmem_cache_create("spufs_inode_cache",
+			sizeof(struct spufs_inode_info), 0,
+			SLAB_HWCACHE_ALIGN, spufs_init_once, NULL);
+
+	if (!spufs_inode_cache)
+		goto out;
+	ret = register_filesystem(&spufs_type);
+	if (ret)
+		goto out_cache;
+	ret = register_spu_syscalls(&spufs_calls);
+	if (ret)
+		goto out_fs;
+	return 0;
+out_fs:
+	unregister_filesystem(&spufs_type);
+out_cache:
+	kmem_cache_destroy(spufs_inode_cache);
+out:
+	return ret;
+}
+module_init(spufs_init);
+
+static void spufs_exit(void)
+{
+	unregister_spu_syscalls(&spufs_calls);
+	unregister_filesystem(&spufs_type);
+	kmem_cache_destroy(spufs_inode_cache);
+}
+module_exit(spufs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnd Bergmann <arndb@de.ibm.com>");
+
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
new file mode 100644
index 000000000000..b37fe797ea1c
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -0,0 +1,71 @@
+/*
+ * SPU file system
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef SPUFS_H
+#define SPUFS_H
+
+#include <linux/kref.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+
+#include <asm/spu.h>
+
+/* The magic number for our file system */
+enum {
+	SPUFS_MAGIC = 0x23c9b64e,
+};
+
+struct spu_context {
+	struct spu *spu;		  /* pointer to a physical SPU */
+	struct rw_semaphore backing_sema; /* protects the above */
+	spinlock_t mmio_lock;		  /* protects mmio access */
+
+	struct kref kref;
+};
+
+struct spufs_inode_info {
+	struct spu_context *i_ctx;
+	struct inode vfs_inode;
+};
+#define SPUFS_I(inode) \
+	container_of(inode, struct spufs_inode_info, vfs_inode)
+
+extern struct tree_descr spufs_dir_contents[];
+
+/* system call implementation */
+long spufs_run_spu(struct file *file,
+		   struct spu_context *ctx, u32 *npc, u32 *status);
+long spufs_create_thread(struct nameidata *nd, const char *name,
+			 unsigned int flags, mode_t mode);
+
+/* context management */
+struct spu_context * alloc_spu_context(void);
+void destroy_spu_context(struct kref *kref);
+struct spu_context * get_spu_context(struct spu_context *ctx);
+int put_spu_context(struct spu_context *ctx);
+
+void spu_acquire(struct spu_context *ctx);
+void spu_release(struct spu_context *ctx);
+void spu_acquire_runnable(struct spu_context *ctx);
+void spu_acquire_saved(struct spu_context *ctx);
+
+#endif
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
new file mode 100644
index 000000000000..3f71bb5e9d8e
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -0,0 +1,106 @@
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+
+#include <asm/uaccess.h>
+
+#include "spufs.h"
+
+/**
+ * sys_spu_run - run code loaded into an SPU
+ *
+ * @unpc:    next program counter for the SPU
+ * @ustatus: status of the SPU
+ *
+ * This system call transfers the control of execution of a
+ * user space thread to an SPU. It will return when the
+ * SPU has finished executing or when it hits an error
+ * condition and it will be interrupted if a signal needs
+ * to be delivered to a handler in user space.
+ *
+ * The next program counter is set to the passed value
+ * before the SPU starts fetching code and the user space
+ * pointer gets updated with the new value when returning
+ * from kernel space.
+ *
+ * The status value returned from spu_run reflects the
+ * value of the spu_status register after the SPU has stopped.
+ *
+ */
+long do_spu_run(struct file *filp, __u32 __user *unpc, __u32 __user *ustatus)
+{
+	long ret;
+	struct spufs_inode_info *i;
+	u32 npc, status;
+
+	ret = -EFAULT;
+	if (get_user(npc, unpc))
+		goto out;
+
+	ret = -EINVAL;
+	if (filp->f_vfsmnt->mnt_sb->s_magic != SPUFS_MAGIC)
+		goto out;
+
+	i = SPUFS_I(filp->f_dentry->d_inode);
+	ret = spufs_run_spu(filp, i->i_ctx, &npc, &status);
+
+	if (ret ==-EAGAIN || ret == -EIO)
+		ret = status;
+
+	if (put_user(npc, unpc))
+		ret = -EFAULT;
+
+	if (ustatus && put_user(status, ustatus))
+		ret = -EFAULT;
+out:
+	return ret;
+}
+
+#ifndef MODULE
+asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus)
+{
+	int fput_needed;
+	struct file *filp;
+	long ret;
+
+	ret = -EBADF;
+	filp = fget_light(fd, &fput_needed);
+	if (filp) {
+		ret = do_spu_run(filp, unpc, ustatus);
+		fput_light(filp, fput_needed);
+	}
+
+	return ret;
+}
+#endif
+
+asmlinkage long sys_spu_create(const char __user *pathname,
+					unsigned int flags, mode_t mode)
+{
+	char *tmp;
+	int ret;
+
+	tmp = getname(pathname);
+	ret = PTR_ERR(tmp);
+	if (!IS_ERR(tmp)) {
+		struct nameidata nd;
+
+		ret = path_lookup(tmp, LOOKUP_PARENT|
+				LOOKUP_OPEN|LOOKUP_CREATE, &nd);
+		if (!ret) {
+			ret = spufs_create_thread(&nd, pathname, flags, mode);
+			path_release(&nd);
+		}
+		putname(tmp);
+	}
+
+	return ret;
+}
+
+struct spufs_calls spufs_calls = {
+	.create_thread = sys_spu_create,
+	.spu_run = do_spu_run,
+	.owner = THIS_MODULE,
+};
diff --git a/arch/ppc/kernel/ppc_ksyms.c b/arch/ppc/kernel/ppc_ksyms.c
index 28f1082e5040..95075f99a6d4 100644
--- a/arch/ppc/kernel/ppc_ksyms.c
+++ b/arch/ppc/kernel/ppc_ksyms.c
@@ -307,7 +307,6 @@ EXPORT_SYMBOL(__res);
 
 EXPORT_SYMBOL(next_mmu_context);
 EXPORT_SYMBOL(set_context);
-EXPORT_SYMBOL_GPL(__handle_mm_fault); /* For MOL */
 EXPORT_SYMBOL(disarm_decr);
 #ifdef CONFIG_PPC_STD_MMU
 extern long mol_trampoline;
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
new file mode 100644
index 000000000000..b036385cd831
--- /dev/null
+++ b/include/asm-powerpc/spu.h
@@ -0,0 +1,498 @@
+/*
+ * SPU core / file system interface and HW structures
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _SPU_H
+#define _SPU_H
+#include <linux/config.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+
+#define LS_ORDER (6)		/* 256 kb */
+
+#define LS_SIZE (PAGE_SIZE << LS_ORDER)
+
+struct spu {
+	char *name;
+	unsigned long local_store_phys;
+	u8 *local_store;
+	struct spu_problem __iomem *problem;
+	struct spu_priv1 __iomem *priv1;
+	struct spu_priv2 __iomem *priv2;
+	struct list_head list;
+	int number;
+	u32 isrc;
+	u32 node;
+	struct kref kref;
+	size_t ls_size;
+	unsigned int slb_replace;
+	struct mm_struct *mm;
+	int class_0_pending;
+	spinlock_t register_lock;
+
+	u32 stop_code;
+	wait_queue_head_t stop_wq;
+	wait_queue_head_t ibox_wq;
+	wait_queue_head_t wbox_wq;
+	struct fasync_struct *ibox_fasync;
+	struct fasync_struct *wbox_fasync;
+
+	char irq_c0[8];
+	char irq_c1[8];
+	char irq_c2[8];
+};
+
+struct spu *spu_alloc(void);
+void spu_free(struct spu *spu);
+int spu_run(struct spu *spu);
+
+size_t spu_wbox_write(struct spu *spu, u32 data);
+size_t spu_ibox_read(struct spu *spu, u32 *data);
+
+extern struct spufs_calls {
+	asmlinkage long (*create_thread)(const char __user *name,
+					unsigned int flags, mode_t mode);
+	asmlinkage long (*spu_run)(struct file *filp, __u32 __user *unpc,
+						__u32 __user *ustatus);
+	struct module *owner;
+} spufs_calls;
+
+#ifdef CONFIG_SPU_FS_MODULE
+int register_spu_syscalls(struct spufs_calls *calls);
+void unregister_spu_syscalls(struct spufs_calls *calls);
+#else
+static inline int register_spu_syscalls(struct spufs_calls *calls)
+{
+	return 0;
+}
+static inline void unregister_spu_syscalls(struct spufs_calls *calls)
+{
+}
+#endif /* MODULE */
+
+
+/*
+ * This defines the Local Store, Problem Area and Privlege Area of an SPU.
+ */
+
+union mfc_tag_size_class_cmd {
+	struct {
+		u16 mfc_size;
+		u16 mfc_tag;
+		u8  pad;
+		u8  mfc_rclassid;
+		u16 mfc_cmd;
+	} u;
+	struct {
+		u32 mfc_size_tag32;
+		u32 mfc_class_cmd32;
+	} by32;
+	u64 all64;
+};
+
+struct mfc_cq_sr {
+	u64 mfc_cq_data0_RW;
+	u64 mfc_cq_data1_RW;
+	u64 mfc_cq_data2_RW;
+	u64 mfc_cq_data3_RW;
+};
+
+struct spu_problem {
+#define MS_SYNC_PENDING         1L
+	u64 spc_mssync_RW;					/* 0x0000 */
+	u8  pad_0x0008_0x3000[0x3000 - 0x0008];
+
+	/* DMA Area */
+	u8  pad_0x3000_0x3004[0x4];				/* 0x3000 */
+	u32 mfc_lsa_W;						/* 0x3004 */
+	u64 mfc_ea_W;						/* 0x3008 */
+	union mfc_tag_size_class_cmd mfc_union_W;			/* 0x3010 */
+	u8  pad_0x3018_0x3104[0xec];				/* 0x3018 */
+	u32 dma_qstatus_R;					/* 0x3104 */
+	u8  pad_0x3108_0x3204[0xfc];				/* 0x3108 */
+	u32 dma_querytype_RW;					/* 0x3204 */
+	u8  pad_0x3208_0x321c[0x14];				/* 0x3208 */
+	u32 dma_querymask_RW;					/* 0x321c */
+	u8  pad_0x3220_0x322c[0xc];				/* 0x3220 */
+	u32 dma_tagstatus_R;					/* 0x322c */
+#define DMA_TAGSTATUS_INTR_ANY	1u
+#define DMA_TAGSTATUS_INTR_ALL	2u
+	u8  pad_0x3230_0x4000[0x4000 - 0x3230]; 		/* 0x3230 */
+
+	/* SPU Control Area */
+	u8  pad_0x4000_0x4004[0x4];				/* 0x4000 */
+	u32 pu_mb_R;						/* 0x4004 */
+	u8  pad_0x4008_0x400c[0x4];				/* 0x4008 */
+	u32 spu_mb_W;						/* 0x400c */
+	u8  pad_0x4010_0x4014[0x4];				/* 0x4010 */
+	u32 mb_stat_R;						/* 0x4014 */
+	u8  pad_0x4018_0x401c[0x4];				/* 0x4018 */
+	u32 spu_runcntl_RW;					/* 0x401c */
+#define SPU_RUNCNTL_STOP	0L
+#define SPU_RUNCNTL_RUNNABLE	1L
+	u8  pad_0x4020_0x4024[0x4];				/* 0x4020 */
+	u32 spu_status_R;					/* 0x4024 */
+#define SPU_STOP_STATUS_SHIFT           16
+#define SPU_STATUS_STOPPED		0x0
+#define SPU_STATUS_RUNNING		0x1
+#define SPU_STATUS_STOPPED_BY_STOP	0x2
+#define SPU_STATUS_STOPPED_BY_HALT	0x4
+#define SPU_STATUS_WAITING_FOR_CHANNEL	0x8
+#define SPU_STATUS_SINGLE_STEP		0x10
+#define SPU_STATUS_INVALID_INSTR        0x20
+#define SPU_STATUS_INVALID_CH           0x40
+#define SPU_STATUS_ISOLATED_STATE       0x80
+#define SPU_STATUS_ISOLATED_LOAD_STAUTUS 0x200
+#define SPU_STATUS_ISOLATED_EXIT_STAUTUS 0x400
+	u8  pad_0x4028_0x402c[0x4];				/* 0x4028 */
+	u32 spu_spe_R;						/* 0x402c */
+	u8  pad_0x4030_0x4034[0x4];				/* 0x4030 */
+	u32 spu_npc_RW;						/* 0x4034 */
+	u8  pad_0x4038_0x14000[0x14000 - 0x4038];		/* 0x4038 */
+
+	/* Signal Notification Area */
+	u8  pad_0x14000_0x1400c[0xc];				/* 0x14000 */
+	u32 signal_notify1;					/* 0x1400c */
+	u8  pad_0x14010_0x1c00c[0x7ffc];			/* 0x14010 */
+	u32 signal_notify2;					/* 0x1c00c */
+} __attribute__ ((aligned(0x20000)));
+
+/* SPU Privilege 2 State Area */
+struct spu_priv2 {
+	/* MFC Registers */
+	u8  pad_0x0000_0x1100[0x1100 - 0x0000]; 		/* 0x0000 */
+
+	/* SLB Management Registers */
+	u8  pad_0x1100_0x1108[0x8];				/* 0x1100 */
+	u64 slb_index_W;					/* 0x1108 */
+#define SLB_INDEX_MASK				0x7L
+	u64 slb_esid_RW;					/* 0x1110 */
+	u64 slb_vsid_RW;					/* 0x1118 */
+#define SLB_VSID_SUPERVISOR_STATE	(0x1ull << 11)
+#define SLB_VSID_SUPERVISOR_STATE_MASK	(0x1ull << 11)
+#define SLB_VSID_PROBLEM_STATE		(0x1ull << 10)
+#define SLB_VSID_PROBLEM_STATE_MASK	(0x1ull << 10)
+#define SLB_VSID_EXECUTE_SEGMENT	(0x1ull << 9)
+#define SLB_VSID_NO_EXECUTE_SEGMENT	(0x1ull << 9)
+#define SLB_VSID_EXECUTE_SEGMENT_MASK	(0x1ull << 9)
+#define SLB_VSID_4K_PAGE		(0x0 << 8)
+#define SLB_VSID_LARGE_PAGE		(0x1ull << 8)
+#define SLB_VSID_PAGE_SIZE_MASK		(0x1ull << 8)
+#define SLB_VSID_CLASS_MASK		(0x1ull << 7)
+#define SLB_VSID_VIRTUAL_PAGE_SIZE_MASK	(0x1ull << 6)
+	u64 slb_invalidate_entry_W;				/* 0x1120 */
+	u64 slb_invalidate_all_W;				/* 0x1128 */
+	u8  pad_0x1130_0x2000[0x2000 - 0x1130]; 		/* 0x1130 */
+
+	/* Context Save / Restore Area */
+	struct mfc_cq_sr spuq[16];				/* 0x2000 */
+	struct mfc_cq_sr puq[8];				/* 0x2200 */
+	u8  pad_0x2300_0x3000[0x3000 - 0x2300]; 		/* 0x2300 */
+
+	/* MFC Control */
+	u64 mfc_control_RW;					/* 0x3000 */
+#define MFC_CNTL_RESUME_DMA_QUEUE		(0ull << 0)
+#define MFC_CNTL_SUSPEND_DMA_QUEUE		(1ull << 0)
+#define MFC_CNTL_SUSPEND_DMA_QUEUE_MASK		(1ull << 0)
+#define MFC_CNTL_NORMAL_DMA_QUEUE_OPERATION	(0ull << 8)
+#define MFC_CNTL_SUSPEND_IN_PROGRESS		(1ull << 8)
+#define MFC_CNTL_SUSPEND_COMPLETE		(3ull << 8)
+#define MFC_CNTL_SUSPEND_DMA_STATUS_MASK	(3ull << 8)
+#define MFC_CNTL_DMA_QUEUES_EMPTY		(1ull << 14)
+#define MFC_CNTL_DMA_QUEUES_EMPTY_MASK		(1ull << 14)
+#define MFC_CNTL_PURGE_DMA_REQUEST		(1ull << 15)
+#define MFC_CNTL_PURGE_DMA_IN_PROGRESS		(1ull << 24)
+#define MFC_CNTL_PURGE_DMA_COMPLETE		(3ull << 24)
+#define MFC_CNTL_PURGE_DMA_STATUS_MASK		(3ull << 24)
+#define MFC_CNTL_RESTART_DMA_COMMAND		(1ull << 32)
+#define MFC_CNTL_DMA_COMMAND_REISSUE_PENDING	(1ull << 32)
+#define MFC_CNTL_DMA_COMMAND_REISSUE_STATUS_MASK (1ull << 32)
+#define MFC_CNTL_MFC_PRIVILEGE_STATE		(2ull << 33)
+#define MFC_CNTL_MFC_PROBLEM_STATE		(3ull << 33)
+#define MFC_CNTL_MFC_KEY_PROTECTION_STATE_MASK	(3ull << 33)
+#define MFC_CNTL_DECREMENTER_HALTED		(1ull << 35)
+#define MFC_CNTL_DECREMENTER_RUNNING		(1ull << 40)
+#define MFC_CNTL_DECREMENTER_STATUS_MASK	(1ull << 40)
+	u8  pad_0x3008_0x4000[0x4000 - 0x3008]; 		/* 0x3008 */
+
+	/* Interrupt Mailbox */
+	u64 puint_mb_R;						/* 0x4000 */
+	u8  pad_0x4008_0x4040[0x4040 - 0x4008]; 		/* 0x4008 */
+
+	/* SPU Control */
+	u64 spu_privcntl_RW;					/* 0x4040 */
+#define SPU_PRIVCNTL_MODE_NORMAL		(0x0ull << 0)
+#define SPU_PRIVCNTL_MODE_SINGLE_STEP		(0x1ull << 0)
+#define SPU_PRIVCNTL_MODE_MASK			(0x1ull << 0)
+#define SPU_PRIVCNTL_NO_ATTENTION_EVENT		(0x0ull << 1)
+#define SPU_PRIVCNTL_ATTENTION_EVENT		(0x1ull << 1)
+#define SPU_PRIVCNTL_ATTENTION_EVENT_MASK	(0x1ull << 1)
+#define SPU_PRIVCNT_LOAD_REQUEST_NORMAL		(0x0ull << 2)
+#define SPU_PRIVCNT_LOAD_REQUEST_ENABLE_MASK	(0x1ull << 2)
+	u8  pad_0x4048_0x4058[0x10];				/* 0x4048 */
+	u64 spu_lslr_RW;					/* 0x4058 */
+	u64 spu_chnlcntptr_RW;					/* 0x4060 */
+	u64 spu_chnlcnt_RW;					/* 0x4068 */
+	u64 spu_chnldata_RW;					/* 0x4070 */
+	u64 spu_cfg_RW;						/* 0x4078 */
+	u8  pad_0x4080_0x5000[0x5000 - 0x4080]; 		/* 0x4080 */
+
+	/* PV2_ImplRegs: Implementation-specific privileged-state 2 regs */
+	u64 spu_pm_trace_tag_status_RW;				/* 0x5000 */
+	u64 spu_tag_status_query_RW;				/* 0x5008 */
+#define TAG_STATUS_QUERY_CONDITION_BITS (0x3ull << 32)
+#define TAG_STATUS_QUERY_MASK_BITS (0xffffffffull)
+	u64 spu_cmd_buf1_RW;					/* 0x5010 */
+#define SPU_COMMAND_BUFFER_1_LSA_BITS (0x7ffffull << 32)
+#define SPU_COMMAND_BUFFER_1_EAH_BITS (0xffffffffull)
+	u64 spu_cmd_buf2_RW;					/* 0x5018 */
+#define SPU_COMMAND_BUFFER_2_EAL_BITS ((0xffffffffull) << 32)
+#define SPU_COMMAND_BUFFER_2_TS_BITS (0xffffull << 16)
+#define SPU_COMMAND_BUFFER_2_TAG_BITS (0x3full)
+	u64 spu_atomic_status_RW;				/* 0x5020 */
+} __attribute__ ((aligned(0x20000)));
+
+/* SPU Privilege 1 State Area */
+struct spu_priv1 {
+	/* Control and Configuration Area */
+	u64 mfc_sr1_RW;						/* 0x000 */
+#define MFC_STATE1_LOCAL_STORAGE_DECODE_MASK	0x01ull
+#define MFC_STATE1_BUS_TLBIE_MASK		0x02ull
+#define MFC_STATE1_REAL_MODE_OFFSET_ENABLE_MASK	0x04ull
+#define MFC_STATE1_PROBLEM_STATE_MASK		0x08ull
+#define MFC_STATE1_RELOCATE_MASK		0x10ull
+#define MFC_STATE1_MASTER_RUN_CONTROL_MASK	0x20ull
+	u64 mfc_lpid_RW;					/* 0x008 */
+	u64 spu_idr_RW;						/* 0x010 */
+	u64 mfc_vr_RO;						/* 0x018 */
+#define MFC_VERSION_BITS		(0xffff << 16)
+#define MFC_REVISION_BITS		(0xffff)
+#define MFC_GET_VERSION_BITS(vr)	(((vr) & MFC_VERSION_BITS) >> 16)
+#define MFC_GET_REVISION_BITS(vr)	((vr) & MFC_REVISION_BITS)
+	u64 spu_vr_RO;						/* 0x020 */
+#define SPU_VERSION_BITS		(0xffff << 16)
+#define SPU_REVISION_BITS		(0xffff)
+#define SPU_GET_VERSION_BITS(vr)	(vr & SPU_VERSION_BITS) >> 16
+#define SPU_GET_REVISION_BITS(vr)	(vr & SPU_REVISION_BITS)
+	u8  pad_0x28_0x100[0x100 - 0x28];			/* 0x28 */
+
+
+	/* Interrupt Area */
+	u64 int_mask_class0_RW;					/* 0x100 */
+#define CLASS0_ENABLE_DMA_ALIGNMENT_INTR		0x1L
+#define CLASS0_ENABLE_INVALID_DMA_COMMAND_INTR		0x2L
+#define CLASS0_ENABLE_SPU_ERROR_INTR			0x4L
+#define CLASS0_ENABLE_MFC_FIR_INTR			0x8L
+	u64 int_mask_class1_RW;					/* 0x108 */
+#define CLASS1_ENABLE_SEGMENT_FAULT_INTR		0x1L
+#define CLASS1_ENABLE_STORAGE_FAULT_INTR		0x2L
+#define CLASS1_ENABLE_LS_COMPARE_SUSPEND_ON_GET_INTR	0x4L
+#define CLASS1_ENABLE_LS_COMPARE_SUSPEND_ON_PUT_INTR	0x8L
+	u64 int_mask_class2_RW;					/* 0x110 */
+#define CLASS2_ENABLE_MAILBOX_INTR			0x1L
+#define CLASS2_ENABLE_SPU_STOP_INTR			0x2L
+#define CLASS2_ENABLE_SPU_HALT_INTR			0x4L
+#define CLASS2_ENABLE_SPU_DMA_TAG_GROUP_COMPLETE_INTR	0x8L
+	u8  pad_0x118_0x140[0x28];				/* 0x118 */
+	u64 int_stat_class0_RW;					/* 0x140 */
+	u64 int_stat_class1_RW;					/* 0x148 */
+	u64 int_stat_class2_RW;					/* 0x150 */
+	u8  pad_0x158_0x180[0x28];				/* 0x158 */
+	u64 int_route_RW;					/* 0x180 */
+
+	/* Interrupt Routing */
+	u8  pad_0x188_0x200[0x200 - 0x188];			/* 0x188 */
+
+	/* Atomic Unit Control Area */
+	u64 mfc_atomic_flush_RW;				/* 0x200 */
+#define mfc_atomic_flush_enable			0x1L
+	u8  pad_0x208_0x280[0x78];				/* 0x208 */
+	u64 resource_allocation_groupID_RW;			/* 0x280 */
+	u64 resource_allocation_enable_RW; 			/* 0x288 */
+	u8  pad_0x290_0x3c8[0x3c8 - 0x290];			/* 0x290 */
+
+	/* SPU_Cache_ImplRegs: Implementation-dependent cache registers */
+
+	u64 smf_sbi_signal_sel;					/* 0x3c8 */
+#define smf_sbi_mask_lsb	56
+#define smf_sbi_shift		(63 - smf_sbi_mask_lsb)
+#define smf_sbi_mask		(0x301LL << smf_sbi_shift)
+#define smf_sbi_bus0_bits	(0x001LL << smf_sbi_shift)
+#define smf_sbi_bus2_bits	(0x100LL << smf_sbi_shift)
+#define smf_sbi2_bus0_bits	(0x201LL << smf_sbi_shift)
+#define smf_sbi2_bus2_bits	(0x300LL << smf_sbi_shift)
+	u64 smf_ato_signal_sel;					/* 0x3d0 */
+#define smf_ato_mask_lsb	35
+#define smf_ato_shift		(63 - smf_ato_mask_lsb)
+#define smf_ato_mask		(0x3LL << smf_ato_shift)
+#define smf_ato_bus0_bits	(0x2LL << smf_ato_shift)
+#define smf_ato_bus2_bits	(0x1LL << smf_ato_shift)
+	u8  pad_0x3d8_0x400[0x400 - 0x3d8];			/* 0x3d8 */
+
+	/* TLB Management Registers */
+	u64 mfc_sdr_RW;						/* 0x400 */
+	u8  pad_0x408_0x500[0xf8];				/* 0x408 */
+	u64 tlb_index_hint_RO;					/* 0x500 */
+	u64 tlb_index_W;					/* 0x508 */
+	u64 tlb_vpn_RW;						/* 0x510 */
+	u64 tlb_rpn_RW;						/* 0x518 */
+	u8  pad_0x520_0x540[0x20];				/* 0x520 */
+	u64 tlb_invalidate_entry_W;				/* 0x540 */
+	u64 tlb_invalidate_all_W;				/* 0x548 */
+	u8  pad_0x550_0x580[0x580 - 0x550];			/* 0x550 */
+
+	/* SPU_MMU_ImplRegs: Implementation-dependent MMU registers */
+	u64 smm_hid;						/* 0x580 */
+#define PAGE_SIZE_MASK		0xf000000000000000ull
+#define PAGE_SIZE_16MB_64KB	0x2000000000000000ull
+	u8  pad_0x588_0x600[0x600 - 0x588];			/* 0x588 */
+
+	/* MFC Status/Control Area */
+	u64 mfc_accr_RW;					/* 0x600 */
+#define MFC_ACCR_EA_ACCESS_GET		(1 << 0)
+#define MFC_ACCR_EA_ACCESS_PUT		(1 << 1)
+#define MFC_ACCR_LS_ACCESS_GET		(1 << 3)
+#define MFC_ACCR_LS_ACCESS_PUT		(1 << 4)
+	u8  pad_0x608_0x610[0x8];				/* 0x608 */
+	u64 mfc_dsisr_RW;					/* 0x610 */
+#define MFC_DSISR_PTE_NOT_FOUND		(1 << 30)
+#define MFC_DSISR_ACCESS_DENIED		(1 << 27)
+#define MFC_DSISR_ATOMIC		(1 << 26)
+#define MFC_DSISR_ACCESS_PUT		(1 << 25)
+#define MFC_DSISR_ADDR_MATCH		(1 << 22)
+#define MFC_DSISR_LS			(1 << 17)
+#define MFC_DSISR_L			(1 << 16)
+#define MFC_DSISR_ADDRESS_OVERFLOW	(1 << 0)
+	u8  pad_0x618_0x620[0x8];				/* 0x618 */
+	u64 mfc_dar_RW;						/* 0x620 */
+	u8  pad_0x628_0x700[0x700 - 0x628];			/* 0x628 */
+
+	/* Replacement Management Table (RMT) Area */
+	u64 rmt_index_RW;					/* 0x700 */
+	u8  pad_0x708_0x710[0x8];				/* 0x708 */
+	u64 rmt_data1_RW;					/* 0x710 */
+	u8  pad_0x718_0x800[0x800 - 0x718];			/* 0x718 */
+
+	/* Control/Configuration Registers */
+	u64 mfc_dsir_R;						/* 0x800 */
+#define MFC_DSIR_Q			(1 << 31)
+#define MFC_DSIR_SPU_QUEUE		MFC_DSIR_Q
+	u64 mfc_lsacr_RW;					/* 0x808 */
+#define MFC_LSACR_COMPARE_MASK		((~0ull) << 32)
+#define MFC_LSACR_COMPARE_ADDR		((~0ull) >> 32)
+	u64 mfc_lscrr_R;					/* 0x810 */
+#define MFC_LSCRR_Q			(1 << 31)
+#define MFC_LSCRR_SPU_QUEUE		MFC_LSCRR_Q
+#define MFC_LSCRR_QI_SHIFT		32
+#define MFC_LSCRR_QI_MASK		((~0ull) << MFC_LSCRR_QI_SHIFT)
+	u8  pad_0x818_0x820[0x8];				/* 0x818 */
+	u64 mfc_tclass_id_RW;					/* 0x820 */
+#define MFC_TCLASS_ID_ENABLE		(1L << 0L)
+#define MFC_TCLASS_SLOT2_ENABLE		(1L << 5L)
+#define MFC_TCLASS_SLOT1_ENABLE		(1L << 6L)
+#define MFC_TCLASS_SLOT0_ENABLE		(1L << 7L)
+#define MFC_TCLASS_QUOTA_2_SHIFT	8L
+#define MFC_TCLASS_QUOTA_1_SHIFT	16L
+#define MFC_TCLASS_QUOTA_0_SHIFT	24L
+#define MFC_TCLASS_QUOTA_2_MASK		(0x1FL << MFC_TCLASS_QUOTA_2_SHIFT)
+#define MFC_TCLASS_QUOTA_1_MASK		(0x1FL << MFC_TCLASS_QUOTA_1_SHIFT)
+#define MFC_TCLASS_QUOTA_0_MASK		(0x1FL << MFC_TCLASS_QUOTA_0_SHIFT)
+	u8  pad_0x828_0x900[0x900 - 0x828];			/* 0x828 */
+
+	/* Real Mode Support Registers */
+	u64 mfc_rm_boundary;					/* 0x900 */
+	u8  pad_0x908_0x938[0x30];				/* 0x908 */
+	u64 smf_dma_signal_sel;					/* 0x938 */
+#define mfc_dma1_mask_lsb	41
+#define mfc_dma1_shift		(63 - mfc_dma1_mask_lsb)
+#define mfc_dma1_mask		(0x3LL << mfc_dma1_shift)
+#define mfc_dma1_bits		(0x1LL << mfc_dma1_shift)
+#define mfc_dma2_mask_lsb	43
+#define mfc_dma2_shift		(63 - mfc_dma2_mask_lsb)
+#define mfc_dma2_mask		(0x3LL << mfc_dma2_shift)
+#define mfc_dma2_bits		(0x1LL << mfc_dma2_shift)
+	u8  pad_0x940_0xa38[0xf8];				/* 0x940 */
+	u64 smm_signal_sel;					/* 0xa38 */
+#define smm_sig_mask_lsb	12
+#define smm_sig_shift		(63 - smm_sig_mask_lsb)
+#define smm_sig_mask		(0x3LL << smm_sig_shift)
+#define smm_sig_bus0_bits	(0x2LL << smm_sig_shift)
+#define smm_sig_bus2_bits	(0x1LL << smm_sig_shift)
+	u8  pad_0xa40_0xc00[0xc00 - 0xa40];			/* 0xa40 */
+
+	/* DMA Command Error Area */
+	u64 mfc_cer_R;						/* 0xc00 */
+#define MFC_CER_Q		(1 << 31)
+#define MFC_CER_SPU_QUEUE	MFC_CER_Q
+	u8  pad_0xc08_0x1000[0x1000 - 0xc08];			/* 0xc08 */
+
+	/* PV1_ImplRegs: Implementation-dependent privileged-state 1 regs */
+	/* DMA Command Error Area */
+	u64 spu_ecc_cntl_RW;					/* 0x1000 */
+#define SPU_ECC_CNTL_E			(1ull << 0ull)
+#define SPU_ECC_CNTL_ENABLE		SPU_ECC_CNTL_E
+#define SPU_ECC_CNTL_DISABLE		(~SPU_ECC_CNTL_E & 1L)
+#define SPU_ECC_CNTL_S			(1ull << 1ull)
+#define SPU_ECC_STOP_AFTER_ERROR	SPU_ECC_CNTL_S
+#define SPU_ECC_CONTINUE_AFTER_ERROR	(~SPU_ECC_CNTL_S & 2L)
+#define SPU_ECC_CNTL_B			(1ull << 2ull)
+#define SPU_ECC_BACKGROUND_ENABLE	SPU_ECC_CNTL_B
+#define SPU_ECC_BACKGROUND_DISABLE	(~SPU_ECC_CNTL_B & 4L)
+#define SPU_ECC_CNTL_I_SHIFT		3ull
+#define SPU_ECC_CNTL_I_MASK		(3ull << SPU_ECC_CNTL_I_SHIFT)
+#define SPU_ECC_WRITE_ALWAYS		(~SPU_ECC_CNTL_I & 12L)
+#define SPU_ECC_WRITE_CORRECTABLE	(1ull << SPU_ECC_CNTL_I_SHIFT)
+#define SPU_ECC_WRITE_UNCORRECTABLE	(3ull << SPU_ECC_CNTL_I_SHIFT)
+#define SPU_ECC_CNTL_D			(1ull << 5ull)
+#define SPU_ECC_DETECTION_ENABLE	SPU_ECC_CNTL_D
+#define SPU_ECC_DETECTION_DISABLE	(~SPU_ECC_CNTL_D & 32L)
+	u64 spu_ecc_stat_RW;					/* 0x1008 */
+#define SPU_ECC_CORRECTED_ERROR		(1ull << 0ul)
+#define SPU_ECC_UNCORRECTED_ERROR	(1ull << 1ul)
+#define SPU_ECC_SCRUB_COMPLETE		(1ull << 2ul)
+#define SPU_ECC_SCRUB_IN_PROGRESS	(1ull << 3ul)
+#define SPU_ECC_INSTRUCTION_ERROR	(1ull << 4ul)
+#define SPU_ECC_DATA_ERROR		(1ull << 5ul)
+#define SPU_ECC_DMA_ERROR		(1ull << 6ul)
+#define SPU_ECC_STATUS_CNT_MASK		(256ull << 8)
+	u64 spu_ecc_addr_RW;					/* 0x1010 */
+	u64 spu_err_mask_RW;					/* 0x1018 */
+#define SPU_ERR_ILLEGAL_INSTR		(1ull << 0ul)
+#define SPU_ERR_ILLEGAL_CHANNEL		(1ull << 1ul)
+	u8  pad_0x1020_0x1028[0x1028 - 0x1020];			/* 0x1020 */
+
+	/* SPU Debug-Trace Bus (DTB) Selection Registers */
+	u64 spu_trig0_sel;					/* 0x1028 */
+	u64 spu_trig1_sel;					/* 0x1030 */
+	u64 spu_trig2_sel;					/* 0x1038 */
+	u64 spu_trig3_sel;					/* 0x1040 */
+	u64 spu_trace_sel;					/* 0x1048 */
+#define spu_trace_sel_mask		0x1f1fLL
+#define spu_trace_sel_bus0_bits		0x1000LL
+#define spu_trace_sel_bus2_bits		0x0010LL
+	u64 spu_event0_sel;					/* 0x1050 */
+	u64 spu_event1_sel;					/* 0x1058 */
+	u64 spu_event2_sel;					/* 0x1060 */
+	u64 spu_event3_sel;					/* 0x1068 */
+	u64 spu_trace_cntl;					/* 0x1070 */
+} __attribute__ ((aligned(0x2000)));
+
+#endif
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index 0991dfceef1d..9606349855da 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -296,6 +296,8 @@
 #define __NR_inotify_init	275
 #define __NR_inotify_add_watch	276
 #define __NR_inotify_rm_watch	277
+#define __NR_spu_run		278
+#define __NR_spu_create		279
 
 #define __NR_syscalls		278
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c7007b1db91d..44fdd48d38e6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -512,4 +512,9 @@ asmlinkage long sys_ioprio_get(int which, int who);
 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 					unsigned long maxnode);
 
+asmlinkage long sys_spu_run(int fd, __u32 __user *unpc,
+				 __u32 __user *ustatus);
+asmlinkage long sys_spu_create(const char __user *name,
+		unsigned int flags, mode_t mode);
+
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370e2efa..d4739a475d23 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -90,3 +90,5 @@ cond_syscall(sys_pciconfig_iobase);
 cond_syscall(sys32_ipc);
 cond_syscall(sys32_sysctl);
 cond_syscall(ppc_rtas);
+cond_syscall(sys_spu_run);
+cond_syscall(sys_spu_create);
diff --git a/mm/memory.c b/mm/memory.c
index 7197f9bcd384..3944fec38012 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2267,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 
+EXPORT_SYMBOL_GPL(__handle_mm_fault);
+
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
  * Allocate page upper directory.
-- 
cgit v1.2.3-71-gd317


From 1beb6a7d6cbed3ac03500ce9b5b9bb632c512039 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 14 Dec 2005 13:10:10 +1100
Subject: [PATCH] powerpc: Experimental support for new G5 Macs (#2)

This adds some very basic support for the new machines, including the
Quad G5 (tested), and other new dual core based machines and iMac G5
iSight (untested). This is still experimental !  There is no thermal
control yet, there is no proper handing of MSIs, etc.. but it
boots, I have all 4 cores up on my machine. Compared to the previous
version of this patch, this one adds DART IOMMU support for the U4
chipset and thus should work fine on setups with more than 2Gb of RAM.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/Kconfig                      |   1 +
 arch/powerpc/kernel/pci_64.c              |  35 ++-
 arch/powerpc/kernel/prom.c                |  26 ++-
 arch/powerpc/kernel/udbg.c                |   2 +
 arch/powerpc/platforms/maple/setup.c      |   4 +-
 arch/powerpc/platforms/powermac/feature.c |  65 ++++--
 arch/powerpc/platforms/powermac/pci.c     | 210 +++++++++++++++---
 arch/powerpc/platforms/powermac/pic.c     |  72 +++---
 arch/powerpc/platforms/powermac/setup.c   |  13 +-
 arch/powerpc/platforms/powermac/smp.c     | 319 ++++++++++++++-------------
 arch/powerpc/sysdev/Makefile              |   2 +-
 arch/powerpc/sysdev/dart.h                |  41 ++--
 arch/powerpc/sysdev/dart_iommu.c          | 350 ++++++++++++++++++++++++++++++
 arch/powerpc/sysdev/mpic.c                | 199 +++++++++++++----
 arch/powerpc/sysdev/u3_iommu.c            | 327 ----------------------------
 drivers/ide/ppc/pmac.c                    |   2 +-
 drivers/macintosh/smu.c                   |   8 +-
 include/asm-powerpc/iommu.h               |   6 +-
 include/asm-powerpc/mpic.h                |   3 +-
 include/asm-powerpc/pmac_feature.h        |   2 +
 include/linux/pci_regs.h                  |   1 +
 21 files changed, 1059 insertions(+), 629 deletions(-)
 create mode 100644 arch/powerpc/sysdev/dart_iommu.c
 delete mode 100644 arch/powerpc/sysdev/u3_iommu.c

(limited to 'include/linux')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 773b880d5577..5692edb3491e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -300,6 +300,7 @@ config PPC_PMAC64
 	bool
 	depends on PPC_PMAC && POWER4
 	select U3_DART
+	select MPIC_BROKEN_U3
 	select GENERIC_TBSYNC
 	default y
 
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 523f35087e81..f73a16e9867a 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -34,7 +34,7 @@
 
 #ifdef DEBUG
 #include <asm/udbg.h>
-#define DBG(fmt...) udbg_printf(fmt)
+#define DBG(fmt...) printk(fmt)
 #else
 #define DBG(fmt...)
 #endif
@@ -323,6 +323,7 @@ static void pci_parse_of_addrs(struct device_node *node, struct pci_dev *dev)
 	addrs = (u32 *) get_property(node, "assigned-addresses", &proplen);
 	if (!addrs)
 		return;
+	DBG("    parse addresses (%d bytes) @ %p\n", proplen, addrs);
 	for (; proplen >= 20; proplen -= 20, addrs += 5) {
 		flags = pci_parse_of_flags(addrs[0]);
 		if (!flags)
@@ -332,6 +333,9 @@ static void pci_parse_of_addrs(struct device_node *node, struct pci_dev *dev)
 		if (!size)
 			continue;
 		i = addrs[0] & 0xff;
+		DBG("  base: %llx, size: %llx, i: %x\n",
+		    (unsigned long long)base, (unsigned long long)size, i);
+
 		if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) {
 			res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2];
 		} else if (i == dev->rom_base_reg) {
@@ -362,6 +366,8 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
 	if (type == NULL)
 		type = "";
 
+	DBG("    create device, devfn: %x, type: %s\n", devfn, type);
+
 	memset(dev, 0, sizeof(struct pci_dev));
 	dev->bus = bus;
 	dev->sysdata = node;
@@ -381,6 +387,8 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
 		dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn));
 	dev->class = get_int_prop(node, "class-code", 0);
 
+	DBG("    class: 0x%x\n", dev->class);
+
 	dev->current_state = 4;		/* unknown power state */
 
 	if (!strcmp(type, "pci")) {
@@ -402,6 +410,8 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
 
 	pci_parse_of_addrs(node, dev);
 
+	DBG("    adding to system ...\n");
+
 	pci_device_add(dev, bus);
 
 	/* XXX pci_scan_msi_device(dev); */
@@ -418,15 +428,21 @@ void __devinit of_scan_bus(struct device_node *node,
 	int reglen, devfn;
 	struct pci_dev *dev;
 
+	DBG("of_scan_bus(%s) bus no %d... \n", node->full_name, bus->number);
+
 	while ((child = of_get_next_child(node, child)) != NULL) {
+		DBG("  * %s\n", child->full_name);
 		reg = (u32 *) get_property(child, "reg", &reglen);
 		if (reg == NULL || reglen < 20)
 			continue;
 		devfn = (reg[0] >> 8) & 0xff;
+
 		/* create a new pci_dev for this device */
 		dev = of_create_pci_dev(child, bus, devfn);
 		if (!dev)
 			continue;
+		DBG("dev header type: %x\n", dev->hdr_type);
+
 		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
 		    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
 			of_scan_pci_bridge(child, dev);
@@ -446,16 +462,18 @@ void __devinit of_scan_pci_bridge(struct device_node *node,
 	unsigned int flags;
 	u64 size;
 
+	DBG("of_scan_pci_bridge(%s)\n", node->full_name);
+
 	/* parse bus-range property */
 	busrange = (u32 *) get_property(node, "bus-range", &len);
 	if (busrange == NULL || len != 8) {
-		printk(KERN_ERR "Can't get bus-range for PCI-PCI bridge %s\n",
+		printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %s\n",
 		       node->full_name);
 		return;
 	}
 	ranges = (u32 *) get_property(node, "ranges", &len);
 	if (ranges == NULL) {
-		printk(KERN_ERR "Can't get ranges for PCI-PCI bridge %s\n",
+		printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %s\n",
 		       node->full_name);
 		return;
 	}
@@ -509,10 +527,13 @@ void __devinit of_scan_pci_bridge(struct device_node *node,
 	}
 	sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus),
 		bus->number);
+	DBG("    bus name: %s\n", bus->name);
 
 	mode = PCI_PROBE_NORMAL;
 	if (ppc_md.pci_probe_mode)
 		mode = ppc_md.pci_probe_mode(bus);
+	DBG("    probe mode: %d\n", mode);
+
 	if (mode == PCI_PROBE_DEVTREE)
 		of_scan_bus(node, bus);
 	else if (mode == PCI_PROBE_NORMAL)
@@ -528,6 +549,8 @@ void __devinit scan_phb(struct pci_controller *hose)
 	int i, mode;
 	struct resource *res;
 
+	DBG("Scanning PHB %s\n", node ? node->full_name : "<NO NAME>");
+
 	bus = pci_create_bus(NULL, hose->first_busno, hose->ops, node);
 	if (bus == NULL) {
 		printk(KERN_ERR "Failed to create bus for PCI domain %04x\n",
@@ -552,8 +575,9 @@ void __devinit scan_phb(struct pci_controller *hose)
 
 	mode = PCI_PROBE_NORMAL;
 #ifdef CONFIG_PPC_MULTIPLATFORM
-	if (ppc_md.pci_probe_mode)
+	if (node && ppc_md.pci_probe_mode)
 		mode = ppc_md.pci_probe_mode(bus);
+	DBG("    probe mode: %d\n", mode);
 	if (mode == PCI_PROBE_DEVTREE) {
 		bus->subordinate = hose->last_busno;
 		of_scan_bus(node, bus);
@@ -842,8 +866,7 @@ pgprot_t pci_phys_mem_access_prot(struct file *file,
  * Returns a negative error code on failure, zero on success.
  */
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state,
-			int write_combine)
+			enum pci_mmap_state mmap_state, int write_combine)
 {
 	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
 	struct resource *rp;
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 1b97e13657e5..977ee3adaf2d 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -298,6 +298,16 @@ static int __devinit finish_node_interrupts(struct device_node *np,
 	int i, j, n, sense;
 	unsigned int *irq, virq;
 	struct device_node *ic;
+	int trace = 0;
+
+	//#define TRACE(fmt...) do { if (trace) { printk(fmt); mdelay(1000); } } while(0)
+#define TRACE(fmt...)
+
+	if (!strcmp(np->name, "smu-doorbell"))
+		trace = 1;
+
+	TRACE("Finishing SMU doorbell ! num_interrupt_controllers = %d\n",
+	      num_interrupt_controllers);
 
 	if (num_interrupt_controllers == 0) {
 		/*
@@ -332,11 +342,12 @@ static int __devinit finish_node_interrupts(struct device_node *np,
 	}
 
 	ints = (unsigned int *) get_property(np, "interrupts", &intlen);
+	TRACE("ints=%p, intlen=%d\n", ints, intlen);
 	if (ints == NULL)
 		return 0;
 	intrcells = prom_n_intr_cells(np);
 	intlen /= intrcells * sizeof(unsigned int);
-
+	TRACE("intrcells=%d, new intlen=%d\n", intrcells, intlen);
 	np->intrs = prom_alloc(intlen * sizeof(*(np->intrs)), mem_start);
 	if (!np->intrs)
 		return -ENOMEM;
@@ -347,6 +358,7 @@ static int __devinit finish_node_interrupts(struct device_node *np,
 	intrcount = 0;
 	for (i = 0; i < intlen; ++i, ints += intrcells) {
 		n = map_interrupt(&irq, &ic, np, ints, intrcells);
+		TRACE("map, irq=%d, ic=%p, n=%d\n", irq, ic, n);
 		if (n <= 0)
 			continue;
 
@@ -357,6 +369,7 @@ static int __devinit finish_node_interrupts(struct device_node *np,
 			np->intrs[intrcount].sense = map_isa_senses[sense];
 		} else {
 			virq = virt_irq_create_mapping(irq[0]);
+			TRACE("virq=%d\n", virq);
 #ifdef CONFIG_PPC64
 			if (virq == NO_IRQ) {
 				printk(KERN_CRIT "Could not allocate interrupt"
@@ -366,6 +379,12 @@ static int __devinit finish_node_interrupts(struct device_node *np,
 #endif
 			np->intrs[intrcount].line = irq_offset_up(virq);
 			sense = (n > 1)? (irq[1] & 3): 1;
+
+			/* Apple uses bits in there in a different way, let's
+			 * only keep the real sense bit on macs
+			 */
+			if (_machine == PLATFORM_POWERMAC)
+				sense &= 0x1;
 			np->intrs[intrcount].sense = map_mpic_senses[sense];
 		}
 
@@ -375,12 +394,13 @@ static int __devinit finish_node_interrupts(struct device_node *np,
 			char *name = get_property(ic->parent, "name", NULL);
 			if (name && !strcmp(name, "u3"))
 				np->intrs[intrcount].line += 128;
-			else if (!(name && !strcmp(name, "mac-io")))
+			else if (!(name && (!strcmp(name, "mac-io") ||
+					    !strcmp(name, "u4"))))
 				/* ignore other cascaded controllers, such as
 				   the k2-sata-root */
 				break;
 		}
-#endif
+#endif /* CONFIG_PPC64 */
 		if (n > 2) {
 			printk("hmmm, got %d intr cells for %s:", n,
 			       np->full_name);
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index a058285a70e7..9567d9474c80 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -110,10 +110,12 @@ static int early_console_initialized;
 
 void __init disable_early_printk(void)
 {
+#if 1
 	if (!early_console_initialized)
 		return;
 	unregister_console(&udbg_console);
 	early_console_initialized = 0;
+#endif
 }
 
 /* called by setup_system */
diff --git a/arch/powerpc/platforms/maple/setup.c b/arch/powerpc/platforms/maple/setup.c
index 65fe4c166a68..dd73e38bfb7d 100644
--- a/arch/powerpc/platforms/maple/setup.c
+++ b/arch/powerpc/platforms/maple/setup.c
@@ -195,7 +195,7 @@ static void __init maple_init_early(void)
 	/* Setup interrupt mapping options */
 	ppc64_interrupt_controller = IC_OPEN_PIC;
 
-	iommu_init_early_u3();
+	iommu_init_early_dart();
 
 	DBG(" <- maple_init_early\n");
 }
@@ -257,7 +257,7 @@ static int __init maple_probe(int platform)
 	 * occupies having to be broken up so the DART itself is not
 	 * part of the cacheable linar mapping
 	 */
-	alloc_u3_dart_table();
+	alloc_dart_table();
 
 	return 1;
 }
diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c
index b1f896952b1b..d2915d64d45e 100644
--- a/arch/powerpc/platforms/powermac/feature.c
+++ b/arch/powerpc/platforms/powermac/feature.c
@@ -101,7 +101,8 @@ static const char *macio_names[] =
 	"Keylargo",
 	"Pangea",
 	"Intrepid",
-	"K2"
+	"K2",
+	"Shasta",
 };
 
 
@@ -119,7 +120,7 @@ static const char *macio_names[] =
 static struct device_node *uninorth_node;
 static u32 __iomem *uninorth_base;
 static u32 uninorth_rev;
-static int uninorth_u3;
+static int uninorth_maj;
 static void __iomem *u3_ht;
 
 /*
@@ -1399,8 +1400,15 @@ static long g5_fw_enable(struct device_node *node, long param, long value)
 static long g5_mpic_enable(struct device_node *node, long param, long value)
 {
 	unsigned long flags;
+	struct device_node *parent = of_get_parent(node);
+	int is_u3;
 
-	if (node->parent == NULL || strcmp(node->parent->name, "u3"))
+	if (parent == NULL)
+		return 0;
+	is_u3 = strcmp(parent->name, "u3") == 0 ||
+		strcmp(parent->name, "u4") == 0;
+	of_node_put(parent);
+	if (!is_u3)
 		return 0;
 
 	LOCK(flags);
@@ -1464,7 +1472,7 @@ static long g5_i2s_enable(struct device_node *node, long param, long value)
 		},
 	};
 
-	if (macio->type != macio_keylargo2 /* && macio->type != macio_shasta*/)
+	if (macio->type != macio_keylargo2 && macio->type != macio_shasta)
 		return -ENODEV;
 	if (strncmp(node->name, "i2s-", 4))
 		return -ENODEV;
@@ -1473,11 +1481,9 @@ static long g5_i2s_enable(struct device_node *node, long param, long value)
 	case 0:
 	case 1:
 		break;
-#if 0
 	case 2:
 		if (macio->type == macio_shasta)
 			break;
-#endif
 	default:
 		return -ENODEV;
 	}
@@ -1508,7 +1514,7 @@ static long g5_reset_cpu(struct device_node *node, long param, long value)
 	struct device_node *np;
 
 	macio = &macio_chips[0];
-	if (macio->type != macio_keylargo2)
+	if (macio->type != macio_keylargo2 && macio->type != macio_shasta)
 		return -ENODEV;
 
 	np = find_path_device("/cpus");
@@ -1547,7 +1553,8 @@ static long g5_reset_cpu(struct device_node *node, long param, long value)
  */
 void g5_phy_disable_cpu1(void)
 {
-	UN_OUT(U3_API_PHY_CONFIG_1, 0);
+	if (uninorth_maj == 3)
+		UN_OUT(U3_API_PHY_CONFIG_1, 0);
 }
 #endif /* CONFIG_POWER4 */
 
@@ -2462,6 +2469,14 @@ static struct pmac_mb_def pmac_mb_defs[] = {
 		PMAC_TYPE_POWERMAC_G5_U3L,	g5_features,
 		0,
 	},
+	{	"PowerMac11,2",			"PowerMac G5 Dual Core",
+		PMAC_TYPE_POWERMAC_G5_U3L,	g5_features,
+		0,
+	},
+	{	"PowerMac12,1",			"iMac G5 (iSight)",
+		PMAC_TYPE_POWERMAC_G5_U3L,	g5_features,
+		0,
+	},
 	{       "RackMac3,1",                   "XServe G5",
 		PMAC_TYPE_XSERVE_G5,		g5_features,
 		0,
@@ -2574,6 +2589,11 @@ static int __init probe_motherboard(void)
 		pmac_mb.model_name = "Unknown K2-based";
 		pmac_mb.features = g5_features;
 		break;
+	case macio_shasta:
+		pmac_mb.model_id = PMAC_TYPE_UNKNOWN_SHASTA;
+		pmac_mb.model_name = "Unknown Shasta-based";
+		pmac_mb.features = g5_features;
+		break;
 #endif /* CONFIG_POWER4 */
 	default:
 		return -ENODEV;
@@ -2651,7 +2671,12 @@ static void __init probe_uninorth(void)
 	/* Locate G5 u3 */
 	if (uninorth_node == NULL) {
 		uninorth_node = of_find_node_by_name(NULL, "u3");
-		uninorth_u3 = 1;
+		uninorth_maj = 3;
+	}
+	/* Locate G5 u4 */
+	if (uninorth_node == NULL) {
+		uninorth_node = of_find_node_by_name(NULL, "u4");
+		uninorth_maj = 4;
 	}
 	if (uninorth_node == NULL)
 		return;
@@ -2664,12 +2689,13 @@ static void __init probe_uninorth(void)
 		return;
 	uninorth_base = ioremap(address, 0x40000);
 	uninorth_rev = in_be32(UN_REG(UNI_N_VERSION));
-	if (uninorth_u3)
+	if (uninorth_maj == 3 || uninorth_maj == 4)
 		u3_ht = ioremap(address + U3_HT_CONFIG_BASE, 0x1000);
 
-	printk(KERN_INFO "Found %s memory controller & host bridge,"
-	       " revision: %d\n", uninorth_u3 ? "U3" : "UniNorth",
-	       uninorth_rev);
+	printk(KERN_INFO "Found %s memory controller & host bridge"
+	       " @ 0x%08x revision: 0x%02x\n", uninorth_maj == 3 ? "U3" :
+	       uninorth_maj == 4 ? "U4" : "UniNorth",
+	       (unsigned int)address, uninorth_rev);
 	printk(KERN_INFO "Mapped at 0x%08lx\n", (unsigned long)uninorth_base);
 
 	/* Set the arbitrer QAck delay according to what Apple does
@@ -2677,7 +2703,8 @@ static void __init probe_uninorth(void)
 	if (uninorth_rev < 0x11) {
 		actrl = UN_IN(UNI_N_ARB_CTRL) & ~UNI_N_ARB_CTRL_QACK_DELAY_MASK;
 		actrl |= ((uninorth_rev < 3) ? UNI_N_ARB_CTRL_QACK_DELAY105 :
-			UNI_N_ARB_CTRL_QACK_DELAY) << UNI_N_ARB_CTRL_QACK_DELAY_SHIFT;
+			UNI_N_ARB_CTRL_QACK_DELAY) <<
+			UNI_N_ARB_CTRL_QACK_DELAY_SHIFT;
 		UN_OUT(UNI_N_ARB_CTRL, actrl);
 	}
 
@@ -2685,7 +2712,8 @@ static void __init probe_uninorth(void)
 	 * revs 1.5 to 2.O and Pangea. Seem to toggle the UniN Maxbus/PCI
 	 * memory timeout
 	 */
-	if ((uninorth_rev >= 0x11 && uninorth_rev <= 0x24) || uninorth_rev == 0xc0)
+	if ((uninorth_rev >= 0x11 && uninorth_rev <= 0x24) ||
+	    uninorth_rev == 0xc0)
 		UN_OUT(0x2160, UN_IN(0x2160) & 0x00ffffff);
 }
 
@@ -2736,12 +2764,14 @@ static void __init probe_one_macio(const char *name, const char *compat, int typ
 		       node->full_name);
 		return;
 	}
-	if (type == macio_keylargo) {
+	if (type == macio_keylargo || type == macio_keylargo2) {
 		u32 *did = (u32 *)get_property(node, "device-id", NULL);
 		if (*did == 0x00000025)
 			type = macio_pangea;
 		if (*did == 0x0000003e)
 			type = macio_intrepid;
+		if (*did == 0x0000004f)
+			type = macio_shasta;
 	}
 	macio_chips[i].of_node	= node;
 	macio_chips[i].type	= type;
@@ -2840,7 +2870,8 @@ set_initial_features(void)
 	}
 
 #ifdef CONFIG_POWER4
-	if (macio_chips[0].type == macio_keylargo2) {
+	if (macio_chips[0].type == macio_keylargo2 ||
+	    macio_chips[0].type == macio_shasta) {
 #ifndef CONFIG_SMP
 		/* On SMP machines running UP, we have the second CPU eating
 		 * bus cycles. We need to take it off the bus. This is done
diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c
index 5aab261075de..f671ed253901 100644
--- a/arch/powerpc/platforms/powermac/pci.c
+++ b/arch/powerpc/platforms/powermac/pci.c
@@ -1,7 +1,7 @@
 /*
  * Support for PCI bridges found on Power Macintoshes.
  *
- * Copyright (C) 2003 Benjamin Herrenschmuidt (benh@kernel.crashing.org)
+ * Copyright (C) 2003-2005 Benjamin Herrenschmuidt (benh@kernel.crashing.org)
  * Copyright (C) 1997 Paul Mackerras (paulus@samba.org)
  *
  * This program is free software; you can redistribute it and/or
@@ -25,7 +25,7 @@
 #include <asm/pmac_feature.h>
 #include <asm/grackle.h>
 #ifdef CONFIG_PPC64
-#include <asm/iommu.h>
+//#include <asm/iommu.h>
 #include <asm/ppc-pci.h>
 #endif
 
@@ -44,6 +44,7 @@ static int add_bridge(struct device_node *dev);
 static int has_uninorth;
 #ifdef CONFIG_PPC64
 static struct pci_controller *u3_agp;
+static struct pci_controller *u4_pcie;
 static struct pci_controller *u3_ht;
 #endif /* CONFIG_PPC64 */
 
@@ -97,11 +98,8 @@ static void __init fixup_bus_range(struct device_node *bridge)
 
 	/* Lookup the "bus-range" property for the hose */
 	bus_range = (int *) get_property(bridge, "bus-range", &len);
-	if (bus_range == NULL || len < 2 * sizeof(int)) {
-		printk(KERN_WARNING "Can't get bus-range for %s\n",
-			       bridge->full_name);
+	if (bus_range == NULL || len < 2 * sizeof(int))
 		return;
-	}
 	bus_range[1] = fixup_one_level_bus_range(bridge->child, bus_range[1]);
 }
 
@@ -128,14 +126,14 @@ static void __init fixup_bus_range(struct device_node *bridge)
  */
 
 #define MACRISC_CFA0(devfn, off)	\
-	((1 << (unsigned long)PCI_SLOT(dev_fn)) \
-	| (((unsigned long)PCI_FUNC(dev_fn)) << 8) \
-	| (((unsigned long)(off)) & 0xFCUL))
+	((1 << (unsigned int)PCI_SLOT(dev_fn)) \
+	| (((unsigned int)PCI_FUNC(dev_fn)) << 8) \
+	| (((unsigned int)(off)) & 0xFCUL))
 
 #define MACRISC_CFA1(bus, devfn, off)	\
-	((((unsigned long)(bus)) << 16) \
-	|(((unsigned long)(devfn)) << 8) \
-	|(((unsigned long)(off)) & 0xFCUL) \
+	((((unsigned int)(bus)) << 16) \
+	|(((unsigned int)(devfn)) << 8) \
+	|(((unsigned int)(off)) & 0xFCUL) \
 	|1UL)
 
 static unsigned long macrisc_cfg_access(struct pci_controller* hose,
@@ -168,7 +166,8 @@ static int macrisc_read_config(struct pci_bus *bus, unsigned int devfn,
 	hose = pci_bus_to_host(bus);
 	if (hose == NULL)
 		return PCIBIOS_DEVICE_NOT_FOUND;
-
+	if (offset >= 0x100)
+		return  PCIBIOS_BAD_REGISTER_NUMBER;
 	addr = macrisc_cfg_access(hose, bus->number, devfn, offset);
 	if (!addr)
 		return PCIBIOS_DEVICE_NOT_FOUND;
@@ -199,7 +198,8 @@ static int macrisc_write_config(struct pci_bus *bus, unsigned int devfn,
 	hose = pci_bus_to_host(bus);
 	if (hose == NULL)
 		return PCIBIOS_DEVICE_NOT_FOUND;
-
+	if (offset >= 0x100)
+		return  PCIBIOS_BAD_REGISTER_NUMBER;
 	addr = macrisc_cfg_access(hose, bus->number, devfn, offset);
 	if (!addr)
 		return PCIBIOS_DEVICE_NOT_FOUND;
@@ -234,12 +234,13 @@ static struct pci_ops macrisc_pci_ops =
 /*
  * Verify that a specific (bus, dev_fn) exists on chaos
  */
-static int
-chaos_validate_dev(struct pci_bus *bus, int devfn, int offset)
+static int chaos_validate_dev(struct pci_bus *bus, int devfn, int offset)
 {
 	struct device_node *np;
 	u32 *vendor, *device;
 
+	if (offset >= 0x100)
+		return  PCIBIOS_BAD_REGISTER_NUMBER;
 	np = pci_busdev_to_OF_node(bus, devfn);
 	if (np == NULL)
 		return PCIBIOS_DEVICE_NOT_FOUND;
@@ -341,10 +342,10 @@ static int u3_ht_skip_device(struct pci_controller *hose,
 }
 
 #define U3_HT_CFA0(devfn, off)		\
-		((((unsigned long)devfn) << 8) | offset)
+		((((unsigned int)devfn) << 8) | offset)
 #define U3_HT_CFA1(bus, devfn, off)	\
 		(U3_HT_CFA0(devfn, off) \
-		+ (((unsigned long)bus) << 16) \
+		+ (((unsigned int)bus) << 16) \
 		+ 0x01000000UL)
 
 static unsigned long u3_ht_cfg_access(struct pci_controller* hose,
@@ -370,7 +371,8 @@ static int u3_ht_read_config(struct pci_bus *bus, unsigned int devfn,
 	hose = pci_bus_to_host(bus);
 	if (hose == NULL)
 		return PCIBIOS_DEVICE_NOT_FOUND;
-
+	if (offset >= 0x100)
+		return  PCIBIOS_BAD_REGISTER_NUMBER;
 	addr = u3_ht_cfg_access(hose, bus->number, devfn, offset);
 	if (!addr)
 		return PCIBIOS_DEVICE_NOT_FOUND;
@@ -419,7 +421,8 @@ static int u3_ht_write_config(struct pci_bus *bus, unsigned int devfn,
 	hose = pci_bus_to_host(bus);
 	if (hose == NULL)
 		return PCIBIOS_DEVICE_NOT_FOUND;
-
+	if (offset >= 0x100)
+		return  PCIBIOS_BAD_REGISTER_NUMBER;
 	addr = u3_ht_cfg_access(hose, bus->number, devfn, offset);
 	if (!addr)
 		return PCIBIOS_DEVICE_NOT_FOUND;
@@ -459,6 +462,112 @@ static struct pci_ops u3_ht_pci_ops =
 	u3_ht_read_config,
 	u3_ht_write_config
 };
+
+#define U4_PCIE_CFA0(devfn, off)	\
+	((1 << ((unsigned int)PCI_SLOT(dev_fn)))	\
+	 | (((unsigned int)PCI_FUNC(dev_fn)) << 8)	\
+	 | ((((unsigned int)(off)) >> 8) << 28) \
+	 | (((unsigned int)(off)) & 0xfcU))
+
+#define U4_PCIE_CFA1(bus, devfn, off)	\
+	((((unsigned int)(bus)) << 16) \
+	 |(((unsigned int)(devfn)) << 8)	\
+	 | ((((unsigned int)(off)) >> 8) << 28) \
+	 |(((unsigned int)(off)) & 0xfcU)	\
+	 |1UL)
+
+static unsigned long u4_pcie_cfg_access(struct pci_controller* hose,
+					u8 bus, u8 dev_fn, int offset)
+{
+	unsigned int caddr;
+
+	if (bus == hose->first_busno) {
+		caddr = U4_PCIE_CFA0(dev_fn, offset);
+	} else
+		caddr = U4_PCIE_CFA1(bus, dev_fn, offset);
+
+	/* Uninorth will return garbage if we don't read back the value ! */
+	do {
+		out_le32(hose->cfg_addr, caddr);
+	} while (in_le32(hose->cfg_addr) != caddr);
+
+	offset &= 0x03;
+	return ((unsigned long)hose->cfg_data) + offset;
+}
+
+static int u4_pcie_read_config(struct pci_bus *bus, unsigned int devfn,
+			       int offset, int len, u32 *val)
+{
+	struct pci_controller *hose;
+	unsigned long addr;
+
+	hose = pci_bus_to_host(bus);
+	if (hose == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	if (offset >= 0x1000)
+		return  PCIBIOS_BAD_REGISTER_NUMBER;
+	addr = u4_pcie_cfg_access(hose, bus->number, devfn, offset);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	switch (len) {
+	case 1:
+		*val = in_8((u8 *)addr);
+		break;
+	case 2:
+		*val = in_le16((u16 *)addr);
+		break;
+	default:
+		*val = in_le32((u32 *)addr);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int u4_pcie_write_config(struct pci_bus *bus, unsigned int devfn,
+				int offset, int len, u32 val)
+{
+	struct pci_controller *hose;
+	unsigned long addr;
+
+	hose = pci_bus_to_host(bus);
+	if (hose == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	if (offset >= 0x1000)
+		return  PCIBIOS_BAD_REGISTER_NUMBER;
+	addr = u4_pcie_cfg_access(hose, bus->number, devfn, offset);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	switch (len) {
+	case 1:
+		out_8((u8 *)addr, val);
+		(void) in_8((u8 *)addr);
+		break;
+	case 2:
+		out_le16((u16 *)addr, val);
+		(void) in_le16((u16 *)addr);
+		break;
+	default:
+		out_le32((u32 *)addr, val);
+		(void) in_le32((u32 *)addr);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops u4_pcie_pci_ops =
+{
+	u4_pcie_read_config,
+	u4_pcie_write_config
+};
+
 #endif /* CONFIG_PPC64 */
 
 #ifdef CONFIG_PPC32
@@ -628,15 +737,36 @@ static void __init setup_u3_agp(struct pci_controller* hose)
 	hose->ops = &macrisc_pci_ops;
 	hose->cfg_addr = ioremap(0xf0000000 + 0x800000, 0x1000);
 	hose->cfg_data = ioremap(0xf0000000 + 0xc00000, 0x1000);
-
 	u3_agp = hose;
 }
 
+static void __init setup_u4_pcie(struct pci_controller* hose)
+{
+	/* We currently only implement the "non-atomic" config space, to
+	 * be optimised later.
+	 */
+	hose->ops = &u4_pcie_pci_ops;
+	hose->cfg_addr = ioremap(0xf0000000 + 0x800000, 0x1000);
+	hose->cfg_data = ioremap(0xf0000000 + 0xc00000, 0x1000);
+
+	/* The bus contains a bridge from root -> device, we need to
+	 * make it visible on bus 0 so that we pick the right type
+	 * of config cycles. If we didn't, we would have to force all
+	 * config cycles to be type 1. So we override the "bus-range"
+	 * property here
+	 */
+	hose->first_busno = 0x00;
+	hose->last_busno = 0xff;
+	u4_pcie = hose;
+}
+
 static void __init setup_u3_ht(struct pci_controller* hose)
 {
 	struct device_node *np = (struct device_node *)hose->arch_data;
+	struct pci_controller *other = NULL;
 	int i, cur;
 
+
 	hose->ops = &u3_ht_pci_ops;
 
 	/* We hard code the address because of the different size of
@@ -670,11 +800,20 @@ static void __init setup_u3_ht(struct pci_controller* hose)
 
 	u3_ht = hose;
 
-	if (u3_agp == NULL) {
-		DBG("U3 has no AGP, using full resource range\n");
+	if (u3_agp != NULL)
+		other = u3_agp;
+	else if (u4_pcie != NULL)
+		other = u4_pcie;
+
+	if (other == NULL) {
+		DBG("U3/4 has no AGP/PCIE, using full resource range\n");
 		return;
 	}
 
+	/* Fixup bus range vs. PCIE */
+	if (u4_pcie)
+		hose->last_busno = u4_pcie->first_busno - 1;
+
 	/* We "remove" the AGP resources from the resources allocated to HT,
 	 * that is we create "holes". However, that code does assumptions
 	 * that so far happen to be true (cross fingers...), typically that
@@ -682,7 +821,7 @@ static void __init setup_u3_ht(struct pci_controller* hose)
 	 */
 	cur = 0;
 	for (i=0; i<3; i++) {
-		struct resource *res = &u3_agp->mem_resources[i];
+		struct resource *res = &other->mem_resources[i];
 		if (res->flags != IORESOURCE_MEM)
 			continue;
 		/* We don't care about "fine" resources */
@@ -777,9 +916,13 @@ static int __init add_bridge(struct device_node *dev)
 		setup_u3_ht(hose);
 		disp_name = "U3-HT";
 		primary = 1;
+	} else if (device_is_compatible(dev, "u4-pcie")) {
+		setup_u4_pcie(hose);
+		disp_name = "U4-PCIE";
+		primary = 0;
 	}
-	printk(KERN_INFO "Found %s PCI host bridge.  Firmware bus number: %d->%d\n",
-		disp_name, hose->first_busno, hose->last_busno);
+	printk(KERN_INFO "Found %s PCI host bridge.  Firmware bus number:"
+	       " %d->%d\n", disp_name, hose->first_busno, hose->last_busno);
 #endif /* CONFIG_PPC64 */
 
 	/* 32 bits only bridges */
@@ -900,6 +1043,8 @@ void __init pmac_pci_init(void)
 		pci_setup_phb_io(u3_ht, 1);
 	if (u3_agp)
 		pci_setup_phb_io(u3_agp, 0);
+	if (u4_pcie)
+		pci_setup_phb_io(u4_pcie, 0);
 
 	/*
 	 * On ppc64, fixup the IO resources on our host bridges as
@@ -912,7 +1057,8 @@ void __init pmac_pci_init(void)
 
 	/* Fixup the PCI<->OF mapping for U3 AGP due to bus renumbering. We
 	 * assume there is no P2P bridge on the AGP bus, which should be a
-	 * safe assumptions hopefully.
+	 * safe assumptions for now. We should do something better in the
+	 * future though
 	 */
 	if (u3_agp) {
 		struct device_node *np = u3_agp->arch_data;
@@ -920,7 +1066,6 @@ void __init pmac_pci_init(void)
 		for (np = np->child; np; np = np->sibling)
 			PCI_DN(np)->busno = 0xf0;
 	}
-
 	/* pmac_check_ht_link(); */
 
 	/* Tell pci.c to not use the common resource allocation mechanism */
@@ -1127,7 +1272,8 @@ void pmac_pci_fixup_pciata(struct pci_dev* dev)
  good:
 	pci_read_config_byte(dev, PCI_CLASS_PROG, &progif);
 	if ((progif & 5) != 5) {
-		printk(KERN_INFO "Forcing PCI IDE into native mode: %s\n", pci_name(dev));
+		printk(KERN_INFO "Forcing PCI IDE into native mode: %s\n",
+		       pci_name(dev));
 		(void) pci_write_config_byte(dev, PCI_CLASS_PROG, progif|5);
 		if (pci_read_config_byte(dev, PCI_CLASS_PROG, &progif) ||
 		    (progif & 5) != 5)
@@ -1153,7 +1299,8 @@ static void fixup_k2_sata(struct pci_dev* dev)
 		for (i = 0; i < 6; i++) {
 			dev->resource[i].start = dev->resource[i].end = 0;
 			dev->resource[i].flags = 0;
-			pci_write_config_dword(dev, PCI_BASE_ADDRESS_0 + 4 * i, 0);
+			pci_write_config_dword(dev, PCI_BASE_ADDRESS_0 + 4 * i,
+					       0);
 		}
 	} else {
 		pci_read_config_word(dev, PCI_COMMAND, &cmd);
@@ -1162,7 +1309,8 @@ static void fixup_k2_sata(struct pci_dev* dev)
 		for (i = 0; i < 5; i++) {
 			dev->resource[i].start = dev->resource[i].end = 0;
 			dev->resource[i].flags = 0;
-			pci_write_config_dword(dev, PCI_BASE_ADDRESS_0 + 4 * i, 0);
+			pci_write_config_dword(dev, PCI_BASE_ADDRESS_0 + 4 * i,
+					       0);
 		}
 	}
 }
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index dbb524a851aa..18bf3011d1e3 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -524,18 +524,56 @@ static void __init pmac_pic_setup_mpic_nmi(struct mpic *mpic)
 #endif	/* defined(CONFIG_XMON) && defined(CONFIG_PPC32) */
 }
 
+static struct mpic * __init pmac_setup_one_mpic(struct device_node *np,
+						int master)
+{
+	unsigned char senses[128];
+	int offset = master ? 0 : 128;
+	int count = master ? 128 : 124;
+	const char *name = master ? " MPIC 1   " : " MPIC 2   ";
+	struct resource r;
+	struct mpic *mpic;
+	unsigned int flags = master ? MPIC_PRIMARY : 0;
+	int rc;
+
+	rc = of_address_to_resource(np, 0, &r);
+	if (rc)
+		return NULL;
+
+	pmac_call_feature(PMAC_FTR_ENABLE_MPIC, np, 0, 0);
+
+	prom_get_irq_senses(senses, offset, offset + count);
+
+	flags |= MPIC_WANTS_RESET;
+	if (get_property(np, "big-endian", NULL))
+		flags |= MPIC_BIG_ENDIAN;
+
+	/* Primary Big Endian means HT interrupts. This is quite dodgy
+	 * but works until I find a better way
+	 */
+	if (master && (flags & MPIC_BIG_ENDIAN))
+		flags |= MPIC_BROKEN_U3;
+
+	mpic = mpic_alloc(r.start, flags, 0, offset, count, master ? 252 : 0,
+			  senses, count, name);
+	if (mpic == NULL)
+		return NULL;
+
+	mpic_init(mpic);
+
+	return mpic;
+ }
+
 static int __init pmac_pic_probe_mpic(void)
 {
 	struct mpic *mpic1, *mpic2;
 	struct device_node *np, *master = NULL, *slave = NULL;
-	unsigned char senses[128];
-	struct resource r;
 
 	/* We can have up to 2 MPICs cascaded */
 	for (np = NULL; (np = of_find_node_by_type(np, "open-pic"))
 		     != NULL;) {
 		if (master == NULL &&
-		    get_property(np, "interrupt-parent", NULL) != NULL)
+		    get_property(np, "interrupts", NULL) == NULL)
 			master = of_node_get(np);
 		else if (slave == NULL)
 			slave = of_node_get(np);
@@ -557,13 +595,8 @@ static int __init pmac_pic_probe_mpic(void)
 	ppc_md.get_irq = mpic_get_irq;
 
 	/* Setup master */
-	BUG_ON(of_address_to_resource(master, 0, &r));
-	pmac_call_feature(PMAC_FTR_ENABLE_MPIC, master, 0, 0);
-	prom_get_irq_senses(senses, 0, 128);
-	mpic1 = mpic_alloc(r.start, MPIC_PRIMARY | MPIC_WANTS_RESET,
-			   0, 0, 128, 252, senses, 128, " OpenPIC  ");
+	mpic1 = pmac_setup_one_mpic(master, 1);
 	BUG_ON(mpic1 == NULL);
-	mpic_init(mpic1);
 
 	/* Install NMI if any */
 	pmac_pic_setup_mpic_nmi(mpic1);
@@ -574,27 +607,12 @@ static int __init pmac_pic_probe_mpic(void)
 	if (slave == NULL || slave->n_intrs < 1)
 		return 0;
 
-	/* Setup slave, failures are non-fatal */
-	if (of_address_to_resource(slave, 0, &r)) {
-		printk(KERN_ERR "Can't get address of MPIC %s\n",
-		       slave->full_name);
-		return 0;
-	}
-	pmac_call_feature(PMAC_FTR_ENABLE_MPIC, slave, 0, 0);
-	prom_get_irq_senses(senses, 128, 128 + 124);
-
-	/* We don't need to set MPIC_BROKEN_U3 here since we don't have
-	 * hypertransport interrupts routed to it, at least not on currently
-	 * supported machines, that may change.
-	 */
-	mpic2 = mpic_alloc(r.start, MPIC_BIG_ENDIAN | MPIC_WANTS_RESET,
-			   0, 128, 124, 0, senses, 124, " U3-MPIC  ");
+	mpic2 = pmac_setup_one_mpic(slave, 0);
 	if (mpic2 == NULL) {
-		printk(KERN_ERR "Can't create slave MPIC %s\n",
-		       slave->full_name);
+		printk(KERN_ERR "Failed to setup slave MPIC\n");
+		of_node_put(slave);
 		return 0;
 	}
-	mpic_init(mpic2);
 	mpic_setup_cascade(slave->intrs[0].line, pmac_u3_cascade, mpic2);
 
 	of_node_put(slave);
diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c
index 18c5620f87fa..1daa5a06e9ea 100644
--- a/arch/powerpc/platforms/powermac/setup.c
+++ b/arch/powerpc/platforms/powermac/setup.c
@@ -345,7 +345,7 @@ void __init pmac_setup_arch(void)
 
 #ifdef CONFIG_SMP
 	/* Check for Core99 */
-	if (find_devices("uni-n") || find_devices("u3"))
+	if (find_devices("uni-n") || find_devices("u3") || find_devices("u4"))
 		smp_ops = &core99_smp_ops;
 #ifdef CONFIG_PPC32
 	else
@@ -635,7 +635,7 @@ static void __init pmac_init_early(void)
 	/* Setup interrupt mapping options */
 	ppc64_interrupt_controller = IC_OPEN_PIC;
 
-	iommu_init_early_u3();
+	iommu_init_early_dart();
 #endif
 }
 
@@ -711,7 +711,7 @@ static int __init pmac_probe(int platform)
 	 * occupies having to be broken up so the DART itself is not
 	 * part of the cacheable linar mapping
 	 */
-	alloc_u3_dart_table();
+	alloc_dart_table();
 #endif
 
 #ifdef CONFIG_PMAC_SMU
@@ -733,10 +733,11 @@ static int pmac_pci_probe_mode(struct pci_bus *bus)
 	struct device_node *node = bus->sysdata;
 
 	/* We need to use normal PCI probing for the AGP bus,
-	   since the device for the AGP bridge isn't in the tree. */
-	if (bus->self == NULL && device_is_compatible(node, "u3-agp"))
+	 * since the device for the AGP bridge isn't in the tree.
+	 */
+	if (bus->self == NULL && (device_is_compatible(node, "u3-agp") ||
+				  device_is_compatible(node, "u4-pcie")))
 		return PCI_PROBE_NORMAL;
-
 	return PCI_PROBE_DEVTREE;
 }
 #endif
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index 862f1e985c19..df01bb8feb16 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -361,7 +361,6 @@ static void __init psurge_dual_sync_tb(int cpu_nr)
 	set_dec(tb_ticks_per_jiffy);
 	/* XXX fixme */
 	set_tb(0, 0);
-	last_jiffy_stamp(cpu_nr) = 0;
 
 	if (cpu_nr > 0) {
 		mb();
@@ -429,15 +428,62 @@ struct smp_ops_t psurge_smp_ops = {
 };
 #endif /* CONFIG_PPC32 - actually powersurge support */
 
+/*
+ * Core 99 and later support
+ */
+
+static void (*pmac_tb_freeze)(int freeze);
+static unsigned long timebase;
+static int tb_req;
+
+static void smp_core99_give_timebase(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	while(!tb_req)
+		barrier();
+	tb_req = 0;
+	(*pmac_tb_freeze)(1);
+	mb();
+	timebase = get_tb();
+	mb();
+	while (timebase)
+		barrier();
+	mb();
+	(*pmac_tb_freeze)(0);
+	mb();
+
+	local_irq_restore(flags);
+}
+
+
+static void __devinit smp_core99_take_timebase(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	tb_req = 1;
+	mb();
+	while (!timebase)
+		barrier();
+	mb();
+	set_tb(timebase >> 32, timebase & 0xffffffff);
+	timebase = 0;
+	mb();
+	set_dec(tb_ticks_per_jiffy/2);
+
+	local_irq_restore(flags);
+}
+
 #ifdef CONFIG_PPC64
 /*
  * G5s enable/disable the timebase via an i2c-connected clock chip.
  */
 static struct device_node *pmac_tb_clock_chip_host;
 static u8 pmac_tb_pulsar_addr;
-static void (*pmac_tb_freeze)(int freeze);
-static DEFINE_SPINLOCK(timebase_lock);
-static unsigned long timebase;
 
 static void smp_core99_cypress_tb_freeze(int freeze)
 {
@@ -447,7 +493,8 @@ static void smp_core99_cypress_tb_freeze(int freeze)
 	/* Strangely, the device-tree says address is 0xd2, but darwin
 	 * accesses 0xd0 ...
 	 */
-	pmac_low_i2c_setmode(pmac_tb_clock_chip_host, pmac_low_i2c_mode_combined);
+	pmac_low_i2c_setmode(pmac_tb_clock_chip_host,
+			     pmac_low_i2c_mode_combined);
 	rc = pmac_low_i2c_xfer(pmac_tb_clock_chip_host,
 			       0xd0 | pmac_low_i2c_read,
 			       0x81, &data, 1);
@@ -475,7 +522,8 @@ static void smp_core99_pulsar_tb_freeze(int freeze)
 	u8 data;
 	int rc;
 
-	pmac_low_i2c_setmode(pmac_tb_clock_chip_host, pmac_low_i2c_mode_combined);
+	pmac_low_i2c_setmode(pmac_tb_clock_chip_host,
+			     pmac_low_i2c_mode_combined);
 	rc = pmac_low_i2c_xfer(pmac_tb_clock_chip_host,
 			       pmac_tb_pulsar_addr | pmac_low_i2c_read,
 			       0x2e, &data, 1);
@@ -496,54 +544,14 @@ static void smp_core99_pulsar_tb_freeze(int freeze)
 	}
 }
 
-
-static void smp_core99_give_timebase(void)
-{
-	/* Open i2c bus for synchronous access */
-	if (pmac_low_i2c_open(pmac_tb_clock_chip_host, 0))
-		panic("Can't open i2c for TB sync !\n");
-
-	spin_lock(&timebase_lock);
-	(*pmac_tb_freeze)(1);
-	mb();
-	timebase = get_tb();
-	spin_unlock(&timebase_lock);
-
-	while (timebase)
-		barrier();
-
-	spin_lock(&timebase_lock);
-	(*pmac_tb_freeze)(0);
-	spin_unlock(&timebase_lock);
-
-	/* Close i2c bus */
-	pmac_low_i2c_close(pmac_tb_clock_chip_host);
-}
-
-
-static void __devinit smp_core99_take_timebase(void)
-{
-	while (!timebase)
-		barrier();
-	spin_lock(&timebase_lock);
-	set_tb(timebase >> 32, timebase & 0xffffffff);
-	timebase = 0;
-	spin_unlock(&timebase_lock);
-}
-
-static void __init smp_core99_setup(int ncpus)
+static void __init smp_core99_setup_i2c_hwsync(int ncpus)
 {
 	struct device_node *cc = NULL;	
 	struct device_node *p;
+	const char *name = NULL;
 	u32 *reg;
 	int ok;
 
-	/* HW sync only on these platforms */
-	if (!machine_is_compatible("PowerMac7,2") &&
-	    !machine_is_compatible("PowerMac7,3") &&
-	    !machine_is_compatible("RackMac3,1"))
-		return;
-
 	/* Look for the clock chip */
 	while ((cc = of_find_node_by_name(cc, "i2c-hwclock")) != NULL) {
 		p = of_get_parent(cc);
@@ -561,114 +569,64 @@ static void __init smp_core99_setup(int ncpus)
 			if (device_is_compatible(cc, "pulsar-legacy-slewing")) {
 				pmac_tb_freeze = smp_core99_pulsar_tb_freeze;
 				pmac_tb_pulsar_addr = 0xd2;
-				printk(KERN_INFO "Timebase clock is Pulsar chip\n");
+				name = "Pulsar";
 			} else if (device_is_compatible(cc, "cy28508")) {
 				pmac_tb_freeze = smp_core99_cypress_tb_freeze;
-				printk(KERN_INFO "Timebase clock is Cypress chip\n");
+				name = "Cypress";
 			}
 			break;
 		case 0xd4:
 			pmac_tb_freeze = smp_core99_pulsar_tb_freeze;
 			pmac_tb_pulsar_addr = 0xd4;
-			printk(KERN_INFO "Timebase clock is Pulsar chip\n");
+			name = "Pulsar";
 			break;
 		}
-		if (pmac_tb_freeze != NULL) {
-			pmac_tb_clock_chip_host = of_get_parent(cc);
-			of_node_put(cc);
+		if (pmac_tb_freeze != NULL)
 			break;
-		}
 	}
-	if (pmac_tb_freeze == NULL) {
-		smp_ops->give_timebase = smp_generic_give_timebase;
-		smp_ops->take_timebase = smp_generic_take_timebase;
+	if (pmac_tb_freeze != NULL) {
+		struct device_node *p = of_get_parent(cc);
+		of_node_put(cc);
+		while(p && strcmp(p->type, "i2c")) {
+			cc = of_get_parent(p);
+			of_node_put(p);
+			p = cc;
+		}
+		if (p == NULL)
+			goto no_i2c_sync;
+		/* Open i2c bus for synchronous access */
+		if (pmac_low_i2c_open(p, 0)) {
+			printk(KERN_ERR "Failed top open i2c bus %s for clock"
+			       " sync, fallback to software sync !\n",
+			       p->full_name);
+			of_node_put(p);
+			goto no_i2c_sync;
+		}
+		pmac_tb_clock_chip_host = p;
+		printk(KERN_INFO "Processor timebase sync using %s i2c clock\n",
+		       name);
+		return;
 	}
+ no_i2c_sync:
+	pmac_tb_freeze = NULL;
 }
 
-/* nothing to do here, caches are already set up by service processor */
-static inline void __devinit core99_init_caches(int cpu)
-{
-}
+#endif /* CONFIG_PPC64 */
 
-#else /* CONFIG_PPC64 */
 
 /*
- * SMP G4 powermacs use a GPIO to enable/disable the timebase.
+ * SMP G4 and newer G5 use a GPIO to enable/disable the timebase.
  */
 
 static unsigned int core99_tb_gpio;	/* Timebase freeze GPIO */
 
-static unsigned int pri_tb_hi, pri_tb_lo;
-static unsigned int pri_tb_stamp;
-
-/* not __init, called in sleep/wakeup code */
-void smp_core99_give_timebase(void)
+static void smp_core99_gpio_tb_freeze(int freeze)
 {
-	unsigned long flags;
-	unsigned int t;
-
-	/* wait for the secondary to be in take_timebase */
-	for (t = 100000; t > 0 && !sec_tb_reset; --t)
-		udelay(10);
-	if (!sec_tb_reset) {
-		printk(KERN_WARNING "Timeout waiting sync on second CPU\n");
-		return;
-	}
-
-	/* freeze the timebase and read it */
-	/* disable interrupts so the timebase is disabled for the
-	   shortest possible time */
-	local_irq_save(flags);
-	pmac_call_feature(PMAC_FTR_WRITE_GPIO, NULL, core99_tb_gpio, 4);
+	if (freeze)
+		pmac_call_feature(PMAC_FTR_WRITE_GPIO, NULL, core99_tb_gpio, 4);
+	else
+		pmac_call_feature(PMAC_FTR_WRITE_GPIO, NULL, core99_tb_gpio, 0);
 	pmac_call_feature(PMAC_FTR_READ_GPIO, NULL, core99_tb_gpio, 0);
-	mb();
-	pri_tb_hi = get_tbu();
-	pri_tb_lo = get_tbl();
-	pri_tb_stamp = last_jiffy_stamp(smp_processor_id());
-	mb();
-
-	/* tell the secondary we're ready */
-	sec_tb_reset = 2;
-	mb();
-
-	/* wait for the secondary to have taken it */
-	/* note: can't use udelay here, since it needs the timebase running */
-	for (t = 10000000; t > 0 && sec_tb_reset; --t)
-		barrier();
-	if (sec_tb_reset)
-		/* XXX BUG_ON here? */
-		printk(KERN_WARNING "Timeout waiting sync(2) on second CPU\n");
-
-	/* Now, restart the timebase by leaving the GPIO to an open collector */
-       	pmac_call_feature(PMAC_FTR_WRITE_GPIO, NULL, core99_tb_gpio, 0);
-        pmac_call_feature(PMAC_FTR_READ_GPIO, NULL, core99_tb_gpio, 0);
-	local_irq_restore(flags);
-}
-
-/* not __init, called in sleep/wakeup code */
-void smp_core99_take_timebase(void)
-{
-	unsigned long flags;
-
-	/* tell the primary we're here */
-	sec_tb_reset = 1;
-	mb();
-
-	/* wait for the primary to set pri_tb_hi/lo */
-	while (sec_tb_reset < 2)
-		mb();
-
-	/* set our stuff the same as the primary */
-	local_irq_save(flags);
-	set_dec(1);
-	set_tb(pri_tb_hi, pri_tb_lo);
-	last_jiffy_stamp(smp_processor_id()) = pri_tb_stamp;
-	mb();
-
-	/* tell the primary we're done */
-       	sec_tb_reset = 0;
-	mb();
-	local_irq_restore(flags);
 }
 
 /* L2 and L3 cache settings to pass from CPU0 to CPU1 on G4 cpus */
@@ -677,6 +635,7 @@ volatile static long int core99_l3_cache;
 
 static void __devinit core99_init_caches(int cpu)
 {
+#ifndef CONFIG_PPC64
 	if (!cpu_has_feature(CPU_FTR_L2CR))
 		return;
 
@@ -702,30 +661,80 @@ static void __devinit core99_init_caches(int cpu)
 		_set_L3CR(core99_l3_cache);
 		printk("CPU%d: L3CR set to %lx\n", cpu, core99_l3_cache);
 	}
+#endif /* !CONFIG_PPC64 */
 }
 
 static void __init smp_core99_setup(int ncpus)
 {
-	struct device_node *cpu;
-	u32 *tbprop = NULL;
-	int i;
+#ifdef CONFIG_PPC64
 
-	core99_tb_gpio = KL_GPIO_TB_ENABLE;	/* default value */
-	cpu = of_find_node_by_type(NULL, "cpu");
-	if (cpu != NULL) {
-		tbprop = (u32 *)get_property(cpu, "timebase-enable", NULL);
-		if (tbprop)
-			core99_tb_gpio = *tbprop;
-		of_node_put(cpu);
+	/* i2c based HW sync on some G5s */
+	if (machine_is_compatible("PowerMac7,2") ||
+	    machine_is_compatible("PowerMac7,3") ||
+	    machine_is_compatible("RackMac3,1"))
+		smp_core99_setup_i2c_hwsync(ncpus);
+
+	/* GPIO based HW sync on recent G5s */
+	if (pmac_tb_freeze == NULL) {
+		struct device_node *np =
+			of_find_node_by_name(NULL, "timebase-enable");
+		u32 *reg = (u32 *)get_property(np, "reg", NULL);
+
+		if (np && reg && !strcmp(np->type, "gpio")) {
+			core99_tb_gpio = *reg;
+			if (core99_tb_gpio < 0x50)
+				core99_tb_gpio += 0x50;
+			pmac_tb_freeze = smp_core99_gpio_tb_freeze;
+			printk(KERN_INFO "Processor timebase sync using"
+			       " GPIO 0x%02x\n", core99_tb_gpio);
+		}
 	}
 
-	/* XXX should get this from reg properties */
-	for (i = 1; i < ncpus; ++i)
-		smp_hw_index[i] = i;
-	powersave_nap = 0;
-}
+#else /* CONFIG_PPC64 */
+
+	/* GPIO based HW sync on ppc32 Core99 */
+	if (pmac_tb_freeze == NULL && !machine_is_compatible("MacRISC4")) {
+		struct device_node *cpu;
+		u32 *tbprop = NULL;
+
+		core99_tb_gpio = KL_GPIO_TB_ENABLE;	/* default value */
+		cpu = of_find_node_by_type(NULL, "cpu");
+		if (cpu != NULL) {
+			tbprop = (u32 *)get_property(cpu, "timebase-enable",
+						     NULL);
+			if (tbprop)
+				core99_tb_gpio = *tbprop;
+			of_node_put(cpu);
+		}
+		pmac_tb_freeze = smp_core99_gpio_tb_freeze;
+		printk(KERN_INFO "Processor timebase sync using"
+		       " GPIO 0x%02x\n", core99_tb_gpio);
+	}
+
+#endif /* CONFIG_PPC64 */
+
+	/* No timebase sync, fallback to software */
+	if (pmac_tb_freeze == NULL) {
+		smp_ops->give_timebase = smp_generic_give_timebase;
+		smp_ops->take_timebase = smp_generic_take_timebase;
+		printk(KERN_INFO "Processor timebase sync using software\n");
+	}
+
+#ifndef CONFIG_PPC64
+	{
+		int i;
+
+		/* XXX should get this from reg properties */
+		for (i = 1; i < ncpus; ++i)
+			smp_hw_index[i] = i;
+	}
 #endif
 
+	/* 32 bits SMP can't NAP */
+	if (!machine_is_compatible("MacRISC4"))
+		powersave_nap = 0;
+}
+
 static int __init smp_core99_probe(void)
 {
 	struct device_node *cpus;
@@ -803,17 +812,25 @@ static void __devinit smp_core99_setup_cpu(int cpu_nr)
 	mpic_setup_this_cpu();
 
 	if (cpu_nr == 0) {
-#ifdef CONFIG_POWER4
+#ifdef CONFIG_PPC64
 		extern void g5_phy_disable_cpu1(void);
 
+		/* Close i2c bus if it was used for tb sync */
+		if (pmac_tb_clock_chip_host) {
+			pmac_low_i2c_close(pmac_tb_clock_chip_host);
+			pmac_tb_clock_chip_host	= NULL;
+		}
+
 		/* If we didn't start the second CPU, we must take
 		 * it off the bus
 		 */
 		if (machine_is_compatible("MacRISC4") &&
 		    num_online_cpus() < 2)		
 			g5_phy_disable_cpu1();
-#endif /* CONFIG_POWER4 */
-		if (ppc_md.progress) ppc_md.progress("core99_setup_cpu 0 done", 0x349);
+#endif /* CONFIG_PPC64 */
+
+		if (ppc_md.progress)
+			ppc_md.progress("core99_setup_cpu 0 done", 0x349);
 	}
 }
 
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index b3e3636a57b0..14b9abde2d27 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -4,6 +4,6 @@ obj-$(CONFIG_PPC_I8259)		+= i8259.o
 obj-$(CONFIG_PPC_MPC106)	+= grackle.o
 obj-$(CONFIG_BOOKE)		+= dcr.o
 obj-$(CONFIG_40x)		+= dcr.o
-obj-$(CONFIG_U3_DART)		+= u3_iommu.o
+obj-$(CONFIG_U3_DART)		+= dart_iommu.o
 obj-$(CONFIG_MMIO_NVRAM)	+= mmio_nvram.o
 obj-$(CONFIG_83xx)		+= ipic.o
diff --git a/arch/powerpc/sysdev/dart.h b/arch/powerpc/sysdev/dart.h
index 33ed9ed7fc1e..c2d05763ccbe 100644
--- a/arch/powerpc/sysdev/dart.h
+++ b/arch/powerpc/sysdev/dart.h
@@ -20,29 +20,44 @@
 #define _POWERPC_SYSDEV_DART_H
 
 
-/* physical base of DART registers */
-#define DART_BASE        0xf8033000UL
-
 /* Offset from base to control register */
-#define DARTCNTL   0
+#define DART_CNTL	0
+
 /* Offset from base to exception register */
-#define DARTEXCP   0x10
+#define DART_EXCP_U3	0x10
 /* Offset from base to TLB tag registers */
-#define DARTTAG    0x1000
+#define DART_TAGS_U3	0x1000
 
+/* U4 registers */
+#define DART_BASE_U4	0x10
+#define DART_SIZE_U4	0x20
+#define DART_EXCP_U4	0x30
+#define DART_TAGS_U4	0x1000
 
 /* Control Register fields */
 
-/* base address of table (pfn) */
-#define DARTCNTL_BASE_MASK    0xfffff
-#define DARTCNTL_BASE_SHIFT   12
+/* U3 registers */
+#define DART_CNTL_U3_BASE_MASK	0xfffff
+#define DART_CNTL_U3_BASE_SHIFT	12
+#define DART_CNTL_U3_FLUSHTLB	0x400
+#define DART_CNTL_U3_ENABLE	0x200
+#define DART_CNTL_U3_SIZE_MASK	0x1ff
+#define DART_CNTL_U3_SIZE_SHIFT	0
+
+/* U4 registers */
+#define DART_BASE_U4_BASE_MASK	0xffffff
+#define DART_BASE_U4_BASE_SHIFT	0
+#define DART_CNTL_U4_FLUSHTLB	0x20000000
+#define DART_CNTL_U4_ENABLE	0x80000000
+#define DART_SIZE_U4_SIZE_MASK	0x1fff
+#define DART_SIZE_U4_SIZE_SHIFT	0
+
+#define DART_REG(r)	(dart + ((r) >> 2))
+#define DART_IN(r)	(in_be32(DART_REG(r)))
+#define DART_OUT(r,v)	(out_be32(DART_REG(r), (v)))
 
-#define DARTCNTL_FLUSHTLB     0x400
-#define DARTCNTL_ENABLE       0x200
 
 /* size of table in pages */
-#define DARTCNTL_SIZE_MASK    0x1ff
-#define DARTCNTL_SIZE_SHIFT   0
 
 
 /* DART table fields */
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
new file mode 100644
index 000000000000..df0dbdee762a
--- /dev/null
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -0,0 +1,350 @@
+/*
+ * arch/powerpc/sysdev/dart_iommu.c
+ *
+ * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
+ * Copyright (C) 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org>,
+ *                    IBM Corporation
+ *
+ * Based on pSeries_iommu.c:
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
+ *
+ * Dynamic DMA mapping support, Apple U3, U4 & IBM CPC925 "DART" iommu.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/cacheflush.h>
+#include <asm/lmb.h>
+#include <asm/ppc-pci.h>
+
+#include "dart.h"
+
+extern int iommu_force_on;
+
+/* Physical base address and size of the DART table */
+unsigned long dart_tablebase; /* exported to htab_initialize */
+static unsigned long dart_tablesize;
+
+/* Virtual base address of the DART table */
+static u32 *dart_vbase;
+
+/* Mapped base address for the dart */
+static unsigned int *__iomem dart;
+
+/* Dummy val that entries are set to when unused */
+static unsigned int dart_emptyval;
+
+static struct iommu_table iommu_table_dart;
+static int iommu_table_dart_inited;
+static int dart_dirty;
+static int dart_is_u4;
+
+#define DBG(...)
+
+static inline void dart_tlb_invalidate_all(void)
+{
+	unsigned long l = 0;
+	unsigned int reg, inv_bit;
+	unsigned long limit;
+
+	DBG("dart: flush\n");
+
+	/* To invalidate the DART, set the DARTCNTL_FLUSHTLB bit in the
+	 * control register and wait for it to clear.
+	 *
+	 * Gotcha: Sometimes, the DART won't detect that the bit gets
+	 * set. If so, clear it and set it again.
+	 */
+
+	limit = 0;
+
+	inv_bit = dart_is_u4 ? DART_CNTL_U4_FLUSHTLB : DART_CNTL_U3_FLUSHTLB;
+retry:
+	l = 0;
+	reg = DART_IN(DART_CNTL);
+	reg |= inv_bit;
+	DART_OUT(DART_CNTL, reg);
+
+	while ((DART_IN(DART_CNTL) & inv_bit) && l < (1L << limit))
+		l++;
+	if (l == (1L << limit)) {
+		if (limit < 4) {
+			limit++;
+		        reg = DART_IN(DART_CNTL);
+		        reg &= ~inv_bit;
+			DART_OUT(DART_CNTL, reg);
+			goto retry;
+		} else
+			panic("DART: TLB did not flush after waiting a long "
+			      "time. Buggy U3 ?");
+	}
+}
+
+static void dart_flush(struct iommu_table *tbl)
+{
+	if (dart_dirty)
+		dart_tlb_invalidate_all();
+	dart_dirty = 0;
+}
+
+static void dart_build(struct iommu_table *tbl, long index,
+		       long npages, unsigned long uaddr,
+		       enum dma_data_direction direction)
+{
+	unsigned int *dp;
+	unsigned int rpn;
+
+	DBG("dart: build at: %lx, %lx, addr: %x\n", index, npages, uaddr);
+
+	index <<= DART_PAGE_FACTOR;
+	npages <<= DART_PAGE_FACTOR;
+
+	dp = ((unsigned int*)tbl->it_base) + index;
+
+	/* On U3, all memory is contigous, so we can move this
+	 * out of the loop.
+	 */
+	while (npages--) {
+		rpn = virt_to_abs(uaddr) >> DART_PAGE_SHIFT;
+
+		*(dp++) = DARTMAP_VALID | (rpn & DARTMAP_RPNMASK);
+
+		rpn++;
+		uaddr += DART_PAGE_SIZE;
+	}
+
+	dart_dirty = 1;
+}
+
+
+static void dart_free(struct iommu_table *tbl, long index, long npages)
+{
+	unsigned int *dp;
+
+	/* We don't worry about flushing the TLB cache. The only drawback of
+	 * not doing it is that we won't catch buggy device drivers doing
+	 * bad DMAs, but then no 32-bit architecture ever does either.
+	 */
+
+	DBG("dart: free at: %lx, %lx\n", index, npages);
+
+	index <<= DART_PAGE_FACTOR;
+	npages <<= DART_PAGE_FACTOR;
+
+	dp  = ((unsigned int *)tbl->it_base) + index;
+
+	while (npages--)
+		*(dp++) = dart_emptyval;
+}
+
+
+static int dart_init(struct device_node *dart_node)
+{
+	unsigned int i;
+	unsigned long tmp, base, size;
+	struct resource r;
+
+	if (dart_tablebase == 0 || dart_tablesize == 0) {
+		printk(KERN_INFO "DART: table not allocated, using "
+		       "direct DMA\n");
+		return -ENODEV;
+	}
+
+	if (of_address_to_resource(dart_node, 0, &r))
+		panic("DART: can't get register base ! ");
+
+	/* Make sure nothing from the DART range remains in the CPU cache
+	 * from a previous mapping that existed before the kernel took
+	 * over
+	 */
+	flush_dcache_phys_range(dart_tablebase,
+				dart_tablebase + dart_tablesize);
+
+	/* Allocate a spare page to map all invalid DART pages. We need to do
+	 * that to work around what looks like a problem with the HT bridge
+	 * prefetching into invalid pages and corrupting data
+	 */
+	tmp = lmb_alloc(DART_PAGE_SIZE, DART_PAGE_SIZE);
+	if (!tmp)
+		panic("DART: Cannot allocate spare page!");
+	dart_emptyval = DARTMAP_VALID | ((tmp >> DART_PAGE_SHIFT) &
+					 DARTMAP_RPNMASK);
+
+	/* Map in DART registers */
+	dart = ioremap(r.start, r.end - r.start + 1);
+	if (dart == NULL)
+		panic("DART: Cannot map registers!");
+
+	/* Map in DART table */
+	dart_vbase = ioremap(virt_to_abs(dart_tablebase), dart_tablesize);
+
+	/* Fill initial table */
+	for (i = 0; i < dart_tablesize/4; i++)
+		dart_vbase[i] = dart_emptyval;
+
+	/* Initialize DART with table base and enable it. */
+	base = dart_tablebase >> DART_PAGE_SHIFT;
+	size = dart_tablesize >> DART_PAGE_SHIFT;
+	if (dart_is_u4) {
+		BUG_ON(size & ~DART_SIZE_U4_SIZE_MASK);
+		DART_OUT(DART_BASE_U4, base);
+		DART_OUT(DART_SIZE_U4, size);
+		DART_OUT(DART_CNTL, DART_CNTL_U4_ENABLE);
+	} else {
+		BUG_ON(size & ~DART_CNTL_U3_SIZE_MASK);
+		DART_OUT(DART_CNTL,
+			 DART_CNTL_U3_ENABLE |
+			 (base << DART_CNTL_U3_BASE_SHIFT) |
+			 (size << DART_CNTL_U3_SIZE_SHIFT));
+	}
+
+	/* Invalidate DART to get rid of possible stale TLBs */
+	dart_tlb_invalidate_all();
+
+	printk(KERN_INFO "DART IOMMU initialized for %s type chipset\n",
+	       dart_is_u4 ? "U4" : "U3");
+
+	return 0;
+}
+
+static void iommu_table_dart_setup(void)
+{
+	iommu_table_dart.it_busno = 0;
+	iommu_table_dart.it_offset = 0;
+	/* it_size is in number of entries */
+	iommu_table_dart.it_size = (dart_tablesize / sizeof(u32)) >> DART_PAGE_FACTOR;
+
+	/* Initialize the common IOMMU code */
+	iommu_table_dart.it_base = (unsigned long)dart_vbase;
+	iommu_table_dart.it_index = 0;
+	iommu_table_dart.it_blocksize = 1;
+	iommu_init_table(&iommu_table_dart);
+
+	/* Reserve the last page of the DART to avoid possible prefetch
+	 * past the DART mapped area
+	 */
+	set_bit(iommu_table_dart.it_size - 1, iommu_table_dart.it_map);
+}
+
+static void iommu_dev_setup_dart(struct pci_dev *dev)
+{
+	struct device_node *dn;
+
+	/* We only have one iommu table on the mac for now, which makes
+	 * things simple. Setup all PCI devices to point to this table
+	 *
+	 * We must use pci_device_to_OF_node() to make sure that
+	 * we get the real "final" pointer to the device in the
+	 * pci_dev sysdata and not the temporary PHB one
+	 */
+	dn = pci_device_to_OF_node(dev);
+
+	if (dn)
+		PCI_DN(dn)->iommu_table = &iommu_table_dart;
+}
+
+static void iommu_bus_setup_dart(struct pci_bus *bus)
+{
+	struct device_node *dn;
+
+	if (!iommu_table_dart_inited) {
+		iommu_table_dart_inited = 1;
+		iommu_table_dart_setup();
+	}
+
+	dn = pci_bus_to_OF_node(bus);
+
+	if (dn)
+		PCI_DN(dn)->iommu_table = &iommu_table_dart;
+}
+
+static void iommu_dev_setup_null(struct pci_dev *dev) { }
+static void iommu_bus_setup_null(struct pci_bus *bus) { }
+
+void iommu_init_early_dart(void)
+{
+	struct device_node *dn;
+
+	/* Find the DART in the device-tree */
+	dn = of_find_compatible_node(NULL, "dart", "u3-dart");
+	if (dn == NULL) {
+		dn = of_find_compatible_node(NULL, "dart", "u4-dart");
+		if (dn == NULL)
+			goto bail;
+		dart_is_u4 = 1;
+	}
+
+	/* Setup low level TCE operations for the core IOMMU code */
+	ppc_md.tce_build = dart_build;
+	ppc_md.tce_free  = dart_free;
+	ppc_md.tce_flush = dart_flush;
+
+	/* Initialize the DART HW */
+	if (dart_init(dn) == 0) {
+		ppc_md.iommu_dev_setup = iommu_dev_setup_dart;
+		ppc_md.iommu_bus_setup = iommu_bus_setup_dart;
+
+		/* Setup pci_dma ops */
+		pci_iommu_init();
+
+		return;
+	}
+
+ bail:
+	/* If init failed, use direct iommu and null setup functions */
+	ppc_md.iommu_dev_setup = iommu_dev_setup_null;
+	ppc_md.iommu_bus_setup = iommu_bus_setup_null;
+
+	/* Setup pci_dma ops */
+	pci_direct_iommu_init();
+}
+
+
+void __init alloc_dart_table(void)
+{
+	/* Only reserve DART space if machine has more than 2GB of RAM
+	 * or if requested with iommu=on on cmdline.
+	 */
+	if (lmb_end_of_DRAM() <= 0x80000000ull && !iommu_force_on)
+		return;
+
+	/* 512 pages (2MB) is max DART tablesize. */
+	dart_tablesize = 1UL << 21;
+	/* 16MB (1 << 24) alignment. We allocate a full 16Mb chuck since we
+	 * will blow up an entire large page anyway in the kernel mapping
+	 */
+	dart_tablebase = (unsigned long)
+		abs_to_virt(lmb_alloc_base(1UL<<24, 1UL<<24, 0x80000000L));
+
+	printk(KERN_INFO "DART table allocated at: %lx\n", dart_tablebase);
+}
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 9513ea78e6c1..4f26304d0263 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -13,6 +13,9 @@
  */
 
 #undef DEBUG
+#undef DEBUG_IPI
+#undef DEBUG_IRQ
+#undef DEBUG_LOW
 
 #include <linux/config.h>
 #include <linux/types.h>
@@ -168,35 +171,86 @@ static void __init mpic_test_broken_ipi(struct mpic *mpic)
 /* Test if an interrupt is sourced from HyperTransport (used on broken U3s)
  * to force the edge setting on the MPIC and do the ack workaround.
  */
-static inline int mpic_is_ht_interrupt(struct mpic *mpic, unsigned int source_no)
+static inline int mpic_is_ht_interrupt(struct mpic *mpic, unsigned int source)
 {
-	if (source_no >= 128 || !mpic->fixups)
+	if (source >= 128 || !mpic->fixups)
 		return 0;
-	return mpic->fixups[source_no].base != NULL;
+	return mpic->fixups[source].base != NULL;
 }
 
 
-static inline void mpic_apic_end_irq(struct mpic *mpic, unsigned int source_no)
+static inline void mpic_ht_end_irq(struct mpic *mpic, unsigned int source)
 {
-	struct mpic_irq_fixup *fixup = &mpic->fixups[source_no];
+	struct mpic_irq_fixup *fixup = &mpic->fixups[source];
 
-	spin_lock(&mpic->fixup_lock);
-	writeb(0x11 + 2 * fixup->irq, fixup->base + 2);
-	writel(fixup->data, fixup->base + 4);
-	spin_unlock(&mpic->fixup_lock);
+	if (fixup->applebase) {
+		unsigned int soff = (fixup->index >> 3) & ~3;
+		unsigned int mask = 1U << (fixup->index & 0x1f);
+		writel(mask, fixup->applebase + soff);
+	} else {
+		spin_lock(&mpic->fixup_lock);
+		writeb(0x11 + 2 * fixup->index, fixup->base + 2);
+		writel(fixup->data, fixup->base + 4);
+		spin_unlock(&mpic->fixup_lock);
+	}
 }
 
+static void mpic_startup_ht_interrupt(struct mpic *mpic, unsigned int source,
+				      unsigned int irqflags)
+{
+	struct mpic_irq_fixup *fixup = &mpic->fixups[source];
+	unsigned long flags;
+	u32 tmp;
+
+	if (fixup->base == NULL)
+		return;
+
+	DBG("startup_ht_interrupt(%u, %u) index: %d\n",
+	    source, irqflags, fixup->index);
+	spin_lock_irqsave(&mpic->fixup_lock, flags);
+	/* Enable and configure */
+	writeb(0x10 + 2 * fixup->index, fixup->base + 2);
+	tmp = readl(fixup->base + 4);
+	tmp &= ~(0x23U);
+	if (irqflags & IRQ_LEVEL)
+		tmp |= 0x22;
+	writel(tmp, fixup->base + 4);
+	spin_unlock_irqrestore(&mpic->fixup_lock, flags);
+}
+
+static void mpic_shutdown_ht_interrupt(struct mpic *mpic, unsigned int source,
+				       unsigned int irqflags)
+{
+	struct mpic_irq_fixup *fixup = &mpic->fixups[source];
+	unsigned long flags;
+	u32 tmp;
+
+	if (fixup->base == NULL)
+		return;
+
+	DBG("shutdown_ht_interrupt(%u, %u)\n", source, irqflags);
+
+	/* Disable */
+	spin_lock_irqsave(&mpic->fixup_lock, flags);
+	writeb(0x10 + 2 * fixup->index, fixup->base + 2);
+	tmp = readl(fixup->base + 4);
+	tmp &= ~1U;
+	writel(tmp, fixup->base + 4);
+	spin_unlock_irqrestore(&mpic->fixup_lock, flags);
+}
 
-static void __init mpic_scan_ioapic(struct mpic *mpic, u8 __iomem *devbase)
+static void __init mpic_scan_ht_pic(struct mpic *mpic, u8 __iomem *devbase,
+				    unsigned int devfn, u32 vdid)
 {
 	int i, irq, n;
+	u8 __iomem *base;
 	u32 tmp;
 	u8 pos;
 
-	for (pos = readb(devbase + 0x34); pos; pos = readb(devbase + pos + 1)) {
-		u8 id = readb(devbase + pos);
-
-		if (id == 0x08) {
+	for (pos = readb(devbase + PCI_CAPABILITY_LIST); pos != 0;
+	     pos = readb(devbase + pos + PCI_CAP_LIST_NEXT)) {
+		u8 id = readb(devbase + pos + PCI_CAP_LIST_ID);
+		if (id == PCI_CAP_ID_HT_IRQCONF) {
 			id = readb(devbase + pos + 3);
 			if (id == 0x80)
 				break;
@@ -205,33 +259,41 @@ static void __init mpic_scan_ioapic(struct mpic *mpic, u8 __iomem *devbase)
 	if (pos == 0)
 		return;
 
-	printk(KERN_INFO "mpic:    - Workarounds @ %p, pos = 0x%02x\n", devbase, pos);
+	base = devbase + pos;
+	writeb(0x01, base + 2);
+	n = (readl(base + 4) >> 16) & 0xff;
 
-	devbase += pos;
-
-	writeb(0x01, devbase + 2);
-	n = (readl(devbase + 4) >> 16) & 0xff;
+	printk(KERN_INFO "mpic:   - HT:%02x.%x [0x%02x] vendor %04x device %04x"
+	       " has %d irqs\n",
+	       devfn >> 3, devfn & 0x7, pos, vdid & 0xffff, vdid >> 16, n + 1);
 
 	for (i = 0; i <= n; i++) {
-		writeb(0x10 + 2 * i, devbase + 2);
-		tmp = readl(devbase + 4);
-		if ((tmp & 0x21) != 0x20)
-			continue;
+		writeb(0x10 + 2 * i, base + 2);
+		tmp = readl(base + 4);
 		irq = (tmp >> 16) & 0xff;
-		mpic->fixups[irq].irq = i;
-		mpic->fixups[irq].base = devbase;
-		writeb(0x11 + 2 * i, devbase + 2);
-		mpic->fixups[irq].data = readl(devbase + 4) | 0x80000000;
+		DBG("HT PIC index 0x%x, irq 0x%x, tmp: %08x\n", i, irq, tmp);
+		/* mask it , will be unmasked later */
+		tmp |= 0x1;
+		writel(tmp, base + 4);
+		mpic->fixups[irq].index = i;
+		mpic->fixups[irq].base = base;
+		/* Apple HT PIC has a non-standard way of doing EOIs */
+		if ((vdid & 0xffff) == 0x106b)
+			mpic->fixups[irq].applebase = devbase + 0x60;
+		else
+			mpic->fixups[irq].applebase = NULL;
+		writeb(0x11 + 2 * i, base + 2);
+		mpic->fixups[irq].data = readl(base + 4) | 0x80000000;
 	}
 }
  
 
-static void __init mpic_scan_ioapics(struct mpic *mpic)
+static void __init mpic_scan_ht_pics(struct mpic *mpic)
 {
 	unsigned int devfn;
 	u8 __iomem *cfgspace;
 
-	printk(KERN_INFO "mpic: Setting up IO-APICs workarounds for U3\n");
+	printk(KERN_INFO "mpic: Setting up HT PICs workarounds for U3/U4\n");
 
 	/* Allocate fixups array */
 	mpic->fixups = alloc_bootmem(128 * sizeof(struct mpic_irq_fixup));
@@ -247,13 +309,14 @@ static void __init mpic_scan_ioapics(struct mpic *mpic)
 	cfgspace = ioremap(0xf2000000, 0x10000);
 	BUG_ON(cfgspace == NULL);
 
-	/* Now we scan all slots. We do a very quick scan, we read the header type,
-	 * vendor ID and device ID only, that's plenty enough
+	/* Now we scan all slots. We do a very quick scan, we read the header
+	 * type, vendor ID and device ID only, that's plenty enough
 	 */
 	for (devfn = 0; devfn < 0x100; devfn++) {
 		u8 __iomem *devbase = cfgspace + (devfn << 8);
 		u8 hdr_type = readb(devbase + PCI_HEADER_TYPE);
 		u32 l = readl(devbase + PCI_VENDOR_ID);
+		u16 s;
 
 		DBG("devfn %x, l: %x\n", devfn, l);
 
@@ -261,8 +324,12 @@ static void __init mpic_scan_ioapics(struct mpic *mpic)
 		if (l == 0xffffffff || l == 0x00000000 ||
 		    l == 0x0000ffff || l == 0xffff0000)
 			goto next;
+		/* Check if is supports capability lists */
+		s = readw(devbase + PCI_STATUS);
+		if (!(s & PCI_STATUS_CAP_LIST))
+			goto next;
 
-		mpic_scan_ioapic(mpic, devbase);
+		mpic_scan_ht_pic(mpic, devbase, devfn, l);
 
 	next:
 		/* next device, if function 0 */
@@ -363,6 +430,31 @@ static void mpic_enable_irq(unsigned int irq)
 			break;
 		}
 	} while(mpic_irq_read(src, MPIC_IRQ_VECTOR_PRI) & MPIC_VECPRI_MASK);	
+
+#ifdef CONFIG_MPIC_BROKEN_U3
+	if (mpic->flags & MPIC_BROKEN_U3) {
+		unsigned int src = irq - mpic->irq_offset;
+		if (mpic_is_ht_interrupt(mpic, src) &&
+		    (irq_desc[irq].status & IRQ_LEVEL))
+			mpic_ht_end_irq(mpic, src);
+	}
+#endif /* CONFIG_MPIC_BROKEN_U3 */
+}
+
+static unsigned int mpic_startup_irq(unsigned int irq)
+{
+#ifdef CONFIG_MPIC_BROKEN_U3
+	struct mpic *mpic = mpic_from_irq(irq);
+	unsigned int src = irq - mpic->irq_offset;
+
+	if (mpic_is_ht_interrupt(mpic, src))
+		mpic_startup_ht_interrupt(mpic, src, irq_desc[irq].status);
+
+#endif /* CONFIG_MPIC_BROKEN_U3 */
+
+	mpic_enable_irq(irq);
+
+	return 0;
 }
 
 static void mpic_disable_irq(unsigned int irq)
@@ -386,12 +478,27 @@ static void mpic_disable_irq(unsigned int irq)
 	} while(!(mpic_irq_read(src, MPIC_IRQ_VECTOR_PRI) & MPIC_VECPRI_MASK));
 }
 
+static void mpic_shutdown_irq(unsigned int irq)
+{
+#ifdef CONFIG_MPIC_BROKEN_U3
+	struct mpic *mpic = mpic_from_irq(irq);
+	unsigned int src = irq - mpic->irq_offset;
+
+	if (mpic_is_ht_interrupt(mpic, src))
+		mpic_shutdown_ht_interrupt(mpic, src, irq_desc[irq].status);
+
+#endif /* CONFIG_MPIC_BROKEN_U3 */
+
+	mpic_disable_irq(irq);
+}
+
 static void mpic_end_irq(unsigned int irq)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
 
+#ifdef DEBUG_IRQ
 	DBG("%s: end_irq: %d\n", mpic->name, irq);
-
+#endif
 	/* We always EOI on end_irq() even for edge interrupts since that
 	 * should only lower the priority, the MPIC should have properly
 	 * latched another edge interrupt coming in anyway
@@ -400,8 +507,9 @@ static void mpic_end_irq(unsigned int irq)
 #ifdef CONFIG_MPIC_BROKEN_U3
 	if (mpic->flags & MPIC_BROKEN_U3) {
 		unsigned int src = irq - mpic->irq_offset;
-		if (mpic_is_ht_interrupt(mpic, src))
-			mpic_apic_end_irq(mpic, src);
+		if (mpic_is_ht_interrupt(mpic, src) &&
+		    (irq_desc[irq].status & IRQ_LEVEL))
+			mpic_ht_end_irq(mpic, src);
 	}
 #endif /* CONFIG_MPIC_BROKEN_U3 */
 
@@ -482,6 +590,8 @@ struct mpic * __init mpic_alloc(unsigned long phys_addr,
 	mpic->name = name;
 
 	mpic->hc_irq.typename = name;
+	mpic->hc_irq.startup = mpic_startup_irq;
+	mpic->hc_irq.shutdown = mpic_shutdown_irq;
 	mpic->hc_irq.enable = mpic_enable_irq;
 	mpic->hc_irq.disable = mpic_disable_irq;
 	mpic->hc_irq.end = mpic_end_irq;
@@ -650,10 +760,10 @@ void __init mpic_init(struct mpic *mpic)
 		mpic->irq_count = mpic->num_sources;
 
 #ifdef CONFIG_MPIC_BROKEN_U3
-	/* Do the ioapic fixups on U3 broken mpic */
+	/* Do the HT PIC fixups on U3 broken mpic */
 	DBG("MPIC flags: %x\n", mpic->flags);
 	if ((mpic->flags & MPIC_BROKEN_U3) && (mpic->flags & MPIC_PRIMARY))
-		mpic_scan_ioapics(mpic);
+		mpic_scan_ht_pics(mpic);
 #endif /* CONFIG_MPIC_BROKEN_U3 */
 
 	for (i = 0; i < mpic->num_sources; i++) {
@@ -840,7 +950,9 @@ void mpic_send_ipi(unsigned int ipi_no, unsigned int cpu_mask)
 
 	BUG_ON(mpic == NULL);
 
+#ifdef DEBUG_IPI
 	DBG("%s: send_ipi(ipi_no: %d)\n", mpic->name, ipi_no);
+#endif
 
 	mpic_cpu_write(MPIC_CPU_IPI_DISPATCH_0 + ipi_no * 0x10,
 		       mpic_physmask(cpu_mask & cpus_addr(cpu_online_map)[0]));
@@ -851,19 +963,28 @@ int mpic_get_one_irq(struct mpic *mpic, struct pt_regs *regs)
 	u32 irq;
 
 	irq = mpic_cpu_read(MPIC_CPU_INTACK) & MPIC_VECPRI_VECTOR_MASK;
+#ifdef DEBUG_LOW
 	DBG("%s: get_one_irq(): %d\n", mpic->name, irq);
-
+#endif
 	if (mpic->cascade && irq == mpic->cascade_vec) {
+#ifdef DEBUG_LOW
 		DBG("%s: cascading ...\n", mpic->name);
+#endif
 		irq = mpic->cascade(regs, mpic->cascade_data);
 		mpic_eoi(mpic);
 		return irq;
 	}
 	if (unlikely(irq == MPIC_VEC_SPURRIOUS))
 		return -1;
-	if (irq < MPIC_VEC_IPI_0) 
+	if (irq < MPIC_VEC_IPI_0) {
+#ifdef DEBUG_IRQ
+		DBG("%s: irq %d\n", mpic->name, irq + mpic->irq_offset);
+#endif
 		return irq + mpic->irq_offset;
+	}
+#ifdef DEBUG_IPI
        	DBG("%s: ipi %d !\n", mpic->name, irq - MPIC_VEC_IPI_0);
+#endif
 	return irq - MPIC_VEC_IPI_0 + mpic->ipi_offset;
 }
 
diff --git a/arch/powerpc/sysdev/u3_iommu.c b/arch/powerpc/sysdev/u3_iommu.c
deleted file mode 100644
index 5c1a26a6d00c..000000000000
--- a/arch/powerpc/sysdev/u3_iommu.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * arch/powerpc/sysdev/u3_iommu.c
- *
- * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
- *
- * Based on pSeries_iommu.c:
- * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
- * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
- *
- * Dynamic DMA mapping support, Apple U3 & IBM CPC925 "DART" iommu.
- *
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/config.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/vmalloc.h>
-#include <asm/io.h>
-#include <asm/prom.h>
-#include <asm/iommu.h>
-#include <asm/pci-bridge.h>
-#include <asm/machdep.h>
-#include <asm/abs_addr.h>
-#include <asm/cacheflush.h>
-#include <asm/lmb.h>
-#include <asm/ppc-pci.h>
-
-#include "dart.h"
-
-extern int iommu_force_on;
-
-/* Physical base address and size of the DART table */
-unsigned long dart_tablebase; /* exported to htab_initialize */
-static unsigned long dart_tablesize;
-
-/* Virtual base address of the DART table */
-static u32 *dart_vbase;
-
-/* Mapped base address for the dart */
-static unsigned int *dart; 
-
-/* Dummy val that entries are set to when unused */
-static unsigned int dart_emptyval;
-
-static struct iommu_table iommu_table_u3;
-static int iommu_table_u3_inited;
-static int dart_dirty;
-
-#define DBG(...)
-
-static inline void dart_tlb_invalidate_all(void)
-{
-	unsigned long l = 0;
-	unsigned int reg;
-	unsigned long limit;
-
-	DBG("dart: flush\n");
-
-	/* To invalidate the DART, set the DARTCNTL_FLUSHTLB bit in the
-	 * control register and wait for it to clear.
-	 *
-	 * Gotcha: Sometimes, the DART won't detect that the bit gets
-	 * set. If so, clear it and set it again.
-	 */ 
-
-	limit = 0;
-
-retry:
-	reg = in_be32((unsigned int *)dart+DARTCNTL);
-	reg |= DARTCNTL_FLUSHTLB;
-	out_be32((unsigned int *)dart+DARTCNTL, reg);
-
-	l = 0;
-	while ((in_be32((unsigned int *)dart+DARTCNTL) & DARTCNTL_FLUSHTLB) &&
-		l < (1L<<limit)) {
-		l++;
-	}
-	if (l == (1L<<limit)) {
-		if (limit < 4) {
-			limit++;
-		        reg = in_be32((unsigned int *)dart+DARTCNTL);
-		        reg &= ~DARTCNTL_FLUSHTLB;
-		        out_be32((unsigned int *)dart+DARTCNTL, reg);
-			goto retry;
-		} else
-			panic("U3-DART: TLB did not flush after waiting a long "
-			      "time. Buggy U3 ?");
-	}
-}
-
-static void dart_flush(struct iommu_table *tbl)
-{
-	if (dart_dirty)
-		dart_tlb_invalidate_all();
-	dart_dirty = 0;
-}
-
-static void dart_build(struct iommu_table *tbl, long index, 
-		       long npages, unsigned long uaddr,
-		       enum dma_data_direction direction)
-{
-	unsigned int *dp;
-	unsigned int rpn;
-
-	DBG("dart: build at: %lx, %lx, addr: %x\n", index, npages, uaddr);
-
-	index <<= DART_PAGE_FACTOR;
-	npages <<= DART_PAGE_FACTOR;
-
-	dp = ((unsigned int*)tbl->it_base) + index;
-	
-	/* On U3, all memory is contigous, so we can move this
-	 * out of the loop.
-	 */
-	while (npages--) {
-		rpn = virt_to_abs(uaddr) >> DART_PAGE_SHIFT;
-
-		*(dp++) = DARTMAP_VALID | (rpn & DARTMAP_RPNMASK);
-
-		rpn++;
-		uaddr += DART_PAGE_SIZE;
-	}
-
-	dart_dirty = 1;
-}
-
-
-static void dart_free(struct iommu_table *tbl, long index, long npages)
-{
-	unsigned int *dp;
-	
-	/* We don't worry about flushing the TLB cache. The only drawback of
-	 * not doing it is that we won't catch buggy device drivers doing
-	 * bad DMAs, but then no 32-bit architecture ever does either.
-	 */
-
-	DBG("dart: free at: %lx, %lx\n", index, npages);
-
-	index <<= DART_PAGE_FACTOR;
-	npages <<= DART_PAGE_FACTOR;
-
-	dp  = ((unsigned int *)tbl->it_base) + index;
-		
-	while (npages--)
-		*(dp++) = dart_emptyval;
-}
-
-
-static int dart_init(struct device_node *dart_node)
-{
-	unsigned int regword;
-	unsigned int i;
-	unsigned long tmp;
-
-	if (dart_tablebase == 0 || dart_tablesize == 0) {
-		printk(KERN_INFO "U3-DART: table not allocated, using direct DMA\n");
-		return -ENODEV;
-	}
-
-	/* Make sure nothing from the DART range remains in the CPU cache
-	 * from a previous mapping that existed before the kernel took
-	 * over
-	 */
-	flush_dcache_phys_range(dart_tablebase, dart_tablebase + dart_tablesize);
-
-	/* Allocate a spare page to map all invalid DART pages. We need to do
-	 * that to work around what looks like a problem with the HT bridge
-	 * prefetching into invalid pages and corrupting data
-	 */
-	tmp = lmb_alloc(DART_PAGE_SIZE, DART_PAGE_SIZE);
-	if (!tmp)
-		panic("U3-DART: Cannot allocate spare page!");
-	dart_emptyval = DARTMAP_VALID | ((tmp >> DART_PAGE_SHIFT) & DARTMAP_RPNMASK);
-
-	/* Map in DART registers. FIXME: Use device node to get base address */
-	dart = ioremap(DART_BASE, 0x7000);
-	if (dart == NULL)
-		panic("U3-DART: Cannot map registers!");
-
-	/* Set initial control register contents: table base, 
-	 * table size and enable bit
-	 */
-	regword = DARTCNTL_ENABLE | 
-		((dart_tablebase >> DART_PAGE_SHIFT) << DARTCNTL_BASE_SHIFT) |
-		(((dart_tablesize >> DART_PAGE_SHIFT) & DARTCNTL_SIZE_MASK)
-				 << DARTCNTL_SIZE_SHIFT);
-	dart_vbase = ioremap(virt_to_abs(dart_tablebase), dart_tablesize);
-
-	/* Fill initial table */
-	for (i = 0; i < dart_tablesize/4; i++)
-		dart_vbase[i] = dart_emptyval;
-
-	/* Initialize DART with table base and enable it. */
-	out_be32((unsigned int *)dart, regword);
-
-	/* Invalidate DART to get rid of possible stale TLBs */
-	dart_tlb_invalidate_all();
-
-	printk(KERN_INFO "U3/CPC925 DART IOMMU initialized\n");
-
-	return 0;
-}
-
-static void iommu_table_u3_setup(void)
-{
-	iommu_table_u3.it_busno = 0;
-	iommu_table_u3.it_offset = 0;
-	/* it_size is in number of entries */
-	iommu_table_u3.it_size = (dart_tablesize / sizeof(u32)) >> DART_PAGE_FACTOR;
-
-	/* Initialize the common IOMMU code */
-	iommu_table_u3.it_base = (unsigned long)dart_vbase;
-	iommu_table_u3.it_index = 0;
-	iommu_table_u3.it_blocksize = 1;
-	iommu_init_table(&iommu_table_u3);
-
-	/* Reserve the last page of the DART to avoid possible prefetch
-	 * past the DART mapped area
-	 */
-	set_bit(iommu_table_u3.it_size - 1, iommu_table_u3.it_map);
-}
-
-static void iommu_dev_setup_u3(struct pci_dev *dev)
-{
-	struct device_node *dn;
-
-	/* We only have one iommu table on the mac for now, which makes
-	 * things simple. Setup all PCI devices to point to this table
-	 *
-	 * We must use pci_device_to_OF_node() to make sure that
-	 * we get the real "final" pointer to the device in the
-	 * pci_dev sysdata and not the temporary PHB one
-	 */
-	dn = pci_device_to_OF_node(dev);
-
-	if (dn)
-		PCI_DN(dn)->iommu_table = &iommu_table_u3;
-}
-
-static void iommu_bus_setup_u3(struct pci_bus *bus)
-{
-	struct device_node *dn;
-
-	if (!iommu_table_u3_inited) {
-		iommu_table_u3_inited = 1;
-		iommu_table_u3_setup();
-	}
-
-	dn = pci_bus_to_OF_node(bus);
-
-	if (dn)
-		PCI_DN(dn)->iommu_table = &iommu_table_u3;
-}
-
-static void iommu_dev_setup_null(struct pci_dev *dev) { }
-static void iommu_bus_setup_null(struct pci_bus *bus) { }
-
-void iommu_init_early_u3(void)
-{
-	struct device_node *dn;
-
-	/* Find the DART in the device-tree */
-	dn = of_find_compatible_node(NULL, "dart", "u3-dart");
-	if (dn == NULL)
-		return;
-
-	/* Setup low level TCE operations for the core IOMMU code */
-	ppc_md.tce_build = dart_build;
-	ppc_md.tce_free  = dart_free;
-	ppc_md.tce_flush = dart_flush;
-
-	/* Initialize the DART HW */
-	if (dart_init(dn)) {
-		/* If init failed, use direct iommu and null setup functions */
-		ppc_md.iommu_dev_setup = iommu_dev_setup_null;
-		ppc_md.iommu_bus_setup = iommu_bus_setup_null;
-
-		/* Setup pci_dma ops */
-		pci_direct_iommu_init();
-	} else {
-		ppc_md.iommu_dev_setup = iommu_dev_setup_u3;
-		ppc_md.iommu_bus_setup = iommu_bus_setup_u3;
-
-		/* Setup pci_dma ops */
-		pci_iommu_init();
-	}
-}
-
-
-void __init alloc_u3_dart_table(void)
-{
-	/* Only reserve DART space if machine has more than 2GB of RAM
-	 * or if requested with iommu=on on cmdline.
-	 */
-	if (lmb_end_of_DRAM() <= 0x80000000ull && !iommu_force_on)
-		return;
-
-	/* 512 pages (2MB) is max DART tablesize. */
-	dart_tablesize = 1UL << 21;
-	/* 16MB (1 << 24) alignment. We allocate a full 16Mb chuck since we
-	 * will blow up an entire large page anyway in the kernel mapping
-	 */
-	dart_tablebase = (unsigned long)
-		abs_to_virt(lmb_alloc_base(1UL<<24, 1UL<<24, 0x80000000L));
-
-	printk(KERN_INFO "U3-DART allocated at: %lx\n", dart_tablebase);
-}
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index a8d3bc0a9c5c..5013b1285e22 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -1686,7 +1686,7 @@ pmac_ide_probe(void)
 #else
 	macio_register_driver(&pmac_ide_macio_driver);
 	pci_register_driver(&pmac_ide_pci_driver);
-#endif	
+#endif
 }
 
 #ifdef CONFIG_BLK_DEV_IDEDMA_PMAC
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index e8378274d710..96226116a646 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -53,7 +53,7 @@
 #undef DEBUG_SMU
 
 #ifdef DEBUG_SMU
-#define DPRINTK(fmt, args...) do { udbg_printf(KERN_DEBUG fmt , ##args); } while (0)
+#define DPRINTK(fmt, args...) do { printk(KERN_DEBUG fmt , ##args); } while (0)
 #else
 #define DPRINTK(fmt, args...) do { } while (0)
 #endif
@@ -909,10 +909,13 @@ static struct smu_sdbp_header *smu_create_sdb_partition(int id)
 	struct property *prop;
 
 	/* First query the partition info */
+	DPRINTK("SMU: Query partition infos ... (irq=%d)\n", smu->db_irq);
 	smu_queue_simple(&cmd, SMU_CMD_PARTITION_COMMAND, 2,
 			 smu_done_complete, &comp,
 			 SMU_CMD_PARTITION_LATEST, id);
 	wait_for_completion(&comp);
+	DPRINTK("SMU: done, status: %d, reply_len: %d\n",
+		cmd.cmd.status, cmd.cmd.reply_len);
 
 	/* Partition doesn't exist (or other error) */
 	if (cmd.cmd.status != 0 || cmd.cmd.reply_len != 6)
@@ -975,6 +978,8 @@ struct smu_sdbp_header *__smu_get_sdb_partition(int id, unsigned int *size,
 
 	sprintf(pname, "sdb-partition-%02x", id);
 
+	DPRINTK("smu_get_sdb_partition(%02x)\n", id);
+
 	if (interruptible) {
 		int rc;
 		rc = down_interruptible(&smu_part_access);
@@ -986,6 +991,7 @@ struct smu_sdbp_header *__smu_get_sdb_partition(int id, unsigned int *size,
 	part = (struct smu_sdbp_header *)get_property(smu->of_node,
 						      pname, size);
 	if (part == NULL) {
+		DPRINTK("trying to extract from SMU ...\n");
 		part = smu_create_sdb_partition(id);
 		if (part != NULL && size)
 			*size = part->len << 2;
diff --git a/include/asm-powerpc/iommu.h b/include/asm-powerpc/iommu.h
index f89f06050893..59f062668997 100644
--- a/include/asm-powerpc/iommu.h
+++ b/include/asm-powerpc/iommu.h
@@ -56,7 +56,7 @@ struct device_node;
 
 /* Walks all buses and creates iommu tables */
 extern void iommu_setup_pSeries(void);
-extern void iommu_setup_u3(void);
+extern void iommu_setup_dart(void);
 
 /* Frees table for an individual device node */
 extern void iommu_free_table(struct device_node *dn);
@@ -104,7 +104,7 @@ extern void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
 
 extern void iommu_init_early_pSeries(void);
 extern void iommu_init_early_iSeries(void);
-extern void iommu_init_early_u3(void);
+extern void iommu_init_early_dart(void);
 
 #ifdef CONFIG_PCI
 extern void pci_iommu_init(void);
@@ -113,6 +113,6 @@ extern void pci_direct_iommu_init(void);
 static inline void pci_iommu_init(void) { }
 #endif
 
-extern void alloc_u3_dart_table(void);
+extern void alloc_dart_table(void);
 
 #endif /* _ASM_IOMMU_H */
diff --git a/include/asm-powerpc/mpic.h b/include/asm-powerpc/mpic.h
index 6ce27e1b5646..bf7e71793205 100644
--- a/include/asm-powerpc/mpic.h
+++ b/include/asm-powerpc/mpic.h
@@ -117,8 +117,9 @@ typedef int (*mpic_cascade_t)(struct pt_regs *regs, void *data);
 struct mpic_irq_fixup
 {
 	u8 __iomem	*base;
+	u8 __iomem	*applebase;
 	u32		data;
-	unsigned int	irq;
+	unsigned int	index;
 };
 #endif /* CONFIG_MPIC_BROKEN_U3 */
 
diff --git a/include/asm-powerpc/pmac_feature.h b/include/asm-powerpc/pmac_feature.h
index e9683bcff19b..f6997ed5179e 100644
--- a/include/asm-powerpc/pmac_feature.h
+++ b/include/asm-powerpc/pmac_feature.h
@@ -121,6 +121,7 @@
 #define PMAC_TYPE_IMAC_G5		0x152	/* iMac G5 */
 #define PMAC_TYPE_XSERVE_G5		0x153	/* Xserve G5 */
 #define PMAC_TYPE_UNKNOWN_K2		0x19f	/* Any other K2 based */
+#define PMAC_TYPE_UNKNOWN_SHASTA       	0x19e	/* Any other Shasta based */
 
 /*
  * Motherboard flags
@@ -341,6 +342,7 @@ enum {
 	macio_pangea,
 	macio_intrepid,
 	macio_keylargo2,
+	macio_shasta,
 };
 
 struct macio_chip
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index e2a089b051ed..d27a78b71297 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -196,6 +196,7 @@
 #define  PCI_CAP_ID_MSI		0x05	/* Message Signalled Interrupts */
 #define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
 #define  PCI_CAP_ID_PCIX	0x07	/* PCI-X */
+#define  PCI_CAP_ID_HT_IRQCONF	0x08	/* HyperTransport IRQ Configuration */
 #define  PCI_CAP_ID_SHPC 	0x0C	/* PCI Standard Hot-Plug Controller */
 #define  PCI_CAP_ID_EXP 	0x10	/* PCI Express */
 #define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
-- 
cgit v1.2.3-71-gd317


From b792de39d892e06b18ddea85be076bae123d6bf6 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olh@suse.de>
Date: Sun, 8 Jan 2006 01:00:32 -0800
Subject: [PATCH] Fix compilation with CONFIG_MEMORY_HOTPLUG=y and gcc41.

Fix compilation with CONFIG_MEMORY_HOTPLUG=y and gcc41.
Also remove unneeded declations, add a public function.

drivers/base/memory.c:53: error: static declaration of 'register_memory_notifier' follows non-static declaration
include/linux/memory.h:85: error: previous declaration of 'register_memory_notifier' was here
drivers/base/memory.c:58: error: static declaration of 'unregister_memory_notifier' follows non-static declaration
include/linux/memory.h:86: error: previous declaration of 'unregister_memory_notifier' was here
drivers/base/memory.c:68: error: static declaration of 'register_memory' follows non-static declaration
include/linux/memory.h:73: error: previous declaration of 'register_memory' was here

Signed-off-by: Olaf Hering <olh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index dc4081b6f161..e251dc43d0f5 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -70,21 +70,15 @@ static inline void unregister_memory_notifier(struct notifier_block *nb)
 {
 }
 #else
-extern int register_memory(struct memory_block *, struct mem_section *section, struct node *);
 extern int register_new_memory(struct mem_section *);
 extern int unregister_memory_section(struct mem_section *);
 extern int memory_dev_init(void);
-extern int register_memory_notifier(struct notifier_block *nb);
-extern void unregister_memory_notifier(struct notifier_block *nb);
+extern int remove_memory_block(unsigned long, struct mem_section *, int);
 
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 
-extern int invalidate_phys_mapping(unsigned long, unsigned long);
 struct notifier_block;
 
-extern int register_memory_notifier(struct notifier_block *nb);
-extern void unregister_memory_notifier(struct notifier_block *nb);
-
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #define hotplug_memory_notifier(fn, pri) {			\
-- 
cgit v1.2.3-71-gd317


From f9f7500521b25dbf1aba476b81230489ad8e2c4b Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 8 Jan 2006 01:00:33 -0800
Subject: [PATCH] slab: remove unused align parameter from alloc_percpu

__alloc_percpu and alloc_percpu both take an 'align' argument which is
completely ignored.  snmp6_mib_init() in net/ipv6/af_inet6.c attempts to use
it, but it will be ignored.  Therefore, remove the 'align' argument and fixup
the lone caller.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/percpu.h | 7 +++----
 mm/slab.c              | 3 +--
 net/ipv6/af_inet6.c    | 4 ++--
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index fb8d2d24e4bb..20317d88deba 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -33,14 +33,14 @@ struct percpu_data {
         (__typeof__(ptr))__p->ptrs[(cpu)];	\
 })
 
-extern void *__alloc_percpu(size_t size, size_t align);
+extern void *__alloc_percpu(size_t size);
 extern void free_percpu(const void *);
 
 #else /* CONFIG_SMP */
 
 #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
-static inline void *__alloc_percpu(size_t size, size_t align)
+static inline void *__alloc_percpu(size_t size)
 {
 	void *ret = kmalloc(size, GFP_KERNEL);
 	if (ret)
@@ -55,7 +55,6 @@ static inline void free_percpu(const void *ptr)
 #endif /* CONFIG_SMP */
 
 /* Simple wrapper for the common case: zeros memory. */
-#define alloc_percpu(type) \
-	((type *)(__alloc_percpu(sizeof(type), __alignof__(type))))
+#define alloc_percpu(type)	((type *)(__alloc_percpu(sizeof(type))))
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/mm/slab.c b/mm/slab.c
index e5ec26e0c460..eb70fddf2059 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2944,9 +2944,8 @@ EXPORT_SYMBOL(__kmalloc);
  * Objects should be dereferenced using the per_cpu_ptr macro only.
  *
  * @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
  */
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
 {
 	int i;
 	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 68afc53be662..25c3fe5005d9 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -689,11 +689,11 @@ snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
 	if (ptr == NULL)
 		return -EINVAL;
 
-	ptr[0] = __alloc_percpu(mibsize, mibalign);
+	ptr[0] = __alloc_percpu(mibsize);
 	if (!ptr[0])
 		goto err0;
 
-	ptr[1] = __alloc_percpu(mibsize, mibalign);
+	ptr[1] = __alloc_percpu(mibsize);
 	if (!ptr[1])
 		goto err1;
 
-- 
cgit v1.2.3-71-gd317


From 9d0243bca345d5ce25d3f4b74b7facb3a6df1232 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:00:39 -0800
Subject: [PATCH] drop-pagecache

Add /proc/sys/vm/drop_caches.  When written to, this will cause the kernel to
discard as much pagecache and/or reclaimable slab objects as it can.  THis
operation requires root permissions.

It won't drop dirty data, so the user should run `sync' first.

Caveats:

a) Holds inode_lock for exorbitant amounts of time.

b) Needs to be taught about NUMA nodes: propagate these all the way through
   so the discarding can be controlled on a per-node basis.

This is a debugging feature: useful for getting consistent results between
filesystem benchmarks.  We could possibly put it under a config option, but
it's less than 300 bytes.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/proc.txt | 17 ++++++++++
 Documentation/sysctl/vm.txt        |  3 +-
 fs/Makefile                        |  2 +-
 fs/drop_caches.c                   | 68 ++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h                 |  7 ++++
 include/linux/sysctl.h             |  1 +
 kernel/sysctl.c                    | 10 ++++++
 mm/truncate.c                      |  1 -
 mm/vmscan.c                        |  3 +-
 9 files changed, 107 insertions(+), 5 deletions(-)
 create mode 100644 fs/drop_caches.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index d4773565ea2f..a4dcf42c2fd9 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1302,6 +1302,23 @@ VM has token based thrashing control mechanism and uses the token to prevent
 unnecessary page faults in thrashing situation. The unit of the value is
 second. The value would be useful to tune thrashing behavior.
 
+drop_caches
+-----------
+
+Writing to this will cause the kernel to drop clean caches, dentries and
+inodes from memory, causing that memory to become free.
+
+To free pagecache:
+	echo 1 > /proc/sys/vm/drop_caches
+To free dentries and inodes:
+	echo 2 > /proc/sys/vm/drop_caches
+To free pagecache, dentries and inodes:
+	echo 3 > /proc/sys/vm/drop_caches
+
+As this is a non-destructive operation and dirty objects are not freeable, the
+user should run `sync' first.
+
+
 2.5 /proc/sys/dev - Device specific parameters
 ----------------------------------------------
 
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 2f1aae32a5d9..89ba1a42a17d 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -26,12 +26,13 @@ Currently, these files are in /proc/sys/vm:
 - min_free_kbytes
 - laptop_mode
 - block_dump
+- drop-caches
 
 ==============================================================
 
 dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
 dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
-block_dump, swap_token_timeout:
+block_dump, swap_token_timeout, drop-caches:
 
 See Documentation/filesystems/proc.txt
 
diff --git a/fs/Makefile b/fs/Makefile
index 73676111ebbe..35e9aec608e4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o pnode.o
+		ioprio.o pnode.o drop_caches.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
new file mode 100644
index 000000000000..4e4762389bdc
--- /dev/null
+++ b/fs/drop_caches.c
@@ -0,0 +1,68 @@
+/*
+ * Implement the manual drop-all-pagecache function
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/sysctl.h>
+#include <linux/gfp.h>
+
+/* A global variable is a bit ugly, but it keeps the code simple */
+int sysctl_drop_caches;
+
+static void drop_pagecache_sb(struct super_block *sb)
+{
+	struct inode *inode;
+
+	spin_lock(&inode_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE))
+			continue;
+		invalidate_inode_pages(inode->i_mapping);
+	}
+	spin_unlock(&inode_lock);
+}
+
+void drop_pagecache(void)
+{
+	struct super_block *sb;
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			drop_pagecache_sb(sb);
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+}
+
+void drop_slab(void)
+{
+	int nr_objects;
+
+	do {
+		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+	} while (nr_objects > 10);
+}
+
+int drop_caches_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (write) {
+		if (sysctl_drop_caches & 1)
+			drop_pagecache();
+		if (sysctl_drop_caches & 2)
+			drop_slab();
+	}
+	return 0;
+}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc01fff3aa01..83c651f25188 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1036,5 +1036,12 @@ int in_gate_area_no_task(unsigned long addr);
 /* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */
 #define OOM_DISABLE -17
 
+int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+			unsigned long lru_pages);
+void drop_pagecache(void);
+void drop_slab(void);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a9b80fc7f0f3..4cd267fe87ec 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -180,6 +180,7 @@ enum
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a85047bb5739..8dcf6fd5b0f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,6 +68,7 @@ extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
+extern int sysctl_drop_caches;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -774,6 +775,15 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
 		.strategy	= &sysctl_intvec,
 	},
+	{
+		.ctl_name	= VM_DROP_PAGECACHE,
+		.procname	= "drop_caches",
+		.data		= &sysctl_drop_caches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= drop_caches_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
 	{
 		.ctl_name	= VM_MIN_FREE_KBYTES,
 		.procname	= "min_free_kbytes",
diff --git a/mm/truncate.c b/mm/truncate.c
index 7dee32745901..b1a463d0fe71 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -249,7 +249,6 @@ unlock:
 				break;
 		}
 		pagevec_release(&pvec);
-		cond_resched();
 	}
 	return ret;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index be8235fb1939..428c5801d4b4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -180,8 +180,7 @@ EXPORT_SYMBOL(remove_shrinker);
  *
  * Returns the number of slab objects which we shrunk.
  */
-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages)
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
 	int ret = 0;
-- 
cgit v1.2.3-71-gd317


From 8ad4b1fb8205340dba16b63467bb23efc27264d6 Mon Sep 17 00:00:00 2001
From: Rohit Seth <rohit.seth@intel.com>
Date: Sun, 8 Jan 2006 01:00:40 -0800
Subject: [PATCH] Make high and batch sizes of per_cpu_pagelists configurable

As recently there has been lot of traffic on the right values for batch and
high water marks for per_cpu_pagelists.  This patch makes these two
variables configurable through /proc interface.

A new tunable /proc/sys/vm/percpu_pagelist_fraction is added.  This entry
controls the fraction of pages at most in each zone that are allocated for
each per cpu page list.  The min value for this is 8.  It means that we
don't allow more than 1/8th of pages in each zone to be allocated in any
single per_cpu_pagelist.

The batch value of each per cpu pagelist is also updated as a result.  It
is set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)

Signed-off-by: Rohit Seth <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 17 ++++++++++++++++
 include/linux/mmzone.h      |  2 ++
 include/linux/sysctl.h      |  1 +
 kernel/sysctl.c             | 12 +++++++++++
 mm/page_alloc.c             | 49 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 89ba1a42a17d..6910c0136f8d 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -103,3 +103,20 @@ This is used to force the Linux VM to keep a minimum number
 of kilobytes free.  The VM uses this number to compute a pages_min
 value for each lowmem zone in the system.  Each lowmem zone gets 
 a number of reserved free pages based proportionally on its size.
+
+==============================================================
+
+percpu_pagelist_fraction
+
+This is the fraction of pages at most (high mark pcp->high) in each zone that
+are allocated for each per cpu page list.  The min value for this is 8.  It
+means that we don't allow more than 1/8th of pages in each zone to be
+allocated in any single per_cpu_pagelist.  This entry only changes the value
+of hot per cpu pagelists.  User can specify a number like 100 to allocate
+1/100th of each zone to each per cpu page list.
+
+The batch value of each per cpu pagelist is also updated as a result.  It is
+set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
+
+The initial value is zero.  Kernel does not use this value at boot time to set
+the high water marks for each per cpu page list.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c34f4a2c62f8..2a89c132ba9c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -437,6 +437,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4cd267fe87ec..7f472127b7b5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -181,6 +181,7 @@ enum
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
 	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
+	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8dcf6fd5b0f9..03b0598f2369 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -69,6 +69,7 @@ extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
+extern int percpu_pagelist_fraction;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -79,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
+static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
@@ -794,6 +796,16 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= VM_PERCPU_PAGELIST_FRACTION,
+		.procname	= "percpu_pagelist_fraction",
+		.data		= &percpu_pagelist_fraction,
+		.maxlen		= sizeof(percpu_pagelist_fraction),
+		.mode		= 0644,
+		.proc_handler	= &percpu_pagelist_fraction_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_percpu_pagelist_fract,
+	},
 #ifdef CONFIG_MMU
 	{
 		.ctl_name	= VM_MAX_MAP_COUNT,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5eeeadd9f66a..2c46f697e8ff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
+int percpu_pagelist_fraction;
 
 static void fastcall free_hot_cold_page(struct page *page, int cold);
 
@@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 	INIT_LIST_HEAD(&pcp->list);
 }
 
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+				unsigned long high)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0]; /* hot list */
+	pcp->high = high;
+	pcp->batch = max(1UL, high/4);
+	if ((high/4) > (PAGE_SHIFT * 8))
+		pcp->batch = PAGE_SHIFT * 8;
+}
+
+
 #ifdef CONFIG_NUMA
 /*
  * Boot pageset table. One per cpu which is going to be used for all
@@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu)
 			goto bad;
 
 		setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+
+		if (percpu_pagelist_fraction)
+			setup_pagelist_highmark(zone_pcp(zone, cpu),
+			 	(zone->present_pages / percpu_pagelist_fraction));
 	}
 
 	return 0;
@@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	unsigned int cpu;
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	for_each_zone(zone) {
+		for_each_online_cpu(cpu) {
+			unsigned long  high;
+			high = zone->present_pages / percpu_pagelist_fraction;
+			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+		}
+	}
+	return 0;
+}
+
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3-71-gd317


From 15316ba81aee6775d6079fb46c66c801989e7d10 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:00:43 -0800
Subject: [PATCH] add schedule_on_each_cpu()

swap migration's isolate_lru_page() currently uses an IPI to notify other
processors that the lru caches need to be drained if the page cannot be
found on the LRU.  The IPI interrupt may interrupt a processor that is just
processing lru requests and cause a race condition.

This patch introduces a new function run_on_each_cpu() that uses the
keventd() to run the LRU draining on each processor.  Processors disable
preemption when dealing the LRU caches (these are per processor) and thus
executing LRU draining from another process is safe.

Thanks to Lee Schermerhorn <lee.schermerhorn@hp.com> for finding this race
condition.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index ac39d04d027c..86b111300231 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -65,6 +65,7 @@ extern int FASTCALL(schedule_work(struct work_struct *work));
 extern int FASTCALL(schedule_delayed_work(struct work_struct *work, unsigned long delay));
 
 extern int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay);
+extern int schedule_on_each_cpu(void (*func)(void *info), void *info);
 extern void flush_scheduled_work(void);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2bd5aee1c736..62d47220696a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -419,6 +419,25 @@ int schedule_delayed_work_on(int cpu,
 	return ret;
 }
 
+int schedule_on_each_cpu(void (*func) (void *info), void *info)
+{
+	int cpu;
+	struct work_struct *work;
+
+	work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
+
+	if (!work)
+		return -ENOMEM;
+	for_each_online_cpu(cpu) {
+		INIT_WORK(work + cpu, func, info);
+		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
+				work + cpu);
+	}
+	flush_workqueue(keventd_wq);
+	kfree(work);
+	return 0;
+}
+
 void flush_scheduled_work(void)
 {
 	flush_workqueue(keventd_wq);
-- 
cgit v1.2.3-71-gd317


From 21eac81f252fe31c3cf64b805a1e8652192f3a3b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:45 -0800
Subject: [PATCH] Swap Migration V5: LRU operations

This is the start of the `swap migration' patch series.

Swap migration allows the moving of the physical location of pages between
nodes in a numa system while the process is running.  This means that the
virtual addresses that the process sees do not change.  However, the system
rearranges the physical location of those pages.

The main intent of page migration patches here is to reduce the latency of
memory access by moving pages near to the processor where the process
accessing that memory is running.

The patchset allows a process to manually relocate the node on which its
pages are located through the MF_MOVE and MF_MOVE_ALL options while
setting a new memory policy.

The pages of process can also be relocated from another process using the
sys_migrate_pages() function call.  Requires CAP_SYS_ADMIN.  The migrate_pages
function call takes two sets of nodes and moves pages of a process that are
located on the from nodes to the destination nodes.

Manual migration is very useful if for example the scheduler has relocated a
process to a processor on a distant node.  A batch scheduler or an
administrator can detect the situation and move the pages of the process
nearer to the new processor.

sys_migrate_pages() could be used on non-numa machines as well, to force all
of a particualr process's pages out to swap, if someone thinks that's useful.

Larger installations usually partition the system using cpusets into sections
of nodes.  Paul has equipped cpusets with the ability to move pages when a
task is moved to another cpuset.  This allows automatic control over locality
of a process.  If a task is moved to a new cpuset then also all its pages are
moved with it so that the performance of the process does not sink
dramatically (as is the case today).

Swap migration works by simply evicting the page.  The pages must be faulted
back in.  The pages are then typically reallocated by the system near the node
where the process is executing.

For swap migration the destination of the move is controlled by the allocation
policy.  Cpusets set the allocation policy before calling sys_migrate_pages()
in order to move the pages as intended.

No allocation policy changes are performed for sys_migrate_pages().  This
means that the pages may not faulted in to the specified nodes if no
allocation policy was set by other means.  The pages will just end up near the
node where the fault occurred.

There's another patch series in the pipeline which implements "direct
migration".

The direct migration patchset extends the migration functionality to avoid
going through swap.  The destination node of the relation is controllable
during the actual moving of pages.  The crutch of using the allocation policy
to relocate is not necessary and the pages are moved directly to the target.
Its also faster since swap is not used.

And sys_migrate_pages() can then move pages directly to the specified node.
Implement functions to isolate pages from the LRU and put them back later.

This patch:

An earlier implementation was provided by Hirokazu Takahashi
<taka@valinux.co.jp> and IWAMOTO Toshihiro <iwamoto@valinux.co.jp> for the
memory hotplug project.

From: Magnus

This breaks out isolate_lru_page() and putpack_lru_page().  Needed for swap
migration.

Signed-off-by: Magnus Damm <magnus.damm@gmail.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm_inline.h |  22 ++++++++++
 include/linux/swap.h      |   3 ++
 mm/vmscan.c               | 100 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 112 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 47762ca695a5..49cc68af01f8 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -38,3 +38,25 @@ del_page_from_lru(struct zone *zone, struct page *page)
 		zone->nr_inactive--;
 	}
 }
+
+/*
+ * Isolate one page from the LRU lists.
+ *
+ * - zone->lru_lock must be held
+ */
+static inline int __isolate_lru_page(struct page *page)
+{
+	if (unlikely(!TestClearPageLRU(page)))
+		return 0;
+
+	if (get_page_testone(page)) {
+		/*
+		 * It is being freed elsewhere
+		 */
+		__put_page(page);
+		SetPageLRU(page);
+		return -ENOENT;
+	}
+
+	return 1;
+}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 556617bcf7ac..a49112536c02 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,6 +175,9 @@ extern int try_to_free_pages(struct zone **, gfp_t);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
+extern int isolate_lru_page(struct page *p);
+extern int putback_lru_pages(struct list_head *l);
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 428c5801d4b4..261a56ee11b6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -593,20 +593,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		if (!TestClearPageLRU(page))
-			BUG();
-		list_del(&page->lru);
-		if (get_page_testone(page)) {
-			/*
-			 * It is being freed elsewhere
-			 */
-			__put_page(page);
-			SetPageLRU(page);
-			list_add(&page->lru, src);
-			continue;
-		} else {
-			list_add(&page->lru, dst);
+		switch (__isolate_lru_page(page)) {
+		case 1:
+			/* Succeeded to isolate page */
+			list_move(&page->lru, dst);
 			nr_taken++;
+			break;
+		case -ENOENT:
+			/* Not possible to isolate */
+			list_move(&page->lru, src);
+			break;
+		default:
+			BUG();
 		}
 	}
 
@@ -614,6 +612,48 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 	return nr_taken;
 }
 
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list. Do necessary cache draining if the
+ * page is not on the LRU lists yet.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ * -ENOENT = page is being freed elsewhere.
+ */
+int isolate_lru_page(struct page *page)
+{
+	int rc = 0;
+	struct zone *zone = page_zone(page);
+
+redo:
+	spin_lock_irq(&zone->lru_lock);
+	rc = __isolate_lru_page(page);
+	if (rc == 1) {
+		if (PageActive(page))
+			del_page_from_active_list(zone, page);
+		else
+			del_page_from_inactive_list(zone, page);
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	if (rc == 0) {
+		/*
+		 * Maybe this page is still waiting for a cpu to drain it
+		 * from one of the lru lists?
+		 */
+		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+		if (rc == 0 && PageLRU(page))
+			goto redo;
+	}
+	return rc;
+}
+
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
@@ -679,6 +719,40 @@ done:
 	pagevec_release(&pvec);
 }
 
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
 /*
  * This moves pages from the active list to the inactive list.
  *
-- 
cgit v1.2.3-71-gd317


From 930d915252edda7042c944ed3c30194a2f9fe163 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:47 -0800
Subject: [PATCH] Swap Migration V5: PF_SWAPWRITE to allow writing to swap

Add PF_SWAPWRITE to control a processes permission to write to swap.

- Use PF_SWAPWRITE in may_write_to_queue() instead of checking for kswapd
  and pdflush

- Set PF_SWAPWRITE flag for kswapd and pdflush

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 1 +
 mm/pdflush.c          | 2 +-
 mm/vmscan.c           | 6 ++----
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7da33619d5d0..a74662077d60 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -895,6 +895,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c489..c4b6d0afd736 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
 
 static int __pdflush(struct pdflush_work *my_work)
 {
-	current->flags |= PF_FLUSHER;
+	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
 	my_work->fn = NULL;
 	my_work->who = current;
 	INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 261a56ee11b6..6c30a8c59795 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -268,9 +268,7 @@ static inline int is_page_cache_freeable(struct page *page)
 
 static int may_write_to_queue(struct backing_dev_info *bdi)
 {
-	if (current_is_kswapd())
-		return 1;
-	if (current_is_pdflush())	/* This is unlikely, but why not... */
+	if (current->flags & PF_SWAPWRITE)
 		return 1;
 	if (!bdi_write_congested(bdi))
 		return 1;
@@ -1299,7 +1297,7 @@ static int kswapd(void *p)
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
-	tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 
 	order = 0;
 	for ( ; ; ) {
-- 
cgit v1.2.3-71-gd317


From 49d2e9cc4544369635cd6f4ef6d5bb0f757079a7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:48 -0800
Subject: [PATCH] Swap Migration V5: migrate_pages() function

This adds the basic page migration function with a minimal implementation that
only allows the eviction of pages to swap space.

Page eviction and migration may be useful to migrate pages, to suspend
programs or for remapping single pages (useful for faulty pages or pages with
soft ECC failures)

The process is as follows:

The function wanting to migrate pages must first build a list of pages to be
migrated or evicted and take them off the lru lists via isolate_lru_page().
isolate_lru_page determines that a page is freeable based on the LRU bit set.

Then the actual migration or swapout can happen by calling migrate_pages().

migrate_pages does its best to migrate or swapout the pages and does multiple
passes over the list.  Some pages may only be swappable if they are not dirty.
 migrate_pages may start writing out dirty pages in the initial passes over
the pages.  However, migrate_pages may not be able to migrate or evict all
pages for a variety of reasons.

The remaining pages may be returned to the LRU lists using putback_lru_pages().

Changelog V4->V5:
- Use the lru caches to return pages to the LRU

Changelog V3->V4:
- Restructure code so that applying patches to support full migration does
  require minimal changes. Rename swapout_pages() to migrate_pages().

Changelog V2->V3:
- Extract common code from shrink_list() and swapout_pages()

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: "Michael Kerrisk" <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |   2 +
 mm/vmscan.c          | 214 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 182 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a49112536c02..893096e67bdb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,6 +178,8 @@ extern int vm_swappiness;
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
 
+extern int migrate_pages(struct list_head *l, struct list_head *t);
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6c30a8c59795..a537a7f16357 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -373,6 +373,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 	return PAGE_CLEAN;
 }
 
+static int remove_mapping(struct address_space *mapping, struct page *page)
+{
+	if (!mapping)
+		return 0;		/* truncate got there first */
+
+	write_lock_irq(&mapping->tree_lock);
+
+	/*
+	 * The non-racy check for busy page.  It is critical to check
+	 * PageDirty _after_ making sure that the page is freeable and
+	 * not in use by anybody. 	(pagecache + us == 2)
+	 */
+	if (unlikely(page_count(page) != 2))
+		goto cannot_free;
+	smp_rmb();
+	if (unlikely(PageDirty(page)))
+		goto cannot_free;
+
+	if (PageSwapCache(page)) {
+		swp_entry_t swap = { .val = page_private(page) };
+		__delete_from_swap_cache(page);
+		write_unlock_irq(&mapping->tree_lock);
+		swap_free(swap);
+		__put_page(page);	/* The pagecache ref */
+		return 1;
+	}
+
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+	__put_page(page);
+	return 1;
+
+cannot_free:
+	write_unlock_irq(&mapping->tree_lock);
+	return 0;
+}
+
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
@@ -504,36 +541,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 				goto free_it;
 		}
 
-		if (!mapping)
-			goto keep_locked;	/* truncate got there first */
-
-		write_lock_irq(&mapping->tree_lock);
-
-		/*
-		 * The non-racy check for busy page.  It is critical to check
-		 * PageDirty _after_ making sure that the page is freeable and
-		 * not in use by anybody. 	(pagecache + us == 2)
-		 */
-		if (unlikely(page_count(page) != 2))
-			goto cannot_free;
-		smp_rmb();
-		if (unlikely(PageDirty(page)))
-			goto cannot_free;
-
-#ifdef CONFIG_SWAP
-		if (PageSwapCache(page)) {
-			swp_entry_t swap = { .val = page_private(page) };
-			__delete_from_swap_cache(page);
-			write_unlock_irq(&mapping->tree_lock);
-			swap_free(swap);
-			__put_page(page);	/* The pagecache ref */
-			goto free_it;
-		}
-#endif /* CONFIG_SWAP */
-
-		__remove_from_page_cache(page);
-		write_unlock_irq(&mapping->tree_lock);
-		__put_page(page);
+		if (!remove_mapping(mapping, page))
+			goto keep_locked;
 
 free_it:
 		unlock_page(page);
@@ -542,10 +551,6 @@ free_it:
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
 
-cannot_free:
-		write_unlock_irq(&mapping->tree_lock);
-		goto keep_locked;
-
 activate_locked:
 		SetPageActive(page);
 		pgactivate++;
@@ -563,6 +568,147 @@ keep:
 	return reclaimed;
 }
 
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ *
+ * return codes:
+ *	0 = complete
+ *	1 = retry
+ */
+static int swap_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (page_mapped(page) && mapping)
+		if (try_to_unmap(page) != SWAP_SUCCESS)
+			goto unlock_retry;
+
+	if (PageDirty(page)) {
+		/* Page is dirty, try to write it out here */
+		switch(pageout(page, mapping)) {
+		case PAGE_KEEP:
+		case PAGE_ACTIVATE:
+			goto unlock_retry;
+
+		case PAGE_SUCCESS:
+			goto retry;
+
+		case PAGE_CLEAN:
+			; /* try to free the page below */
+		}
+	}
+
+	if (PagePrivate(page)) {
+		if (!try_to_release_page(page, GFP_KERNEL) ||
+		    (!mapping && page_count(page) == 1))
+			goto unlock_retry;
+	}
+
+	if (remove_mapping(mapping, page)) {
+		/* Success */
+		unlock_page(page);
+		return 0;
+	}
+
+unlock_retry:
+	unlock_page(page);
+
+retry:
+	return 1;
+}
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because t has become empty
+ * or no retryable pages exist anymore.
+ *
+ * SIMPLIFIED VERSION: This implementation of migrate_pages
+ * is only swapping out pages and never touches the second
+ * list. The direct migration patchset
+ * extends this function to avoid the use of swap.
+ */
+int migrate_pages(struct list_head *l, struct list_head *t)
+{
+	int retry;
+	LIST_HEAD(failed);
+	int nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int swapwrite = current->flags & PF_SWAPWRITE;
+
+	if (!swapwrite)
+		current->flags |= PF_SWAPWRITE;
+
+redo:
+	retry = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		cond_resched();
+
+		/*
+		 * Skip locked pages during the first two passes to give the
+		 * functions holding the lock time to release the page. Later we use
+		 * lock_page to have a higher chance of acquiring the lock.
+		 */
+		if (pass > 2)
+			lock_page(page);
+		else
+			if (TestSetPageLocked(page))
+				goto retry_later;
+
+		/*
+		 * Only wait on writeback if we have already done a pass where
+		 * we we may have triggered writeouts for lots of pages.
+		 */
+		if (pass > 0)
+			wait_on_page_writeback(page);
+		else
+			if (PageWriteback(page)) {
+				unlock_page(page);
+				goto retry_later;
+			}
+
+#ifdef CONFIG_SWAP
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!add_to_swap(page)) {
+				unlock_page(page);
+				list_move(&page->lru, &failed);
+				nr_failed++;
+				continue;
+			}
+		}
+#endif /* CONFIG_SWAP */
+
+		/*
+		 * Page is properly locked and writeback is complete.
+		 * Try to migrate the page.
+		 */
+		if (swap_page(page)) {
+retry_later:
+			retry++;
+		}
+	}
+	if (retry && pass++ < 10)
+		goto redo;
+
+	if (!swapwrite)
+		current->flags &= ~PF_SWAPWRITE;
+
+	if (!list_empty(&failed))
+		list_splice(&failed, l);
+
+	return nr_failed + retry;
+}
+
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
-- 
cgit v1.2.3-71-gd317


From 7cbe34cf86c673503b177ff47cfa2c7030dabb50 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:00:49 -0800
Subject: [PATCH] Swap Migration V5: Add CONFIG_MIGRATION for page migration
 support

Include page migration if the system is NUMA or having a memory model that
allows distinct areas of memory (SPARSEMEM, DISCONTIGMEM).

And:
- Only include lru_add_drain_per_cpu if building for an SMP system.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  2 ++
 mm/Kconfig           |  7 +++++++
 mm/vmscan.c          | 20 +++++++++++---------
 3 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 893096e67bdb..117add066f00 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,7 +178,9 @@ extern int vm_swappiness;
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
 
+#ifdef CONFIG_MIGRATION
 extern int migrate_pages(struct list_head *l, struct list_head *t);
+#endif
 
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
diff --git a/mm/Kconfig b/mm/Kconfig
index b3db11f137e0..a9cb80ae6409 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS
 	default "4096" if ARM && !CPU_CACHE_VIPT
 	default "4096" if PARISC && !PA20
 	default "4"
+
+#
+# support for page migration
+#
+config MIGRATION
+	def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+	depends on SWAP
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a537a7f16357..58270aea669a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -568,6 +568,7 @@ keep:
 	return reclaimed;
 }
 
+#ifdef CONFIG_MIGRATION
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
@@ -656,8 +657,9 @@ redo:
 
 		/*
 		 * Skip locked pages during the first two passes to give the
-		 * functions holding the lock time to release the page. Later we use
-		 * lock_page to have a higher chance of acquiring the lock.
+		 * functions holding the lock time to release the page. Later we
+		 * use lock_page() to have a higher chance of acquiring the
+		 * lock.
 		 */
 		if (pass > 2)
 			lock_page(page);
@@ -669,15 +671,15 @@ redo:
 		 * Only wait on writeback if we have already done a pass where
 		 * we we may have triggered writeouts for lots of pages.
 		 */
-		if (pass > 0)
+		if (pass > 0) {
 			wait_on_page_writeback(page);
-		else
+		} else {
 			if (PageWriteback(page)) {
 				unlock_page(page);
 				goto retry_later;
 			}
+		}
 
-#ifdef CONFIG_SWAP
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!add_to_swap(page)) {
 				unlock_page(page);
@@ -686,16 +688,15 @@ redo:
 				continue;
 			}
 		}
-#endif /* CONFIG_SWAP */
 
 		/*
 		 * Page is properly locked and writeback is complete.
 		 * Try to migrate the page.
 		 */
-		if (swap_page(page)) {
+		if (!swap_page(page))
+			continue;
 retry_later:
-			retry++;
-		}
+		retry++;
 	}
 	if (retry && pass++ < 10)
 		goto redo;
@@ -708,6 +709,7 @@ retry_later:
 
 	return nr_failed + retry;
 }
+#endif
 
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
-- 
cgit v1.2.3-71-gd317


From dc9aa5b9d65fd11b1f5246b46ec610ee8b83c6dd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:50 -0800
Subject: [PATCH] Swap Migration V5: MPOL_MF_MOVE interface

Add page migration support via swap to the NUMA policy layer

This patch adds page migration support to the NUMA policy layer.  An
additional flag MPOL_MF_MOVE is introduced for mbind.  If MPOL_MF_MOVE is
specified then pages that do not conform to the memory policy will be evicted
from memory.  When they get pages back in new pages will be allocated
following the numa policy.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |   3 +
 mm/mempolicy.c            | 155 ++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 138 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ed00b278cb93..05443a766cb8 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -22,6 +22,9 @@
 
 /* Flags for mbind */
 #define MPOL_MF_STRICT	(1<<0)	/* Verify existing pages in the mapping */
+#define MPOL_MF_MOVE	(1<<1)	/* Move pages owned by this process to conform to mapping */
+#define MPOL_MF_MOVE_ALL (1<<2)	/* Move every page to conform to mapping */
+#define MPOL_MF_INTERNAL (1<<3)	/* Internal flags start here */
 
 #ifdef __KERNEL__
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f1d2b8a952b..9cc6d962831d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,14 @@
 #include <linux/init.h>
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
+#include <linux/swap.h>
+
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 
+/* Internal MPOL_MF_xxx flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
+
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
 
@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	return policy;
 }
 
+/* Check if we are the only process mapping the page in question */
+static inline int single_mm_mapping(struct mm_struct *mm,
+			struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int rc = 1;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+out:
+	spin_unlock(&mapping->i_mmap_lock);
+	return rc;
+}
+
+/*
+ * Add a page to be migrated to the pagelist
+ */
+static void migrate_page_add(struct vm_area_struct *vma,
+	struct page *page, struct list_head *pagelist, unsigned long flags)
+{
+	/*
+	 * Avoid migrating a page that is shared by others and not writable.
+	 */
+	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
+	    mapping_writably_mapped(page->mapping) ||
+	    single_mm_mapping(vma->vm_mm, page->mapping)) {
+		int rc = isolate_lru_page(page);
+
+		if (rc == 1)
+			list_add(&page->lru, pagelist);
+		/*
+		 * If the isolate attempt was not successful then we just
+		 * encountered an unswappable page. Something must be wrong.
+	 	 */
+		WARN_ON(rc == 0);
+	}
+}
+
 /* Ensure all existing pages follow the policy. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!page)
 			continue;
 		nid = page_to_nid(page);
-		if (!node_isset(nid, *nodes))
-			break;
+		if (!node_isset(nid, *nodes)) {
+			if (pagelist)
+				migrate_page_add(vma, page, pagelist, flags);
+			else
+				break;
+		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
 	return addr != end;
 }
 
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (check_pte_range(vma, pmd, addr, next, nodes))
+		if (check_pte_range(vma, pmd, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (check_pmd_range(vma, pud, addr, next, nodes))
+		if (check_pmd_range(vma, pud, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int check_pgd_range(struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (check_pud_range(vma, pgd, addr, next, nodes))
+		if (check_pud_range(vma, pgd, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
 }
 
-/* Step 1: check the range */
+/* Check if a vma is migratable */
+static inline int vma_migratable(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & (
+		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
+		return 0;
+	return 1;
+}
+
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-	    nodemask_t *nodes, unsigned long flags)
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		return ERR_PTR(-EFAULT);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
-		if (!vma->vm_next && vma->vm_end < end)
-			return ERR_PTR(-EFAULT);
-		if (prev && prev->vm_end < vma->vm_start)
-			return ERR_PTR(-EFAULT);
-		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+			if (!vma->vm_next && vma->vm_end < end)
+				return ERR_PTR(-EFAULT);
+			if (prev && prev->vm_end < vma->vm_start)
+				return ERR_PTR(-EFAULT);
+		}
+		if (!is_vm_hugetlb_page(vma) &&
+		    ((flags & MPOL_MF_STRICT) ||
+		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+				vma_migratable(vma)))) {
 			unsigned long endvma = vma->vm_end;
+
 			if (endvma > end)
 				endvma = end;
 			if (vma->vm_start > start)
 				start = vma->vm_start;
-			err = check_pgd_range(vma, start, endvma, nodes);
+			err = check_pgd_range(vma, start, endvma, nodes,
+						flags, pagelist);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len,
 	struct mempolicy *new;
 	unsigned long end;
 	int err;
+	LIST_HEAD(pagelist);
 
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+	    || mode > MPOL_MAX)
 		return -EINVAL;
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
 	if (start & ~PAGE_MASK)
 		return -EINVAL;
+
 	if (mode == MPOL_DEFAULT)
 		flags &= ~MPOL_MF_STRICT;
+
 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 	end = start + len;
+
 	if (end < start)
 		return -EINVAL;
 	if (end == start)
 		return 0;
+
 	if (mpol_check_policy(mode, nmask))
 		return -EINVAL;
+
 	new = mpol_new(mode, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
+	/*
+	 * If we are using the default policy then operation
+	 * on discontinuous address spaces is okay after all
+	 */
+	if (!new)
+		flags |= MPOL_MF_DISCONTIG_OK;
+
 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 			mode,nodes_addr(nodes)[0]);
 
 	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nmask, flags);
+	vma = check_range(mm, start, end, nmask, flags,
+	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
 	err = PTR_ERR(vma);
-	if (!IS_ERR(vma))
+	if (!IS_ERR(vma)) {
 		err = mbind_range(vma, start, end, new);
+		if (!list_empty(&pagelist))
+			migrate_pages(&pagelist, NULL);
+		if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
+			err = -EIO;
+	}
+	if (!list_empty(&pagelist))
+		putback_lru_pages(&pagelist);
+
 	up_write(&mm->mmap_sem);
 	mpol_free(new);
 	return err;
-- 
cgit v1.2.3-71-gd317


From 39743889aaf76725152f16aa90ca3c45f6d52da3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:51 -0800
Subject: [PATCH] Swap Migration V5: sys_migrate_pages interface

sys_migrate_pages implementation using swap based page migration

This is the original API proposed by Ray Bryant in his posts during the first
half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org.

The intent of sys_migrate is to migrate memory of a process.  A process may
have migrated to another node.  Memory was allocated optimally for the prior
context.  sys_migrate_pages allows to shift the memory to the new node.

sys_migrate_pages is also useful if the processes available memory nodes have
changed through cpuset operations to manually move the processes memory.  Paul
Jackson is working on an automated mechanism that will allow an automatic
migration if the cpuset of a process is changed.  However, a user may decide
to manually control the migration.

This implementation is put into the policy layer since it uses concepts and
functions that are also needed for mbind and friends.  The patch also provides
a do_migrate_pages function that may be useful for cpusets to automatically
move memory.  sys_migrate_pages does not modify policies in contrast to Ray's
implementation.

The current code here is based on the swap based page migration capability and
thus is not able to preserve the physical layout relative to it containing
nodeset (which may be a cpuset).  When direct page migration becomes available
then the implementation needs to be changed to do a isomorphic move of pages
between different nodesets.  The current implementation simply evicts all
pages in source nodeset that are not in the target nodeset.

Patch supports ia64, i386 and x86_64.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |  1 +
 arch/ia64/kernel/entry.S         |  1 +
 arch/x86_64/ia32/ia32entry.S     |  1 +
 include/asm-i386/unistd.h        |  3 +-
 include/asm-ia64/unistd.h        |  3 +-
 include/asm-x86_64/ia32_unistd.h |  3 +-
 include/asm-x86_64/unistd.h      |  4 +-
 include/linux/mempolicy.h        |  3 ++
 include/linux/syscalls.h         |  2 +
 kernel/sys_ni.c                  |  1 +
 mm/mempolicy.c                   | 94 +++++++++++++++++++++++++++++++++++++++-
 11 files changed, 111 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index f7ba4acc20ec..6ff3e5243226 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -293,3 +293,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init
 	.long sys_inotify_add_watch
 	.long sys_inotify_rm_watch
+	.long sys_migrate_pages
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 0741b066b98f..7a6ffd613789 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1600,5 +1600,6 @@ sys_call_table:
 	data8 sys_inotify_init
 	data8 sys_inotify_add_watch
 	data8 sys_inotify_rm_watch
+	data8 sys_migrate_pages			// 1280
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index df0773c9bdbe..1f0ff5adc80e 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -643,6 +643,7 @@ ia32_sys_call_table:
 	.quad sys_inotify_init
 	.quad sys_inotify_add_watch
 	.quad sys_inotify_rm_watch
+	.quad sys_migrate_pages
 ia32_syscall_end:		
 	.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
 		.quad ni_syscall
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fe38b9a96233..481c3c0ea720 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -299,8 +299,9 @@
 #define __NR_inotify_init	291
 #define __NR_inotify_add_watch	292
 #define __NR_inotify_rm_watch	293
+#define __NR_migrate_pages	294
 
-#define NR_syscalls 294
+#define NR_syscalls 295
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 2bf543493cb8..962f9bd1bdff 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -269,12 +269,13 @@
 #define __NR_inotify_init		1277
 #define __NR_inotify_add_watch		1278
 #define __NR_inotify_rm_watch		1279
+#define __NR_migrate_pages		1280
 
 #ifdef __KERNEL__
 
 #include <linux/config.h>
 
-#define NR_syscalls			256 /* length of syscall table */
+#define NR_syscalls			270 /* length of syscall table */
 
 #define __ARCH_WANT_SYS_RT_SIGACTION
 
diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h
index d5166ec3868d..e8843362a6cc 100644
--- a/include/asm-x86_64/ia32_unistd.h
+++ b/include/asm-x86_64/ia32_unistd.h
@@ -299,7 +299,8 @@
 #define __NR_ia32_inotify_init		291
 #define __NR_ia32_inotify_add_watch	292
 #define __NR_ia32_inotify_rm_watch	293
+#define __NR_ia32_migrate_pages		294
 
-#define IA32_NR_syscalls 294	/* must be > than biggest syscall! */
+#define IA32_NR_syscalls 295	/* must be > than biggest syscall! */
 
 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 2c42150bce0c..e6f896161c11 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -571,8 +571,10 @@ __SYSCALL(__NR_inotify_init, sys_inotify_init)
 __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
 #define __NR_inotify_rm_watch	255
 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
+#define __NR_migrate_pages	256
+__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
 
-#define __NR_syscall_max __NR_inotify_rm_watch
+#define __NR_syscall_max __NR_migrate_pages
 #ifndef __NO_STUBS
 
 /* user-visible error numbers are in the range -1 - -4095 */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 05443a766cb8..3e61e829681d 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -162,6 +162,9 @@ static inline void check_highest_zone(int k)
 		policy_zone = k;
 }
 
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
+
 #else
 
 struct mempolicy {};
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c7007b1db91d..e910d1a481df 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -511,5 +511,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
 asmlinkage long sys_ioprio_get(int which, int who);
 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 					unsigned long maxnode);
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+			const unsigned long __user *from, const unsigned long __user *to);
 
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370e2efa..7a8bc7f60d91 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -82,6 +82,7 @@ cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
+cond_syscall(sys_migrate_pages);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9cc6d962831d..20d5ad39fa41 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -614,12 +614,42 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+/*
+ * For now migrate_pages simply swaps out the pages from nodes that are in
+ * the source set but not in the target set. In the future, we would
+ * want a function that moves pages between the two nodesets in such
+ * a way as to preserve the physical layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+	LIST_HEAD(pagelist);
+	int count = 0;
+	nodemask_t nodes;
+
+	nodes_andnot(nodes, *from_nodes, *to_nodes);
+	nodes_complement(nodes, nodes);
+
+	down_read(&mm->mmap_sem);
+	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+	if (!list_empty(&pagelist)) {
+		migrate_pages(&pagelist, NULL);
+		if (!list_empty(&pagelist))
+			count = putback_lru_pages(&pagelist);
+	}
+	up_read(&mm->mmap_sem);
+	return count;
+}
+
 /*
  * User space interface with variable sized bitmaps for nodelists.
  */
 
 /* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
 	unsigned long k;
@@ -708,6 +738,68 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	return do_set_mempolicy(mode, &nodes);
 }
 
+/* Macro needed until Paul implements this function in kernel/cpusets.c */
+#define cpuset_mems_allowed(task) node_online_map
+
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+		const unsigned long __user *old_nodes,
+		const unsigned long __user *new_nodes)
+{
+	struct mm_struct *mm;
+	struct task_struct *task;
+	nodemask_t old;
+	nodemask_t new;
+	nodemask_t task_nodes;
+	int err;
+
+	err = get_nodes(&old, old_nodes, maxnode);
+	if (err)
+		return err;
+
+	err = get_nodes(&new, new_nodes, maxnode);
+	if (err)
+		return err;
+
+	/* Find the mm_struct */
+	read_lock(&tasklist_lock);
+	task = pid ? find_task_by_pid(pid) : current;
+	if (!task) {
+		read_unlock(&tasklist_lock);
+		return -ESRCH;
+	}
+	mm = get_task_mm(task);
+	read_unlock(&tasklist_lock);
+
+	if (!mm)
+		return -EINVAL;
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. The right exists if the process has administrative
+	 * capabilities, superuser priviledges or the same
+	 * userid as the target process.
+	 */
+	if ((current->euid != task->suid) && (current->euid != task->uid) &&
+	    (current->uid != task->suid) && (current->uid != task->uid) &&
+	    !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	task_nodes = cpuset_mems_allowed(task);
+	/* Is the user allowed to access the target nodes? */
+	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+out:
+	mmput(mm);
+	return err;
+}
+
+
 /* Retrieve NUMA policy */
 asmlinkage long sys_get_mempolicy(int __user *policy,
 				unsigned long __user *nmask,
-- 
cgit v1.2.3-71-gd317


From 8419c3181086c86664e8246bc997afc2e4ffba4f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:52 -0800
Subject: [PATCH] SwapMig: CONFIG_MIGRATION fixes

Move move_to_lru, putback_lru_pages and isolate_lru in section surrounded by
CONFIG_MIGRATION saving some codesize for single processor kernels.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |   3 +-
 mm/vmscan.c          | 152 +++++++++++++++++++++++++--------------------------
 2 files changed, 77 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 117add066f00..997d838f0e70 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,10 +175,9 @@ extern int try_to_free_pages(struct zone **, gfp_t);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
+#ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
-
-#ifdef CONFIG_MIGRATION
 extern int migrate_pages(struct list_head *l, struct list_head *t);
 #endif
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 58270aea669a..daed4a73b761 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -569,6 +569,40 @@ keep:
 }
 
 #ifdef CONFIG_MIGRATION
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
@@ -709,6 +743,48 @@ retry_later:
 
 	return nr_failed + retry;
 }
+
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list. Do necessary cache draining if the
+ * page is not on the LRU lists yet.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ * -ENOENT = page is being freed elsewhere.
+ */
+int isolate_lru_page(struct page *page)
+{
+	int rc = 0;
+	struct zone *zone = page_zone(page);
+
+redo:
+	spin_lock_irq(&zone->lru_lock);
+	rc = __isolate_lru_page(page);
+	if (rc == 1) {
+		if (PageActive(page))
+			del_page_from_active_list(zone, page);
+		else
+			del_page_from_inactive_list(zone, page);
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	if (rc == 0) {
+		/*
+		 * Maybe this page is still waiting for a cpu to drain it
+		 * from one of the lru lists?
+		 */
+		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+		if (rc == 0 && PageLRU(page))
+			goto redo;
+	}
+	return rc;
+}
 #endif
 
 /*
@@ -758,48 +834,6 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 	return nr_taken;
 }
 
-static void lru_add_drain_per_cpu(void *dummy)
-{
-	lru_add_drain();
-}
-
-/*
- * Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
- * page is not on the LRU lists yet.
- *
- * Result:
- *  0 = page not on LRU list
- *  1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
- */
-int isolate_lru_page(struct page *page)
-{
-	int rc = 0;
-	struct zone *zone = page_zone(page);
-
-redo:
-	spin_lock_irq(&zone->lru_lock);
-	rc = __isolate_lru_page(page);
-	if (rc == 1) {
-		if (PageActive(page))
-			del_page_from_active_list(zone, page);
-		else
-			del_page_from_inactive_list(zone, page);
-	}
-	spin_unlock_irq(&zone->lru_lock);
-	if (rc == 0) {
-		/*
-		 * Maybe this page is still waiting for a cpu to drain it
-		 * from one of the lru lists?
-		 */
-		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-		if (rc == 0 && PageLRU(page))
-			goto redo;
-	}
-	return rc;
-}
-
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
@@ -865,40 +899,6 @@ done:
 	pagevec_release(&pvec);
 }
 
-static inline void move_to_lru(struct page *page)
-{
-	list_del(&page->lru);
-	if (PageActive(page)) {
-		/*
-		 * lru_cache_add_active checks that
-		 * the PG_active bit is off.
-		 */
-		ClearPageActive(page);
-		lru_cache_add_active(page);
-	} else {
-		lru_cache_add(page);
-	}
-	put_page(page);
-}
-
-/*
- * Add isolated pages on the list back to the LRU
- *
- * returns the number of pages put back.
- */
-int putback_lru_pages(struct list_head *l)
-{
-	struct page *page;
-	struct page *page2;
-	int count = 0;
-
-	list_for_each_entry_safe(page, page2, l, lru) {
-		move_to_lru(page);
-		count++;
-	}
-	return count;
-}
-
 /*
  * This moves pages from the active list to the inactive list.
  *
-- 
cgit v1.2.3-71-gd317


From 1480a540c98525640174a7eadd712378fcd6fd63 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:53 -0800
Subject: [PATCH] SwapMig: add_to_swap() avoid atomic allocations

Add gfp_mask to add_to_swap

add_to_swap does allocations with GFP_ATOMIC in order not to interfere with
swapping.  During migration we may have use add_to_swap extensively which may
lead to out of memory errors.

This patch makes add_to_swap take a parameter that specifies the gfp mask.
The page migration code can then make add_to_swap use GFP_KERNEL.

Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 2 +-
 mm/swap_state.c      | 4 ++--
 mm/vmscan.c          | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 997d838f0e70..eb591eaad1b7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -198,7 +198,7 @@ extern int rw_swap_page_sync(int, swp_entry_t, struct page *);
 extern struct address_space swapper_space;
 #define total_swapcache_pages  swapper_space.nrpages
 extern void show_swap_cache_info(void);
-extern int add_to_swap(struct page *);
+extern int add_to_swap(struct page *, gfp_t);
 extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
 extern int move_to_swap_cache(struct page *, swp_entry_t);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fc2aecb70a95..7b09ac503fec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -141,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
  * Allocate swap space for the page and add the page to the
  * swap cache.  Caller needs to hold the page lock. 
  */
-int add_to_swap(struct page * page)
+int add_to_swap(struct page * page, gfp_t gfp_mask)
 {
 	swp_entry_t entry;
 	int err;
@@ -166,7 +166,7 @@ int add_to_swap(struct page * page)
 		 * Add it to the swap cache and mark it dirty
 		 */
 		err = __add_to_swap_cache(page, entry,
-				GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
+				gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
 
 		switch (err) {
 		case 0:				/* Success */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index daed4a73b761..5393b093a87b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -458,7 +458,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page))
+			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
 		}
 #endif /* CONFIG_SWAP */
@@ -715,7 +715,7 @@ redo:
 		}
 
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page)) {
+			if (!add_to_swap(page, GFP_KERNEL)) {
 				unlock_page(page);
 				list_move(&page->lru, &failed);
 				nr_failed++;
-- 
cgit v1.2.3-71-gd317


From d498471133ff1f9586a06820beaeebc575fe2814 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:55 -0800
Subject: [PATCH] SwapMig: Extend parameters for migrate_pages()

Extend the parameters of migrate_pages() to allow the caller control over the
fate of successfully migrated or impossible to migrate pages.

Swap migration and direct migration will have the same interface after this
patch so that patches can be independently applied to the policy layer and the
core migration code.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  3 ++-
 mm/mempolicy.c       | 27 ++++++++++++++++++++++-----
 mm/vmscan.c          | 17 ++++++++---------
 3 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index eb591eaad1b7..389d1c382e20 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,7 +178,8 @@ extern int vm_swappiness;
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
-extern int migrate_pages(struct list_head *l, struct list_head *t);
+extern int migrate_pages(struct list_head *l, struct list_head *t,
+		struct list_head *moved, struct list_head *failed);
 #endif
 
 #ifdef CONFIG_MMU
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 20d5ad39fa41..30bdafba52d8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -429,6 +429,19 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
+static int swap_pages(struct list_head *pagelist)
+{
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int n;
+
+	n = migrate_pages(pagelist, NULL, &moved, &failed);
+	putback_lru_pages(&failed);
+	putback_lru_pages(&moved);
+
+	return n;
+}
+
 long do_mbind(unsigned long start, unsigned long len,
 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -481,10 +494,13 @@ long do_mbind(unsigned long start, unsigned long len,
 	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
 	err = PTR_ERR(vma);
 	if (!IS_ERR(vma)) {
+		int nr_failed = 0;
+
 		err = mbind_range(vma, start, end, new);
 		if (!list_empty(&pagelist))
-			migrate_pages(&pagelist, NULL);
-		if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
+			nr_failed = swap_pages(&pagelist);
+
+		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	}
 	if (!list_empty(&pagelist))
@@ -635,11 +651,12 @@ int do_migrate_pages(struct mm_struct *mm,
 	down_read(&mm->mmap_sem);
 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
 	if (!list_empty(&pagelist)) {
-		migrate_pages(&pagelist, NULL);
-		if (!list_empty(&pagelist))
-			count = putback_lru_pages(&pagelist);
+		count = swap_pages(&pagelist);
+		putback_lru_pages(&pagelist);
 	}
+
 	up_read(&mm->mmap_sem);
 	return count;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 73ba4046ed27..5eecb514ccea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -670,10 +670,10 @@ retry:
  * list. The direct migration patchset
  * extends this function to avoid the use of swap.
  */
-int migrate_pages(struct list_head *l, struct list_head *t)
+int migrate_pages(struct list_head *from, struct list_head *to,
+		  struct list_head *moved, struct list_head *failed)
 {
 	int retry;
-	LIST_HEAD(failed);
 	int nr_failed = 0;
 	int pass = 0;
 	struct page *page;
@@ -686,12 +686,12 @@ int migrate_pages(struct list_head *l, struct list_head *t)
 redo:
 	retry = 0;
 
-	list_for_each_entry_safe(page, page2, l, lru) {
+	list_for_each_entry_safe(page, page2, from, lru) {
 		cond_resched();
 
 		if (page_count(page) == 1) {
 			/* page was freed from under us. So we are done. */
-			move_to_lru(page);
+			list_move(&page->lru, moved);
 			continue;
 		}
 		/*
@@ -722,7 +722,7 @@ redo:
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!add_to_swap(page, GFP_KERNEL)) {
 				unlock_page(page);
-				list_move(&page->lru, &failed);
+				list_move(&page->lru, failed);
 				nr_failed++;
 				continue;
 			}
@@ -732,8 +732,10 @@ redo:
 		 * Page is properly locked and writeback is complete.
 		 * Try to migrate the page.
 		 */
-		if (!swap_page(page))
+		if (!swap_page(page)) {
+			list_move(&page->lru, moved);
 			continue;
+		}
 retry_later:
 		retry++;
 	}
@@ -743,9 +745,6 @@ retry_later:
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
-	if (!list_empty(&failed))
-		list_splice(&failed, l);
-
 	return nr_failed + retry;
 }
 
-- 
cgit v1.2.3-71-gd317


From 45b07ef31d1182d2cfde7711327e3afb268bb1ac Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:00:56 -0800
Subject: [PATCH] cpusets: swap migration interface

Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.

It defaults to false (file contains "0").  It can be set true by writing
"1" to the file.

If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.

Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 25 +++++++++++++++++++++++++
 include/linux/mempolicy.h |  7 +++++++
 kernel/cpuset.c           | 38 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 68 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index a09a8eb80665..e2d9afc30d2d 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -192,6 +192,7 @@ containing the following files describing that cpuset:
 
  - cpus: list of CPUs in that cpuset
  - mems: list of Memory Nodes in that cpuset
+ - memory_migrate flag: if set, move pages to cpusets nodes
  - cpu_exclusive flag: is cpu placement exclusive?
  - mem_exclusive flag: is memory placement exclusive?
  - tasks: list of tasks (by pid) attached to that cpuset
@@ -277,6 +278,30 @@ rewritten to the 'tasks' file of its cpuset.  This is done to avoid
 impacting the scheduler code in the kernel with a check for changes
 in a tasks processor placement.
 
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'mems' subsequently changes.
+If the cpuset flag file 'memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the tasks new cpuset.  Depending on the implementation,
+this migration may either be done by swapping the page out,
+so that the next time the page is referenced, it will be paged
+into the tasks new cpuset, usually on the node where it was
+referenced, or this migration may be done by directly copying
+the pages from the tasks previous cpuset to the new cpuset,
+where possible to the same node, relative to the new cpuset,
+as the node that held the page, relative to the old cpuset.
+Also if 'memory_migrate' is set true, then if that cpusets
+'mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'mems',
+will be moved to nodes in the new setting of 'mems.'  Again,
+depending on the implementation, this might be done by swapping,
+or by direct copying.  In either case, pages that were not in
+the tasks prior cpuset, or in the cpusets prior 'mems' setting,
+will not be moved.
+
 There is an exception to the above.  If hotplug functionality is used
 to remove all the CPUs that are currently assigned to a cpuset,
 then the kernel will automatically update the cpus_allowed of all
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 3e61e829681d..66247eff24a0 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -235,6 +235,13 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 	return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
 }
 
+static inline int do_migrate_pages(struct mm_struct *mm,
+			const nodemask_t *from_nodes,
+			const nodemask_t *to_nodes, int flags)
+{
+	return 0;
+}
+
 static inline void check_highest_zone(int k)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7430640f9816..f63383e01ec7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -87,6 +87,7 @@ struct cpuset {
 typedef enum {
 	CS_CPU_EXCLUSIVE,
 	CS_MEM_EXCLUSIVE,
+	CS_MEMORY_MIGRATE,
 	CS_REMOVED,
 	CS_NOTIFY_ON_RELEASE
 } cpuset_flagbits_t;
@@ -112,6 +113,11 @@ static inline int notify_on_release(const struct cpuset *cs)
 	return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
 }
 
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+	return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
 /*
  * Increment this atomic integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
@@ -602,16 +608,24 @@ static void refresh_mems(void)
 	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
 		struct cpuset *cs;
 		nodemask_t oldmem = current->mems_allowed;
+		int migrate;
 
 		down(&callback_sem);
 		task_lock(current);
 		cs = current->cpuset;
+		migrate = is_memory_migrate(cs);
 		guarantee_online_mems(cs, &current->mems_allowed);
 		current->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(current);
 		up(&callback_sem);
-		if (!nodes_equal(oldmem, current->mems_allowed))
+		if (!nodes_equal(oldmem, current->mems_allowed)) {
 			numa_policy_rebind(&oldmem, &current->mems_allowed);
+			if (migrate) {
+				do_migrate_pages(current->mm, &oldmem,
+					&current->mems_allowed,
+					MPOL_MF_MOVE_ALL);
+			}
+		}
 	}
 }
 
@@ -795,7 +809,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- *						CS_NOTIFY_ON_RELEASE)
+ *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
  *
@@ -848,6 +862,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	struct task_struct *tsk;
 	struct cpuset *oldcs;
 	cpumask_t cpus;
+	nodemask_t from, to;
 
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
@@ -893,7 +908,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	guarantee_online_cpus(cs, &cpus);
 	set_cpus_allowed(tsk, cpus);
 
+	from = oldcs->mems_allowed;
+	to = cs->mems_allowed;
+
 	up(&callback_sem);
+	if (is_memory_migrate(cs))
+		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
 	if (atomic_dec_and_test(&oldcs->count))
 		check_for_release(oldcs, ppathbuf);
@@ -905,6 +925,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 typedef enum {
 	FILE_ROOT,
 	FILE_DIR,
+	FILE_MEMORY_MIGRATE,
 	FILE_CPULIST,
 	FILE_MEMLIST,
 	FILE_CPU_EXCLUSIVE,
@@ -960,6 +981,9 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_NOTIFY_ON_RELEASE:
 		retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
 		break;
+	case FILE_MEMORY_MIGRATE:
+		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
 		break;
@@ -1060,6 +1084,9 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_NOTIFY_ON_RELEASE:
 		*s++ = notify_on_release(cs) ? '1' : '0';
 		break;
+	case FILE_MEMORY_MIGRATE:
+		*s++ = is_memory_migrate(cs) ? '1' : '0';
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1408,6 +1435,11 @@ static struct cftype cft_notify_on_release = {
 	.private = FILE_NOTIFY_ON_RELEASE,
 };
 
+static struct cftype cft_memory_migrate = {
+	.name = "memory_migrate",
+	.private = FILE_MEMORY_MIGRATE,
+};
+
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
 	int err;
@@ -1422,6 +1454,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
 		return err;
 	return 0;
-- 
cgit v1.2.3-71-gd317


From 152194aaa6266d71dfee57882a23def339ef17a4 Mon Sep 17 00:00:00 2001
From: Avishay Traeger <atraeger@cs.sunysb.edu>
Date: Sun, 8 Jan 2006 01:00:58 -0800
Subject: [PATCH] set_page_count() macro safety

Fix set_page_count() macro to handle complex arguments.

Signed-off-by: Avishay Traeger <atraeger@cs.sunysb.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 83c651f25188..7ff54242c5d7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -308,7 +308,7 @@ struct page {
  */
 #define get_page_testone(p)	atomic_inc_and_test(&(p)->_count)
 
-#define set_page_count(p,v) 	atomic_set(&(p)->_count, v - 1)
+#define set_page_count(p,v) 	atomic_set(&(p)->_count, (v) - 1)
 #define __put_page(p)		atomic_dec(&(p)->_count)
 
 extern void FASTCALL(__page_cache_release(struct page *));
-- 
cgit v1.2.3-71-gd317


From 48fce3429df84a94766fbbc845fa8450d0715b48 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:03 -0800
Subject: [PATCH] mempolicies: unexport get_vma_policy()

Since the numa_maps functionality is now in mempolicy.c we no longer need to
export get_vma_policy().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 3 ---
 mm/mempolicy.c            | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 66247eff24a0..05fddd5bee5d 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -144,9 +144,6 @@ void mpol_free_shared_policy(struct shared_policy *p);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
-struct mempolicy *get_vma_policy(struct task_struct *task,
-			struct vm_area_struct *vma, unsigned long addr);
-
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4c0510e9e7f6..4b077ec6c005 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -935,8 +935,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 
 /* Return effective policy for a VMA */
-struct mempolicy *
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+static struct mempolicy * get_vma_policy(struct task_struct *task,
+		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = task->mempolicy;
 
-- 
cgit v1.2.3-71-gd317


From 22fc6eccbf4ce4eb6265e6ada7b50a7b9cc57d05 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sun, 8 Jan 2006 01:01:27 -0800
Subject: [PATCH] Change maxaligned_in_smp alignemnt macros to
 internodealigned_in_smp macros

____cacheline_maxaligned_in_smp is currently used to align critical structures
and avoid false sharing.  It uses per-arch L1_CACHE_SHIFT_MAX and people find
L1_CACHE_SHIFT_MAX useless.

However, we have been using ____cacheline_maxaligned_in_smp to align
structures on the internode cacheline size.  As per Andi's suggestion,
following patch kills ____cacheline_maxaligned_in_smp and introduces
INTERNODE_CACHE_SHIFT, which defaults to L1_CACHE_SHIFT for all arches.
Arches needing L3/Internode cacheline alignment can define
INTERNODE_CACHE_SHIFT in the arch asm/cache.h.  Patch replaces
____cacheline_maxaligned_in_smp with ____cacheline_internodealigned_in_smp

With this patch, L1_CACHE_SHIFT_MAX can be killed

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/init_task.c   |  2 +-
 arch/i386/kernel/irq.c         |  2 +-
 arch/x86_64/kernel/init_task.c |  2 +-
 include/linux/cache.h          | 17 +++++++++++++----
 include/linux/ide.h            |  2 +-
 include/linux/mmzone.h         |  4 ++--
 include/linux/rcupdate.h       |  2 +-
 kernel/rcupdate.c              |  4 ++--
 mm/sparse.c                    |  4 ++--
 9 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c
index 9caa8e8db80c..cff95d10a4d8 100644
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task);
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS;
+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
 
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 1a201a932865..f3a9c78c4a24 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -19,7 +19,7 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 
-DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp;
+DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
 #ifndef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
index e0ba5c1043fd..ce31d904d601 100644
--- a/arch/x86_64/kernel/init_task.c
+++ b/arch/x86_64/kernel/init_task.c
@@ -44,6 +44,6 @@ EXPORT_SYMBOL(init_task);
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS;
+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
 
 #define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/include/linux/cache.h b/include/linux/cache.h
index 0b7ecf3af78a..ffe52210fc4f 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -45,12 +45,21 @@
 #endif /* CONFIG_SMP */
 #endif
 
-#if !defined(____cacheline_maxaligned_in_smp)
+/*
+ * The maximum alignment needed for some critical structures
+ * These could be inter-node cacheline sizes/L3 cacheline
+ * size etc.  Define this in asm/cache.h for your arch
+ */
+#ifndef INTERNODE_CACHE_SHIFT
+#define INTERNODE_CACHE_SHIFT L1_CACHE_SHIFT
+#endif
+
+#if !defined(____cacheline_internodealigned_in_smp)
 #if defined(CONFIG_SMP)
-#define ____cacheline_maxaligned_in_smp \
-	__attribute__((__aligned__(1 << (L1_CACHE_SHIFT_MAX))))
+#define ____cacheline_internodealigned_in_smp \
+	__attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT))))
 #else
-#define ____cacheline_maxaligned_in_smp
+#define ____cacheline_internodealigned_in_smp
 #endif
 #endif
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 7b6a6a58e465..4dd6694963c0 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -801,7 +801,7 @@ typedef struct hwif_s {
 	unsigned dma;
 
 	void (*led_act)(void *data, int rw);
-} ____cacheline_maxaligned_in_smp ide_hwif_t;
+} ____cacheline_internodealigned_in_smp ide_hwif_t;
 
 /*
  *  internal ide interrupt handler type
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2a89c132ba9c..7e4ae6ab1977 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -38,7 +38,7 @@ struct pglist_data;
 #if defined(CONFIG_SMP)
 struct zone_padding {
 	char x[0];
-} ____cacheline_maxaligned_in_smp;
+} ____cacheline_internodealigned_in_smp;
 #define ZONE_PADDING(name)	struct zone_padding name;
 #else
 #define ZONE_PADDING(name)
@@ -233,7 +233,7 @@ struct zone {
 	 * rarely used fields:
 	 */
 	char			*name;
-} ____cacheline_maxaligned_in_smp;
+} ____cacheline_internodealigned_in_smp;
 
 
 /*
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a471f3bb713e..51747cd88d1a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -65,7 +65,7 @@ struct rcu_ctrlblk {
 	long	cur;		/* Current batch number.                      */
 	long	completed;	/* Number of the last completed batch         */
 	int	next_pending;	/* Is the next batch already waiting?         */
-} ____cacheline_maxaligned_in_smp;
+} ____cacheline_internodealigned_in_smp;
 
 /* Is batch a before batch b ? */
 static inline int rcu_batch_before(long a, long b)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 48d3bce465b8..c9afc61240e4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -61,9 +61,9 @@ struct rcu_state {
 	                              /* for current batch to proceed.        */
 };
 
-static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
+static struct rcu_state rcu_state ____cacheline_internodealigned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
-static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
+static struct rcu_state rcu_bh_state ____cacheline_internodealigned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e2d..0a51f36ba3a1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
  */
 #ifdef CONFIG_SPARSEMEM_EXTREME
 struct mem_section *mem_section[NR_SECTION_ROOTS]
-	____cacheline_maxaligned_in_smp;
+	____cacheline_internodealigned_in_smp;
 #else
 struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
-	____cacheline_maxaligned_in_smp;
+	____cacheline_internodealigned_in_smp;
 #endif
 EXPORT_SYMBOL(mem_section);
 
-- 
cgit v1.2.3-71-gd317


From e56d090310d7625ecb43a1eeebd479f04affb48b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 8 Jan 2006 01:01:37 -0800
Subject: [PATCH] RCU signal handling

RCU tasklist_lock and RCU signal handling: send signals RCU-read-locked
instead of tasklist_lock read-locked.  This is a scalability improvement on
SMP and a preemption-latency improvement under PREEMPT_RCU.

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: William Irwin <wli@holomorphy.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  4 +--
 include/linux/sched.h | 32 +++++++++++++++--
 kernel/exit.c         |  1 -
 kernel/fork.c         | 10 +++++-
 kernel/pid.c          | 22 ++++++------
 kernel/rcupdate.c     |  1 +
 kernel/sched.c        |  7 ++++
 kernel/signal.c       | 97 +++++++++++++++++++++++++++++++++++++++++++--------
 8 files changed, 143 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index e75a9548da8e..e9650cd22a3b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -760,7 +760,7 @@ no_thread_group:
 		spin_lock(&oldsighand->siglock);
 		spin_lock(&newsighand->siglock);
 
-		current->sighand = newsighand;
+		rcu_assign_pointer(current->sighand, newsighand);
 		recalc_sigpending();
 
 		spin_unlock(&newsighand->siglock);
@@ -768,7 +768,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			kmem_cache_free(sighand_cachep, oldsighand);
+			sighand_free(oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a74662077d60..a6af77e9b4cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/seccomp.h>
+#include <linux/rcupdate.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
@@ -350,8 +351,16 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
+	struct rcu_head		rcu;
 };
 
+extern void sighand_free_cb(struct rcu_head *rhp);
+
+static inline void sighand_free(struct sighand_struct *sp)
+{
+	call_rcu(&sp->rcu, sighand_free_cb);
+}
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -844,6 +853,7 @@ struct task_struct {
 	int cpuset_mems_generation;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
+	struct rcu_head rcu;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
@@ -867,8 +877,26 @@ static inline int pid_alive(struct task_struct *p)
 extern void free_task(struct task_struct *tsk);
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-#define put_task_struct(tsk) \
-do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
+
+static inline int get_task_struct_rcu(struct task_struct *t)
+{
+	int oldusage;
+
+	do {
+		oldusage = atomic_read(&t->usage);
+		if (oldusage == 0)
+			return 0;
+	} while (cmpxchg(&t->usage.counter, oldusage, oldusage+1) != oldusage);
+	return 1;
+}
+
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->rcu, __put_task_struct_cb);
+}
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index ee515683b92d..c73a7eb26de3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,7 +72,6 @@ repeat:
 		__ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
-	__exit_sighand(p);
 	/*
 	 * Note that the fastpath in sys_times depends on __exit_signal having
 	 * updated the counters before a task is removed from the tasklist of
diff --git a/kernel/fork.c b/kernel/fork.c
index fb8572a42297..7fe3adfa65cb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -743,6 +743,14 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
+void sighand_free_cb(struct rcu_head *rhp)
+{
+	struct sighand_struct *sp;
+
+	sp = container_of(rhp, struct sighand_struct, rcu);
+	kmem_cache_free(sighand_cachep, sp);
+}
+
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -752,7 +760,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 		return 0;
 	}
 	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
-	tsk->sighand = sig;
+	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
 	spin_lock_init(&sig->siglock);
diff --git a/kernel/pid.c b/kernel/pid.c
index edba31c681ac..1acc07246991 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
 	struct hlist_node *elem;
 	struct pid *pid;
 
-	hlist_for_each_entry(pid, elem,
+	hlist_for_each_entry_rcu(pid, elem,
 			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
 		if (pid->nr == nr)
 			return pid;
@@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
 
 	task_pid = &task->pids[type];
 	pid = find_pid(type, nr);
+	task_pid->nr = nr;
 	if (pid == NULL) {
-		hlist_add_head(&task_pid->pid_chain,
-				&pid_hash[type][pid_hashfn(nr)]);
 		INIT_LIST_HEAD(&task_pid->pid_list);
+		hlist_add_head_rcu(&task_pid->pid_chain,
+				   &pid_hash[type][pid_hashfn(nr)]);
 	} else {
 		INIT_HLIST_NODE(&task_pid->pid_chain);
-		list_add_tail(&task_pid->pid_list, &pid->pid_list);
+		list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
 	}
-	task_pid->nr = nr;
 
 	return 0;
 }
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type)
 
 	pid = &task->pids[type];
 	if (!hlist_unhashed(&pid->pid_chain)) {
-		hlist_del(&pid->pid_chain);
 
-		if (list_empty(&pid->pid_list))
+		if (list_empty(&pid->pid_list)) {
 			nr = pid->nr;
-		else {
+			hlist_del_rcu(&pid->pid_chain);
+		} else {
 			pid_next = list_entry(pid->pid_list.next,
 						struct pid, pid_list);
 			/* insert next pid from pid_list to hash */
-			hlist_add_head(&pid_next->pid_chain,
-				&pid_hash[type][pid_hashfn(pid_next->nr)]);
+			hlist_replace_rcu(&pid->pid_chain,
+					  &pid_next->pid_chain);
 		}
 	}
 
-	list_del(&pid->pid_list);
+	list_del_rcu(&pid->pid_list);
 	pid->nr = 0;
 
 	return nr;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c9afc61240e4..0a669bd2f6d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f46c94cc29e..92733091154c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -176,6 +176,13 @@ static unsigned int task_timeslice(task_t *p)
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	__put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
 /*
  * These are the runqueue data structures:
  */
diff --git a/kernel/signal.c b/kernel/signal.c
index d7611f189ef7..64737c72dadd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -329,13 +329,20 @@ void __exit_sighand(struct task_struct *tsk)
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
+		sighand_free(sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
 {
 	write_lock_irq(&tasklist_lock);
-	__exit_sighand(tsk);
+	rcu_read_lock();
+	if (tsk->sighand != NULL) {
+		struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
+		spin_lock(&sighand->siglock);
+		__exit_sighand(tsk);
+		spin_unlock(&sighand->siglock);
+	}
+	rcu_read_unlock();
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -345,12 +352,14 @@ void exit_sighand(struct task_struct *tsk)
 void __exit_signal(struct task_struct *tsk)
 {
 	struct signal_struct * sig = tsk->signal;
-	struct sighand_struct * sighand = tsk->sighand;
+	struct sighand_struct * sighand;
 
 	if (!sig)
 		BUG();
 	if (!atomic_read(&sig->count))
 		BUG();
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
@@ -358,6 +367,7 @@ void __exit_signal(struct task_struct *tsk)
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 	} else {
@@ -389,9 +399,11 @@ void __exit_signal(struct task_struct *tsk)
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->sched_time += tsk->sched_time;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		sig = NULL;	/* Marker for below.  */
 	}
+	rcu_read_unlock();
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
@@ -1080,18 +1092,28 @@ void zap_other_threads(struct task_struct *p)
 }
 
 /*
- * Must be called with the tasklist_lock held for reading!
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
+	struct sighand_struct *sp;
 	int ret;
 
+retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && p->sighand) {
-		spin_lock_irqsave(&p->sighand->siglock, flags);
+	if (!ret && sig && (sp = p->sighand)) {
+		if (!get_task_struct_rcu(p))
+			return -ESRCH;
+		spin_lock_irqsave(&sp->siglock, flags);
+		if (p->sighand != sp) {
+			spin_unlock_irqrestore(&sp->siglock, flags);
+			put_task_struct(p);
+			goto retry;
+		}
 		ret = __group_send_sig_info(sig, info, p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		spin_unlock_irqrestore(&sp->siglock, flags);
+		put_task_struct(p);
 	}
 
 	return ret;
@@ -1136,14 +1158,21 @@ int
 kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
+	int acquired_tasklist_lock = 0;
 	struct task_struct *p;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
+	if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+		read_lock(&tasklist_lock);
+		acquired_tasklist_lock = 1;
+	}
 	p = find_task_by_pid(pid);
 	error = -ESRCH;
 	if (p)
 		error = group_send_sig_info(sig, info, p);
-	read_unlock(&tasklist_lock);
+	if (unlikely(acquired_tasklist_lock))
+		read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -1355,16 +1384,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
 	unsigned long flags;
 	int ret = 0;
+	struct sighand_struct *sh;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	/*
+	 * The rcu based delayed sighand destroy makes it possible to
+	 * run this without tasklist lock held. The task struct itself
+	 * cannot go away as create_timer did get_task_struct().
+	 *
+	 * We return -1, when the task is marked exiting, so
+	 * posix_timer_event can redirect it to the group leader
+	 */
+	rcu_read_lock();
 
 	if (unlikely(p->flags & PF_EXITING)) {
 		ret = -1;
 		goto out_err;
 	}
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
+retry:
+	sh = rcu_dereference(p->sighand);
+
+	spin_lock_irqsave(&sh->siglock, flags);
+	if (p->sighand != sh) {
+		/* We raced with exec() in a multithreaded process... */
+		spin_unlock_irqrestore(&sh->siglock, flags);
+		goto retry;
+	}
+
+	/*
+	 * We do the check here again to handle the following scenario:
+	 *
+	 * CPU 0		CPU 1
+	 * send_sigqueue
+	 * check PF_EXITING
+	 * interrupt		exit code running
+	 *			__exit_signal
+	 *			lock sighand->siglock
+	 *			unlock sighand->siglock
+	 * lock sh->siglock
+	 * add(tsk->pending) 	flush_sigqueue(tsk->pending)
+	 *
+	 */
+
+	if (unlikely(p->flags & PF_EXITING)) {
+		ret = -1;
+		goto out;
+	}
 
 	if (unlikely(!list_empty(&q->list))) {
 		/*
@@ -1388,9 +1455,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 		signal_wake_up(p, sig == SIGKILL);
 
 out:
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	spin_unlock_irqrestore(&sh->siglock, flags);
 out_err:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -1402,7 +1469,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	int ret = 0;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+
 	read_lock(&tasklist_lock);
+	/* Since it_lock is held, p->sighand cannot be NULL. */
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	handle_stop_signal(sig, p);
 
@@ -1436,7 +1505,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 out:
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	read_unlock(&tasklist_lock);
-	return(ret);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From d4829cd5b4bd1ea58ba1bebad44d562f4027c290 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 8 Jan 2006 01:01:39 -0800
Subject: [PATCH] remove get_task_struct_rcu()

The latest set of signal-RCU patches does not use get_task_struct_rcu().
Attached is a patch that removes it.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a6af77e9b4cf..20bd70749104 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -878,18 +878,6 @@ extern void free_task(struct task_struct *tsk);
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
-static inline int get_task_struct_rcu(struct task_struct *t)
-{
-	int oldusage;
-
-	do {
-		oldusage = atomic_read(&t->usage);
-		if (oldusage == 0)
-			return 0;
-	} while (cmpxchg(&t->usage.counter, oldusage, oldusage+1) != oldusage);
-	return 1;
-}
-
 extern void __put_task_struct_cb(struct rcu_head *rhp);
 
 static inline void put_task_struct(struct task_struct *t)
-- 
cgit v1.2.3-71-gd317


From 10cef6029502915bdb3cf0821d425cf9dc30c817 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:01:45 -0800
Subject: [PATCH] slob: introduce the SLOB allocator

configurable replacement for slab allocator

This adds a CONFIG_SLAB option under CONFIG_EMBEDDED.  When CONFIG_SLAB is
disabled, the kernel falls back to using the 'SLOB' allocator.

SLOB is a traditional K&R/UNIX allocator with a SLAB emulation layer,
similar to the original Linux kmalloc allocator that SLAB replaced.  It's
signicantly smaller code and is more memory efficient.  But like all
similar allocators, it scales poorly and suffers from fragmentation more
than SLAB, so it's only appropriate for small systems.

It's been tested extensively in the Linux-tiny tree.  I've also
stress-tested it with make -j 8 compiles on a 3G SMP+PREEMPT box (not
recommended).

Here's a comparison for otherwise identical builds, showing SLOB saving
nearly half a megabyte of RAM:

$ size vmlinux*
   text    data     bss     dec     hex filename
3336372  529360  190812 4056544  3de5e0 vmlinux-slab
3323208  527948  190684 4041840  3dac70 vmlinux-slob

$ size mm/{slab,slob}.o
   text    data     bss     dec     hex filename
  13221     752      48   14021    36c5 mm/slab.o
   1896      52       8    1956     7a4 mm/slob.o

/proc/meminfo:
                  SLAB          SLOB      delta
MemTotal:        27964 kB      27980 kB     +16 kB
MemFree:         24596 kB      25092 kB    +496 kB
Buffers:            36 kB         36 kB       0 kB
Cached:           1188 kB       1188 kB       0 kB
SwapCached:          0 kB          0 kB       0 kB
Active:            608 kB        600 kB      -8 kB
Inactive:          808 kB        812 kB      +4 kB
HighTotal:           0 kB          0 kB       0 kB
HighFree:            0 kB          0 kB       0 kB
LowTotal:        27964 kB      27980 kB     +16 kB
LowFree:         24596 kB      25092 kB    +496 kB
SwapTotal:           0 kB          0 kB       0 kB
SwapFree:            0 kB          0 kB       0 kB
Dirty:               4 kB         12 kB      +8 kB
Writeback:           0 kB          0 kB       0 kB
Mapped:            560 kB        556 kB      -4 kB
Slab:             1756 kB          0 kB   -1756 kB
CommitLimit:     13980 kB      13988 kB      +8 kB
Committed_AS:     4208 kB       4208 kB       0 kB
PageTables:         28 kB         28 kB       0 kB
VmallocTotal:  1007312 kB    1007312 kB       0 kB
VmallocUsed:        48 kB         48 kB       0 kB
VmallocChunk:  1007264 kB    1007264 kB       0 kB

(this work has been sponsored in part by CELF)

From: Ingo Molnar <mingo@elte.hu>

   Fix 32-bitness bugs in mm/slob.c.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c  |   4 +
 include/linux/slab.h |  35 +++++
 init/Kconfig         |  13 ++
 mm/Makefile          |   4 +-
 mm/slob.c            | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 440 insertions(+), 1 deletion(-)
 create mode 100644 mm/slob.c

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5b6b0b6038a7..63bf6c00fa0c 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -323,6 +323,7 @@ static struct file_operations proc_modules_operations = {
 };
 #endif
 
+#ifdef CONFIG_SLAB
 extern struct seq_operations slabinfo_op;
 extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
 static int slabinfo_open(struct inode *inode, struct file *file)
@@ -336,6 +337,7 @@ static struct file_operations proc_slabinfo_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif
 
 static int show_stat(struct seq_file *p, void *v)
 {
@@ -600,7 +602,9 @@ void __init proc_misc_init(void)
 	create_seq_entry("partitions", 0, &proc_partitions_operations);
 	create_seq_entry("stat", 0, &proc_stat_operations);
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
+#ifdef CONFIG_SLAB
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
+#endif
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d1ea4051b996..1fb77a9cc148 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -53,6 +53,8 @@ typedef struct kmem_cache kmem_cache_t;
 #define SLAB_CTOR_ATOMIC	0x002UL		/* tell constructor it can't sleep */
 #define	SLAB_CTOR_VERIFY	0x004UL		/* tell constructor it's a verify call */
 
+#ifndef CONFIG_SLOB
+
 /* prototypes */
 extern void __init kmem_cache_init(void);
 
@@ -134,6 +136,39 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 extern int FASTCALL(kmem_cache_reap(int));
 extern int FASTCALL(kmem_ptr_validate(kmem_cache_t *cachep, void *ptr));
 
+#else /* CONFIG_SLOB */
+
+/* SLOB allocator routines */
+
+void kmem_cache_init(void);
+struct kmem_cache *kmem_find_general_cachep(size_t, gfp_t gfpflags);
+struct kmem_cache *kmem_cache_create(const char *c, size_t, size_t,
+	unsigned long,
+	void (*)(void *, struct kmem_cache *, unsigned long),
+	void (*)(void *, struct kmem_cache *, unsigned long));
+int kmem_cache_destroy(struct kmem_cache *c);
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags);
+void kmem_cache_free(struct kmem_cache *c, void *b);
+const char *kmem_cache_name(struct kmem_cache *);
+void *kmalloc(size_t size, gfp_t flags);
+void *kzalloc(size_t size, gfp_t flags);
+void kfree(const void *m);
+unsigned int ksize(const void *m);
+unsigned int kmem_cache_size(struct kmem_cache *c);
+
+static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
+{
+	return kzalloc(n * size, flags);
+}
+
+#define kmem_cache_shrink(d) (0)
+#define kmem_cache_reap(a)
+#define kmem_ptr_validate(a, b) (0)
+#define kmem_cache_alloc_node(c, f, n) kmem_cache_alloc(c, f)
+#define kmalloc_node(s, f, n) kmalloc(s, f)
+
+#endif /* CONFIG_SLOB */
+
 /* System wide caches */
 extern kmem_cache_t	*vm_area_cachep;
 extern kmem_cache_t	*names_cachep;
diff --git a/init/Kconfig b/init/Kconfig
index ba42f3793a84..0c9932f9f06b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -380,6 +380,15 @@ config CC_ALIGN_JUMPS
 	  no dummy operations need be executed.
 	  Zero means use compiler's default.
 
+config SLAB
+	default y
+	bool "Use full SLAB allocator" if EMBEDDED
+	help
+	  Disabling this replaces the advanced SLAB allocator and
+	  kmalloc support with the drastically simpler SLOB allocator.
+	  SLOB is more space efficient but does not scale well and is
+	  more susceptible to fragmentation.
+
 endmenu		# General setup
 
 config TINY_SHMEM
@@ -391,6 +400,10 @@ config BASE_SMALL
 	default 0 if BASE_FULL
 	default 1 if !BASE_FULL
 
+config SLOB
+	default !SLAB
+	bool
+
 menu "Loadable module support"
 
 config MODULES
diff --git a/mm/Makefile b/mm/Makefile
index 74c85ddc9176..9aa03fa1dcc3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
-			   readahead.o slab.o swap.o truncate.o vmscan.o \
+			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 000000000000..1c240c4b71d9
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * this to 4 if it's deemed worth the effort. The slob heap is a
+ * singly-linked list of pages from __get_free_page, grown on demand
+ * and allocation from the heap is currently first-fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * __get_free_pages directly so that it can return page-aligned blocks
+ * and keeps a linked list of such pages and their orders. These
+ * objects are detected in kfree() by their page alignment.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with
+ * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
+ * set, in which case the low-level allocator will fragment blocks to
+ * create the proper alignment. Again, objects of page-size or greater
+ * are allocated by calling __get_free_pages. As SLAB objects know
+ * their size, no separate size bookkeeping is necessary and there is
+ * essentially no allocation space overhead.
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+
+struct slob_block {
+	int units;
+	struct slob_block *next;
+};
+typedef struct slob_block slob_t;
+
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+
+struct bigblock {
+	int order;
+	void *pages;
+	struct bigblock *next;
+};
+typedef struct bigblock bigblock_t;
+
+static slob_t arena = { .next = &arena, .units = 1 };
+static slob_t *slobfree = &arena;
+static bigblock_t *bigblocks;
+static DEFINE_SPINLOCK(slob_lock);
+static DEFINE_SPINLOCK(block_lock);
+
+static void slob_free(void *b, int size);
+
+static void *slob_alloc(size_t size, gfp_t gfp, int align)
+{
+	slob_t *prev, *cur, *aligned = 0;
+	int delta = 0, units = SLOB_UNITS(size);
+	unsigned long flags;
+
+	spin_lock_irqsave(&slob_lock, flags);
+	prev = slobfree;
+	for (cur = prev->next; ; prev = cur, cur = cur->next) {
+		if (align) {
+			aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+			delta = aligned - cur;
+		}
+		if (cur->units >= units + delta) { /* room enough? */
+			if (delta) { /* need to fragment head to align? */
+				aligned->units = cur->units - delta;
+				aligned->next = cur->next;
+				cur->next = aligned;
+				cur->units = delta;
+				prev = cur;
+				cur = aligned;
+			}
+
+			if (cur->units == units) /* exact fit? */
+				prev->next = cur->next; /* unlink */
+			else { /* fragment */
+				prev->next = cur + units;
+				prev->next->units = cur->units - units;
+				prev->next->next = cur->next;
+				cur->units = units;
+			}
+
+			slobfree = prev;
+			spin_unlock_irqrestore(&slob_lock, flags);
+			return cur;
+		}
+		if (cur == slobfree) {
+			spin_unlock_irqrestore(&slob_lock, flags);
+
+			if (size == PAGE_SIZE) /* trying to shrink arena? */
+				return 0;
+
+			cur = (slob_t *)__get_free_page(gfp);
+			if (!cur)
+				return 0;
+
+			slob_free(cur, PAGE_SIZE);
+			spin_lock_irqsave(&slob_lock, flags);
+			cur = slobfree;
+		}
+	}
+}
+
+static void slob_free(void *block, int size)
+{
+	slob_t *cur, *b = (slob_t *)block;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (size)
+		b->units = SLOB_UNITS(size);
+
+	/* Find reinsertion point */
+	spin_lock_irqsave(&slob_lock, flags);
+	for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+		if (cur >= cur->next && (b > cur || b < cur->next))
+			break;
+
+	if (b + b->units == cur->next) {
+		b->units += cur->next->units;
+		b->next = cur->next->next;
+	} else
+		b->next = cur->next;
+
+	if (cur + cur->units == b) {
+		cur->units += b->units;
+		cur->next = b->next;
+	} else
+		cur->next = b;
+
+	slobfree = cur;
+
+	spin_unlock_irqrestore(&slob_lock, flags);
+}
+
+static int FASTCALL(find_order(int size));
+static int fastcall find_order(int size)
+{
+	int order = 0;
+	for ( ; size > 4096 ; size >>=1)
+		order++;
+	return order;
+}
+
+void *kmalloc(size_t size, gfp_t gfp)
+{
+	slob_t *m;
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (size < PAGE_SIZE - SLOB_UNIT) {
+		m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+		return m ? (void *)(m + 1) : 0;
+	}
+
+	bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+	if (!bb)
+		return 0;
+
+	bb->order = find_order(size);
+	bb->pages = (void *)__get_free_pages(gfp, bb->order);
+
+	if (bb->pages) {
+		spin_lock_irqsave(&block_lock, flags);
+		bb->next = bigblocks;
+		bigblocks = bb;
+		spin_unlock_irqrestore(&block_lock, flags);
+		return bb->pages;
+	}
+
+	slob_free(bb, sizeof(bigblock_t));
+	return 0;
+}
+
+EXPORT_SYMBOL(kmalloc);
+
+void kfree(const void *block)
+{
+	bigblock_t *bb, **last = &bigblocks;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		/* might be on the big block list */
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+			if (bb->pages == block) {
+				*last = bb->next;
+				spin_unlock_irqrestore(&block_lock, flags);
+				free_pages((unsigned long)block, bb->order);
+				slob_free(bb, sizeof(bigblock_t));
+				return;
+			}
+		}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	slob_free((slob_t *)block - 1, 0);
+	return;
+}
+
+EXPORT_SYMBOL(kfree);
+
+unsigned int ksize(const void *block)
+{
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (!block)
+		return 0;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; bb = bb->next)
+			if (bb->pages == block) {
+				spin_unlock_irqrestore(&slob_lock, flags);
+				return PAGE_SIZE << bb->order;
+			}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	return ((slob_t *)block - 1)->units * SLOB_UNIT;
+}
+
+struct kmem_cache {
+	unsigned int size, align;
+	const char *name;
+	void (*ctor)(void *, struct kmem_cache *, unsigned long);
+	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+};
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+	size_t align, unsigned long flags,
+	void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+{
+	struct kmem_cache *c;
+
+	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+
+	if (c) {
+		c->name = name;
+		c->size = size;
+		c->ctor = ctor;
+		c->dtor = dtor;
+		/* ignore alignment unless it's forced */
+		c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+		if (c->align < align)
+			c->align = align;
+	}
+
+	return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+int kmem_cache_destroy(struct kmem_cache *c)
+{
+	slob_free(c, sizeof(struct kmem_cache));
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+	void *b;
+
+	if (c->size < PAGE_SIZE)
+		b = slob_alloc(c->size, flags, c->align);
+	else
+		b = (void *)__get_free_pages(flags, find_order(c->size));
+
+	if (c->ctor)
+		c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+
+	return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+	if (c->dtor)
+		c->dtor(b, c, 0);
+
+	if (c->size < PAGE_SIZE)
+		slob_free(b, c->size);
+	else
+		free_pages((unsigned long)b, find_order(c->size));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+	return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+	return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+static struct timer_list slob_timer = TIMER_INITIALIZER(
+	(void (*)(unsigned long))kmem_cache_init, 0, 0);
+
+void kmem_cache_init(void)
+{
+	void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+
+	if (p)
+		free_page((unsigned long)p);
+
+	mod_timer(&slob_timer, jiffies + HZ);
+}
+
+atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(slab_reclaim_pages);
+
+#ifdef CONFIG_SMP
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+	int i;
+	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+
+	if (!pdata)
+		return NULL;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+		if (!pdata->ptrs[i])
+			goto unwind_oom;
+		memset(pdata->ptrs[i], 0, size);
+	}
+
+	/* Catch derefs w/o wrappers */
+	return (void *) (~(unsigned long) pdata);
+
+unwind_oom:
+	while (--i >= 0) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(pdata->ptrs[i]);
+	}
+	kfree(pdata);
+	return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+
+void
+free_percpu(const void *objp)
+{
+	int i;
+	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(p->ptrs[i]);
+	}
+	kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 5966514db662fb24c9bb43226a80106bcffd51f8 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:47 -0800
Subject: [PATCH] cpuset: mempolicy one more nodemask conversion

Finish converting mm/mempolicy.c from bitmaps to nodemasks.  The previous
conversion had left one routine using bitmaps, since it involved a
corresponding change to kernel/cpuset.c

Fix that interface by replacing with a simple macro that calls nodes_subset(),
or if !CONFIG_CPUSET, returns (1).

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  5 +++--
 kernel/cpuset.c        | 10 ----------
 mm/mempolicy.c         |  5 ++---
 3 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 6e2deef96b34..8b21786490ee 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,7 +21,8 @@ extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 void cpuset_update_current_mems_allowed(void);
-void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
+#define cpuset_nodes_subset_current_mems_allowed(nodes) \
+		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
 extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
@@ -42,7 +43,7 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 
 static inline void cpuset_init_current_mems_allowed(void) {}
 static inline void cpuset_update_current_mems_allowed(void) {}
-static inline void cpuset_restrict_to_mems_allowed(unsigned long *nodes) {}
+#define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
 
 static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f63383e01ec7..6503c6da4c4f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1749,16 +1749,6 @@ done:
 		refresh_mems();
 }
 
-/**
- * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
- * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
- */
-void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
-{
-	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
-							MAX_NUMNODES);
-}
-
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7051fe450e96..9dea2b8a7d48 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -387,10 +387,9 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	if (!nodes)
 		return 0;
 
-	/* Update current mems_allowed */
 	cpuset_update_current_mems_allowed();
-	/* Ignore nodes not set in current->mems_allowed */
-	cpuset_restrict_to_mems_allowed(nodes->bits);
+	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
+		return -EINVAL;
 	return mpol_check_policy(mode, nodes);
 }
 
-- 
cgit v1.2.3-71-gd317


From 3e0d98b9f1eb757fc98efc84e74e54a08308aa73 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:49 -0800
Subject: [PATCH] cpuset: memory pressure meter

Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.

This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.

This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.

This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure.  It's up to the batch
manager or other user code to decide what to do about it and take action.

==> Unless this feature is enabled by writing "1" to the special file
    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
    code of __alloc_pages() for this metric reduces to simply noticing
    that the cpuset_memory_pressure_enabled flag is zero.  So only
    systems that enable this feature will compute the metric.

Why a per-cpuset, running average:

    Because this meter is per-cpuset, rather than per-task or mm, the
    system load imposed by a batch scheduler monitoring this metric is
    sharply reduced on large systems, because a scan of the tasklist can be
    avoided on each set of queries.

    Because this meter is a running average, instead of an accumulating
    counter, a batch scheduler can detect memory pressure with a single
    read, instead of having to read and accumulate results for a period of
    time.

    Because this meter is per-cpuset rather than per-task or mm, the
    batch scheduler can obtain the key information, memory pressure in a
    cpuset, with a single read, rather than having to query and accumulate
    results over all the (dynamically changing) set of tasks in the cpuset.

A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.

A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  11 +++
 kernel/cpuset.c        | 193 ++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/page_alloc.c        |   1 +
 3 files changed, 203 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 8b21786490ee..736d73801cb6 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -26,6 +26,15 @@ void cpuset_update_current_mems_allowed(void);
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
 extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
+
+#define cpuset_memory_pressure_bump() 				\
+	do {							\
+		if (cpuset_memory_pressure_enabled)		\
+			__cpuset_memory_pressure_bump();	\
+	} while (0)
+extern int cpuset_memory_pressure_enabled;
+extern void __cpuset_memory_pressure_bump(void);
+
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
@@ -60,6 +69,8 @@ static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	return 1;
 }
 
+static inline void cpuset_memory_pressure_bump(void) {}
+
 static inline char *cpuset_task_status_allowed(struct task_struct *task,
 							char *buffer)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6503c6da4c4f..5a06fef669f8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,15 @@
 
 #define CPUSET_SUPER_MAGIC 		0x27e0eb
 
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+	int cnt;		/* unprocessed events count */
+	int val;		/* most recent output value */
+	time_t time;		/* clock (secs) when val computed */
+	spinlock_t lock;	/* guards read or write of above */
+};
+
 struct cpuset {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
@@ -80,7 +89,9 @@ struct cpuset {
 	 * Copy of global cpuset_mems_generation as of the most
 	 * recent time this cpuset changed its mems_allowed.
 	 */
-	 int mems_generation;
+	int mems_generation;
+
+	struct fmeter fmeter;		/* memory_pressure filter */
 };
 
 /* bits in struct cpuset flags field */
@@ -149,7 +160,7 @@ static struct cpuset top_cpuset = {
 };
 
 static struct vfsmount *cpuset_mount;
-static struct super_block *cpuset_sb = NULL;
+static struct super_block *cpuset_sb;
 
 /*
  * We have two global cpuset semaphores below.  They can nest.
@@ -806,6 +817,19 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	return retval;
 }
 
+/*
+ * Call with manage_sem held.
+ */
+
+static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
+{
+	if (simple_strtoul(buf, NULL, 10) != 0)
+		cpuset_memory_pressure_enabled = 1;
+	else
+		cpuset_memory_pressure_enabled = 0;
+	return 0;
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -847,6 +871,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 	return 0;
 }
 
+/*
+ * Frequency meter - How fast is some event occuring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter.  There are four routines:
+ *   fmeter_init() - initialize a frequency meter.
+ *   fmeter_markevent() - called each time the event happens.
+ *   fmeter_getrate() - returns the recent rate of such events.
+ *   fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR).  The time unit
+ * is 1 second.  Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter.  If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds.  At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000.  At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event.  At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933		/* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
+#define FM_SCALE 1000		/* faux fixed point scale */
+
+/* Initialize a frequency meter */
+static void fmeter_init(struct fmeter *fmp)
+{
+	fmp->cnt = 0;
+	fmp->val = 0;
+	fmp->time = 0;
+	spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+	time_t now = get_seconds();
+	time_t ticks = now - fmp->time;
+
+	if (ticks == 0)
+		return;
+
+	ticks = min(FM_MAXTICKS, ticks);
+	while (ticks-- > 0)
+		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+	fmp->time = now;
+
+	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+	fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+	spin_lock(&fmp->lock);
+	fmeter_update(fmp);
+	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+	spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+	int val;
+
+	spin_lock(&fmp->lock);
+	fmeter_update(fmp);
+	val = fmp->val;
+	spin_unlock(&fmp->lock);
+	return val;
+}
+
 /*
  * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
  * writing the path of the old cpuset in 'ppathbuf' if it needs to be
@@ -931,6 +1053,8 @@ typedef enum {
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_NOTIFY_ON_RELEASE,
+	FILE_MEMORY_PRESSURE_ENABLED,
+	FILE_MEMORY_PRESSURE,
 	FILE_TASKLIST,
 } cpuset_filetype_t;
 
@@ -984,6 +1108,12 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_MEMORY_MIGRATE:
 		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
 		break;
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		retval = update_memory_pressure_enabled(cs, buffer);
+		break;
+	case FILE_MEMORY_PRESSURE:
+		retval = -EACCES;
+		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
 		break;
@@ -1087,6 +1217,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_MEMORY_MIGRATE:
 		*s++ = is_memory_migrate(cs) ? '1' : '0';
 		break;
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		*s++ = cpuset_memory_pressure_enabled ? '1' : '0';
+		break;
+	case FILE_MEMORY_PRESSURE:
+		s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1440,6 +1576,16 @@ static struct cftype cft_memory_migrate = {
 	.private = FILE_MEMORY_MIGRATE,
 };
 
+static struct cftype cft_memory_pressure_enabled = {
+	.name = "memory_pressure_enabled",
+	.private = FILE_MEMORY_PRESSURE_ENABLED,
+};
+
+static struct cftype cft_memory_pressure = {
+	.name = "memory_pressure",
+	.private = FILE_MEMORY_PRESSURE,
+};
+
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
 	int err;
@@ -1456,6 +1602,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
 		return err;
 	return 0;
@@ -1491,6 +1639,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	INIT_LIST_HEAD(&cs->children);
 	atomic_inc(&cpuset_mems_generation);
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
+	fmeter_init(&cs->fmeter);
 
 	cs->parent = parent;
 
@@ -1580,6 +1729,7 @@ int __init cpuset_init(void)
 	top_cpuset.cpus_allowed = CPU_MASK_ALL;
 	top_cpuset.mems_allowed = NODE_MASK_ALL;
 
+	fmeter_init(&top_cpuset.fmeter);
 	atomic_inc(&cpuset_mems_generation);
 	top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
 
@@ -1601,6 +1751,9 @@ int __init cpuset_init(void)
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
 	err = cpuset_populate_dir(root);
+	/* memory_pressure_enabled is in root cpuset only */
+	if (err == 0)
+		err = cpuset_add_file(root, &cft_memory_pressure_enabled);
 out:
 	return err;
 }
@@ -1890,6 +2043,42 @@ done:
 	return overlap;
 }
 
+/*
+ * Collection of memory_pressure is suppressed unless
+ * this flag is enabled by writing "1" to the special
+ * cpuset file 'memory_pressure_enabled' in the root cpuset.
+ */
+
+int cpuset_memory_pressure_enabled;
+
+/**
+ * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ *
+ * Keep a running average of the rate of synchronous (direct)
+ * page reclaim efforts initiated by tasks in each cpuset.
+ *
+ * This represents the rate at which some task in the cpuset
+ * ran low on memory on all nodes it was allowed to use, and
+ * had to enter the kernels page reclaim code in an effort to
+ * create more free memory by tossing clean pages or swapping
+ * or writing dirty pages.
+ *
+ * Display to user space in the per-cpuset read-only file
+ * "memory_pressure".  Value displayed is an integer
+ * representing the recent rate of entry into the synchronous
+ * (direct) page reclaim by any task attached to the cpuset.
+ **/
+
+void __cpuset_memory_pressure_bump(void)
+{
+	struct cpuset *cs;
+
+	task_lock(current);
+	cs = current->cpuset;
+	fmeter_markevent(&cs->fmeter);
+	task_unlock(current);
+}
+
 /*
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ad3d0202cdef..e0e84924171b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -976,6 +976,7 @@ rebalance:
 	cond_resched();
 
 	/* We now go into synchronous reclaim */
+	cpuset_memory_pressure_bump();
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
-- 
cgit v1.2.3-71-gd317


From cf2a473c4089aa41c26f653200673f5a4cc25047 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:54 -0800
Subject: [PATCH] cpuset: combine refresh_mems and update_mems

The important code paths through alloc_pages_current() and alloc_page_vma(),
by which most kernel page allocations go, both called
cpuset_update_current_mems_allowed(), which in turn called refresh_mems().
-Both- of these latter two routines did a tasklock, got the tasks cpuset
pointer, and checked for out of date cpuset->mems_generation.

That was a silly duplication of code and waste of CPU cycles on an important
code path.

Consolidated those two routines into a single routine, called
cpuset_update_task_memory_state(), since it updates more than just
mems_allowed.

Changed all callers of either routine to call the new consolidated routine.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  4 +--
 kernel/cpuset.c        | 95 ++++++++++++++++++++++----------------------------
 mm/mempolicy.c         | 10 +++---
 3 files changed, 48 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 736d73801cb6..1feebf16ab08 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,7 +20,7 @@ extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
-void cpuset_update_current_mems_allowed(void);
+void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
 		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
@@ -51,7 +51,7 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 }
 
 static inline void cpuset_init_current_mems_allowed(void) {}
-static inline void cpuset_update_current_mems_allowed(void) {}
+static inline void cpuset_update_task_memory_state(void) {}
 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
 
 static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d9349cc48b95..e9917d71628a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -584,13 +584,26 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 	BUG_ON(!nodes_intersects(*pmask, node_online_map));
 }
 
-/*
- * Refresh current tasks mems_allowed and mems_generation from current
- * tasks cpuset.
+/**
+ * cpuset_update_task_memory_state - update task memory placement
  *
- * Call without callback_sem or task_lock() held.  May be called with
- * or without manage_sem held.  Will acquire task_lock() and might
- * acquire callback_sem during call.
+ * If the current tasks cpusets mems_allowed changed behind our
+ * backs, update current->mems_allowed, mems_generation and task NUMA
+ * mempolicy to the new value.
+ *
+ * Task mempolicy is updated by rebinding it relative to the
+ * current->cpuset if a task has its memory placement changed.
+ * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_sem or task_lock() held.  May be called
+ * with or without manage_sem held.  Except in early boot or
+ * an exiting task, when tsk->cpuset is NULL, this routine will
+ * acquire task_lock().  We don't need to use task_lock to guard
+ * against another task changing a non-NULL cpuset pointer to NULL,
+ * as that is only done by a task on itself, and if the current task
+ * is here, it is not simultaneously in the exit code NULL'ing its
+ * cpuset pointer.  This routine also might acquire callback_sem and
+ * current->mm->mmap_sem during call.
  *
  * The task_lock() is required to dereference current->cpuset safely.
  * Without it, we could pick up the pointer value of current->cpuset
@@ -605,32 +618,36 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * task has been modifying its cpuset.
  */
 
-static void refresh_mems(void)
+void cpuset_update_task_memory_state()
 {
 	int my_cpusets_mem_gen;
+	struct task_struct *tsk = current;
+	struct cpuset *cs = tsk->cpuset;
 
-	task_lock(current);
-	my_cpusets_mem_gen = current->cpuset->mems_generation;
-	task_unlock(current);
+	if (unlikely(!cs))
+		return;
+
+	task_lock(tsk);
+	my_cpusets_mem_gen = cs->mems_generation;
+	task_unlock(tsk);
 
-	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
-		struct cpuset *cs;
-		nodemask_t oldmem = current->mems_allowed;
+	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
+		nodemask_t oldmem = tsk->mems_allowed;
 		int migrate;
 
 		down(&callback_sem);
-		task_lock(current);
-		cs = current->cpuset;
+		task_lock(tsk);
+		cs = tsk->cpuset;	/* Maybe changed when task not locked */
 		migrate = is_memory_migrate(cs);
-		guarantee_online_mems(cs, &current->mems_allowed);
-		current->cpuset_mems_generation = cs->mems_generation;
-		task_unlock(current);
+		guarantee_online_mems(cs, &tsk->mems_allowed);
+		tsk->cpuset_mems_generation = cs->mems_generation;
+		task_unlock(tsk);
 		up(&callback_sem);
-		if (!nodes_equal(oldmem, current->mems_allowed)) {
-			numa_policy_rebind(&oldmem, &current->mems_allowed);
+		numa_policy_rebind(&oldmem, &tsk->mems_allowed);
+		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
 			if (migrate) {
-				do_migrate_pages(current->mm, &oldmem,
-					&current->mems_allowed,
+				do_migrate_pages(tsk->mm, &oldmem,
+					&tsk->mems_allowed,
 					MPOL_MF_MOVE_ALL);
 			}
 		}
@@ -1630,7 +1647,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 		return -ENOMEM;
 
 	down(&manage_sem);
-	refresh_mems();
+	cpuset_update_task_memory_state();
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1688,7 +1705,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	/* the vfs holds both inode->i_sem already */
 
 	down(&manage_sem);
-	refresh_mems();
+	cpuset_update_task_memory_state();
 	if (atomic_read(&cs->count) > 0) {
 		up(&manage_sem);
 		return -EBUSY;
@@ -1872,36 +1889,6 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
-/**
- * cpuset_update_current_mems_allowed - update mems parameters to new values
- *
- * If the current tasks cpusets mems_allowed changed behind our backs,
- * update current->mems_allowed and mems_generation to the new value.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Unless exiting, it will acquire
- * task_lock().  Also might acquire callback_sem during call to
- * refresh_mems().
- */
-
-void cpuset_update_current_mems_allowed(void)
-{
-	struct cpuset *cs;
-	int need_to_refresh = 0;
-
-	task_lock(current);
-	cs = current->cpuset;
-	if (!cs)
-		goto done;
-	if (current->cpuset_mems_generation != cs->mems_generation)
-		need_to_refresh = 1;
-done:
-	task_unlock(current);
-	if (need_to_refresh)
-		refresh_mems();
-}
-
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9dea2b8a7d48..515bfeee027e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -387,7 +387,7 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	if (!nodes)
 		return 0;
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 		return -EINVAL;
 	return mpol_check_policy(mode, nodes);
@@ -461,7 +461,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 		return -EINVAL;
 	if (flags & MPOL_F_ADDR) {
@@ -1089,7 +1089,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 
 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 		unsigned nid;
@@ -1115,7 +1115,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
  *
- *	Don't call cpuset_update_current_mems_allowed() unless
+ *	Don't call cpuset_update_task_memory_state() unless
  *	1) it's ok to take cpuset_sem (can WAIT), and
  *	2) allocating for current task (not interrupt).
  */
@@ -1124,7 +1124,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 	struct mempolicy *pol = current->mempolicy;
 
 	if ((gfp & __GFP_WAIT) && !in_interrupt())
-		cpuset_update_current_mems_allowed();
+		cpuset_update_task_memory_state();
 	if (!pol || in_interrupt())
 		pol = &default_policy;
 	if (pol->policy == MPOL_INTERLEAVE)
-- 
cgit v1.2.3-71-gd317


From 909d75a3b77bdd8baa9429bad3b69a654d2954ce Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:55 -0800
Subject: [PATCH] cpuset: implement cpuset_mems_allowed

Provide a cpuset_mems_allowed() method, which the sys_migrate_pages() code
needed, to obtain the mems_allowed vector of a cpuset, and replaced the
workaround in sys_migrate_pages() to call this new method.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  8 +++++++-
 kernel/cpuset.c        | 29 ++++++++++++++++++++++++++---
 mm/mempolicy.c         |  3 ---
 3 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1feebf16ab08..37d2dd7ca3e9 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,7 +18,8 @@ extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
-extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
+extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
+extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
@@ -50,6 +51,11 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 	return cpu_possible_map;
 }
 
+static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
+{
+	return node_possible_map;
+}
+
 static inline void cpuset_init_current_mems_allowed(void) {}
 static inline void cpuset_update_task_memory_state(void) {}
 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e9917d71628a..0d0dbbd6560a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1871,14 +1871,14 @@ void cpuset_exit(struct task_struct *tsk)
  * tasks cpuset.
  **/
 
-cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
+cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 {
 	cpumask_t mask;
 
 	down(&callback_sem);
-	task_lock((struct task_struct *)tsk);
+	task_lock(tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
-	task_unlock((struct task_struct *)tsk);
+	task_unlock(tsk);
 	up(&callback_sem);
 
 	return mask;
@@ -1889,6 +1889,29 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
+/**
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ *
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk.  Guaranteed to return some non-empty
+ * subset of node_online_map, even if this means going outside the
+ * tasks cpuset.
+ **/
+
+nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
+{
+	nodemask_t mask;
+
+	down(&callback_sem);
+	task_lock(tsk);
+	guarantee_online_mems(tsk->cpuset, &mask);
+	task_unlock(tsk);
+	up(&callback_sem);
+
+	return mask;
+}
+
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 515bfeee027e..34d566ac147f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -772,9 +772,6 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	return do_set_mempolicy(mode, &nodes);
 }
 
-/* Macro needed until Paul implements this function in kernel/cpusets.c */
-#define cpuset_mems_allowed(task) node_online_map
-
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		const unsigned long __user *old_nodes,
 		const unsigned long __user *new_nodes)
-- 
cgit v1.2.3-71-gd317


From 74cb21553f4bf244185b9bec4c26e4e3169ad55e Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:56 -0800
Subject: [PATCH] cpuset: numa_policy_rebind cleanup

Cleanup, reorganize and make more robust the mempolicy.c code to rebind
mempolicies relative to the containing cpuset after a tasks memory placement
changes.

The real motivator for this cleanup patch is to lay more groundwork for the
upcoming patch to correctly rebind NUMA mempolicies that are attached to vma's
after the containing cpuset memory placement changes.

NUMA mempolicies are constrained by the cpuset their task is a member of.
When either (1) a task is moved to a different cpuset, or (2) the 'mems'
mems_allowed of a cpuset is changed, then the NUMA mempolicies have embedded
node numbers (for MPOL_BIND, MPOL_INTERLEAVE and MPOL_PREFERRED) that need to
be recalculated, relative to their new cpuset placement.

The old code used an unreliable method of determining what was the old
mems_allowed constraining the mempolicy.  It just looked at the tasks
mems_allowed value.  This sort of worked with the present code, that just
rebinds the -task- mempolicy, and leaves any -vma- mempolicies broken,
referring to the old nodes.  But in an upcoming patch, the vma mempolicies
will be rebound as well.  Then the order in which the various task and vma
mempolicies are updated will no longer be deterministic, and one can no longer
count on the task->mems_allowed holding the old value for as long as needed.
It's not even clear if the current code was guaranteed to work reliably for
task mempolicies.

So I added a mems_allowed field to each mempolicy, stating exactly what
mems_allowed the policy is relative to, and updated synchronously and reliably
anytime that the mempolicy is rebound.

Also removed a useless wrapper routine, numa_policy_rebind(), and had its
caller, cpuset_update_task_memory_state(), call directly to the rewritten
policy_rebind() routine, and made that rebind routine extern instead of
static, and added a "mpol_" prefix to its name, making it
mpol_rebind_policy().

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 12 ++++++++++--
 kernel/cpuset.c           |  2 +-
 mm/mempolicy.c            | 31 +++++++++++++++++++------------
 3 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 05fddd5bee5d..74357cb9bc7c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -68,6 +68,7 @@ struct mempolicy {
 		nodemask_t	 nodes;		/* interleave */
 		/* undefined for default */
 	} v;
+	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
 };
 
 /*
@@ -146,7 +147,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
-extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
+extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
+extern void mpol_rebind_task(struct task_struct *tsk,
+					const nodemask_t *new);
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
@@ -221,7 +224,12 @@ static inline void numa_default_policy(void)
 {
 }
 
-static inline void numa_policy_rebind(const nodemask_t *old,
+static inline void mpol_rebind_policy(struct mempolicy *pol,
+					const nodemask_t *new)
+{
+}
+
+static inline void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0d0dbbd6560a..8f764de3a9e7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -643,7 +643,7 @@ void cpuset_update_task_memory_state()
 		tsk->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(tsk);
 		up(&callback_sem);
-		numa_policy_rebind(&oldmem, &tsk->mems_allowed);
+		mpol_rebind_task(tsk, &tsk->mems_allowed);
 		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
 			if (migrate) {
 				do_migrate_pages(tsk->mm, &oldmem,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 34d566ac147f..c39bd86f4ea0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -180,6 +180,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 		break;
 	}
 	policy->policy = mode;
+	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
 }
 
@@ -1411,25 +1412,31 @@ void numa_default_policy(void)
 }
 
 /* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
-							const nodemask_t *new)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 {
+	nodemask_t *mpolmask;
 	nodemask_t tmp;
 
 	if (!pol)
 		return;
+	mpolmask = &pol->cpuset_mems_allowed;
+	if (nodes_equal(*mpolmask, *newmask))
+		return;
 
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_INTERLEAVE:
-		nodes_remap(tmp, pol->v.nodes, *old, *new);
+		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
 		pol->v.nodes = tmp;
-		current->il_next = node_remap(current->il_next, *old, *new);
+		*mpolmask = *newmask;
+		current->il_next = node_remap(current->il_next,
+						*mpolmask, *newmask);
 		break;
 	case MPOL_PREFERRED:
 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
-								*old, *new);
+						*mpolmask, *newmask);
+		*mpolmask = *newmask;
 		break;
 	case MPOL_BIND: {
 		nodemask_t nodes;
@@ -1439,7 +1446,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 		nodes_clear(nodes);
 		for (z = pol->v.zonelist->zones; *z; z++)
 			node_set((*z)->zone_pgdat->node_id, nodes);
-		nodes_remap(tmp, nodes, *old, *new);
+		nodes_remap(tmp, nodes, *mpolmask, *newmask);
 		nodes = tmp;
 
 		zonelist = bind_zonelist(&nodes);
@@ -1454,6 +1461,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 			kfree(pol->v.zonelist);
 			pol->v.zonelist = zonelist;
 		}
+		*mpolmask = *newmask;
 		break;
 	}
 	default:
@@ -1463,14 +1471,13 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 }
 
 /*
- * Someone moved this task to different nodes.  Fixup mempolicies.
- *
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
  */
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 {
-	rebind_policy(current->mempolicy, old, new);
+	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From 202f72d5d1b5c2c084f63ef996c736d208b447b5 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:57 -0800
Subject: [PATCH] cpuset: number_of_cpusets optimization

Easy little optimization hack to avoid actually having to call
cpuset_zone_allowed() and check mems_allowed, in the main page allocation
routine, __alloc_pages().  This saves several CPU cycles per page allocation
on systems not using cpusets.

A counter is updated each time a cpuset is created or removed, and whenever
there is only one cpuset in the system, it must be the root cpuset, which
contains all CPUs and all Memory Nodes.  In that case, when the counter is
one, all allocations are allowed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h | 10 +++++++++-
 kernel/cpuset.c        | 12 +++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 37d2dd7ca3e9..34081c168af5 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_CPUSETS
 
+extern int number_of_cpusets;	/* How many cpusets are defined in system? */
+
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
@@ -25,7 +27,13 @@ void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
 		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
-extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
+
+extern int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
+static int inline cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	return number_of_cpusets <= 1 || __cpuset_zone_allowed(z, gfp_mask);
+}
+
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
 
 #define cpuset_memory_pressure_bump() 				\
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8f764de3a9e7..6004719f26ee 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,13 @@
 
 #define CPUSET_SUPER_MAGIC		0x27e0eb
 
+/*
+ * Tracks how many cpusets are currently defined in system.
+ * When there is only one cpuset (the root cpuset) we can
+ * short circuit some hooks.
+ */
+int number_of_cpusets;
+
 /* See "Frequency meter" comments, below. */
 
 struct fmeter {
@@ -1664,6 +1671,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	down(&callback_sem);
 	list_add(&cs->sibling, &cs->parent->children);
+	number_of_cpusets++;
 	up(&callback_sem);
 
 	err = cpuset_create_dir(cs, name, mode);
@@ -1726,6 +1734,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
+	number_of_cpusets--;
 	up(&callback_sem);
 	if (list_empty(&parent->children))
 		check_for_release(parent, &pathbuf);
@@ -1769,6 +1778,7 @@ int __init cpuset_init(void)
 	root->d_inode->i_nlink++;
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
+	number_of_cpusets = 1;
 	err = cpuset_populate_dir(root);
 	/* memory_pressure_enabled is in root cpuset only */
 	if (err == 0)
@@ -1982,7 +1992,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  **/
 
-int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
 	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
-- 
cgit v1.2.3-71-gd317


From 4225399a66b315d4d1fb1cb61b75dda201c832e3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:59 -0800
Subject: [PATCH] cpuset: rebind vma mempolicies fix

Fix more of longstanding bug in cpuset/mempolicy interaction.

NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset.  The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.

When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.

An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.

This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired.  The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.

Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan.  In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock).  So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.

Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound.  A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.

When a task is moved to a different cpuset, it is easier, as there is only one
task involved.  It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.

It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice.  This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 18 ++++++++++
 kernel/cpuset.c           | 90 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/mempolicy.c            | 29 +++++++++++++++
 3 files changed, 137 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 74357cb9bc7c..c7ac77e873b3 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -150,6 +150,16 @@ extern void numa_policy_init(void);
 extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
 extern void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new);
+extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
+
+#ifdef CONFIG_CPUSET
+#define current_cpuset_is_being_rebound() \
+				(cpuset_being_rebound == current->cpuset)
+#else
+#define current_cpuset_is_being_rebound() 0
+#endif
+
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
@@ -165,6 +175,8 @@ static inline void check_highest_zone(int k)
 int do_migrate_pages(struct mm_struct *mm,
 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
 
+extern void *cpuset_being_rebound;	/* Trigger mpol_copy vma rebind */
+
 #else
 
 struct mempolicy {};
@@ -234,6 +246,12 @@ static inline void mpol_rebind_task(struct task_struct *tsk,
 {
 }
 
+static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+}
+
+#define set_cpuset_being_rebound(x) do {} while (0)
+
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6004719f26ee..19f87565be17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 }
 
 /*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies.
+ *
  * Call with manage_sem held.  May take callback_sem during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
  */
 
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
+	struct task_struct *g, *p;
+	struct mm_struct **mmarray;
+	int i, n, ntasks;
+	int fudge;
 	int retval;
 
 	trialcs = *cs;
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
 	up(&callback_sem);
 
+	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */
+
+	fudge = 10;				/* spare mmarray[] slots */
+	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */
+	retval = -ENOMEM;
+
+	/*
+	 * Allocate mmarray[] to hold mm reference for each task
+	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
+	 * tasklist_lock.  We could use GFP_ATOMIC, but with a
+	 * few more lines of code, we can retry until we get a big
+	 * enough mmarray[] w/o using GFP_ATOMIC.
+	 */
+	while (1) {
+		ntasks = atomic_read(&cs->count);	/* guess */
+		ntasks += fudge;
+		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+		if (!mmarray)
+			goto done;
+		write_lock_irq(&tasklist_lock);		/* block fork */
+		if (atomic_read(&cs->count) <= ntasks)
+			break;				/* got enough */
+		write_unlock_irq(&tasklist_lock);	/* try again */
+		kfree(mmarray);
+	}
+
+	n = 0;
+
+	/* Load up mmarray[] with mm reference for each task in cpuset. */
+	do_each_thread(g, p) {
+		struct mm_struct *mm;
+
+		if (n >= ntasks) {
+			printk(KERN_WARNING
+				"Cpuset mempolicy rebind incomplete.\n");
+			continue;
+		}
+		if (p->cpuset != cs)
+			continue;
+		mm = get_task_mm(p);
+		if (!mm)
+			continue;
+		mmarray[n++] = mm;
+	} while_each_thread(g, p);
+	write_unlock_irq(&tasklist_lock);
+
+	/*
+	 * Now that we've dropped the tasklist spinlock, we can
+	 * rebind the vma mempolicies of each mm in mmarray[] to their
+	 * new cpuset, and release that mm.  The mpol_rebind_mm()
+	 * call takes mmap_sem, which we couldn't take while holding
+	 * tasklist_lock.  Forks can happen again now - the mpol_copy()
+	 * cpuset_being_rebound check will catch such forks, and rebind
+	 * their vma mempolicies too.  Because we still hold the global
+	 * cpuset manage_sem, we know that no other rebind effort will
+	 * be contending for the global variable cpuset_being_rebound.
+	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+	 * is idempotent.
+	 */
+	for (i = 0; i < n; i++) {
+		struct mm_struct *mm = mmarray[i];
+
+		mpol_rebind_mm(mm, &cs->mems_allowed);
+		mmput(mm);
+	}
+
+	/* We're done rebinding vma's to this cpusets new mems_allowed. */
+	kfree(mmarray);
+	set_cpuset_being_rebound(NULL);
+	retval = 0;
 done:
 	return retval;
 }
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	struct cpuset *oldcs;
 	cpumask_t cpus;
 	nodemask_t from, to;
+	struct mm_struct *mm;
 
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	to = cs->mems_allowed;
 
 	up(&callback_sem);
+
+	mm = get_task_mm(tsk);
+	if (mm) {
+		mpol_rebind_mm(mm, &to);
+		mmput(mm);
+	}
+
 	if (is_memory_migrate(cs))
 		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c39bd86f4ea0..1850d0aef4ac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1131,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
+
 /* Slow path of a mempolicy copy */
 struct mempolicy *__mpol_copy(struct mempolicy *old)
 {
@@ -1138,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 
 	if (!new)
 		return ERR_PTR(-ENOMEM);
+	if (current_cpuset_is_being_rebound()) {
+		nodemask_t mems = cpuset_mems_allowed(current);
+		mpol_rebind_policy(old, &mems);
+	}
 	*new = *old;
 	atomic_set(&new->refcnt, 1);
 	if (new->policy == MPOL_BIND) {
@@ -1480,6 +1493,22 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+	struct vm_area_struct *vma;
+
+	down_write(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		mpol_rebind_policy(vma->vm_policy, new);
+	up_write(&mm->mmap_sem);
+}
+
 /*
  * Display pages allocated per node and memory policy via /proc.
  */
-- 
cgit v1.2.3-71-gd317


From c417f0242ebe578924a30d4e53d35b5059fed4e7 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:01 -0800
Subject: [PATCH] cpuset: remove test for null cpuset from alloc code path

Remove a couple of more lines of code from the cpuset hooks in the page
allocation code path.

There was a check for a NULL cpuset pointer in the routine
cpuset_update_task_memory_state() that was only needed during system boot,
after the memory subsystem was initialized, before the cpuset subsystem was
initialized, to catch a NULL task->cpuset pointer.

Add a cpuset_init_early() routine, just before the mem_init() call in
init/main.c, that sets up just enough of the init tasks cpuset structure to
render cpuset_update_task_memory_state() calls harmless.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  2 ++
 init/main.c            |  1 +
 kernel/cpuset.c        | 22 ++++++++++++++++------
 3 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 34081c168af5..c472f972bd6d 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,6 +16,7 @@
 
 extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 
+extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
@@ -49,6 +50,7 @@ extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
 #else /* !CONFIG_CPUSETS */
 
+static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 static inline void cpuset_fork(struct task_struct *p) {}
diff --git a/init/main.c b/init/main.c
index 2ed3638deec7..afe5eb84ad52 100644
--- a/init/main.c
+++ b/init/main.c
@@ -512,6 +512,7 @@ asmlinkage void __init start_kernel(void)
 	}
 #endif
 	vfs_caches_init_early();
+	cpuset_init_early();
 	mem_init();
 	kmem_cache_init();
 	setup_per_cpu_pageset();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cf8203a5fa71..fc949e4a625c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -603,9 +603,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * Do not call this routine if in_interrupt().
  *
  * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Except in early boot or
- * an exiting task, when tsk->cpuset is NULL, this routine will
- * acquire task_lock().  We don't need to use task_lock to guard
+ * with or without manage_sem held.  Doesn't need task_lock to guard
  * against another task changing a non-NULL cpuset pointer to NULL,
  * as that is only done by a task on itself, and if the current task
  * is here, it is not simultaneously in the exit code NULL'ing its
@@ -631,9 +629,6 @@ void cpuset_update_task_memory_state()
 	struct task_struct *tsk = current;
 	struct cpuset *cs = tsk->cpuset;
 
-	if (unlikely(!cs))
-		return;
-
 	task_lock(tsk);
 	my_cpusets_mem_gen = cs->mems_generation;
 	task_unlock(tsk);
@@ -1836,6 +1831,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	return 0;
 }
 
+/*
+ * cpuset_init_early - just enough so that the calls to
+ * cpuset_update_task_memory_state() in early init code
+ * are harmless.
+ */
+
+int __init cpuset_init_early(void)
+{
+	struct task_struct *tsk = current;
+
+	tsk->cpuset = &top_cpuset;
+	tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
+	return 0;
+}
+
 /**
  * cpuset_init - initialize cpusets at system boot
  *
-- 
cgit v1.2.3-71-gd317


From de25968cc87cc5b76d09de8b4cbddc8f24fcf5f7 Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Sun, 8 Jan 2006 01:02:05 -0800
Subject: [PATCH] fix more missing includes

Include fixes for 2.6.14-git11.  Should allow to remove sched.h from
module.h on i386, x86_64, arm, ia64, ppc, ppc64, and s390.  Probably more
to come since I haven't yet checked the other archs.

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/common/scoop.c                   | 1 +
 arch/arm/mach-realview/localtimer.c       | 1 +
 arch/mips/sgi-ip27/ip27-berr.c            | 1 +
 drivers/char/agp/sworks-agp.c             | 1 +
 drivers/infiniband/hw/mthca/mthca_dev.h   | 1 +
 drivers/infiniband/ulp/srp/ib_srp.c       | 1 +
 drivers/macintosh/windfarm_smu_controls.c | 1 +
 drivers/macintosh/windfarm_smu_sensors.c  | 1 +
 drivers/mtd/onenand/generic.c             | 1 +
 drivers/mtd/rfd_ftl.c                     | 1 +
 drivers/pci/hotplug/pciehp.h              | 1 +
 drivers/pci/hotplug/pciehp_hpc.c          | 3 +++
 drivers/rapidio/rio-scan.c                | 2 ++
 drivers/rapidio/rio-sysfs.c               | 1 +
 drivers/rapidio/rio.c                     | 1 +
 drivers/usb/host/ohci-au1xxx.c            | 1 +
 drivers/usb/host/ohci-lh7a404.c           | 1 +
 drivers/usb/host/ohci-ppc-soc.c           | 1 +
 include/linux/rio_drv.h                   | 1 +
 19 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/common/scoop.c b/arch/arm/common/scoop.c
index b6de43e73699..a2dfe0b0f1ec 100644
--- a/arch/arm/common/scoop.c
+++ b/arch/arm/common/scoop.c
@@ -13,6 +13,7 @@
 
 #include <linux/device.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <asm/io.h>
 #include <asm/hardware/scoop.h>
diff --git a/arch/arm/mach-realview/localtimer.c b/arch/arm/mach-realview/localtimer.c
index c9d7c596b200..caf6b8bb6c95 100644
--- a/arch/arm/mach-realview/localtimer.c
+++ b/arch/arm/mach-realview/localtimer.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/smp.h>
+#include <linux/jiffies.h>
 
 #include <asm/mach/time.h>
 #include <asm/hardware/arm_twd.h>
diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c
index 07631a97670b..ce907eda221b 100644
--- a/arch/mips/sgi-ip27/ip27-berr.c
+++ b/arch/mips/sgi-ip27/ip27-berr.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/signal.h>	/* for SIGBUS */
+#include <linux/sched.h>	/* schow_regs(), force_sig() */
 
 #include <asm/module.h>
 #include <asm/sn/addrs.h>
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c
index 3f8f7fa6b0ff..268f78d926d3 100644
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/jiffies.h>
 #include <linux/agp_backend.h>
 #include "agp.h"
 
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 497ff794ef6a..795b379260bf 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -43,6 +43,7 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
+#include <linux/timer.h>
 #include <asm/semaphore.h>
 
 #include "mthca_provider.h"
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index ee9fe226ae99..dd488d3cffa9 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -39,6 +39,7 @@
 #include <linux/string.h>
 #include <linux/parser.h>
 #include <linux/random.h>
+#include <linux/jiffies.h>
 
 #include <asm/atomic.h>
 
diff --git a/drivers/macintosh/windfarm_smu_controls.c b/drivers/macintosh/windfarm_smu_controls.c
index 2c3158c81ff2..4d811600bdab 100644
--- a/drivers/macintosh/windfarm_smu_controls.c
+++ b/drivers/macintosh/windfarm_smu_controls.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/wait.h>
+#include <linux/completion.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/io.h>
diff --git a/drivers/macintosh/windfarm_smu_sensors.c b/drivers/macintosh/windfarm_smu_sensors.c
index b558cc209d49..1a00d9c75a23 100644
--- a/drivers/macintosh/windfarm_smu_sensors.c
+++ b/drivers/macintosh/windfarm_smu_sensors.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/wait.h>
+#include <linux/completion.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/io.h>
diff --git a/drivers/mtd/onenand/generic.c b/drivers/mtd/onenand/generic.c
index 45c077d0f063..af06a80f44de 100644
--- a/drivers/mtd/onenand/generic.c
+++ b/drivers/mtd/onenand/generic.c
@@ -14,6 +14,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/onenand.h>
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index 20ce212638fc..a3e00a4635a5 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -18,6 +18,7 @@
 #include <linux/mtd/blktrans.h>
 #include <linux/mtd/mtd.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include <linux/jiffies.h>
 
 #include <asm/types.h>
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 6a61b9f286e1..0aac6a61337d 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -32,6 +32,7 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
+#include <linux/sched.h>		/* signal_pending() */
 #include <linux/pcieport_if.h>
 #include "pci_hotplug.h"
 
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 0b8b26beb163..ac1e495c314e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -30,6 +30,9 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/signal.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 
diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 4f7ed4bd3be9..94e30fe4b8f3 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -24,6 +24,8 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
 
 #include "rio.h"
 
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index 30a11436e241..bef9316e95df 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -15,6 +15,7 @@
 #include <linux/rio.h>
 #include <linux/rio_drv.h>
 #include <linux/stat.h>
+#include <linux/sched.h>	/* for capable() */
 
 #include "rio.h"
 
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index 3ca1011ceaac..5e382470faa2 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -23,6 +23,7 @@
 #include <linux/rio_regs.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
 
 #include "rio.h"
 
diff --git a/drivers/usb/host/ohci-au1xxx.c b/drivers/usb/host/ohci-au1xxx.c
index d9cf3b327d96..77cd6ac07e3c 100644
--- a/drivers/usb/host/ohci-au1xxx.c
+++ b/drivers/usb/host/ohci-au1xxx.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/signal.h>
 
 #include <asm/mach-au1x00/au1000.h>
 
diff --git a/drivers/usb/host/ohci-lh7a404.c b/drivers/usb/host/ohci-lh7a404.c
index 3959ccc88332..0020ed7a39d0 100644
--- a/drivers/usb/host/ohci-lh7a404.c
+++ b/drivers/usb/host/ohci-lh7a404.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/signal.h>
 
 #include <asm/hardware.h>
 
diff --git a/drivers/usb/host/ohci-ppc-soc.c b/drivers/usb/host/ohci-ppc-soc.c
index 2ec6a78bd65e..b2a8dfa48870 100644
--- a/drivers/usb/host/ohci-ppc-soc.c
+++ b/drivers/usb/host/ohci-ppc-soc.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/signal.h>
 
 /* configure so an HC device and id are always provided */
 /* always called with process context; sleeping is OK */
diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h
index 3bd7cce19e26..157d7e3236b5 100644
--- a/include/linux/rio_drv.h
+++ b/include/linux/rio_drv.h
@@ -21,6 +21,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/device.h>
+#include <linux/string.h>
 #include <linux/rio.h>
 
 extern int __rio_local_read_config_32(struct rio_mport *port, u32 offset,
-- 
cgit v1.2.3-71-gd317


From 705b6c7b34f2621f95f606d0e683daa10cdb8eb9 Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Sun, 8 Jan 2006 01:02:06 -0800
Subject: [PATCH] new driver synclink_gt

New character device driver for the SyncLink GT and SyncLink AC families of
synchronous and asynchronous serial adapters

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/Kconfig       |    8 +
 drivers/char/Makefile      |    1 +
 drivers/char/synclink_gt.c | 4501 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/synclink.h   |    9 +-
 4 files changed, 4518 insertions(+), 1 deletion(-)
 create mode 100644 drivers/char/synclink_gt.c

(limited to 'include/linux')

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index b1b34edcd70c..dd7e6901c575 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -220,6 +220,14 @@ config SYNCLINKMP
 	  The module will be called synclinkmp.  If you want to do that, say M
 	  here.
 
+config SYNCLINK_GT
+	tristate "SyncLink GT/AC support"
+	depends on SERIAL_NONSTANDARD
+	help
+	  Support for SyncLink GT and SyncLink AC families of
+	  synchronous and asynchronous serial adapters
+	  manufactured by Microgate Systems, Ltd. (www.microgate.com)
+
 config N_HDLC
 	tristate "HDLC line discipline support"
 	depends on SERIAL_NONSTANDARD
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 4aeae687e88a..d973d14d8f7f 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_RISCOM8)		+= riscom8.o
 obj-$(CONFIG_ISI)		+= isicom.o
 obj-$(CONFIG_SYNCLINK)		+= synclink.o
 obj-$(CONFIG_SYNCLINKMP)	+= synclinkmp.o
+obj-$(CONFIG_SYNCLINK_GT)	+= synclink_gt.o
 obj-$(CONFIG_N_HDLC)		+= n_hdlc.o
 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
 obj-$(CONFIG_SX)		+= sx.o generic_serial.o
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
new file mode 100644
index 000000000000..2b9cde94e2f7
--- /dev/null
+++ b/drivers/char/synclink_gt.c
@@ -0,0 +1,4501 @@
+/*
+ * $Id: synclink_gt.c,v 4.20 2005/11/08 19:51:55 paulkf Exp $
+ *
+ * Device driver for Microgate SyncLink GT serial adapters.
+ *
+ * written by Paul Fulghum for Microgate Corporation
+ * paulkf@microgate.com
+ *
+ * Microgate and SyncLink are trademarks of Microgate Corporation
+ *
+ * This code is released under the GNU General Public License (GPL)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * DEBUG OUTPUT DEFINITIONS
+ *
+ * uncomment lines below to enable specific types of debug output
+ *
+ * DBGINFO   information - most verbose output
+ * DBGERR    serious errors
+ * DBGBH     bottom half service routine debugging
+ * DBGISR    interrupt service routine debugging
+ * DBGDATA   output receive and transmit data
+ * DBGTBUF   output transmit DMA buffers and registers
+ * DBGRBUF   output receive DMA buffers and registers
+ */
+
+#define DBGINFO(fmt) if (debug_level >= DEBUG_LEVEL_INFO) printk fmt
+#define DBGERR(fmt) if (debug_level >= DEBUG_LEVEL_ERROR) printk fmt
+#define DBGBH(fmt) if (debug_level >= DEBUG_LEVEL_BH) printk fmt
+#define DBGISR(fmt) if (debug_level >= DEBUG_LEVEL_ISR) printk fmt
+#define DBGDATA(info, buf, size, label) if (debug_level >= DEBUG_LEVEL_DATA) trace_block((info), (buf), (size), (label))
+//#define DBGTBUF(info) dump_tbufs(info)
+//#define DBGRBUF(info) dump_rbufs(info)
+
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/ioctl.h>
+#include <linux/termios.h>
+#include <linux/bitops.h>
+#include <linux/workqueue.h>
+#include <linux/hdlc.h>
+
+#include <asm/serial.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/dma.h>
+#include <asm/types.h>
+#include <asm/uaccess.h>
+
+#include "linux/synclink.h"
+
+#ifdef CONFIG_HDLC_MODULE
+#define CONFIG_HDLC 1
+#endif
+
+/*
+ * module identification
+ */
+static char *driver_name     = "SyncLink GT";
+static char *driver_version  = "$Revision: 4.20 $";
+static char *tty_driver_name = "synclink_gt";
+static char *tty_dev_prefix  = "ttySLG";
+MODULE_LICENSE("GPL");
+#define MGSL_MAGIC 0x5401
+#define MAX_DEVICES 12
+
+static struct pci_device_id pci_table[] = {
+	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_GT_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_GT4_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_AC_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{0,}, /* terminate list */
+};
+MODULE_DEVICE_TABLE(pci, pci_table);
+
+static int  init_one(struct pci_dev *dev,const struct pci_device_id *ent);
+static void remove_one(struct pci_dev *dev);
+static struct pci_driver pci_driver = {
+	.name		= "synclink_gt",
+	.id_table	= pci_table,
+	.probe		= init_one,
+	.remove		= __devexit_p(remove_one),
+};
+
+static int pci_registered;
+
+/*
+ * module configuration and status
+ */
+static struct slgt_info *slgt_device_list;
+static int slgt_device_count;
+
+static int ttymajor;
+static int debug_level;
+static int maxframe[MAX_DEVICES];
+static int dosyncppp[MAX_DEVICES];
+
+module_param(ttymajor, int, 0);
+module_param(debug_level, int, 0);
+module_param_array(maxframe, int, NULL, 0);
+module_param_array(dosyncppp, int, NULL, 0);
+
+MODULE_PARM_DESC(ttymajor, "TTY major device number override: 0=auto assigned");
+MODULE_PARM_DESC(debug_level, "Debug syslog output: 0=disabled, 1 to 5=increasing detail");
+MODULE_PARM_DESC(maxframe, "Maximum frame size used by device (4096 to 65535)");
+MODULE_PARM_DESC(dosyncppp, "Enable synchronous net device, 0=disable 1=enable");
+
+/*
+ * tty support and callbacks
+ */
+#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
+
+static struct tty_driver *serial_driver;
+
+static int  open(struct tty_struct *tty, struct file * filp);
+static void close(struct tty_struct *tty, struct file * filp);
+static void hangup(struct tty_struct *tty);
+static void set_termios(struct tty_struct *tty, struct termios *old_termios);
+
+static int  write(struct tty_struct *tty, const unsigned char *buf, int count);
+static void put_char(struct tty_struct *tty, unsigned char ch);
+static void send_xchar(struct tty_struct *tty, char ch);
+static void wait_until_sent(struct tty_struct *tty, int timeout);
+static int  write_room(struct tty_struct *tty);
+static void flush_chars(struct tty_struct *tty);
+static void flush_buffer(struct tty_struct *tty);
+static void tx_hold(struct tty_struct *tty);
+static void tx_release(struct tty_struct *tty);
+
+static int  ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg);
+static int  read_proc(char *page, char **start, off_t off, int count,int *eof, void *data);
+static int  chars_in_buffer(struct tty_struct *tty);
+static void throttle(struct tty_struct * tty);
+static void unthrottle(struct tty_struct * tty);
+static void set_break(struct tty_struct *tty, int break_state);
+
+/*
+ * generic HDLC support and callbacks
+ */
+#ifdef CONFIG_HDLC
+#define dev_to_port(D) (dev_to_hdlc(D)->priv)
+static void hdlcdev_tx_done(struct slgt_info *info);
+static void hdlcdev_rx(struct slgt_info *info, char *buf, int size);
+static int  hdlcdev_init(struct slgt_info *info);
+static void hdlcdev_exit(struct slgt_info *info);
+#endif
+
+
+/*
+ * device specific structures, macros and functions
+ */
+
+#define SLGT_MAX_PORTS 4
+#define SLGT_REG_SIZE  256
+
+/*
+ * DMA buffer descriptor and access macros
+ */
+struct slgt_desc
+{
+	unsigned short count;
+	unsigned short status;
+	unsigned int pbuf;  /* physical address of data buffer */
+	unsigned int next;  /* physical address of next descriptor */
+
+	/* driver book keeping */
+	char *buf;          /* virtual  address of data buffer */
+    	unsigned int pdesc; /* physical address of this descriptor */
+	dma_addr_t buf_dma_addr;
+};
+
+#define set_desc_buffer(a,b) (a).pbuf = cpu_to_le32((unsigned int)(b))
+#define set_desc_next(a,b) (a).next   = cpu_to_le32((unsigned int)(b))
+#define set_desc_count(a,b)(a).count  = cpu_to_le16((unsigned short)(b))
+#define set_desc_eof(a,b)  (a).status = cpu_to_le16((b) ? (le16_to_cpu((a).status) | BIT0) : (le16_to_cpu((a).status) & ~BIT0))
+#define desc_count(a)      (le16_to_cpu((a).count))
+#define desc_status(a)     (le16_to_cpu((a).status))
+#define desc_complete(a)   (le16_to_cpu((a).status) & BIT15)
+#define desc_eof(a)        (le16_to_cpu((a).status) & BIT2)
+#define desc_crc_error(a)  (le16_to_cpu((a).status) & BIT1)
+#define desc_abort(a)      (le16_to_cpu((a).status) & BIT0)
+#define desc_residue(a)    ((le16_to_cpu((a).status) & 0x38) >> 3)
+
+struct _input_signal_events {
+	int ri_up;
+	int ri_down;
+	int dsr_up;
+	int dsr_down;
+	int dcd_up;
+	int dcd_down;
+	int cts_up;
+	int cts_down;
+};
+
+/*
+ * device instance data structure
+ */
+struct slgt_info {
+	void *if_ptr;		/* General purpose pointer (used by SPPP) */
+
+	struct slgt_info *next_device;	/* device list link */
+
+	int magic;
+	int flags;
+
+	char device_name[25];
+	struct pci_dev *pdev;
+
+	int port_count;  /* count of ports on adapter */
+	int adapter_num; /* adapter instance number */
+	int port_num;    /* port instance number */
+
+	/* array of pointers to port contexts on this adapter */
+	struct slgt_info *port_array[SLGT_MAX_PORTS];
+
+	int			count;		/* count of opens */
+	int			line;		/* tty line instance number */
+	unsigned short		close_delay;
+	unsigned short		closing_wait;	/* time to wait before closing */
+
+	struct mgsl_icount	icount;
+
+	struct tty_struct 	*tty;
+	int			timeout;
+	int			x_char;		/* xon/xoff character */
+	int			blocked_open;	/* # of blocked opens */
+	unsigned int		read_status_mask;
+	unsigned int 		ignore_status_mask;
+
+	wait_queue_head_t	open_wait;
+	wait_queue_head_t	close_wait;
+
+	wait_queue_head_t	status_event_wait_q;
+	wait_queue_head_t	event_wait_q;
+	struct timer_list	tx_timer;
+	struct timer_list	rx_timer;
+
+	spinlock_t lock;	/* spinlock for synchronizing with ISR */
+
+	struct work_struct task;
+	u32 pending_bh;
+	int bh_requested;
+	int bh_running;
+
+	int isr_overflow;
+	int irq_requested;	/* nonzero if IRQ requested */
+	int irq_occurred;	/* for diagnostics use */
+
+	/* device configuration */
+
+	unsigned int bus_type;
+	unsigned int irq_level;
+	unsigned long irq_flags;
+
+	unsigned char __iomem * reg_addr;  /* memory mapped registers address */
+	u32 phys_reg_addr;
+	u32 reg_offset;
+	int reg_addr_requested;
+
+	MGSL_PARAMS params;       /* communications parameters */
+	u32 idle_mode;
+	u32 max_frame_size;       /* as set by device config */
+
+	unsigned int raw_rx_size;
+	unsigned int if_mode;
+
+	/* device status */
+
+	int rx_enabled;
+	int rx_restart;
+
+	int tx_enabled;
+	int tx_active;
+
+	unsigned char signals;    /* serial signal states */
+	unsigned int init_error;  /* initialization error */
+
+	unsigned char *tx_buf;
+	int tx_count;
+
+	char flag_buf[MAX_ASYNC_BUFFER_SIZE];
+	char char_buf[MAX_ASYNC_BUFFER_SIZE];
+	BOOLEAN drop_rts_on_tx_done;
+	struct	_input_signal_events	input_signal_events;
+
+	int dcd_chkcount;	/* check counts to prevent */
+	int cts_chkcount;	/* too many IRQs if a signal */
+	int dsr_chkcount;	/* is floating */
+	int ri_chkcount;
+
+	char *bufs;		/* virtual address of DMA buffer lists */
+	dma_addr_t bufs_dma_addr; /* physical address of buffer descriptors */
+
+	unsigned int rbuf_count;
+	struct slgt_desc *rbufs;
+	unsigned int rbuf_current;
+	unsigned int rbuf_index;
+
+	unsigned int tbuf_count;
+	struct slgt_desc *tbufs;
+	unsigned int tbuf_current;
+	unsigned int tbuf_start;
+
+	unsigned char *tmp_rbuf;
+	unsigned int tmp_rbuf_count;
+
+	/* SPPP/Cisco HDLC device parts */
+
+	int netcount;
+	int dosyncppp;
+	spinlock_t netlock;
+#ifdef CONFIG_HDLC
+	struct net_device *netdev;
+#endif
+
+};
+
+static MGSL_PARAMS default_params = {
+	.mode            = MGSL_MODE_HDLC,
+	.loopback        = 0,
+	.flags           = HDLC_FLAG_UNDERRUN_ABORT15,
+	.encoding        = HDLC_ENCODING_NRZI_SPACE,
+	.clock_speed     = 0,
+	.addr_filter     = 0xff,
+	.crc_type        = HDLC_CRC_16_CCITT,
+	.preamble_length = HDLC_PREAMBLE_LENGTH_8BITS,
+	.preamble        = HDLC_PREAMBLE_PATTERN_NONE,
+	.data_rate       = 9600,
+	.data_bits       = 8,
+	.stop_bits       = 1,
+	.parity          = ASYNC_PARITY_NONE
+};
+
+
+#define BH_RECEIVE  1
+#define BH_TRANSMIT 2
+#define BH_STATUS   4
+#define IO_PIN_SHUTDOWN_LIMIT 100
+
+#define DMABUFSIZE 256
+#define DESC_LIST_SIZE 4096
+
+#define MASK_PARITY  BIT1
+#define MASK_FRAMING BIT2
+#define MASK_BREAK   BIT3
+#define MASK_OVERRUN BIT4
+
+#define GSR   0x00 /* global status */
+#define TDR   0x80 /* tx data */
+#define RDR   0x80 /* rx data */
+#define TCR   0x82 /* tx control */
+#define TIR   0x84 /* tx idle */
+#define TPR   0x85 /* tx preamble */
+#define RCR   0x86 /* rx control */
+#define VCR   0x88 /* V.24 control */
+#define CCR   0x89 /* clock control */
+#define BDR   0x8a /* baud divisor */
+#define SCR   0x8c /* serial control */
+#define SSR   0x8e /* serial status */
+#define RDCSR 0x90 /* rx DMA control/status */
+#define TDCSR 0x94 /* tx DMA control/status */
+#define RDDAR 0x98 /* rx DMA descriptor address */
+#define TDDAR 0x9c /* tx DMA descriptor address */
+
+#define RXIDLE      BIT14
+#define RXBREAK     BIT14
+#define IRQ_TXDATA  BIT13
+#define IRQ_TXIDLE  BIT12
+#define IRQ_TXUNDER BIT11 /* HDLC */
+#define IRQ_RXDATA  BIT10
+#define IRQ_RXIDLE  BIT9  /* HDLC */
+#define IRQ_RXBREAK BIT9  /* async */
+#define IRQ_RXOVER  BIT8
+#define IRQ_DSR     BIT7
+#define IRQ_CTS     BIT6
+#define IRQ_DCD     BIT5
+#define IRQ_RI      BIT4
+#define IRQ_ALL     0x3ff0
+#define IRQ_MASTER  BIT0
+
+#define slgt_irq_on(info, mask) \
+	wr_reg16((info), SCR, (unsigned short)(rd_reg16((info), SCR) | (mask)))
+#define slgt_irq_off(info, mask) \
+	wr_reg16((info), SCR, (unsigned short)(rd_reg16((info), SCR) & ~(mask)))
+
+static __u8  rd_reg8(struct slgt_info *info, unsigned int addr);
+static void  wr_reg8(struct slgt_info *info, unsigned int addr, __u8 value);
+static __u16 rd_reg16(struct slgt_info *info, unsigned int addr);
+static void  wr_reg16(struct slgt_info *info, unsigned int addr, __u16 value);
+static __u32 rd_reg32(struct slgt_info *info, unsigned int addr);
+static void  wr_reg32(struct slgt_info *info, unsigned int addr, __u32 value);
+
+static void  msc_set_vcr(struct slgt_info *info);
+
+static int  startup(struct slgt_info *info);
+static int  block_til_ready(struct tty_struct *tty, struct file * filp,struct slgt_info *info);
+static void shutdown(struct slgt_info *info);
+static void program_hw(struct slgt_info *info);
+static void change_params(struct slgt_info *info);
+
+static int  register_test(struct slgt_info *info);
+static int  irq_test(struct slgt_info *info);
+static int  loopback_test(struct slgt_info *info);
+static int  adapter_test(struct slgt_info *info);
+
+static void reset_adapter(struct slgt_info *info);
+static void reset_port(struct slgt_info *info);
+static void async_mode(struct slgt_info *info);
+static void hdlc_mode(struct slgt_info *info);
+
+static void rx_stop(struct slgt_info *info);
+static void rx_start(struct slgt_info *info);
+static void reset_rbufs(struct slgt_info *info);
+static void free_rbufs(struct slgt_info *info, unsigned int first, unsigned int last);
+static void rdma_reset(struct slgt_info *info);
+static int  rx_get_frame(struct slgt_info *info);
+static int  rx_get_buf(struct slgt_info *info);
+
+static void tx_start(struct slgt_info *info);
+static void tx_stop(struct slgt_info *info);
+static void tx_set_idle(struct slgt_info *info);
+static unsigned int free_tbuf_count(struct slgt_info *info);
+static void reset_tbufs(struct slgt_info *info);
+static void tdma_reset(struct slgt_info *info);
+static void tx_load(struct slgt_info *info, const char *buf, unsigned int count);
+
+static void get_signals(struct slgt_info *info);
+static void set_signals(struct slgt_info *info);
+static void enable_loopback(struct slgt_info *info);
+static void set_rate(struct slgt_info *info, u32 data_rate);
+
+static int  bh_action(struct slgt_info *info);
+static void bh_handler(void* context);
+static void bh_transmit(struct slgt_info *info);
+static void isr_serial(struct slgt_info *info);
+static void isr_rdma(struct slgt_info *info);
+static void isr_txeom(struct slgt_info *info, unsigned short status);
+static void isr_tdma(struct slgt_info *info);
+static irqreturn_t slgt_interrupt(int irq, void *dev_id, struct pt_regs * regs);
+
+static int  alloc_dma_bufs(struct slgt_info *info);
+static void free_dma_bufs(struct slgt_info *info);
+static int  alloc_desc(struct slgt_info *info);
+static void free_desc(struct slgt_info *info);
+static int  alloc_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count);
+static void free_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count);
+
+static int  alloc_tmp_rbuf(struct slgt_info *info);
+static void free_tmp_rbuf(struct slgt_info *info);
+
+static void tx_timeout(unsigned long context);
+static void rx_timeout(unsigned long context);
+
+/*
+ * ioctl handlers
+ */
+static int  get_stats(struct slgt_info *info, struct mgsl_icount __user *user_icount);
+static int  get_params(struct slgt_info *info, MGSL_PARAMS __user *params);
+static int  set_params(struct slgt_info *info, MGSL_PARAMS __user *params);
+static int  get_txidle(struct slgt_info *info, int __user *idle_mode);
+static int  set_txidle(struct slgt_info *info, int idle_mode);
+static int  tx_enable(struct slgt_info *info, int enable);
+static int  tx_abort(struct slgt_info *info);
+static int  rx_enable(struct slgt_info *info, int enable);
+static int  modem_input_wait(struct slgt_info *info,int arg);
+static int  wait_mgsl_event(struct slgt_info *info, int __user *mask_ptr);
+static int  tiocmget(struct tty_struct *tty, struct file *file);
+static int  tiocmset(struct tty_struct *tty, struct file *file,
+		     unsigned int set, unsigned int clear);
+static void set_break(struct tty_struct *tty, int break_state);
+static int  get_interface(struct slgt_info *info, int __user *if_mode);
+static int  set_interface(struct slgt_info *info, int if_mode);
+
+/*
+ * driver functions
+ */
+static void add_device(struct slgt_info *info);
+static void device_init(int adapter_num, struct pci_dev *pdev);
+static int  claim_resources(struct slgt_info *info);
+static void release_resources(struct slgt_info *info);
+
+/*
+ * DEBUG OUTPUT CODE
+ */
+#ifndef DBGINFO
+#define DBGINFO(fmt)
+#endif
+#ifndef DBGERR
+#define DBGERR(fmt)
+#endif
+#ifndef DBGBH
+#define DBGBH(fmt)
+#endif
+#ifndef DBGISR
+#define DBGISR(fmt)
+#endif
+
+#ifdef DBGDATA
+static void trace_block(struct slgt_info *info, const char *data, int count, const char *label)
+{
+	int i;
+	int linecount;
+	printk("%s %s data:\n",info->device_name, label);
+	while(count) {
+		linecount = (count > 16) ? 16 : count;
+		for(i=0; i < linecount; i++)
+			printk("%02X ",(unsigned char)data[i]);
+		for(;i<17;i++)
+			printk("   ");
+		for(i=0;i<linecount;i++) {
+			if (data[i]>=040 && data[i]<=0176)
+				printk("%c",data[i]);
+			else
+				printk(".");
+		}
+		printk("\n");
+		data  += linecount;
+		count -= linecount;
+	}
+}
+#else
+#define DBGDATA(info, buf, size, label)
+#endif
+
+#ifdef DBGTBUF
+static void dump_tbufs(struct slgt_info *info)
+{
+	int i;
+	printk("tbuf_current=%d\n", info->tbuf_current);
+	for (i=0 ; i < info->tbuf_count ; i++) {
+		printk("%d: count=%04X status=%04X\n",
+			i, le16_to_cpu(info->tbufs[i].count), le16_to_cpu(info->tbufs[i].status));
+	}
+}
+#else
+#define DBGTBUF(info)
+#endif
+
+#ifdef DBGRBUF
+static void dump_rbufs(struct slgt_info *info)
+{
+	int i;
+	printk("rbuf_current=%d\n", info->rbuf_current);
+	for (i=0 ; i < info->rbuf_count ; i++) {
+		printk("%d: count=%04X status=%04X\n",
+			i, le16_to_cpu(info->rbufs[i].count), le16_to_cpu(info->rbufs[i].status));
+	}
+}
+#else
+#define DBGRBUF(info)
+#endif
+
+static inline int sanity_check(struct slgt_info *info, char *devname, const char *name)
+{
+#ifdef SANITY_CHECK
+	if (!info) {
+		printk("null struct slgt_info for (%s) in %s\n", devname, name);
+		return 1;
+	}
+	if (info->magic != MGSL_MAGIC) {
+		printk("bad magic number struct slgt_info (%s) in %s\n", devname, name);
+		return 1;
+	}
+#else
+	if (!info)
+		return 1;
+#endif
+	return 0;
+}
+
+/**
+ * line discipline callback wrappers
+ *
+ * The wrappers maintain line discipline references
+ * while calling into the line discipline.
+ *
+ * ldisc_receive_buf  - pass receive data to line discipline
+ */
+static void ldisc_receive_buf(struct tty_struct *tty,
+			      const __u8 *data, char *flags, int count)
+{
+	struct tty_ldisc *ld;
+	if (!tty)
+		return;
+	ld = tty_ldisc_ref(tty);
+	if (ld) {
+		if (ld->receive_buf)
+			ld->receive_buf(tty, data, flags, count);
+		tty_ldisc_deref(ld);
+	}
+}
+
+/* tty callbacks */
+
+static int open(struct tty_struct *tty, struct file *filp)
+{
+	struct slgt_info *info;
+	int retval, line;
+	unsigned long flags;
+
+	line = tty->index;
+	if ((line < 0) || (line >= slgt_device_count)) {
+		DBGERR(("%s: open with invalid line #%d.\n", driver_name, line));
+		return -ENODEV;
+	}
+
+	info = slgt_device_list;
+	while(info && info->line != line)
+		info = info->next_device;
+	if (sanity_check(info, tty->name, "open"))
+		return -ENODEV;
+	if (info->init_error) {
+		DBGERR(("%s init error=%d\n", info->device_name, info->init_error));
+		return -ENODEV;
+	}
+
+	tty->driver_data = info;
+	info->tty = tty;
+
+	DBGINFO(("%s open, old ref count = %d\n", info->device_name, info->count));
+
+	/* If port is closing, signal caller to try again */
+	if (tty_hung_up_p(filp) || info->flags & ASYNC_CLOSING){
+		if (info->flags & ASYNC_CLOSING)
+			interruptible_sleep_on(&info->close_wait);
+		retval = ((info->flags & ASYNC_HUP_NOTIFY) ?
+			-EAGAIN : -ERESTARTSYS);
+		goto cleanup;
+	}
+
+	info->tty->low_latency = (info->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
+
+	spin_lock_irqsave(&info->netlock, flags);
+	if (info->netcount) {
+		retval = -EBUSY;
+		spin_unlock_irqrestore(&info->netlock, flags);
+		goto cleanup;
+	}
+	info->count++;
+	spin_unlock_irqrestore(&info->netlock, flags);
+
+	if (info->count == 1) {
+		/* 1st open on this device, init hardware */
+		retval = startup(info);
+		if (retval < 0)
+			goto cleanup;
+	}
+
+	retval = block_til_ready(tty, filp, info);
+	if (retval) {
+		DBGINFO(("%s block_til_ready rc=%d\n", info->device_name, retval));
+		goto cleanup;
+	}
+
+	retval = 0;
+
+cleanup:
+	if (retval) {
+		if (tty->count == 1)
+			info->tty = NULL; /* tty layer will release tty struct */
+		if(info->count)
+			info->count--;
+	}
+
+	DBGINFO(("%s open rc=%d\n", info->device_name, retval));
+	return retval;
+}
+
+static void close(struct tty_struct *tty, struct file *filp)
+{
+	struct slgt_info *info = tty->driver_data;
+
+	if (sanity_check(info, tty->name, "close"))
+		return;
+	DBGINFO(("%s close entry, count=%d\n", info->device_name, info->count));
+
+	if (!info->count)
+		return;
+
+	if (tty_hung_up_p(filp))
+		goto cleanup;
+
+	if ((tty->count == 1) && (info->count != 1)) {
+		/*
+		 * tty->count is 1 and the tty structure will be freed.
+		 * info->count should be one in this case.
+		 * if it's not, correct it so that the port is shutdown.
+		 */
+		DBGERR(("%s close: bad refcount; tty->count=1, "
+		       "info->count=%d\n", info->device_name, info->count));
+		info->count = 1;
+	}
+
+	info->count--;
+
+	/* if at least one open remaining, leave hardware active */
+	if (info->count)
+		goto cleanup;
+
+	info->flags |= ASYNC_CLOSING;
+
+	/* set tty->closing to notify line discipline to
+	 * only process XON/XOFF characters. Only the N_TTY
+	 * discipline appears to use this (ppp does not).
+	 */
+	tty->closing = 1;
+
+	/* wait for transmit data to clear all layers */
+
+	if (info->closing_wait != ASYNC_CLOSING_WAIT_NONE) {
+		DBGINFO(("%s call tty_wait_until_sent\n", info->device_name));
+		tty_wait_until_sent(tty, info->closing_wait);
+	}
+
+ 	if (info->flags & ASYNC_INITIALIZED)
+ 		wait_until_sent(tty, info->timeout);
+	if (tty->driver->flush_buffer)
+		tty->driver->flush_buffer(tty);
+	tty_ldisc_flush(tty);
+
+	shutdown(info);
+
+	tty->closing = 0;
+	info->tty = NULL;
+
+	if (info->blocked_open) {
+		if (info->close_delay) {
+			msleep_interruptible(jiffies_to_msecs(info->close_delay));
+		}
+		wake_up_interruptible(&info->open_wait);
+	}
+
+	info->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
+
+	wake_up_interruptible(&info->close_wait);
+
+cleanup:
+	DBGINFO(("%s close exit, count=%d\n", tty->driver->name, info->count));
+}
+
+static void hangup(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+
+	if (sanity_check(info, tty->name, "hangup"))
+		return;
+	DBGINFO(("%s hangup\n", info->device_name));
+
+	flush_buffer(tty);
+	shutdown(info);
+
+	info->count = 0;
+	info->flags &= ~ASYNC_NORMAL_ACTIVE;
+	info->tty = NULL;
+
+	wake_up_interruptible(&info->open_wait);
+}
+
+static void set_termios(struct tty_struct *tty, struct termios *old_termios)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	DBGINFO(("%s set_termios\n", tty->driver->name));
+
+	/* just return if nothing has changed */
+	if ((tty->termios->c_cflag == old_termios->c_cflag)
+	    && (RELEVANT_IFLAG(tty->termios->c_iflag)
+		== RELEVANT_IFLAG(old_termios->c_iflag)))
+		return;
+
+	change_params(info);
+
+	/* Handle transition to B0 status */
+	if (old_termios->c_cflag & CBAUD &&
+	    !(tty->termios->c_cflag & CBAUD)) {
+		info->signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
+		spin_lock_irqsave(&info->lock,flags);
+		set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+
+	/* Handle transition away from B0 status */
+	if (!(old_termios->c_cflag & CBAUD) &&
+	    tty->termios->c_cflag & CBAUD) {
+		info->signals |= SerialSignal_DTR;
+ 		if (!(tty->termios->c_cflag & CRTSCTS) ||
+ 		    !test_bit(TTY_THROTTLED, &tty->flags)) {
+			info->signals |= SerialSignal_RTS;
+ 		}
+		spin_lock_irqsave(&info->lock,flags);
+	 	set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+
+	/* Handle turning off CRTSCTS */
+	if (old_termios->c_cflag & CRTSCTS &&
+	    !(tty->termios->c_cflag & CRTSCTS)) {
+		tty->hw_stopped = 0;
+		tx_release(tty);
+	}
+}
+
+static int write(struct tty_struct *tty,
+		 const unsigned char *buf, int count)
+{
+	int ret = 0;
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "write"))
+		goto cleanup;
+	DBGINFO(("%s write count=%d\n", info->device_name, count));
+
+	if (!tty || !info->tx_buf)
+		goto cleanup;
+
+	if (count > info->max_frame_size) {
+		ret = -EIO;
+		goto cleanup;
+	}
+
+	if (!count)
+		goto cleanup;
+
+	if (info->params.mode == MGSL_MODE_RAW) {
+		unsigned int bufs_needed = (count/DMABUFSIZE);
+		unsigned int bufs_free = free_tbuf_count(info);
+		if (count % DMABUFSIZE)
+			++bufs_needed;
+		if (bufs_needed > bufs_free)
+			goto cleanup;
+	} else {
+		if (info->tx_active)
+			goto cleanup;
+		if (info->tx_count) {
+			/* send accumulated data from send_char() calls */
+			/* as frame and wait before accepting more data. */
+			tx_load(info, info->tx_buf, info->tx_count);
+			goto start;
+		}
+	}
+
+	ret = info->tx_count = count;
+	tx_load(info, buf, count);
+	goto start;
+
+start:
+ 	if (info->tx_count && !tty->stopped && !tty->hw_stopped) {
+		spin_lock_irqsave(&info->lock,flags);
+		if (!info->tx_active)
+		 	tx_start(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+ 	}
+
+cleanup:
+	DBGINFO(("%s write rc=%d\n", info->device_name, ret));
+	return ret;
+}
+
+static void put_char(struct tty_struct *tty, unsigned char ch)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "put_char"))
+		return;
+	DBGINFO(("%s put_char(%d)\n", info->device_name, ch));
+	if (!tty || !info->tx_buf)
+		return;
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active && (info->tx_count < info->max_frame_size))
+		info->tx_buf[info->tx_count++] = ch;
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+static void send_xchar(struct tty_struct *tty, char ch)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "send_xchar"))
+		return;
+	DBGINFO(("%s send_xchar(%d)\n", info->device_name, ch));
+	info->x_char = ch;
+	if (ch) {
+		spin_lock_irqsave(&info->lock,flags);
+		if (!info->tx_enabled)
+		 	tx_start(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+}
+
+static void wait_until_sent(struct tty_struct *tty, int timeout)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long orig_jiffies, char_time;
+
+	if (!info )
+		return;
+	if (sanity_check(info, tty->name, "wait_until_sent"))
+		return;
+	DBGINFO(("%s wait_until_sent entry\n", info->device_name));
+	if (!(info->flags & ASYNC_INITIALIZED))
+		goto exit;
+
+	orig_jiffies = jiffies;
+
+	/* Set check interval to 1/5 of estimated time to
+	 * send a character, and make it at least 1. The check
+	 * interval should also be less than the timeout.
+	 * Note: use tight timings here to satisfy the NIST-PCTS.
+	 */
+
+	if (info->params.data_rate) {
+	       	char_time = info->timeout/(32 * 5);
+		if (!char_time)
+			char_time++;
+	} else
+		char_time = 1;
+
+	if (timeout)
+		char_time = min_t(unsigned long, char_time, timeout);
+
+	while (info->tx_active) {
+		msleep_interruptible(jiffies_to_msecs(char_time));
+		if (signal_pending(current))
+			break;
+		if (timeout && time_after(jiffies, orig_jiffies + timeout))
+			break;
+	}
+
+exit:
+	DBGINFO(("%s wait_until_sent exit\n", info->device_name));
+}
+
+static int write_room(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	int ret;
+
+	if (sanity_check(info, tty->name, "write_room"))
+		return 0;
+	ret = (info->tx_active) ? 0 : HDLC_MAX_FRAME_SIZE;
+	DBGINFO(("%s write_room=%d\n", info->device_name, ret));
+	return ret;
+}
+
+static void flush_chars(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "flush_chars"))
+		return;
+	DBGINFO(("%s flush_chars entry tx_count=%d\n", info->device_name, info->tx_count));
+
+	if (info->tx_count <= 0 || tty->stopped ||
+	    tty->hw_stopped || !info->tx_buf)
+		return;
+
+	DBGINFO(("%s flush_chars start transmit\n", info->device_name));
+
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active && info->tx_count) {
+		tx_load(info, info->tx_buf,info->tx_count);
+	 	tx_start(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+static void flush_buffer(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "flush_buffer"))
+		return;
+	DBGINFO(("%s flush_buffer\n", info->device_name));
+
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active)
+		info->tx_count = 0;
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	wake_up_interruptible(&tty->write_wait);
+	tty_wakeup(tty);
+}
+
+/*
+ * throttle (stop) transmitter
+ */
+static void tx_hold(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "tx_hold"))
+		return;
+	DBGINFO(("%s tx_hold\n", info->device_name));
+	spin_lock_irqsave(&info->lock,flags);
+	if (info->tx_enabled && info->params.mode == MGSL_MODE_ASYNC)
+	 	tx_stop(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+/*
+ * release (start) transmitter
+ */
+static void tx_release(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "tx_release"))
+		return;
+	DBGINFO(("%s tx_release\n", info->device_name));
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active && info->tx_count) {
+		tx_load(info, info->tx_buf, info->tx_count);
+	 	tx_start(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+/*
+ * Service an IOCTL request
+ *
+ * Arguments
+ *
+ * 	tty	pointer to tty instance data
+ * 	file	pointer to associated file object for device
+ * 	cmd	IOCTL command code
+ * 	arg	command argument/context
+ *
+ * Return 0 if success, otherwise error code
+ */
+static int ioctl(struct tty_struct *tty, struct file *file,
+		 unsigned int cmd, unsigned long arg)
+{
+	struct slgt_info *info = tty->driver_data;
+	struct mgsl_icount cnow;	/* kernel counter temps */
+	struct serial_icounter_struct __user *p_cuser;	/* user space */
+	unsigned long flags;
+	void __user *argp = (void __user *)arg;
+
+	if (sanity_check(info, tty->name, "ioctl"))
+		return -ENODEV;
+	DBGINFO(("%s ioctl() cmd=%08X\n", info->device_name, cmd));
+
+	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
+	    (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
+		if (tty->flags & (1 << TTY_IO_ERROR))
+		    return -EIO;
+	}
+
+	switch (cmd) {
+	case MGSL_IOCGPARAMS:
+		return get_params(info, argp);
+	case MGSL_IOCSPARAMS:
+		return set_params(info, argp);
+	case MGSL_IOCGTXIDLE:
+		return get_txidle(info, argp);
+	case MGSL_IOCSTXIDLE:
+		return set_txidle(info, (int)arg);
+	case MGSL_IOCTXENABLE:
+		return tx_enable(info, (int)arg);
+	case MGSL_IOCRXENABLE:
+		return rx_enable(info, (int)arg);
+	case MGSL_IOCTXABORT:
+		return tx_abort(info);
+	case MGSL_IOCGSTATS:
+		return get_stats(info, argp);
+	case MGSL_IOCWAITEVENT:
+		return wait_mgsl_event(info, argp);
+	case TIOCMIWAIT:
+		return modem_input_wait(info,(int)arg);
+	case MGSL_IOCGIF:
+		return get_interface(info, argp);
+	case MGSL_IOCSIF:
+		return set_interface(info,(int)arg);
+	case TIOCGICOUNT:
+		spin_lock_irqsave(&info->lock,flags);
+		cnow = info->icount;
+		spin_unlock_irqrestore(&info->lock,flags);
+		p_cuser = argp;
+		if (put_user(cnow.cts, &p_cuser->cts) ||
+		    put_user(cnow.dsr, &p_cuser->dsr) ||
+		    put_user(cnow.rng, &p_cuser->rng) ||
+		    put_user(cnow.dcd, &p_cuser->dcd) ||
+		    put_user(cnow.rx, &p_cuser->rx) ||
+		    put_user(cnow.tx, &p_cuser->tx) ||
+		    put_user(cnow.frame, &p_cuser->frame) ||
+		    put_user(cnow.overrun, &p_cuser->overrun) ||
+		    put_user(cnow.parity, &p_cuser->parity) ||
+		    put_user(cnow.brk, &p_cuser->brk) ||
+		    put_user(cnow.buf_overrun, &p_cuser->buf_overrun))
+			return -EFAULT;
+		return 0;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return 0;
+}
+
+/*
+ * proc fs support
+ */
+static inline int line_info(char *buf, struct slgt_info *info)
+{
+	char stat_buf[30];
+	int ret;
+	unsigned long flags;
+
+	ret = sprintf(buf, "%s: IO=%08X IRQ=%d MaxFrameSize=%u\n",
+		      info->device_name, info->phys_reg_addr,
+		      info->irq_level, info->max_frame_size);
+
+	/* output current serial signal states */
+	spin_lock_irqsave(&info->lock,flags);
+	get_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	stat_buf[0] = 0;
+	stat_buf[1] = 0;
+	if (info->signals & SerialSignal_RTS)
+		strcat(stat_buf, "|RTS");
+	if (info->signals & SerialSignal_CTS)
+		strcat(stat_buf, "|CTS");
+	if (info->signals & SerialSignal_DTR)
+		strcat(stat_buf, "|DTR");
+	if (info->signals & SerialSignal_DSR)
+		strcat(stat_buf, "|DSR");
+	if (info->signals & SerialSignal_DCD)
+		strcat(stat_buf, "|CD");
+	if (info->signals & SerialSignal_RI)
+		strcat(stat_buf, "|RI");
+
+	if (info->params.mode != MGSL_MODE_ASYNC) {
+		ret += sprintf(buf+ret, "\tHDLC txok:%d rxok:%d",
+			       info->icount.txok, info->icount.rxok);
+		if (info->icount.txunder)
+			ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder);
+		if (info->icount.txabort)
+			ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort);
+		if (info->icount.rxshort)
+			ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort);
+		if (info->icount.rxlong)
+			ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong);
+		if (info->icount.rxover)
+			ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover);
+		if (info->icount.rxcrc)
+			ret += sprintf(buf+ret, " rxcrc:%d", info->icount.rxcrc);
+	} else {
+		ret += sprintf(buf+ret, "\tASYNC tx:%d rx:%d",
+			       info->icount.tx, info->icount.rx);
+		if (info->icount.frame)
+			ret += sprintf(buf+ret, " fe:%d", info->icount.frame);
+		if (info->icount.parity)
+			ret += sprintf(buf+ret, " pe:%d", info->icount.parity);
+		if (info->icount.brk)
+			ret += sprintf(buf+ret, " brk:%d", info->icount.brk);
+		if (info->icount.overrun)
+			ret += sprintf(buf+ret, " oe:%d", info->icount.overrun);
+	}
+
+	/* Append serial signal status to end */
+	ret += sprintf(buf+ret, " %s\n", stat_buf+1);
+
+	ret += sprintf(buf+ret, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
+		       info->tx_active,info->bh_requested,info->bh_running,
+		       info->pending_bh);
+
+	return ret;
+}
+
+/* Called to print information about devices
+ */
+static int read_proc(char *page, char **start, off_t off, int count,
+		     int *eof, void *data)
+{
+	int len = 0, l;
+	off_t	begin = 0;
+	struct slgt_info *info;
+
+	len += sprintf(page, "synclink_gt driver:%s\n", driver_version);
+
+	info = slgt_device_list;
+	while( info ) {
+		l = line_info(page + len, info);
+		len += l;
+		if (len+begin > off+count)
+			goto done;
+		if (len+begin < off) {
+			begin += len;
+			len = 0;
+		}
+		info = info->next_device;
+	}
+
+	*eof = 1;
+done:
+	if (off >= len+begin)
+		return 0;
+	*start = page + (off-begin);
+	return ((count < begin+len-off) ? count : begin+len-off);
+}
+
+/*
+ * return count of bytes in transmit buffer
+ */
+static int chars_in_buffer(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	if (sanity_check(info, tty->name, "chars_in_buffer"))
+		return 0;
+	DBGINFO(("%s chars_in_buffer()=%d\n", info->device_name, info->tx_count));
+	return info->tx_count;
+}
+
+/*
+ * signal remote device to throttle send data (our receive data)
+ */
+static void throttle(struct tty_struct * tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "throttle"))
+		return;
+	DBGINFO(("%s throttle\n", info->device_name));
+	if (I_IXOFF(tty))
+		send_xchar(tty, STOP_CHAR(tty));
+ 	if (tty->termios->c_cflag & CRTSCTS) {
+		spin_lock_irqsave(&info->lock,flags);
+		info->signals &= ~SerialSignal_RTS;
+	 	set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+}
+
+/*
+ * signal remote device to stop throttling send data (our receive data)
+ */
+static void unthrottle(struct tty_struct * tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "unthrottle"))
+		return;
+	DBGINFO(("%s unthrottle\n", info->device_name));
+	if (I_IXOFF(tty)) {
+		if (info->x_char)
+			info->x_char = 0;
+		else
+			send_xchar(tty, START_CHAR(tty));
+	}
+ 	if (tty->termios->c_cflag & CRTSCTS) {
+		spin_lock_irqsave(&info->lock,flags);
+		info->signals |= SerialSignal_RTS;
+	 	set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+}
+
+/*
+ * set or clear transmit break condition
+ * break_state	-1=set break condition, 0=clear
+ */
+static void set_break(struct tty_struct *tty, int break_state)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned short value;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "set_break"))
+		return;
+	DBGINFO(("%s set_break(%d)\n", info->device_name, break_state));
+
+	spin_lock_irqsave(&info->lock,flags);
+	value = rd_reg16(info, TCR);
+ 	if (break_state == -1)
+		value |= BIT6;
+	else
+		value &= ~BIT6;
+	wr_reg16(info, TCR, value);
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+#ifdef CONFIG_HDLC
+
+/**
+ * called by generic HDLC layer when protocol selected (PPP, frame relay, etc.)
+ * set encoding and frame check sequence (FCS) options
+ *
+ * dev       pointer to network device structure
+ * encoding  serial encoding setting
+ * parity    FCS setting
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_attach(struct net_device *dev, unsigned short encoding,
+			  unsigned short parity)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	unsigned char  new_encoding;
+	unsigned short new_crctype;
+
+	/* return error if TTY interface open */
+	if (info->count)
+		return -EBUSY;
+
+	DBGINFO(("%s hdlcdev_attach\n", info->device_name));
+
+	switch (encoding)
+	{
+	case ENCODING_NRZ:        new_encoding = HDLC_ENCODING_NRZ; break;
+	case ENCODING_NRZI:       new_encoding = HDLC_ENCODING_NRZI_SPACE; break;
+	case ENCODING_FM_MARK:    new_encoding = HDLC_ENCODING_BIPHASE_MARK; break;
+	case ENCODING_FM_SPACE:   new_encoding = HDLC_ENCODING_BIPHASE_SPACE; break;
+	case ENCODING_MANCHESTER: new_encoding = HDLC_ENCODING_BIPHASE_LEVEL; break;
+	default: return -EINVAL;
+	}
+
+	switch (parity)
+	{
+	case PARITY_NONE:            new_crctype = HDLC_CRC_NONE; break;
+	case PARITY_CRC16_PR1_CCITT: new_crctype = HDLC_CRC_16_CCITT; break;
+	case PARITY_CRC32_PR1_CCITT: new_crctype = HDLC_CRC_32_CCITT; break;
+	default: return -EINVAL;
+	}
+
+	info->params.encoding = new_encoding;
+	info->params.crc_type = new_crctype;;
+
+	/* if network interface up, reprogram hardware */
+	if (info->netcount)
+		program_hw(info);
+
+	return 0;
+}
+
+/**
+ * called by generic HDLC layer to send frame
+ *
+ * skb  socket buffer containing HDLC frame
+ * dev  pointer to network device structure
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	struct net_device_stats *stats = hdlc_stats(dev);
+	unsigned long flags;
+
+	DBGINFO(("%s hdlc_xmit\n", dev->name));
+
+	/* stop sending until this frame completes */
+	netif_stop_queue(dev);
+
+	/* copy data to device buffers */
+	info->tx_count = skb->len;
+	tx_load(info, skb->data, skb->len);
+
+	/* update network statistics */
+	stats->tx_packets++;
+	stats->tx_bytes += skb->len;
+
+	/* done with socket buffer, so free it */
+	dev_kfree_skb(skb);
+
+	/* save start time for transmit timeout detection */
+	dev->trans_start = jiffies;
+
+	/* start hardware transmitter if necessary */
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active)
+	 	tx_start(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	return 0;
+}
+
+/**
+ * called by network layer when interface enabled
+ * claim resources and initialize hardware
+ *
+ * dev  pointer to network device structure
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_open(struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	int rc;
+	unsigned long flags;
+
+	DBGINFO(("%s hdlcdev_open\n", dev->name));
+
+	/* generic HDLC layer open processing */
+	if ((rc = hdlc_open(dev)))
+		return rc;
+
+	/* arbitrate between network and tty opens */
+	spin_lock_irqsave(&info->netlock, flags);
+	if (info->count != 0 || info->netcount != 0) {
+		DBGINFO(("%s hdlc_open busy\n", dev->name));
+		spin_unlock_irqrestore(&info->netlock, flags);
+		return -EBUSY;
+	}
+	info->netcount=1;
+	spin_unlock_irqrestore(&info->netlock, flags);
+
+	/* claim resources and init adapter */
+	if ((rc = startup(info)) != 0) {
+		spin_lock_irqsave(&info->netlock, flags);
+		info->netcount=0;
+		spin_unlock_irqrestore(&info->netlock, flags);
+		return rc;
+	}
+
+	/* assert DTR and RTS, apply hardware settings */
+	info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	program_hw(info);
+
+	/* enable network layer transmit */
+	dev->trans_start = jiffies;
+	netif_start_queue(dev);
+
+	/* inform generic HDLC layer of current DCD status */
+	spin_lock_irqsave(&info->lock, flags);
+	get_signals(info);
+	spin_unlock_irqrestore(&info->lock, flags);
+	hdlc_set_carrier(info->signals & SerialSignal_DCD, dev);
+
+	return 0;
+}
+
+/**
+ * called by network layer when interface is disabled
+ * shutdown hardware and release resources
+ *
+ * dev  pointer to network device structure
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_close(struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	unsigned long flags;
+
+	DBGINFO(("%s hdlcdev_close\n", dev->name));
+
+	netif_stop_queue(dev);
+
+	/* shutdown adapter and release resources */
+	shutdown(info);
+
+	hdlc_close(dev);
+
+	spin_lock_irqsave(&info->netlock, flags);
+	info->netcount=0;
+	spin_unlock_irqrestore(&info->netlock, flags);
+
+	return 0;
+}
+
+/**
+ * called by network layer to process IOCTL call to network device
+ *
+ * dev  pointer to network device structure
+ * ifr  pointer to network interface request structure
+ * cmd  IOCTL command code
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	const size_t size = sizeof(sync_serial_settings);
+	sync_serial_settings new_line;
+	sync_serial_settings __user *line = ifr->ifr_settings.ifs_ifsu.sync;
+	struct slgt_info *info = dev_to_port(dev);
+	unsigned int flags;
+
+	DBGINFO(("%s hdlcdev_ioctl\n", dev->name));
+
+	/* return error if TTY interface open */
+	if (info->count)
+		return -EBUSY;
+
+	if (cmd != SIOCWANDEV)
+		return hdlc_ioctl(dev, ifr, cmd);
+
+	switch(ifr->ifr_settings.type) {
+	case IF_GET_IFACE: /* return current sync_serial_settings */
+
+		ifr->ifr_settings.type = IF_IFACE_SYNC_SERIAL;
+		if (ifr->ifr_settings.size < size) {
+			ifr->ifr_settings.size = size; /* data size wanted */
+			return -ENOBUFS;
+		}
+
+		flags = info->params.flags & (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
+					      HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
+					      HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
+					      HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN);
+
+		switch (flags){
+		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_TXCPIN): new_line.clock_type = CLOCK_EXT; break;
+		case (HDLC_FLAG_RXC_BRG    | HDLC_FLAG_TXC_BRG):    new_line.clock_type = CLOCK_INT; break;
+		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_BRG):    new_line.clock_type = CLOCK_TXINT; break;
+		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_RXCPIN): new_line.clock_type = CLOCK_TXFROMRX; break;
+		default: new_line.clock_type = CLOCK_DEFAULT;
+		}
+
+		new_line.clock_rate = info->params.clock_speed;
+		new_line.loopback   = info->params.loopback ? 1:0;
+
+		if (copy_to_user(line, &new_line, size))
+			return -EFAULT;
+		return 0;
+
+	case IF_IFACE_SYNC_SERIAL: /* set sync_serial_settings */
+
+		if(!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (copy_from_user(&new_line, line, size))
+			return -EFAULT;
+
+		switch (new_line.clock_type)
+		{
+		case CLOCK_EXT:      flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_TXCPIN; break;
+		case CLOCK_TXFROMRX: flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_RXCPIN; break;
+		case CLOCK_INT:      flags = HDLC_FLAG_RXC_BRG    | HDLC_FLAG_TXC_BRG;    break;
+		case CLOCK_TXINT:    flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_BRG;    break;
+		case CLOCK_DEFAULT:  flags = info->params.flags &
+					     (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
+					      HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
+					      HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
+					      HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN); break;
+		default: return -EINVAL;
+		}
+
+		if (new_line.loopback != 0 && new_line.loopback != 1)
+			return -EINVAL;
+
+		info->params.flags &= ~(HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
+					HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
+					HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
+					HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN);
+		info->params.flags |= flags;
+
+		info->params.loopback = new_line.loopback;
+
+		if (flags & (HDLC_FLAG_RXC_BRG | HDLC_FLAG_TXC_BRG))
+			info->params.clock_speed = new_line.clock_rate;
+		else
+			info->params.clock_speed = 0;
+
+		/* if network interface up, reprogram hardware */
+		if (info->netcount)
+			program_hw(info);
+		return 0;
+
+	default:
+		return hdlc_ioctl(dev, ifr, cmd);
+	}
+}
+
+/**
+ * called by network layer when transmit timeout is detected
+ *
+ * dev  pointer to network device structure
+ */
+static void hdlcdev_tx_timeout(struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	struct net_device_stats *stats = hdlc_stats(dev);
+	unsigned long flags;
+
+	DBGINFO(("%s hdlcdev_tx_timeout\n", dev->name));
+
+	stats->tx_errors++;
+	stats->tx_aborted_errors++;
+
+	spin_lock_irqsave(&info->lock,flags);
+	tx_stop(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	netif_wake_queue(dev);
+}
+
+/**
+ * called by device driver when transmit completes
+ * reenable network layer transmit if stopped
+ *
+ * info  pointer to device instance information
+ */
+static void hdlcdev_tx_done(struct slgt_info *info)
+{
+	if (netif_queue_stopped(info->netdev))
+		netif_wake_queue(info->netdev);
+}
+
+/**
+ * called by device driver when frame received
+ * pass frame to network layer
+ *
+ * info  pointer to device instance information
+ * buf   pointer to buffer contianing frame data
+ * size  count of data bytes in buf
+ */
+static void hdlcdev_rx(struct slgt_info *info, char *buf, int size)
+{
+	struct sk_buff *skb = dev_alloc_skb(size);
+	struct net_device *dev = info->netdev;
+	struct net_device_stats *stats = hdlc_stats(dev);
+
+	DBGINFO(("%s hdlcdev_rx\n", dev->name));
+
+	if (skb == NULL) {
+		DBGERR(("%s: can't alloc skb, drop packet\n", dev->name));
+		stats->rx_dropped++;
+		return;
+	}
+
+	memcpy(skb_put(skb, size),buf,size);
+
+	skb->protocol = hdlc_type_trans(skb, info->netdev);
+
+	stats->rx_packets++;
+	stats->rx_bytes += size;
+
+	netif_rx(skb);
+
+	info->netdev->last_rx = jiffies;
+}
+
+/**
+ * called by device driver when adding device instance
+ * do generic HDLC initialization
+ *
+ * info  pointer to device instance information
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_init(struct slgt_info *info)
+{
+	int rc;
+	struct net_device *dev;
+	hdlc_device *hdlc;
+
+	/* allocate and initialize network and HDLC layer objects */
+
+	if (!(dev = alloc_hdlcdev(info))) {
+		printk(KERN_ERR "%s hdlc device alloc failure\n", info->device_name);
+		return -ENOMEM;
+	}
+
+	/* for network layer reporting purposes only */
+	dev->mem_start = info->phys_reg_addr;
+	dev->mem_end   = info->phys_reg_addr + SLGT_REG_SIZE - 1;
+	dev->irq       = info->irq_level;
+
+	/* network layer callbacks and settings */
+	dev->do_ioctl       = hdlcdev_ioctl;
+	dev->open           = hdlcdev_open;
+	dev->stop           = hdlcdev_close;
+	dev->tx_timeout     = hdlcdev_tx_timeout;
+	dev->watchdog_timeo = 10*HZ;
+	dev->tx_queue_len   = 50;
+
+	/* generic HDLC layer callbacks and settings */
+	hdlc         = dev_to_hdlc(dev);
+	hdlc->attach = hdlcdev_attach;
+	hdlc->xmit   = hdlcdev_xmit;
+
+	/* register objects with HDLC layer */
+	if ((rc = register_hdlc_device(dev))) {
+		printk(KERN_WARNING "%s:unable to register hdlc device\n",__FILE__);
+		free_netdev(dev);
+		return rc;
+	}
+
+	info->netdev = dev;
+	return 0;
+}
+
+/**
+ * called by device driver when removing device instance
+ * do generic HDLC cleanup
+ *
+ * info  pointer to device instance information
+ */
+static void hdlcdev_exit(struct slgt_info *info)
+{
+	unregister_hdlc_device(info->netdev);
+	free_netdev(info->netdev);
+	info->netdev = NULL;
+}
+
+#endif /* ifdef CONFIG_HDLC */
+
+/*
+ * get async data from rx DMA buffers
+ */
+static void rx_async(struct slgt_info *info)
+{
+ 	struct tty_struct *tty = info->tty;
+ 	struct mgsl_icount *icount = &info->icount;
+	unsigned int start, end;
+	unsigned char *p;
+	unsigned char status;
+	struct slgt_desc *bufs = info->rbufs;
+	int i, count;
+
+	start = end = info->rbuf_current;
+
+	while(desc_complete(bufs[end])) {
+		count = desc_count(bufs[end]) - info->rbuf_index;
+		p     = bufs[end].buf + info->rbuf_index;
+
+		DBGISR(("%s rx_async count=%d\n", info->device_name, count));
+		DBGDATA(info, p, count, "rx");
+
+		for(i=0 ; i < count; i+=2, p+=2) {
+			if (tty) {
+				if (tty->flip.count >= TTY_FLIPBUF_SIZE)
+					tty_flip_buffer_push(tty);
+				if (tty->flip.count >= TTY_FLIPBUF_SIZE)
+					break;
+				*tty->flip.char_buf_ptr = *p;
+				*tty->flip.flag_buf_ptr = 0;
+			}
+			icount->rx++;
+
+			if ((status = *(p+1) & (BIT9 + BIT8))) {
+				if (status & BIT9)
+					icount->parity++;
+				else if (status & BIT8)
+					icount->frame++;
+				/* discard char if tty control flags say so */
+				if (status & info->ignore_status_mask)
+					continue;
+				if (tty) {
+					if (status & BIT9)
+						*tty->flip.flag_buf_ptr = TTY_PARITY;
+					else if (status & BIT8)
+						*tty->flip.flag_buf_ptr = TTY_FRAME;
+				}
+			}
+			if (tty) {
+				tty->flip.flag_buf_ptr++;
+				tty->flip.char_buf_ptr++;
+				tty->flip.count++;
+			}
+		}
+
+		if (i < count) {
+			/* receive buffer not completed */
+			info->rbuf_index += i;
+			info->rx_timer.expires = jiffies + 1;
+			add_timer(&info->rx_timer);
+			break;
+		}
+
+		info->rbuf_index = 0;
+		free_rbufs(info, end, end);
+
+		if (++end == info->rbuf_count)
+			end = 0;
+
+		/* if entire list searched then no frame available */
+		if (end == start)
+			break;
+	}
+
+	if (tty && tty->flip.count)
+		tty_flip_buffer_push(tty);
+}
+
+/*
+ * return next bottom half action to perform
+ */
+static int bh_action(struct slgt_info *info)
+{
+	unsigned long flags;
+	int rc;
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	if (info->pending_bh & BH_RECEIVE) {
+		info->pending_bh &= ~BH_RECEIVE;
+		rc = BH_RECEIVE;
+	} else if (info->pending_bh & BH_TRANSMIT) {
+		info->pending_bh &= ~BH_TRANSMIT;
+		rc = BH_TRANSMIT;
+	} else if (info->pending_bh & BH_STATUS) {
+		info->pending_bh &= ~BH_STATUS;
+		rc = BH_STATUS;
+	} else {
+		/* Mark BH routine as complete */
+		info->bh_running   = 0;
+		info->bh_requested = 0;
+		rc = 0;
+	}
+
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	return rc;
+}
+
+/*
+ * perform bottom half processing
+ */
+static void bh_handler(void* context)
+{
+	struct slgt_info *info = context;
+	int action;
+
+	if (!info)
+		return;
+	info->bh_running = 1;
+
+	while((action = bh_action(info))) {
+		switch (action) {
+		case BH_RECEIVE:
+			DBGBH(("%s bh receive\n", info->device_name));
+			switch(info->params.mode) {
+			case MGSL_MODE_ASYNC:
+				rx_async(info);
+				break;
+			case MGSL_MODE_HDLC:
+				while(rx_get_frame(info));
+				break;
+			case MGSL_MODE_RAW:
+				while(rx_get_buf(info));
+				break;
+			}
+			/* restart receiver if rx DMA buffers exhausted */
+			if (info->rx_restart)
+				rx_start(info);
+			break;
+		case BH_TRANSMIT:
+			bh_transmit(info);
+			break;
+		case BH_STATUS:
+			DBGBH(("%s bh status\n", info->device_name));
+			info->ri_chkcount = 0;
+			info->dsr_chkcount = 0;
+			info->dcd_chkcount = 0;
+			info->cts_chkcount = 0;
+			break;
+		default:
+			DBGBH(("%s unknown action\n", info->device_name));
+			break;
+		}
+	}
+	DBGBH(("%s bh_handler exit\n", info->device_name));
+}
+
+static void bh_transmit(struct slgt_info *info)
+{
+	struct tty_struct *tty = info->tty;
+
+	DBGBH(("%s bh_transmit\n", info->device_name));
+	if (tty) {
+		tty_wakeup(tty);
+		wake_up_interruptible(&tty->write_wait);
+	}
+}
+
+static void dsr_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("dsr_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->dsr_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_DSR);
+		return;
+	}
+	info->icount.dsr++;
+	if (info->signals & SerialSignal_DSR)
+		info->input_signal_events.dsr_up++;
+	else
+		info->input_signal_events.dsr_down++;
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+}
+
+static void cts_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("cts_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->cts_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_CTS);
+		return;
+	}
+	info->icount.cts++;
+	if (info->signals & SerialSignal_CTS)
+		info->input_signal_events.cts_up++;
+	else
+		info->input_signal_events.cts_down++;
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+
+	if (info->flags & ASYNC_CTS_FLOW) {
+		if (info->tty) {
+			if (info->tty->hw_stopped) {
+				if (info->signals & SerialSignal_CTS) {
+		 			info->tty->hw_stopped = 0;
+					info->pending_bh |= BH_TRANSMIT;
+					return;
+				}
+			} else {
+				if (!(info->signals & SerialSignal_CTS))
+		 			info->tty->hw_stopped = 1;
+			}
+		}
+	}
+}
+
+static void dcd_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("dcd_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->dcd_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_DCD);
+		return;
+	}
+	info->icount.dcd++;
+	if (info->signals & SerialSignal_DCD) {
+		info->input_signal_events.dcd_up++;
+	} else {
+		info->input_signal_events.dcd_down++;
+	}
+#ifdef CONFIG_HDLC
+	if (info->netcount)
+		hdlc_set_carrier(info->signals & SerialSignal_DCD, info->netdev);
+#endif
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+
+	if (info->flags & ASYNC_CHECK_CD) {
+		if (info->signals & SerialSignal_DCD)
+			wake_up_interruptible(&info->open_wait);
+		else {
+			if (info->tty)
+				tty_hangup(info->tty);
+		}
+	}
+}
+
+static void ri_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("ri_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->ri_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_RI);
+		return;
+	}
+	info->icount.dcd++;
+	if (info->signals & SerialSignal_RI) {
+		info->input_signal_events.ri_up++;
+	} else {
+		info->input_signal_events.ri_down++;
+	}
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+}
+
+static void isr_serial(struct slgt_info *info)
+{
+	unsigned short status = rd_reg16(info, SSR);
+
+	DBGISR(("%s isr_serial status=%04X\n", info->device_name, status));
+
+	wr_reg16(info, SSR, status); /* clear pending */
+
+	info->irq_occurred = 1;
+
+	if (info->params.mode == MGSL_MODE_ASYNC) {
+		if (status & IRQ_TXIDLE) {
+			if (info->tx_count)
+				isr_txeom(info, status);
+		}
+		if ((status & IRQ_RXBREAK) && (status & RXBREAK)) {
+			info->icount.brk++;
+			/* process break detection if tty control allows */
+			if (info->tty) {
+				if (!(status & info->ignore_status_mask)) {
+					if (info->read_status_mask & MASK_BREAK) {
+						*info->tty->flip.flag_buf_ptr = TTY_BREAK;
+						if (info->flags & ASYNC_SAK)
+							do_SAK(info->tty);
+					}
+				}
+			}
+		}
+	} else {
+		if (status & (IRQ_TXIDLE + IRQ_TXUNDER))
+			isr_txeom(info, status);
+
+		if (status & IRQ_RXIDLE) {
+			if (status & RXIDLE)
+				info->icount.rxidle++;
+			else
+				info->icount.exithunt++;
+			wake_up_interruptible(&info->event_wait_q);
+		}
+
+		if (status & IRQ_RXOVER)
+			rx_start(info);
+	}
+
+	if (status & IRQ_DSR)
+		dsr_change(info);
+	if (status & IRQ_CTS)
+		cts_change(info);
+	if (status & IRQ_DCD)
+		dcd_change(info);
+	if (status & IRQ_RI)
+		ri_change(info);
+}
+
+static void isr_rdma(struct slgt_info *info)
+{
+	unsigned int status = rd_reg32(info, RDCSR);
+
+	DBGISR(("%s isr_rdma status=%08x\n", info->device_name, status));
+
+	/* RDCSR (rx DMA control/status)
+	 *
+	 * 31..07  reserved
+	 * 06      save status byte to DMA buffer
+	 * 05      error
+	 * 04      eol (end of list)
+	 * 03      eob (end of buffer)
+	 * 02      IRQ enable
+	 * 01      reset
+	 * 00      enable
+	 */
+	wr_reg32(info, RDCSR, status);	/* clear pending */
+
+	if (status & (BIT5 + BIT4)) {
+		DBGISR(("%s isr_rdma rx_restart=1\n", info->device_name));
+		info->rx_restart = 1;
+	}
+	info->pending_bh |= BH_RECEIVE;
+}
+
+static void isr_tdma(struct slgt_info *info)
+{
+	unsigned int status = rd_reg32(info, TDCSR);
+
+	DBGISR(("%s isr_tdma status=%08x\n", info->device_name, status));
+
+	/* TDCSR (tx DMA control/status)
+	 *
+	 * 31..06  reserved
+	 * 05      error
+	 * 04      eol (end of list)
+	 * 03      eob (end of buffer)
+	 * 02      IRQ enable
+	 * 01      reset
+	 * 00      enable
+	 */
+	wr_reg32(info, TDCSR, status);	/* clear pending */
+
+	if (status & (BIT5 + BIT4 + BIT3)) {
+		// another transmit buffer has completed
+		// run bottom half to get more send data from user
+		info->pending_bh |= BH_TRANSMIT;
+	}
+}
+
+static void isr_txeom(struct slgt_info *info, unsigned short status)
+{
+	DBGISR(("%s txeom status=%04x\n", info->device_name, status));
+
+	slgt_irq_off(info, IRQ_TXDATA + IRQ_TXIDLE + IRQ_TXUNDER);
+	tdma_reset(info);
+	reset_tbufs(info);
+	if (status & IRQ_TXUNDER) {
+		unsigned short val = rd_reg16(info, TCR);
+		wr_reg16(info, TCR, (unsigned short)(val | BIT2)); /* set reset bit */
+		wr_reg16(info, TCR, val); /* clear reset bit */
+	}
+
+	if (info->tx_active) {
+		if (info->params.mode != MGSL_MODE_ASYNC) {
+			if (status & IRQ_TXUNDER)
+				info->icount.txunder++;
+			else if (status & IRQ_TXIDLE)
+				info->icount.txok++;
+		}
+
+		info->tx_active = 0;
+		info->tx_count = 0;
+
+		del_timer(&info->tx_timer);
+
+		if (info->params.mode != MGSL_MODE_ASYNC && info->drop_rts_on_tx_done) {
+			info->signals &= ~SerialSignal_RTS;
+			info->drop_rts_on_tx_done = 0;
+			set_signals(info);
+		}
+
+#ifdef CONFIG_HDLC
+		if (info->netcount)
+			hdlcdev_tx_done(info);
+		else
+#endif
+		{
+			if (info->tty && (info->tty->stopped || info->tty->hw_stopped)) {
+				tx_stop(info);
+				return;
+			}
+			info->pending_bh |= BH_TRANSMIT;
+		}
+	}
+}
+
+/* interrupt service routine
+ *
+ * 	irq	interrupt number
+ * 	dev_id	device ID supplied during interrupt registration
+ * 	regs	interrupted processor context
+ */
+static irqreturn_t slgt_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+	struct slgt_info *info;
+	unsigned int gsr;
+	unsigned int i;
+
+	DBGISR(("slgt_interrupt irq=%d entry\n", irq));
+
+	info = dev_id;
+	if (!info)
+		return IRQ_NONE;
+
+	spin_lock(&info->lock);
+
+	while((gsr = rd_reg32(info, GSR) & 0xffffff00)) {
+		DBGISR(("%s gsr=%08x\n", info->device_name, gsr));
+		info->irq_occurred = 1;
+		for(i=0; i < info->port_count ; i++) {
+			if (info->port_array[i] == NULL)
+				continue;
+			if (gsr & (BIT8 << i))
+				isr_serial(info->port_array[i]);
+			if (gsr & (BIT16 << (i*2)))
+				isr_rdma(info->port_array[i]);
+			if (gsr & (BIT17 << (i*2)))
+				isr_tdma(info->port_array[i]);
+		}
+	}
+
+	for(i=0; i < info->port_count ; i++) {
+		struct slgt_info *port = info->port_array[i];
+
+		if (port && (port->count || port->netcount) &&
+		    port->pending_bh && !port->bh_running &&
+		    !port->bh_requested) {
+			DBGISR(("%s bh queued\n", port->device_name));
+			schedule_work(&port->task);
+			port->bh_requested = 1;
+		}
+	}
+
+	spin_unlock(&info->lock);
+
+	DBGISR(("slgt_interrupt irq=%d exit\n", irq));
+	return IRQ_HANDLED;
+}
+
+static int startup(struct slgt_info *info)
+{
+	DBGINFO(("%s startup\n", info->device_name));
+
+	if (info->flags & ASYNC_INITIALIZED)
+		return 0;
+
+	if (!info->tx_buf) {
+		info->tx_buf = kmalloc(info->max_frame_size, GFP_KERNEL);
+		if (!info->tx_buf) {
+			DBGERR(("%s can't allocate tx buffer\n", info->device_name));
+			return -ENOMEM;
+		}
+	}
+
+	info->pending_bh = 0;
+
+	memset(&info->icount, 0, sizeof(info->icount));
+
+	/* program hardware for current parameters */
+	change_params(info);
+
+	if (info->tty)
+		clear_bit(TTY_IO_ERROR, &info->tty->flags);
+
+	info->flags |= ASYNC_INITIALIZED;
+
+	return 0;
+}
+
+/*
+ *  called by close() and hangup() to shutdown hardware
+ */
+static void shutdown(struct slgt_info *info)
+{
+	unsigned long flags;
+
+	if (!(info->flags & ASYNC_INITIALIZED))
+		return;
+
+	DBGINFO(("%s shutdown\n", info->device_name));
+
+	/* clear status wait queue because status changes */
+	/* can't happen after shutting down the hardware */
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+
+	del_timer_sync(&info->tx_timer);
+	del_timer_sync(&info->rx_timer);
+
+	kfree(info->tx_buf);
+	info->tx_buf = NULL;
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	tx_stop(info);
+	rx_stop(info);
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+
+ 	if (!info->tty || info->tty->termios->c_cflag & HUPCL) {
+ 		info->signals &= ~(SerialSignal_DTR + SerialSignal_RTS);
+		set_signals(info);
+	}
+
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	if (info->tty)
+		set_bit(TTY_IO_ERROR, &info->tty->flags);
+
+	info->flags &= ~ASYNC_INITIALIZED;
+}
+
+static void program_hw(struct slgt_info *info)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	rx_stop(info);
+	tx_stop(info);
+
+	if (info->params.mode == MGSL_MODE_HDLC ||
+	    info->params.mode == MGSL_MODE_RAW ||
+	    info->netcount)
+		hdlc_mode(info);
+	else
+		async_mode(info);
+
+	set_signals(info);
+
+	info->dcd_chkcount = 0;
+	info->cts_chkcount = 0;
+	info->ri_chkcount = 0;
+	info->dsr_chkcount = 0;
+
+	slgt_irq_on(info, IRQ_DCD | IRQ_CTS | IRQ_DSR);
+	get_signals(info);
+
+	if (info->netcount ||
+	    (info->tty && info->tty->termios->c_cflag & CREAD))
+		rx_start(info);
+
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+/*
+ * reconfigure adapter based on new parameters
+ */
+static void change_params(struct slgt_info *info)
+{
+	unsigned cflag;
+	int bits_per_char;
+
+	if (!info->tty || !info->tty->termios)
+		return;
+	DBGINFO(("%s change_params\n", info->device_name));
+
+	cflag = info->tty->termios->c_cflag;
+
+	/* if B0 rate (hangup) specified then negate DTR and RTS */
+	/* otherwise assert DTR and RTS */
+ 	if (cflag & CBAUD)
+		info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
+
+	/* byte size and parity */
+
+	switch (cflag & CSIZE) {
+	case CS5: info->params.data_bits = 5; break;
+	case CS6: info->params.data_bits = 6; break;
+	case CS7: info->params.data_bits = 7; break;
+	case CS8: info->params.data_bits = 8; break;
+	default:  info->params.data_bits = 7; break;
+	}
+
+	info->params.stop_bits = (cflag & CSTOPB) ? 2 : 1;
+
+	if (cflag & PARENB)
+		info->params.parity = (cflag & PARODD) ? ASYNC_PARITY_ODD : ASYNC_PARITY_EVEN;
+	else
+		info->params.parity = ASYNC_PARITY_NONE;
+
+	/* calculate number of jiffies to transmit a full
+	 * FIFO (32 bytes) at specified data rate
+	 */
+	bits_per_char = info->params.data_bits +
+			info->params.stop_bits + 1;
+
+	info->params.data_rate = tty_get_baud_rate(info->tty);
+
+	if (info->params.data_rate) {
+		info->timeout = (32*HZ*bits_per_char) /
+				info->params.data_rate;
+	}
+	info->timeout += HZ/50;		/* Add .02 seconds of slop */
+
+	if (cflag & CRTSCTS)
+		info->flags |= ASYNC_CTS_FLOW;
+	else
+		info->flags &= ~ASYNC_CTS_FLOW;
+
+	if (cflag & CLOCAL)
+		info->flags &= ~ASYNC_CHECK_CD;
+	else
+		info->flags |= ASYNC_CHECK_CD;
+
+	/* process tty input control flags */
+
+	info->read_status_mask = IRQ_RXOVER;
+	if (I_INPCK(info->tty))
+		info->read_status_mask |= MASK_PARITY | MASK_FRAMING;
+ 	if (I_BRKINT(info->tty) || I_PARMRK(info->tty))
+ 		info->read_status_mask |= MASK_BREAK;
+	if (I_IGNPAR(info->tty))
+		info->ignore_status_mask |= MASK_PARITY | MASK_FRAMING;
+	if (I_IGNBRK(info->tty)) {
+		info->ignore_status_mask |= MASK_BREAK;
+		/* If ignoring parity and break indicators, ignore
+		 * overruns too.  (For real raw support).
+		 */
+		if (I_IGNPAR(info->tty))
+			info->ignore_status_mask |= MASK_OVERRUN;
+	}
+
+	program_hw(info);
+}
+
+static int get_stats(struct slgt_info *info, struct mgsl_icount __user *user_icount)
+{
+	DBGINFO(("%s get_stats\n",  info->device_name));
+	if (!user_icount) {
+		memset(&info->icount, 0, sizeof(info->icount));
+	} else {
+		if (copy_to_user(user_icount, &info->icount, sizeof(struct mgsl_icount)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int get_params(struct slgt_info *info, MGSL_PARAMS __user *user_params)
+{
+	DBGINFO(("%s get_params\n", info->device_name));
+	if (copy_to_user(user_params, &info->params, sizeof(MGSL_PARAMS)))
+		return -EFAULT;
+	return 0;
+}
+
+static int set_params(struct slgt_info *info, MGSL_PARAMS __user *new_params)
+{
+ 	unsigned long flags;
+	MGSL_PARAMS tmp_params;
+
+	DBGINFO(("%s set_params\n", info->device_name));
+	if (copy_from_user(&tmp_params, new_params, sizeof(MGSL_PARAMS)))
+		return -EFAULT;
+
+	spin_lock_irqsave(&info->lock, flags);
+	memcpy(&info->params, &tmp_params, sizeof(MGSL_PARAMS));
+	spin_unlock_irqrestore(&info->lock, flags);
+
+ 	change_params(info);
+
+	return 0;
+}
+
+static int get_txidle(struct slgt_info *info, int __user *idle_mode)
+{
+	DBGINFO(("%s get_txidle=%d\n", info->device_name, info->idle_mode));
+	if (put_user(info->idle_mode, idle_mode))
+		return -EFAULT;
+	return 0;
+}
+
+static int set_txidle(struct slgt_info *info, int idle_mode)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s set_txidle(%d)\n", info->device_name, idle_mode));
+	spin_lock_irqsave(&info->lock,flags);
+	info->idle_mode = idle_mode;
+	tx_set_idle(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+static int tx_enable(struct slgt_info *info, int enable)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s tx_enable(%d)\n", info->device_name, enable));
+	spin_lock_irqsave(&info->lock,flags);
+	if (enable) {
+		if (!info->tx_enabled)
+			tx_start(info);
+	} else {
+		if (info->tx_enabled)
+			tx_stop(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+/*
+ * abort transmit HDLC frame
+ */
+static int tx_abort(struct slgt_info *info)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s tx_abort\n", info->device_name));
+	spin_lock_irqsave(&info->lock,flags);
+	tdma_reset(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+static int rx_enable(struct slgt_info *info, int enable)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s rx_enable(%d)\n", info->device_name, enable));
+	spin_lock_irqsave(&info->lock,flags);
+	if (enable) {
+		if (!info->rx_enabled)
+			rx_start(info);
+	} else {
+		if (info->rx_enabled)
+			rx_stop(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+/*
+ *  wait for specified event to occur
+ */
+static int wait_mgsl_event(struct slgt_info *info, int __user *mask_ptr)
+{
+ 	unsigned long flags;
+	int s;
+	int rc=0;
+	struct mgsl_icount cprev, cnow;
+	int events;
+	int mask;
+	struct	_input_signal_events oldsigs, newsigs;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (get_user(mask, mask_ptr))
+		return -EFAULT;
+
+	DBGINFO(("%s wait_mgsl_event(%d)\n", info->device_name, mask));
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	/* return immediately if state matches requested events */
+	get_signals(info);
+	s = info->signals;
+
+	events = mask &
+		( ((s & SerialSignal_DSR) ? MgslEvent_DsrActive:MgslEvent_DsrInactive) +
+ 		  ((s & SerialSignal_DCD) ? MgslEvent_DcdActive:MgslEvent_DcdInactive) +
+		  ((s & SerialSignal_CTS) ? MgslEvent_CtsActive:MgslEvent_CtsInactive) +
+		  ((s & SerialSignal_RI)  ? MgslEvent_RiActive :MgslEvent_RiInactive) );
+	if (events) {
+		spin_unlock_irqrestore(&info->lock,flags);
+		goto exit;
+	}
+
+	/* save current irq counts */
+	cprev = info->icount;
+	oldsigs = info->input_signal_events;
+
+	/* enable hunt and idle irqs if needed */
+	if (mask & (MgslEvent_ExitHuntMode+MgslEvent_IdleReceived)) {
+		unsigned short val = rd_reg16(info, SCR);
+		if (!(val & IRQ_RXIDLE))
+			wr_reg16(info, SCR, (unsigned short)(val | IRQ_RXIDLE));
+	}
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&info->event_wait_q, &wait);
+
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	for(;;) {
+		schedule();
+		if (signal_pending(current)) {
+			rc = -ERESTARTSYS;
+			break;
+		}
+
+		/* get current irq counts */
+		spin_lock_irqsave(&info->lock,flags);
+		cnow = info->icount;
+		newsigs = info->input_signal_events;
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_irqrestore(&info->lock,flags);
+
+		/* if no change, wait aborted for some reason */
+		if (newsigs.dsr_up   == oldsigs.dsr_up   &&
+		    newsigs.dsr_down == oldsigs.dsr_down &&
+		    newsigs.dcd_up   == oldsigs.dcd_up   &&
+		    newsigs.dcd_down == oldsigs.dcd_down &&
+		    newsigs.cts_up   == oldsigs.cts_up   &&
+		    newsigs.cts_down == oldsigs.cts_down &&
+		    newsigs.ri_up    == oldsigs.ri_up    &&
+		    newsigs.ri_down  == oldsigs.ri_down  &&
+		    cnow.exithunt    == cprev.exithunt   &&
+		    cnow.rxidle      == cprev.rxidle) {
+			rc = -EIO;
+			break;
+		}
+
+		events = mask &
+			( (newsigs.dsr_up   != oldsigs.dsr_up   ? MgslEvent_DsrActive:0)   +
+			  (newsigs.dsr_down != oldsigs.dsr_down ? MgslEvent_DsrInactive:0) +
+			  (newsigs.dcd_up   != oldsigs.dcd_up   ? MgslEvent_DcdActive:0)   +
+			  (newsigs.dcd_down != oldsigs.dcd_down ? MgslEvent_DcdInactive:0) +
+			  (newsigs.cts_up   != oldsigs.cts_up   ? MgslEvent_CtsActive:0)   +
+			  (newsigs.cts_down != oldsigs.cts_down ? MgslEvent_CtsInactive:0) +
+			  (newsigs.ri_up    != oldsigs.ri_up    ? MgslEvent_RiActive:0)    +
+			  (newsigs.ri_down  != oldsigs.ri_down  ? MgslEvent_RiInactive:0)  +
+			  (cnow.exithunt    != cprev.exithunt   ? MgslEvent_ExitHuntMode:0) +
+			  (cnow.rxidle      != cprev.rxidle     ? MgslEvent_IdleReceived:0) );
+		if (events)
+			break;
+
+		cprev = cnow;
+		oldsigs = newsigs;
+	}
+
+	remove_wait_queue(&info->event_wait_q, &wait);
+	set_current_state(TASK_RUNNING);
+
+
+	if (mask & (MgslEvent_ExitHuntMode + MgslEvent_IdleReceived)) {
+		spin_lock_irqsave(&info->lock,flags);
+		if (!waitqueue_active(&info->event_wait_q)) {
+			/* disable enable exit hunt mode/idle rcvd IRQs */
+			wr_reg16(info, SCR,
+				(unsigned short)(rd_reg16(info, SCR) & ~IRQ_RXIDLE));
+		}
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+exit:
+	if (rc == 0)
+		rc = put_user(events, mask_ptr);
+	return rc;
+}
+
+static int get_interface(struct slgt_info *info, int __user *if_mode)
+{
+	DBGINFO(("%s get_interface=%x\n", info->device_name, info->if_mode));
+	if (put_user(info->if_mode, if_mode))
+		return -EFAULT;
+	return 0;
+}
+
+static int set_interface(struct slgt_info *info, int if_mode)
+{
+ 	unsigned long flags;
+	unsigned char val;
+
+	DBGINFO(("%s set_interface=%x)\n", info->device_name, if_mode));
+	spin_lock_irqsave(&info->lock,flags);
+	info->if_mode = if_mode;
+
+	msc_set_vcr(info);
+
+	/* TCR (tx control) 07  1=RTS driver control */
+	val = rd_reg16(info, TCR);
+	if (info->if_mode & MGSL_INTERFACE_RTS_EN)
+		val |= BIT7;
+	else
+		val &= ~BIT7;
+	wr_reg16(info, TCR, val);
+
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+static int modem_input_wait(struct slgt_info *info,int arg)
+{
+ 	unsigned long flags;
+	int rc;
+	struct mgsl_icount cprev, cnow;
+	DECLARE_WAITQUEUE(wait, current);
+
+	/* save current irq counts */
+	spin_lock_irqsave(&info->lock,flags);
+	cprev = info->icount;
+	add_wait_queue(&info->status_event_wait_q, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	for(;;) {
+		schedule();
+		if (signal_pending(current)) {
+			rc = -ERESTARTSYS;
+			break;
+		}
+
+		/* get new irq counts */
+		spin_lock_irqsave(&info->lock,flags);
+		cnow = info->icount;
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_irqrestore(&info->lock,flags);
+
+		/* if no change, wait aborted for some reason */
+		if (cnow.rng == cprev.rng && cnow.dsr == cprev.dsr &&
+		    cnow.dcd == cprev.dcd && cnow.cts == cprev.cts) {
+			rc = -EIO;
+			break;
+		}
+
+		/* check for change in caller specified modem input */
+		if ((arg & TIOCM_RNG && cnow.rng != cprev.rng) ||
+		    (arg & TIOCM_DSR && cnow.dsr != cprev.dsr) ||
+		    (arg & TIOCM_CD  && cnow.dcd != cprev.dcd) ||
+		    (arg & TIOCM_CTS && cnow.cts != cprev.cts)) {
+			rc = 0;
+			break;
+		}
+
+		cprev = cnow;
+	}
+	remove_wait_queue(&info->status_event_wait_q, &wait);
+	set_current_state(TASK_RUNNING);
+	return rc;
+}
+
+/*
+ *  return state of serial control and status signals
+ */
+static int tiocmget(struct tty_struct *tty, struct file *file)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned int result;
+ 	unsigned long flags;
+
+	spin_lock_irqsave(&info->lock,flags);
+ 	get_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	result = ((info->signals & SerialSignal_RTS) ? TIOCM_RTS:0) +
+		((info->signals & SerialSignal_DTR) ? TIOCM_DTR:0) +
+		((info->signals & SerialSignal_DCD) ? TIOCM_CAR:0) +
+		((info->signals & SerialSignal_RI)  ? TIOCM_RNG:0) +
+		((info->signals & SerialSignal_DSR) ? TIOCM_DSR:0) +
+		((info->signals & SerialSignal_CTS) ? TIOCM_CTS:0);
+
+	DBGINFO(("%s tiocmget value=%08X\n", info->device_name, result));
+	return result;
+}
+
+/*
+ * set modem control signals (DTR/RTS)
+ *
+ * 	cmd	signal command: TIOCMBIS = set bit TIOCMBIC = clear bit
+ *		TIOCMSET = set/clear signal values
+ * 	value	bit mask for command
+ */
+static int tiocmset(struct tty_struct *tty, struct file *file,
+		    unsigned int set, unsigned int clear)
+{
+	struct slgt_info *info = tty->driver_data;
+ 	unsigned long flags;
+
+	DBGINFO(("%s tiocmset(%x,%x)\n", info->device_name, set, clear));
+
+	if (set & TIOCM_RTS)
+		info->signals |= SerialSignal_RTS;
+	if (set & TIOCM_DTR)
+		info->signals |= SerialSignal_DTR;
+	if (clear & TIOCM_RTS)
+		info->signals &= ~SerialSignal_RTS;
+	if (clear & TIOCM_DTR)
+		info->signals &= ~SerialSignal_DTR;
+
+	spin_lock_irqsave(&info->lock,flags);
+ 	set_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+/*
+ *  block current process until the device is ready to open
+ */
+static int block_til_ready(struct tty_struct *tty, struct file *filp,
+			   struct slgt_info *info)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int		retval;
+	int		do_clocal = 0, extra_count = 0;
+	unsigned long	flags;
+
+	DBGINFO(("%s block_til_ready\n", tty->driver->name));
+
+	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
+		/* nonblock mode is set or port is not enabled */
+		info->flags |= ASYNC_NORMAL_ACTIVE;
+		return 0;
+	}
+
+	if (tty->termios->c_cflag & CLOCAL)
+		do_clocal = 1;
+
+	/* Wait for carrier detect and the line to become
+	 * free (i.e., not in use by the callout).  While we are in
+	 * this loop, info->count is dropped by one, so that
+	 * close() knows when to free things.  We restore it upon
+	 * exit, either normal or abnormal.
+	 */
+
+	retval = 0;
+	add_wait_queue(&info->open_wait, &wait);
+
+	spin_lock_irqsave(&info->lock, flags);
+	if (!tty_hung_up_p(filp)) {
+		extra_count = 1;
+		info->count--;
+	}
+	spin_unlock_irqrestore(&info->lock, flags);
+	info->blocked_open++;
+
+	while (1) {
+		if ((tty->termios->c_cflag & CBAUD)) {
+			spin_lock_irqsave(&info->lock,flags);
+			info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+		 	set_signals(info);
+			spin_unlock_irqrestore(&info->lock,flags);
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (tty_hung_up_p(filp) || !(info->flags & ASYNC_INITIALIZED)){
+			retval = (info->flags & ASYNC_HUP_NOTIFY) ?
+					-EAGAIN : -ERESTARTSYS;
+			break;
+		}
+
+		spin_lock_irqsave(&info->lock,flags);
+	 	get_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+
+ 		if (!(info->flags & ASYNC_CLOSING) &&
+ 		    (do_clocal || (info->signals & SerialSignal_DCD)) ) {
+ 			break;
+		}
+
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+
+		DBGINFO(("%s block_til_ready wait\n", tty->driver->name));
+		schedule();
+	}
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&info->open_wait, &wait);
+
+	if (extra_count)
+		info->count++;
+	info->blocked_open--;
+
+	if (!retval)
+		info->flags |= ASYNC_NORMAL_ACTIVE;
+
+	DBGINFO(("%s block_til_ready ready, rc=%d\n", tty->driver->name, retval));
+	return retval;
+}
+
+static int alloc_tmp_rbuf(struct slgt_info *info)
+{
+	info->tmp_rbuf = kmalloc(info->max_frame_size, GFP_KERNEL);
+	if (info->tmp_rbuf == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void free_tmp_rbuf(struct slgt_info *info)
+{
+	kfree(info->tmp_rbuf);
+	info->tmp_rbuf = NULL;
+}
+
+/*
+ * allocate DMA descriptor lists.
+ */
+static int alloc_desc(struct slgt_info *info)
+{
+	unsigned int i;
+	unsigned int pbufs;
+
+	/* allocate memory to hold descriptor lists */
+	info->bufs = pci_alloc_consistent(info->pdev, DESC_LIST_SIZE, &info->bufs_dma_addr);
+	if (info->bufs == NULL)
+		return -ENOMEM;
+
+	memset(info->bufs, 0, DESC_LIST_SIZE);
+
+	info->rbufs = (struct slgt_desc*)info->bufs;
+	info->tbufs = ((struct slgt_desc*)info->bufs) + info->rbuf_count;
+
+	pbufs = (unsigned int)info->bufs_dma_addr;
+
+	/*
+	 * Build circular lists of descriptors
+	 */
+
+	for (i=0; i < info->rbuf_count; i++) {
+		/* physical address of this descriptor */
+		info->rbufs[i].pdesc = pbufs + (i * sizeof(struct slgt_desc));
+
+		/* physical address of next descriptor */
+		if (i == info->rbuf_count - 1)
+			info->rbufs[i].next = cpu_to_le32(pbufs);
+		else
+			info->rbufs[i].next = cpu_to_le32(pbufs + ((i+1) * sizeof(struct slgt_desc)));
+		set_desc_count(info->rbufs[i], DMABUFSIZE);
+	}
+
+	for (i=0; i < info->tbuf_count; i++) {
+		/* physical address of this descriptor */
+		info->tbufs[i].pdesc = pbufs + ((info->rbuf_count + i) * sizeof(struct slgt_desc));
+
+		/* physical address of next descriptor */
+		if (i == info->tbuf_count - 1)
+			info->tbufs[i].next = cpu_to_le32(pbufs + info->rbuf_count * sizeof(struct slgt_desc));
+		else
+			info->tbufs[i].next = cpu_to_le32(pbufs + ((info->rbuf_count + i + 1) * sizeof(struct slgt_desc)));
+	}
+
+	return 0;
+}
+
+static void free_desc(struct slgt_info *info)
+{
+	if (info->bufs != NULL) {
+		pci_free_consistent(info->pdev, DESC_LIST_SIZE, info->bufs, info->bufs_dma_addr);
+		info->bufs  = NULL;
+		info->rbufs = NULL;
+		info->tbufs = NULL;
+	}
+}
+
+static int alloc_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count)
+{
+	int i;
+	for (i=0; i < count; i++) {
+		if ((bufs[i].buf = pci_alloc_consistent(info->pdev, DMABUFSIZE, &bufs[i].buf_dma_addr)) == NULL)
+			return -ENOMEM;
+		bufs[i].pbuf  = cpu_to_le32((unsigned int)bufs[i].buf_dma_addr);
+	}
+	return 0;
+}
+
+static void free_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count)
+{
+	int i;
+	for (i=0; i < count; i++) {
+		if (bufs[i].buf == NULL)
+			continue;
+		pci_free_consistent(info->pdev, DMABUFSIZE, bufs[i].buf, bufs[i].buf_dma_addr);
+		bufs[i].buf = NULL;
+	}
+}
+
+static int alloc_dma_bufs(struct slgt_info *info)
+{
+	info->rbuf_count = 32;
+	info->tbuf_count = 32;
+
+	if (alloc_desc(info) < 0 ||
+	    alloc_bufs(info, info->rbufs, info->rbuf_count) < 0 ||
+	    alloc_bufs(info, info->tbufs, info->tbuf_count) < 0 ||
+	    alloc_tmp_rbuf(info) < 0) {
+		DBGERR(("%s DMA buffer alloc fail\n", info->device_name));
+		return -ENOMEM;
+	}
+	reset_rbufs(info);
+	return 0;
+}
+
+static void free_dma_bufs(struct slgt_info *info)
+{
+	if (info->bufs) {
+		free_bufs(info, info->rbufs, info->rbuf_count);
+		free_bufs(info, info->tbufs, info->tbuf_count);
+		free_desc(info);
+	}
+	free_tmp_rbuf(info);
+}
+
+static int claim_resources(struct slgt_info *info)
+{
+	if (request_mem_region(info->phys_reg_addr, SLGT_REG_SIZE, "synclink_gt") == NULL) {
+		DBGERR(("%s reg addr conflict, addr=%08X\n",
+			info->device_name, info->phys_reg_addr));
+		info->init_error = DiagStatus_AddressConflict;
+		goto errout;
+	}
+	else
+		info->reg_addr_requested = 1;
+
+	info->reg_addr = ioremap(info->phys_reg_addr, PAGE_SIZE);
+	if (!info->reg_addr) {
+		DBGERR(("%s cant map device registers, addr=%08X\n",
+			info->device_name, info->phys_reg_addr));
+		info->init_error = DiagStatus_CantAssignPciResources;
+		goto errout;
+	}
+	info->reg_addr += info->reg_offset;
+	return 0;
+
+errout:
+	release_resources(info);
+	return -ENODEV;
+}
+
+static void release_resources(struct slgt_info *info)
+{
+	if (info->irq_requested) {
+		free_irq(info->irq_level, info);
+		info->irq_requested = 0;
+	}
+
+	if (info->reg_addr_requested) {
+		release_mem_region(info->phys_reg_addr, SLGT_REG_SIZE);
+		info->reg_addr_requested = 0;
+	}
+
+	if (info->reg_addr) {
+		iounmap(info->reg_addr - info->reg_offset);
+		info->reg_addr = NULL;
+	}
+}
+
+/* Add the specified device instance data structure to the
+ * global linked list of devices and increment the device count.
+ */
+static void add_device(struct slgt_info *info)
+{
+	char *devstr;
+
+	info->next_device = NULL;
+	info->line = slgt_device_count;
+	sprintf(info->device_name, "%s%d", tty_dev_prefix, info->line);
+
+	if (info->line < MAX_DEVICES) {
+		if (maxframe[info->line])
+			info->max_frame_size = maxframe[info->line];
+		info->dosyncppp = dosyncppp[info->line];
+	}
+
+	slgt_device_count++;
+
+	if (!slgt_device_list)
+		slgt_device_list = info;
+	else {
+		struct slgt_info *current_dev = slgt_device_list;
+		while(current_dev->next_device)
+			current_dev = current_dev->next_device;
+		current_dev->next_device = info;
+	}
+
+	if (info->max_frame_size < 4096)
+		info->max_frame_size = 4096;
+	else if (info->max_frame_size > 65535)
+		info->max_frame_size = 65535;
+
+	switch(info->pdev->device) {
+	case SYNCLINK_GT_DEVICE_ID:
+		devstr = "GT";
+		break;
+	case SYNCLINK_GT4_DEVICE_ID:
+		devstr = "GT4";
+		break;
+	case SYNCLINK_AC_DEVICE_ID:
+		devstr = "AC";
+		info->params.mode = MGSL_MODE_ASYNC;
+		break;
+	default:
+		devstr = "(unknown model)";
+	}
+	printk("SyncLink %s %s IO=%08x IRQ=%d MaxFrameSize=%u\n",
+		devstr, info->device_name, info->phys_reg_addr,
+		info->irq_level, info->max_frame_size);
+
+#ifdef CONFIG_HDLC
+	hdlcdev_init(info);
+#endif
+}
+
+/*
+ *  allocate device instance structure, return NULL on failure
+ */
+static struct slgt_info *alloc_dev(int adapter_num, int port_num, struct pci_dev *pdev)
+{
+	struct slgt_info *info;
+
+	info = kmalloc(sizeof(struct slgt_info), GFP_KERNEL);
+
+	if (!info) {
+		DBGERR(("%s device alloc failed adapter=%d port=%d\n",
+			driver_name, adapter_num, port_num));
+	} else {
+		memset(info, 0, sizeof(struct slgt_info));
+		info->magic = MGSL_MAGIC;
+		INIT_WORK(&info->task, bh_handler, info);
+		info->max_frame_size = 4096;
+		info->raw_rx_size = DMABUFSIZE;
+		info->close_delay = 5*HZ/10;
+		info->closing_wait = 30*HZ;
+		init_waitqueue_head(&info->open_wait);
+		init_waitqueue_head(&info->close_wait);
+		init_waitqueue_head(&info->status_event_wait_q);
+		init_waitqueue_head(&info->event_wait_q);
+		spin_lock_init(&info->netlock);
+		memcpy(&info->params,&default_params,sizeof(MGSL_PARAMS));
+		info->idle_mode = HDLC_TXIDLE_FLAGS;
+		info->adapter_num = adapter_num;
+		info->port_num = port_num;
+
+		init_timer(&info->tx_timer);
+		info->tx_timer.data = (unsigned long)info;
+		info->tx_timer.function = tx_timeout;
+
+		init_timer(&info->rx_timer);
+		info->rx_timer.data = (unsigned long)info;
+		info->rx_timer.function = rx_timeout;
+
+		/* Copy configuration info to device instance data */
+		info->pdev = pdev;
+		info->irq_level = pdev->irq;
+		info->phys_reg_addr = pci_resource_start(pdev,0);
+
+		/* veremap works on page boundaries
+		 * map full page starting at the page boundary
+		 */
+		info->reg_offset    = info->phys_reg_addr & (PAGE_SIZE-1);
+		info->phys_reg_addr &= ~(PAGE_SIZE-1);
+
+		info->bus_type = MGSL_BUS_TYPE_PCI;
+		info->irq_flags = SA_SHIRQ;
+
+		info->init_error = -1; /* assume error, set to 0 on successful init */
+	}
+
+	return info;
+}
+
+static void device_init(int adapter_num, struct pci_dev *pdev)
+{
+	struct slgt_info *port_array[SLGT_MAX_PORTS];
+	int i;
+	int port_count = 1;
+
+	if (pdev->device == SYNCLINK_GT4_DEVICE_ID)
+		port_count = 4;
+
+	/* allocate device instances for all ports */
+	for (i=0; i < port_count; ++i) {
+		port_array[i] = alloc_dev(adapter_num, i, pdev);
+		if (port_array[i] == NULL) {
+			for (--i; i >= 0; --i)
+				kfree(port_array[i]);
+			return;
+		}
+	}
+
+	/* give copy of port_array to all ports and add to device list  */
+	for (i=0; i < port_count; ++i) {
+		memcpy(port_array[i]->port_array, port_array, sizeof(port_array));
+		add_device(port_array[i]);
+		port_array[i]->port_count = port_count;
+		spin_lock_init(&port_array[i]->lock);
+	}
+
+	/* Allocate and claim adapter resources */
+	if (!claim_resources(port_array[0])) {
+
+		alloc_dma_bufs(port_array[0]);
+
+		/* copy resource information from first port to others */
+		for (i = 1; i < port_count; ++i) {
+			port_array[i]->lock      = port_array[0]->lock;
+			port_array[i]->irq_level = port_array[0]->irq_level;
+			port_array[i]->reg_addr  = port_array[0]->reg_addr;
+			alloc_dma_bufs(port_array[i]);
+		}
+
+		if (request_irq(port_array[0]->irq_level,
+					slgt_interrupt,
+					port_array[0]->irq_flags,
+					port_array[0]->device_name,
+					port_array[0]) < 0) {
+			DBGERR(("%s request_irq failed IRQ=%d\n",
+				port_array[0]->device_name,
+				port_array[0]->irq_level));
+		} else {
+			port_array[0]->irq_requested = 1;
+			adapter_test(port_array[0]);
+			for (i=1 ; i < port_count ; i++)
+				port_array[i]->init_error = port_array[0]->init_error;
+		}
+	}
+}
+
+static int __devinit init_one(struct pci_dev *dev,
+			      const struct pci_device_id *ent)
+{
+	if (pci_enable_device(dev)) {
+		printk("error enabling pci device %p\n", dev);
+		return -EIO;
+	}
+	pci_set_master(dev);
+	device_init(slgt_device_count, dev);
+	return 0;
+}
+
+static void __devexit remove_one(struct pci_dev *dev)
+{
+}
+
+static struct tty_operations ops = {
+	.open = open,
+	.close = close,
+	.write = write,
+	.put_char = put_char,
+	.flush_chars = flush_chars,
+	.write_room = write_room,
+	.chars_in_buffer = chars_in_buffer,
+	.flush_buffer = flush_buffer,
+	.ioctl = ioctl,
+	.throttle = throttle,
+	.unthrottle = unthrottle,
+	.send_xchar = send_xchar,
+	.break_ctl = set_break,
+	.wait_until_sent = wait_until_sent,
+ 	.read_proc = read_proc,
+	.set_termios = set_termios,
+	.stop = tx_hold,
+	.start = tx_release,
+	.hangup = hangup,
+	.tiocmget = tiocmget,
+	.tiocmset = tiocmset,
+};
+
+static void slgt_cleanup(void)
+{
+	int rc;
+	struct slgt_info *info;
+	struct slgt_info *tmp;
+
+	printk("unload %s %s\n", driver_name, driver_version);
+
+	if (serial_driver) {
+		if ((rc = tty_unregister_driver(serial_driver)))
+			DBGERR(("tty_unregister_driver error=%d\n", rc));
+		put_tty_driver(serial_driver);
+	}
+
+	/* reset devices */
+	info = slgt_device_list;
+	while(info) {
+		reset_port(info);
+		info = info->next_device;
+	}
+
+	/* release devices */
+	info = slgt_device_list;
+	while(info) {
+#ifdef CONFIG_HDLC
+		hdlcdev_exit(info);
+#endif
+		free_dma_bufs(info);
+		free_tmp_rbuf(info);
+		if (info->port_num == 0)
+			release_resources(info);
+		tmp = info;
+		info = info->next_device;
+		kfree(tmp);
+	}
+
+	if (pci_registered)
+		pci_unregister_driver(&pci_driver);
+}
+
+/*
+ *  Driver initialization entry point.
+ */
+static int __init slgt_init(void)
+{
+	int rc;
+
+ 	printk("%s %s\n", driver_name, driver_version);
+
+	slgt_device_count = 0;
+	if ((rc = pci_register_driver(&pci_driver)) < 0) {
+		printk("%s pci_register_driver error=%d\n", driver_name, rc);
+		return rc;
+	}
+	pci_registered = 1;
+
+	if (!slgt_device_list) {
+		printk("%s no devices found\n",driver_name);
+		return -ENODEV;
+	}
+
+	serial_driver = alloc_tty_driver(MAX_DEVICES);
+	if (!serial_driver) {
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	/* Initialize the tty_driver structure */
+
+	serial_driver->owner = THIS_MODULE;
+	serial_driver->driver_name = tty_driver_name;
+	serial_driver->name = tty_dev_prefix;
+	serial_driver->major = ttymajor;
+	serial_driver->minor_start = 64;
+	serial_driver->type = TTY_DRIVER_TYPE_SERIAL;
+	serial_driver->subtype = SERIAL_TYPE_NORMAL;
+	serial_driver->init_termios = tty_std_termios;
+	serial_driver->init_termios.c_cflag =
+		B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+	serial_driver->flags = TTY_DRIVER_REAL_RAW;
+	tty_set_operations(serial_driver, &ops);
+	if ((rc = tty_register_driver(serial_driver)) < 0) {
+		DBGERR(("%s can't register serial driver\n", driver_name));
+		put_tty_driver(serial_driver);
+		serial_driver = NULL;
+		goto error;
+	}
+
+ 	printk("%s %s, tty major#%d\n",
+		driver_name, driver_version,
+		serial_driver->major);
+
+	return 0;
+
+error:
+	slgt_cleanup();
+	return rc;
+}
+
+static void __exit slgt_exit(void)
+{
+	slgt_cleanup();
+}
+
+module_init(slgt_init);
+module_exit(slgt_exit);
+
+/*
+ * register access routines
+ */
+
+#define CALC_REGADDR() \
+	unsigned long reg_addr = ((unsigned long)info->reg_addr) + addr; \
+	if (addr >= 0x80) \
+		reg_addr += (info->port_num) * 32;
+
+static __u8 rd_reg8(struct slgt_info *info, unsigned int addr)
+{
+	CALC_REGADDR();
+	return readb((void __iomem *)reg_addr);
+}
+
+static void wr_reg8(struct slgt_info *info, unsigned int addr, __u8 value)
+{
+	CALC_REGADDR();
+	writeb(value, (void __iomem *)reg_addr);
+}
+
+static __u16 rd_reg16(struct slgt_info *info, unsigned int addr)
+{
+	CALC_REGADDR();
+	return readw((void __iomem *)reg_addr);
+}
+
+static void wr_reg16(struct slgt_info *info, unsigned int addr, __u16 value)
+{
+	CALC_REGADDR();
+	writew(value, (void __iomem *)reg_addr);
+}
+
+static __u32 rd_reg32(struct slgt_info *info, unsigned int addr)
+{
+	CALC_REGADDR();
+	return readl((void __iomem *)reg_addr);
+}
+
+static void wr_reg32(struct slgt_info *info, unsigned int addr, __u32 value)
+{
+	CALC_REGADDR();
+	writel(value, (void __iomem *)reg_addr);
+}
+
+static void rdma_reset(struct slgt_info *info)
+{
+	unsigned int i;
+
+	/* set reset bit */
+	wr_reg32(info, RDCSR, BIT1);
+
+	/* wait for enable bit cleared */
+	for(i=0 ; i < 1000 ; i++)
+		if (!(rd_reg32(info, RDCSR) & BIT0))
+			break;
+}
+
+static void tdma_reset(struct slgt_info *info)
+{
+	unsigned int i;
+
+	/* set reset bit */
+	wr_reg32(info, TDCSR, BIT1);
+
+	/* wait for enable bit cleared */
+	for(i=0 ; i < 1000 ; i++)
+		if (!(rd_reg32(info, TDCSR) & BIT0))
+			break;
+}
+
+/*
+ * enable internal loopback
+ * TxCLK and RxCLK are generated from BRG
+ * and TxD is looped back to RxD internally.
+ */
+static void enable_loopback(struct slgt_info *info)
+{
+	/* SCR (serial control) BIT2=looopback enable */
+	wr_reg16(info, SCR, (unsigned short)(rd_reg16(info, SCR) | BIT2));
+
+	if (info->params.mode != MGSL_MODE_ASYNC) {
+		/* CCR (clock control)
+		 * 07..05  tx clock source (010 = BRG)
+		 * 04..02  rx clock source (010 = BRG)
+		 * 01      auxclk enable   (0 = disable)
+		 * 00      BRG enable      (1 = enable)
+		 *
+		 * 0100 1001
+		 */
+		wr_reg8(info, CCR, 0x49);
+
+		/* set speed if available, otherwise use default */
+		if (info->params.clock_speed)
+			set_rate(info, info->params.clock_speed);
+		else
+			set_rate(info, 3686400);
+	}
+}
+
+/*
+ *  set baud rate generator to specified rate
+ */
+static void set_rate(struct slgt_info *info, u32 rate)
+{
+	unsigned int div;
+	static unsigned int osc = 14745600;
+
+	/* div = osc/rate - 1
+	 *
+	 * Round div up if osc/rate is not integer to
+	 * force to next slowest rate.
+	 */
+
+	if (rate) {
+		div = osc/rate;
+		if (!(osc % rate) && div)
+			div--;
+		wr_reg16(info, BDR, (unsigned short)div);
+	}
+}
+
+static void rx_stop(struct slgt_info *info)
+{
+	unsigned short val;
+
+	/* disable and reset receiver */
+	val = rd_reg16(info, RCR) & ~BIT1;          /* clear enable bit */
+	wr_reg16(info, RCR, (unsigned short)(val | BIT2)); /* set reset bit */
+	wr_reg16(info, RCR, val);                  /* clear reset bit */
+
+	slgt_irq_off(info, IRQ_RXOVER + IRQ_RXDATA + IRQ_RXIDLE);
+
+	/* clear pending rx interrupts */
+	wr_reg16(info, SSR, IRQ_RXIDLE + IRQ_RXOVER);
+
+	rdma_reset(info);
+
+	info->rx_enabled = 0;
+	info->rx_restart = 0;
+}
+
+static void rx_start(struct slgt_info *info)
+{
+	unsigned short val;
+
+	slgt_irq_off(info, IRQ_RXOVER + IRQ_RXDATA);
+
+	/* clear pending rx overrun IRQ */
+	wr_reg16(info, SSR, IRQ_RXOVER);
+
+	/* reset and disable receiver */
+	val = rd_reg16(info, RCR) & ~BIT1; /* clear enable bit */
+	wr_reg16(info, RCR, (unsigned short)(val | BIT2)); /* set reset bit */
+	wr_reg16(info, RCR, val);                  /* clear reset bit */
+
+	rdma_reset(info);
+	reset_rbufs(info);
+
+	/* set 1st descriptor address */
+	wr_reg32(info, RDDAR, info->rbufs[0].pdesc);
+
+	if (info->params.mode != MGSL_MODE_ASYNC) {
+		/* enable rx DMA and DMA interrupt */
+		wr_reg32(info, RDCSR, (BIT2 + BIT0));
+	} else {
+		/* enable saving of rx status, rx DMA and DMA interrupt */
+		wr_reg32(info, RDCSR, (BIT6 + BIT2 + BIT0));
+	}
+
+	slgt_irq_on(info, IRQ_RXOVER);
+
+	/* enable receiver */
+	wr_reg16(info, RCR, (unsigned short)(rd_reg16(info, RCR) | BIT1));
+
+	info->rx_restart = 0;
+	info->rx_enabled = 1;
+}
+
+static void tx_start(struct slgt_info *info)
+{
+	if (!info->tx_enabled) {
+		wr_reg16(info, TCR,
+			(unsigned short)(rd_reg16(info, TCR) | BIT1));
+		info->tx_enabled = TRUE;
+	}
+
+	if (info->tx_count) {
+		info->drop_rts_on_tx_done = 0;
+
+		if (info->params.mode != MGSL_MODE_ASYNC) {
+			if (info->params.flags & HDLC_FLAG_AUTO_RTS) {
+				get_signals(info);
+				if (!(info->signals & SerialSignal_RTS)) {
+					info->signals |= SerialSignal_RTS;
+					set_signals(info);
+					info->drop_rts_on_tx_done = 1;
+				}
+			}
+
+			slgt_irq_off(info, IRQ_TXDATA);
+			slgt_irq_on(info, IRQ_TXUNDER + IRQ_TXIDLE);
+			/* clear tx idle and underrun status bits */
+			wr_reg16(info, SSR, (unsigned short)(IRQ_TXIDLE + IRQ_TXUNDER));
+
+			if (!(rd_reg32(info, TDCSR) & BIT0)) {
+				/* tx DMA stopped, restart tx DMA */
+				tdma_reset(info);
+				/* set 1st descriptor address */
+				wr_reg32(info, TDDAR, info->tbufs[info->tbuf_start].pdesc);
+				if (info->params.mode == MGSL_MODE_RAW)
+					wr_reg32(info, TDCSR, BIT2 + BIT0); /* IRQ + DMA enable */
+				else
+					wr_reg32(info, TDCSR, BIT0); /* DMA enable */
+			}
+
+			if (info->params.mode != MGSL_MODE_RAW) {
+				info->tx_timer.expires = jiffies + msecs_to_jiffies(5000);
+				add_timer(&info->tx_timer);
+			}
+		} else {
+			tdma_reset(info);
+			/* set 1st descriptor address */
+			wr_reg32(info, TDDAR, info->tbufs[info->tbuf_start].pdesc);
+
+			slgt_irq_off(info, IRQ_TXDATA);
+			slgt_irq_on(info, IRQ_TXIDLE);
+			/* clear tx idle status bit */
+			wr_reg16(info, SSR, IRQ_TXIDLE);
+
+			/* enable tx DMA */
+			wr_reg32(info, TDCSR, BIT0);
+		}
+
+		info->tx_active = 1;
+	}
+}
+
+static void tx_stop(struct slgt_info *info)
+{
+	unsigned short val;
+
+	del_timer(&info->tx_timer);
+
+	tdma_reset(info);
+
+	/* reset and disable transmitter */
+	val = rd_reg16(info, TCR) & ~BIT1;          /* clear enable bit */
+	wr_reg16(info, TCR, (unsigned short)(val | BIT2)); /* set reset bit */
+	wr_reg16(info, TCR, val);                  /* clear reset */
+
+	slgt_irq_off(info, IRQ_TXDATA + IRQ_TXIDLE + IRQ_TXUNDER);
+
+	/* clear tx idle and underrun status bit */
+	wr_reg16(info, SSR, (unsigned short)(IRQ_TXIDLE + IRQ_TXUNDER));
+
+	reset_tbufs(info);
+
+	info->tx_enabled = 0;
+	info->tx_active  = 0;
+}
+
+static void reset_port(struct slgt_info *info)
+{
+	if (!info->reg_addr)
+		return;
+
+	tx_stop(info);
+	rx_stop(info);
+
+	info->signals &= ~(SerialSignal_DTR + SerialSignal_RTS);
+	set_signals(info);
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+}
+
+static void reset_adapter(struct slgt_info *info)
+{
+	int i;
+	for (i=0; i < info->port_count; ++i) {
+		if (info->port_array[i])
+			reset_port(info->port_array[i]);
+	}
+}
+
+static void async_mode(struct slgt_info *info)
+{
+  	unsigned short val;
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+	tx_stop(info);
+	rx_stop(info);
+
+	/* TCR (tx control)
+	 *
+	 * 15..13  mode, 010=async
+	 * 12..10  encoding, 000=NRZ
+	 * 09      parity enable
+	 * 08      1=odd parity, 0=even parity
+	 * 07      1=RTS driver control
+	 * 06      1=break enable
+	 * 05..04  character length
+	 *         00=5 bits
+	 *         01=6 bits
+	 *         10=7 bits
+	 *         11=8 bits
+	 * 03      0=1 stop bit, 1=2 stop bits
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-CTS enable
+	 */
+	val = 0x4000;
+
+	if (info->if_mode & MGSL_INTERFACE_RTS_EN)
+		val |= BIT7;
+
+	if (info->params.parity != ASYNC_PARITY_NONE) {
+		val |= BIT9;
+		if (info->params.parity == ASYNC_PARITY_ODD)
+			val |= BIT8;
+	}
+
+	switch (info->params.data_bits)
+	{
+	case 6: val |= BIT4; break;
+	case 7: val |= BIT5; break;
+	case 8: val |= BIT5 + BIT4; break;
+	}
+
+	if (info->params.stop_bits != 1)
+		val |= BIT3;
+
+	if (info->params.flags & HDLC_FLAG_AUTO_CTS)
+		val |= BIT0;
+
+	wr_reg16(info, TCR, val);
+
+	/* RCR (rx control)
+	 *
+	 * 15..13  mode, 010=async
+	 * 12..10  encoding, 000=NRZ
+	 * 09      parity enable
+	 * 08      1=odd parity, 0=even parity
+	 * 07..06  reserved, must be 0
+	 * 05..04  character length
+	 *         00=5 bits
+	 *         01=6 bits
+	 *         10=7 bits
+	 *         11=8 bits
+	 * 03      reserved, must be zero
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-DCD enable
+	 */
+	val = 0x4000;
+
+	if (info->params.parity != ASYNC_PARITY_NONE) {
+		val |= BIT9;
+		if (info->params.parity == ASYNC_PARITY_ODD)
+			val |= BIT8;
+	}
+
+	switch (info->params.data_bits)
+	{
+	case 6: val |= BIT4; break;
+	case 7: val |= BIT5; break;
+	case 8: val |= BIT5 + BIT4; break;
+	}
+
+	if (info->params.flags & HDLC_FLAG_AUTO_DCD)
+		val |= BIT0;
+
+	wr_reg16(info, RCR, val);
+
+	/* CCR (clock control)
+	 *
+	 * 07..05  011 = tx clock source is BRG/16
+	 * 04..02  010 = rx clock source is BRG
+	 * 01      0 = auxclk disabled
+	 * 00      1 = BRG enabled
+	 *
+	 * 0110 1001
+	 */
+	wr_reg8(info, CCR, 0x69);
+
+	msc_set_vcr(info);
+
+	tx_set_idle(info);
+
+	/* SCR (serial control)
+	 *
+	 * 15  1=tx req on FIFO half empty
+	 * 14  1=rx req on FIFO half full
+	 * 13  tx data  IRQ enable
+	 * 12  tx idle  IRQ enable
+	 * 11  rx break on IRQ enable
+	 * 10  rx data  IRQ enable
+	 * 09  rx break off IRQ enable
+	 * 08  overrun  IRQ enable
+	 * 07  DSR      IRQ enable
+	 * 06  CTS      IRQ enable
+	 * 05  DCD      IRQ enable
+	 * 04  RI       IRQ enable
+	 * 03  reserved, must be zero
+	 * 02  1=txd->rxd internal loopback enable
+	 * 01  reserved, must be zero
+	 * 00  1=master IRQ enable
+	 */
+	val = BIT15 + BIT14 + BIT0;
+	wr_reg16(info, SCR, val);
+
+	slgt_irq_on(info, IRQ_RXBREAK | IRQ_RXOVER);
+
+	set_rate(info, info->params.data_rate * 16);
+
+	if (info->params.loopback)
+		enable_loopback(info);
+}
+
+static void hdlc_mode(struct slgt_info *info)
+{
+	unsigned short val;
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+	tx_stop(info);
+	rx_stop(info);
+
+	/* TCR (tx control)
+	 *
+	 * 15..13  mode, 000=HDLC 001=raw sync
+	 * 12..10  encoding
+	 * 09      CRC enable
+	 * 08      CRC32
+	 * 07      1=RTS driver control
+	 * 06      preamble enable
+	 * 05..04  preamble length
+	 * 03      share open/close flag
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-CTS enable
+	 */
+	val = 0;
+
+	if (info->params.mode == MGSL_MODE_RAW)
+		val |= BIT13;
+	if (info->if_mode & MGSL_INTERFACE_RTS_EN)
+		val |= BIT7;
+
+	switch(info->params.encoding)
+	{
+	case HDLC_ENCODING_NRZB:          val |= BIT10; break;
+	case HDLC_ENCODING_NRZI_MARK:     val |= BIT11; break;
+	case HDLC_ENCODING_NRZI:          val |= BIT11 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_MARK:  val |= BIT12; break;
+	case HDLC_ENCODING_BIPHASE_SPACE: val |= BIT12 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_LEVEL: val |= BIT12 + BIT11; break;
+	case HDLC_ENCODING_DIFF_BIPHASE_LEVEL: val |= BIT12 + BIT11 + BIT10; break;
+	}
+
+	switch (info->params.crc_type)
+	{
+	case HDLC_CRC_16_CCITT: val |= BIT9; break;
+	case HDLC_CRC_32_CCITT: val |= BIT9 + BIT8; break;
+	}
+
+	if (info->params.preamble != HDLC_PREAMBLE_PATTERN_NONE)
+		val |= BIT6;
+
+	switch (info->params.preamble_length)
+	{
+	case HDLC_PREAMBLE_LENGTH_16BITS: val |= BIT5; break;
+	case HDLC_PREAMBLE_LENGTH_32BITS: val |= BIT4; break;
+	case HDLC_PREAMBLE_LENGTH_64BITS: val |= BIT5 + BIT4; break;
+	}
+
+	if (info->params.flags & HDLC_FLAG_AUTO_CTS)
+		val |= BIT0;
+
+	wr_reg16(info, TCR, val);
+
+	/* TPR (transmit preamble) */
+
+	switch (info->params.preamble)
+	{
+	case HDLC_PREAMBLE_PATTERN_FLAGS: val = 0x7e; break;
+	case HDLC_PREAMBLE_PATTERN_ONES:  val = 0xff; break;
+	case HDLC_PREAMBLE_PATTERN_ZEROS: val = 0x00; break;
+	case HDLC_PREAMBLE_PATTERN_10:    val = 0x55; break;
+	case HDLC_PREAMBLE_PATTERN_01:    val = 0xaa; break;
+	default:                          val = 0x7e; break;
+	}
+	wr_reg8(info, TPR, (unsigned char)val);
+
+	/* RCR (rx control)
+	 *
+	 * 15..13  mode, 000=HDLC 001=raw sync
+	 * 12..10  encoding
+	 * 09      CRC enable
+	 * 08      CRC32
+	 * 07..03  reserved, must be 0
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-DCD enable
+	 */
+	val = 0;
+
+	if (info->params.mode == MGSL_MODE_RAW)
+		val |= BIT13;
+
+	switch(info->params.encoding)
+	{
+	case HDLC_ENCODING_NRZB:          val |= BIT10; break;
+	case HDLC_ENCODING_NRZI_MARK:     val |= BIT11; break;
+	case HDLC_ENCODING_NRZI:          val |= BIT11 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_MARK:  val |= BIT12; break;
+	case HDLC_ENCODING_BIPHASE_SPACE: val |= BIT12 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_LEVEL: val |= BIT12 + BIT11; break;
+	case HDLC_ENCODING_DIFF_BIPHASE_LEVEL: val |= BIT12 + BIT11 + BIT10; break;
+	}
+
+	switch (info->params.crc_type)
+	{
+	case HDLC_CRC_16_CCITT: val |= BIT9; break;
+	case HDLC_CRC_32_CCITT: val |= BIT9 + BIT8; break;
+	}
+
+	if (info->params.flags & HDLC_FLAG_AUTO_DCD)
+		val |= BIT0;
+
+	wr_reg16(info, RCR, val);
+
+	/* CCR (clock control)
+	 *
+	 * 07..05  tx clock source
+	 * 04..02  rx clock source
+	 * 01      auxclk enable
+	 * 00      BRG enable
+	 */
+	val = 0;
+
+	if (info->params.flags & HDLC_FLAG_TXC_BRG)
+	{
+		// when RxC source is DPLL, BRG generates 16X DPLL
+		// reference clock, so take TxC from BRG/16 to get
+		// transmit clock at actual data rate
+		if (info->params.flags & HDLC_FLAG_RXC_DPLL)
+			val |= BIT6 + BIT5;	/* 011, txclk = BRG/16 */
+		else
+			val |= BIT6;	/* 010, txclk = BRG */
+	}
+	else if (info->params.flags & HDLC_FLAG_TXC_DPLL)
+		val |= BIT7;	/* 100, txclk = DPLL Input */
+	else if (info->params.flags & HDLC_FLAG_TXC_RXCPIN)
+		val |= BIT5;	/* 001, txclk = RXC Input */
+
+	if (info->params.flags & HDLC_FLAG_RXC_BRG)
+		val |= BIT3;	/* 010, rxclk = BRG */
+	else if (info->params.flags & HDLC_FLAG_RXC_DPLL)
+		val |= BIT4;	/* 100, rxclk = DPLL */
+	else if (info->params.flags & HDLC_FLAG_RXC_TXCPIN)
+		val |= BIT2;	/* 001, rxclk = TXC Input */
+
+	if (info->params.clock_speed)
+		val |= BIT1 + BIT0;
+
+	wr_reg8(info, CCR, (unsigned char)val);
+
+	if (info->params.flags & (HDLC_FLAG_TXC_DPLL + HDLC_FLAG_RXC_DPLL))
+	{
+		// program DPLL mode
+		switch(info->params.encoding)
+		{
+		case HDLC_ENCODING_BIPHASE_MARK:
+		case HDLC_ENCODING_BIPHASE_SPACE:
+			val = BIT7; break;
+		case HDLC_ENCODING_BIPHASE_LEVEL:
+		case HDLC_ENCODING_DIFF_BIPHASE_LEVEL:
+			val = BIT7 + BIT6; break;
+		default: val = BIT6;	// NRZ encodings
+		}
+		wr_reg16(info, RCR, (unsigned short)(rd_reg16(info, RCR) | val));
+
+		// DPLL requires a 16X reference clock from BRG
+		set_rate(info, info->params.clock_speed * 16);
+	}
+	else
+		set_rate(info, info->params.clock_speed);
+
+	tx_set_idle(info);
+
+	msc_set_vcr(info);
+
+	/* SCR (serial control)
+	 *
+	 * 15  1=tx req on FIFO half empty
+	 * 14  1=rx req on FIFO half full
+	 * 13  tx data  IRQ enable
+	 * 12  tx idle  IRQ enable
+	 * 11  underrun IRQ enable
+	 * 10  rx data  IRQ enable
+	 * 09  rx idle  IRQ enable
+	 * 08  overrun  IRQ enable
+	 * 07  DSR      IRQ enable
+	 * 06  CTS      IRQ enable
+	 * 05  DCD      IRQ enable
+	 * 04  RI       IRQ enable
+	 * 03  reserved, must be zero
+	 * 02  1=txd->rxd internal loopback enable
+	 * 01  reserved, must be zero
+	 * 00  1=master IRQ enable
+	 */
+	wr_reg16(info, SCR, BIT15 + BIT14 + BIT0);
+
+	if (info->params.loopback)
+		enable_loopback(info);
+}
+
+/*
+ *  set transmit idle mode
+ */
+static void tx_set_idle(struct slgt_info *info)
+{
+	unsigned char val = 0xff;
+
+	switch(info->idle_mode)
+	{
+	case HDLC_TXIDLE_FLAGS:          val = 0x7e; break;
+	case HDLC_TXIDLE_ALT_ZEROS_ONES: val = 0xaa; break;
+	case HDLC_TXIDLE_ZEROS:          val = 0x00; break;
+	case HDLC_TXIDLE_ONES:           val = 0xff; break;
+	case HDLC_TXIDLE_ALT_MARK_SPACE: val = 0xaa; break;
+	case HDLC_TXIDLE_SPACE:          val = 0x00; break;
+	case HDLC_TXIDLE_MARK:           val = 0xff; break;
+	}
+
+	wr_reg8(info, TIR, val);
+}
+
+/*
+ * get state of V24 status (input) signals
+ */
+static void get_signals(struct slgt_info *info)
+{
+	unsigned short status = rd_reg16(info, SSR);
+
+	/* clear all serial signals except DTR and RTS */
+	info->signals &= SerialSignal_DTR + SerialSignal_RTS;
+
+	if (status & BIT3)
+		info->signals |= SerialSignal_DSR;
+	if (status & BIT2)
+		info->signals |= SerialSignal_CTS;
+	if (status & BIT1)
+		info->signals |= SerialSignal_DCD;
+	if (status & BIT0)
+		info->signals |= SerialSignal_RI;
+}
+
+/*
+ * set V.24 Control Register based on current configuration
+ */
+static void msc_set_vcr(struct slgt_info *info)
+{
+	unsigned char val = 0;
+
+	/* VCR (V.24 control)
+	 *
+	 * 07..04  serial IF select
+	 * 03      DTR
+	 * 02      RTS
+	 * 01      LL
+	 * 00      RL
+	 */
+
+	switch(info->if_mode & MGSL_INTERFACE_MASK)
+	{
+	case MGSL_INTERFACE_RS232:
+		val |= BIT5; /* 0010 */
+		break;
+	case MGSL_INTERFACE_V35:
+		val |= BIT7 + BIT6 + BIT5; /* 1110 */
+		break;
+	case MGSL_INTERFACE_RS422:
+		val |= BIT6; /* 0100 */
+		break;
+	}
+
+	if (info->signals & SerialSignal_DTR)
+		val |= BIT3;
+	if (info->signals & SerialSignal_RTS)
+		val |= BIT2;
+	if (info->if_mode & MGSL_INTERFACE_LL)
+		val |= BIT1;
+	if (info->if_mode & MGSL_INTERFACE_RL)
+		val |= BIT0;
+	wr_reg8(info, VCR, val);
+}
+
+/*
+ * set state of V24 control (output) signals
+ */
+static void set_signals(struct slgt_info *info)
+{
+	unsigned char val = rd_reg8(info, VCR);
+	if (info->signals & SerialSignal_DTR)
+		val |= BIT3;
+	else
+		val &= ~BIT3;
+	if (info->signals & SerialSignal_RTS)
+		val |= BIT2;
+	else
+		val &= ~BIT2;
+	wr_reg8(info, VCR, val);
+}
+
+/*
+ * free range of receive DMA buffers (i to last)
+ */
+static void free_rbufs(struct slgt_info *info, unsigned int i, unsigned int last)
+{
+	int done = 0;
+
+	while(!done) {
+		/* reset current buffer for reuse */
+		info->rbufs[i].status = 0;
+		if (info->params.mode == MGSL_MODE_RAW)
+			set_desc_count(info->rbufs[i], info->raw_rx_size);
+		else
+			set_desc_count(info->rbufs[i], DMABUFSIZE);
+
+		if (i == last)
+			done = 1;
+		if (++i == info->rbuf_count)
+			i = 0;
+	}
+	info->rbuf_current = i;
+}
+
+/*
+ * mark all receive DMA buffers as free
+ */
+static void reset_rbufs(struct slgt_info *info)
+{
+	free_rbufs(info, 0, info->rbuf_count - 1);
+}
+
+/*
+ * pass receive HDLC frame to upper layer
+ *
+ * return 1 if frame available, otherwise 0
+ */
+static int rx_get_frame(struct slgt_info *info)
+{
+	unsigned int start, end;
+	unsigned short status;
+	unsigned int framesize = 0;
+	int rc = 0;
+	unsigned long flags;
+	struct tty_struct *tty = info->tty;
+	unsigned char addr_field = 0xff;
+
+check_again:
+
+	framesize = 0;
+	addr_field = 0xff;
+	start = end = info->rbuf_current;
+
+	for (;;) {
+		if (!desc_complete(info->rbufs[end]))
+			goto cleanup;
+
+		if (framesize == 0 && info->params.addr_filter != 0xff)
+			addr_field = info->rbufs[end].buf[0];
+
+		framesize += desc_count(info->rbufs[end]);
+
+		if (desc_eof(info->rbufs[end]))
+			break;
+
+		if (++end == info->rbuf_count)
+			end = 0;
+
+		if (end == info->rbuf_current) {
+			if (info->rx_enabled){
+				spin_lock_irqsave(&info->lock,flags);
+				rx_start(info);
+				spin_unlock_irqrestore(&info->lock,flags);
+			}
+			goto cleanup;
+		}
+	}
+
+	/* status
+	 *
+	 * 15      buffer complete
+	 * 14..06  reserved
+	 * 05..04  residue
+	 * 02      eof (end of frame)
+	 * 01      CRC error
+	 * 00      abort
+	 */
+	status = desc_status(info->rbufs[end]);
+
+	/* ignore CRC bit if not using CRC (bit is undefined) */
+	if (info->params.crc_type == HDLC_CRC_NONE)
+		status &= ~BIT1;
+
+	if (framesize == 0 ||
+		 (addr_field != 0xff && addr_field != info->params.addr_filter)) {
+		free_rbufs(info, start, end);
+		goto check_again;
+	}
+
+	if (framesize < 2 || status & (BIT1+BIT0)) {
+		if (framesize < 2 || (status & BIT0))
+			info->icount.rxshort++;
+		else
+			info->icount.rxcrc++;
+		framesize = 0;
+
+#ifdef CONFIG_HDLC
+		{
+			struct net_device_stats *stats = hdlc_stats(info->netdev);
+			stats->rx_errors++;
+			stats->rx_frame_errors++;
+		}
+#endif
+	} else {
+		/* adjust frame size for CRC, if any */
+		if (info->params.crc_type == HDLC_CRC_16_CCITT)
+			framesize -= 2;
+		else if (info->params.crc_type == HDLC_CRC_32_CCITT)
+			framesize -= 4;
+	}
+
+	DBGBH(("%s rx frame status=%04X size=%d\n",
+		info->device_name, status, framesize));
+	DBGDATA(info, info->rbufs[start].buf, min_t(int, framesize, DMABUFSIZE), "rx");
+
+	if (framesize) {
+		if (framesize > info->max_frame_size)
+			info->icount.rxlong++;
+		else {
+			/* copy dma buffer(s) to contiguous temp buffer */
+			int copy_count = framesize;
+			int i = start;
+			unsigned char *p = info->tmp_rbuf;
+			info->tmp_rbuf_count = framesize;
+
+			info->icount.rxok++;
+
+			while(copy_count) {
+				int partial_count = min(copy_count, DMABUFSIZE);
+				memcpy(p, info->rbufs[i].buf, partial_count);
+				p += partial_count;
+				copy_count -= partial_count;
+				if (++i == info->rbuf_count)
+					i = 0;
+			}
+
+#ifdef CONFIG_HDLC
+			if (info->netcount)
+				hdlcdev_rx(info,info->tmp_rbuf, framesize);
+			else
+#endif
+				ldisc_receive_buf(tty, info->tmp_rbuf, info->flag_buf, framesize);
+		}
+	}
+	free_rbufs(info, start, end);
+	rc = 1;
+
+cleanup:
+	return rc;
+}
+
+/*
+ * pass receive buffer (RAW synchronous mode) to tty layer
+ * return 1 if buffer available, otherwise 0
+ */
+static int rx_get_buf(struct slgt_info *info)
+{
+	unsigned int i = info->rbuf_current;
+
+	if (!desc_complete(info->rbufs[i]))
+		return 0;
+	DBGDATA(info, info->rbufs[i].buf, desc_count(info->rbufs[i]), "rx");
+	DBGINFO(("rx_get_buf size=%d\n", desc_count(info->rbufs[i])));
+	ldisc_receive_buf(info->tty, info->rbufs[i].buf,
+			  info->flag_buf, desc_count(info->rbufs[i]));
+	free_rbufs(info, i, i);
+	return 1;
+}
+
+static void reset_tbufs(struct slgt_info *info)
+{
+	unsigned int i;
+	info->tbuf_current = 0;
+	for (i=0 ; i < info->tbuf_count ; i++) {
+		info->tbufs[i].status = 0;
+		info->tbufs[i].count  = 0;
+	}
+}
+
+/*
+ * return number of free transmit DMA buffers
+ */
+static unsigned int free_tbuf_count(struct slgt_info *info)
+{
+	unsigned int count = 0;
+	unsigned int i = info->tbuf_current;
+
+	do
+	{
+		if (desc_count(info->tbufs[i]))
+			break; /* buffer in use */
+		++count;
+		if (++i == info->tbuf_count)
+			i=0;
+	} while (i != info->tbuf_current);
+
+	/* last buffer with zero count may be in use, assume it is */
+	if (count)
+		--count;
+
+	return count;
+}
+
+/*
+ * load transmit DMA buffer(s) with data
+ */
+static void tx_load(struct slgt_info *info, const char *buf, unsigned int size)
+{
+	unsigned short count;
+	unsigned int i;
+	struct slgt_desc *d;
+
+	if (size == 0)
+		return;
+
+	DBGDATA(info, buf, size, "tx");
+
+	info->tbuf_start = i = info->tbuf_current;
+
+	while (size) {
+		d = &info->tbufs[i];
+		if (++i == info->tbuf_count)
+			i = 0;
+
+		count = (unsigned short)((size > DMABUFSIZE) ? DMABUFSIZE : size);
+		memcpy(d->buf, buf, count);
+
+		size -= count;
+		buf  += count;
+
+		if (!size && info->params.mode != MGSL_MODE_RAW)
+			set_desc_eof(*d, 1); /* HDLC: set EOF of last desc */
+		else
+			set_desc_eof(*d, 0);
+
+		set_desc_count(*d, count);
+	}
+
+	info->tbuf_current = i;
+}
+
+static int register_test(struct slgt_info *info)
+{
+	static unsigned short patterns[] =
+		{0x0000, 0xffff, 0xaaaa, 0x5555, 0x6969, 0x9696};
+	static unsigned int count = sizeof(patterns)/sizeof(patterns[0]);
+	unsigned int i;
+	int rc = 0;
+
+	for (i=0 ; i < count ; i++) {
+		wr_reg16(info, TIR, patterns[i]);
+		wr_reg16(info, BDR, patterns[(i+1)%count]);
+		if ((rd_reg16(info, TIR) != patterns[i]) ||
+		    (rd_reg16(info, BDR) != patterns[(i+1)%count])) {
+			rc = -ENODEV;
+			break;
+		}
+	}
+
+	info->init_error = rc ? 0 : DiagStatus_AddressFailure;
+	return rc;
+}
+
+static int irq_test(struct slgt_info *info)
+{
+	unsigned long timeout;
+	unsigned long flags;
+	struct tty_struct *oldtty = info->tty;
+	u32 speed = info->params.data_rate;
+
+	info->params.data_rate = 921600;
+	info->tty = NULL;
+
+	spin_lock_irqsave(&info->lock, flags);
+	async_mode(info);
+	slgt_irq_on(info, IRQ_TXIDLE);
+
+	/* enable transmitter */
+	wr_reg16(info, TCR,
+		(unsigned short)(rd_reg16(info, TCR) | BIT1));
+
+	/* write one byte and wait for tx idle */
+	wr_reg16(info, TDR, 0);
+
+	/* assume failure */
+	info->init_error = DiagStatus_IrqFailure;
+	info->irq_occurred = FALSE;
+
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	timeout=100;
+	while(timeout-- && !info->irq_occurred)
+		msleep_interruptible(10);
+
+	spin_lock_irqsave(&info->lock,flags);
+	reset_port(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	info->params.data_rate = speed;
+	info->tty = oldtty;
+
+	info->init_error = info->irq_occurred ? 0 : DiagStatus_IrqFailure;
+	return info->irq_occurred ? 0 : -ENODEV;
+}
+
+static int loopback_test_rx(struct slgt_info *info)
+{
+	unsigned char *src, *dest;
+	int count;
+
+	if (desc_complete(info->rbufs[0])) {
+		count = desc_count(info->rbufs[0]);
+		src   = info->rbufs[0].buf;
+		dest  = info->tmp_rbuf;
+
+		for( ; count ; count-=2, src+=2) {
+			/* src=data byte (src+1)=status byte */
+			if (!(*(src+1) & (BIT9 + BIT8))) {
+				*dest = *src;
+				dest++;
+				info->tmp_rbuf_count++;
+			}
+		}
+		DBGDATA(info, info->tmp_rbuf, info->tmp_rbuf_count, "rx");
+		return 1;
+	}
+	return 0;
+}
+
+static int loopback_test(struct slgt_info *info)
+{
+#define TESTFRAMESIZE 20
+
+	unsigned long timeout;
+	u16 count = TESTFRAMESIZE;
+	unsigned char buf[TESTFRAMESIZE];
+	int rc = -ENODEV;
+	unsigned long flags;
+
+	struct tty_struct *oldtty = info->tty;
+	MGSL_PARAMS params;
+
+	memcpy(&params, &info->params, sizeof(params));
+
+	info->params.mode = MGSL_MODE_ASYNC;
+	info->params.data_rate = 921600;
+	info->params.loopback = 1;
+	info->tty = NULL;
+
+	/* build and send transmit frame */
+	for (count = 0; count < TESTFRAMESIZE; ++count)
+		buf[count] = (unsigned char)count;
+
+	info->tmp_rbuf_count = 0;
+	memset(info->tmp_rbuf, 0, TESTFRAMESIZE);
+
+	/* program hardware for HDLC and enabled receiver */
+	spin_lock_irqsave(&info->lock,flags);
+	async_mode(info);
+	rx_start(info);
+	info->tx_count = count;
+	tx_load(info, buf, count);
+	tx_start(info);
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	/* wait for receive complete */
+	for (timeout = 100; timeout; --timeout) {
+		msleep_interruptible(10);
+		if (loopback_test_rx(info)) {
+			rc = 0;
+			break;
+		}
+	}
+
+	/* verify received frame length and contents */
+	if (!rc && (info->tmp_rbuf_count != count ||
+		  memcmp(buf, info->tmp_rbuf, count))) {
+		rc = -ENODEV;
+	}
+
+	spin_lock_irqsave(&info->lock,flags);
+	reset_adapter(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	memcpy(&info->params, &params, sizeof(info->params));
+	info->tty = oldtty;
+
+	info->init_error = rc ? DiagStatus_DmaFailure : 0;
+	return rc;
+}
+
+static int adapter_test(struct slgt_info *info)
+{
+	DBGINFO(("testing %s\n", info->device_name));
+	if ((info->init_error = register_test(info)) < 0) {
+		printk("register test failure %s addr=%08X\n",
+			info->device_name, info->phys_reg_addr);
+	} else if ((info->init_error = irq_test(info)) < 0) {
+		printk("IRQ test failure %s IRQ=%d\n",
+			info->device_name, info->irq_level);
+	} else if ((info->init_error = loopback_test(info)) < 0) {
+		printk("loopback test failure %s\n", info->device_name);
+	}
+	return info->init_error;
+}
+
+/*
+ * transmit timeout handler
+ */
+static void tx_timeout(unsigned long context)
+{
+	struct slgt_info *info = (struct slgt_info*)context;
+	unsigned long flags;
+
+	DBGINFO(("%s tx_timeout\n", info->device_name));
+	if(info->tx_active && info->params.mode == MGSL_MODE_HDLC) {
+		info->icount.txtimeout++;
+	}
+	spin_lock_irqsave(&info->lock,flags);
+	info->tx_active = 0;
+	info->tx_count = 0;
+	spin_unlock_irqrestore(&info->lock,flags);
+
+#ifdef CONFIG_HDLC
+	if (info->netcount)
+		hdlcdev_tx_done(info);
+	else
+#endif
+		bh_transmit(info);
+}
+
+/*
+ * receive buffer polling timer
+ */
+static void rx_timeout(unsigned long context)
+{
+	struct slgt_info *info = (struct slgt_info*)context;
+	unsigned long flags;
+
+	DBGINFO(("%s rx_timeout\n", info->device_name));
+	spin_lock_irqsave(&info->lock, flags);
+	info->pending_bh |= BH_RECEIVE;
+	spin_unlock_irqrestore(&info->lock, flags);
+	bh_handler(info);
+}
+
diff --git a/include/linux/synclink.h b/include/linux/synclink.h
index 763bd290f28d..1b7cd8d1a71b 100644
--- a/include/linux/synclink.h
+++ b/include/linux/synclink.h
@@ -1,7 +1,7 @@
 /*
  * SyncLink Multiprotocol Serial Adapter Driver
  *
- * $Id: synclink.h,v 3.6 2002/02/20 21:58:20 paulkf Exp $
+ * $Id: synclink.h,v 3.10 2005/11/08 19:50:54 paulkf Exp $
  *
  * Copyright (C) 1998-2000 by Microgate Corporation
  *
@@ -128,10 +128,14 @@
 #define MGSL_BUS_TYPE_EISA	2
 #define MGSL_BUS_TYPE_PCI	5
 
+#define MGSL_INTERFACE_MASK     0xf
 #define MGSL_INTERFACE_DISABLE  0
 #define MGSL_INTERFACE_RS232    1
 #define MGSL_INTERFACE_V35      2
 #define MGSL_INTERFACE_RS422    3
+#define MGSL_INTERFACE_RTS_EN   0x10
+#define MGSL_INTERFACE_LL       0x20
+#define MGSL_INTERFACE_RL       0x40
 
 typedef struct _MGSL_PARAMS
 {
@@ -163,6 +167,9 @@ typedef struct _MGSL_PARAMS
 #define SYNCLINK_DEVICE_ID 0x0010
 #define MGSCC_DEVICE_ID 0x0020
 #define SYNCLINK_SCA_DEVICE_ID 0x0030
+#define SYNCLINK_GT_DEVICE_ID 0x0070
+#define SYNCLINK_GT4_DEVICE_ID 0x0080
+#define SYNCLINK_AC_DEVICE_ID  0x0090
 #define MGSL_MAX_SERIAL_NUMBER 30
 
 /*
-- 
cgit v1.2.3-71-gd317


From 9ded96f24c3a5fcbef954e88c443385a1af37eb9 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+lkml@arm.linux.org.uk>
Date: Sun, 8 Jan 2006 01:02:07 -0800
Subject: [PATCH] IRQ type flags

Some ARM platforms have the ability to program the interrupt controller to
detect various interrupt edges and/or levels.  For some platforms, this is
critical to setup correctly, particularly those which the setting is dependent
on the device.

Currently, ARM drivers do (eg) the following:

	err = request_irq(irq, ...);

	set_irq_type(irq, IRQT_RISING);

However, if the interrupt has previously been programmed to be level sensitive
(for whatever reason) then this will cause an interrupt storm.

Hence, if we combine set_irq_type() with request_irq(), we can then safely set
the type prior to unmasking the interrupt.  The unfortunate problem is that in
order to support this, these flags need to be visible outside of the ARM
architecture - drivers such as smc91x need these flags and they're
cross-architecture.

Finally, the SA_TRIGGER_* flag passed to request_irq() should reflect the
property that the device would like.  The IRQ controller code should do its
best to select the most appropriate supported mode.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/kernel/irq.c              | 14 ++++++++++++--
 arch/arm/mach-omap1/serial.c       |  3 +--
 arch/arm/mach-pxa/corgi.c          |  7 +++----
 arch/arm/mach-pxa/poodle.c         |  7 +++----
 arch/arm/mach-pxa/spitz.c          |  7 +++----
 arch/arm/mach-s3c2410/usb-simtec.c |  6 +++---
 drivers/i2c/chips/tps65010.c       | 11 ++++++-----
 drivers/input/keyboard/corgikbd.c  |  6 ++----
 drivers/input/keyboard/spitzkbd.c  | 27 ++++++++++++++-------------
 drivers/mfd/ucb1x00-core.c         |  5 ++---
 drivers/net/smc91x.c               |  5 +----
 drivers/net/smc91x.h               | 18 +++++++++---------
 include/asm-arm/irq.h              | 12 ++++++++----
 include/linux/signal.h             | 13 +++++++++++++
 14 files changed, 80 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 869c466e6258..b5645c4462cf 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -684,8 +684,12 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	spin_lock_irqsave(&irq_controller_lock, flags);
 	p = &desc->action;
 	if ((old = *p) != NULL) {
-		/* Can't share interrupts unless both agree to */
-		if (!(old->flags & new->flags & SA_SHIRQ)) {
+		/*
+		 * Can't share interrupts unless both agree to and are
+		 * the same type.
+		 */
+		if (!(old->flags & new->flags & SA_SHIRQ) ||
+		    (~old->flags & new->flags) & SA_TRIGGER_MASK) {
 			spin_unlock_irqrestore(&irq_controller_lock, flags);
 			return -EBUSY;
 		}
@@ -705,6 +709,12 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		desc->running = 0;
 		desc->pending = 0;
 		desc->disable_depth = 1;
+
+		if (new->flags & SA_TRIGGER_MASK) {
+			unsigned int type = new->flags & SA_TRIGGER_MASK;
+			desc->chip->set_type(irq, type);
+		}
+
 		if (!desc->noautoenable) {
 			desc->disable_depth = 0;
 			desc->chip->unmask(irq);
diff --git a/arch/arm/mach-omap1/serial.c b/arch/arm/mach-omap1/serial.c
index fcfb81d13cfe..7a68f098a025 100644
--- a/arch/arm/mach-omap1/serial.c
+++ b/arch/arm/mach-omap1/serial.c
@@ -252,9 +252,8 @@ static void __init omap_serial_set_port_wakeup(int gpio_nr)
 		return;
 	}
 	omap_set_gpio_direction(gpio_nr, 1);
-	set_irq_type(OMAP_GPIO_IRQ(gpio_nr), IRQT_RISING);
 	ret = request_irq(OMAP_GPIO_IRQ(gpio_nr), &omap_serial_wake_interrupt,
-			  0, "serial wakeup", NULL);
+			  SA_TRIGGER_RISING, "serial wakeup", NULL);
 	if (ret) {
 		omap_free_gpio(gpio_nr);
 		printk(KERN_ERR "No interrupt for UART wake GPIO: %i\n",
diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c
index 100fb31b5156..5a7b873f29b3 100644
--- a/arch/arm/mach-pxa/corgi.c
+++ b/arch/arm/mach-pxa/corgi.c
@@ -213,15 +213,14 @@ static int corgi_mci_init(struct device *dev, irqreturn_t (*corgi_detect_int)(in
 
 	corgi_mci_platform_data.detect_delay = msecs_to_jiffies(250);
 
-	err = request_irq(CORGI_IRQ_GPIO_nSD_DETECT, corgi_detect_int, SA_INTERRUPT,
-			     "MMC card detect", data);
+	err = request_irq(CORGI_IRQ_GPIO_nSD_DETECT, corgi_detect_int,
+			  SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+			  "MMC card detect", data);
 	if (err) {
 		printk(KERN_ERR "corgi_mci_init: MMC/SD: can't request MMC card detect IRQ\n");
 		return -1;
 	}
 
-	set_irq_type(CORGI_IRQ_GPIO_nSD_DETECT, IRQT_BOTHEDGE);
-
 	return 0;
 }
 
diff --git a/arch/arm/mach-pxa/poodle.c b/arch/arm/mach-pxa/poodle.c
index eef3de26ad37..663c95005985 100644
--- a/arch/arm/mach-pxa/poodle.c
+++ b/arch/arm/mach-pxa/poodle.c
@@ -146,15 +146,14 @@ static int poodle_mci_init(struct device *dev, irqreturn_t (*poodle_detect_int)(
 
 	poodle_mci_platform_data.detect_delay = msecs_to_jiffies(250);
 
-	err = request_irq(POODLE_IRQ_GPIO_nSD_DETECT, poodle_detect_int, SA_INTERRUPT,
-			     "MMC card detect", data);
+	err = request_irq(POODLE_IRQ_GPIO_nSD_DETECT, poodle_detect_int,
+			  SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+			  "MMC card detect", data);
 	if (err) {
 		printk(KERN_ERR "poodle_mci_init: MMC/SD: can't request MMC card detect IRQ\n");
 		return -1;
 	}
 
-	set_irq_type(POODLE_IRQ_GPIO_nSD_DETECT, IRQT_BOTHEDGE);
-
 	return 0;
 }
 
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index f2007db0cda5..a9eacc06555f 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -296,15 +296,14 @@ static int spitz_mci_init(struct device *dev, irqreturn_t (*spitz_detect_int)(in
 
 	spitz_mci_platform_data.detect_delay = msecs_to_jiffies(250);
 
-	err = request_irq(SPITZ_IRQ_GPIO_nSD_DETECT, spitz_detect_int, SA_INTERRUPT,
-			     "MMC card detect", data);
+	err = request_irq(SPITZ_IRQ_GPIO_nSD_DETECT, spitz_detect_int,
+			  SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+			  "MMC card detect", data);
 	if (err) {
 		printk(KERN_ERR "spitz_mci_init: MMC/SD: can't request MMC card detect IRQ\n");
 		return -1;
 	}
 
-	set_irq_type(SPITZ_IRQ_GPIO_nSD_DETECT, IRQT_BOTHEDGE);
-
 	return 0;
 }
 
diff --git a/arch/arm/mach-s3c2410/usb-simtec.c b/arch/arm/mach-s3c2410/usb-simtec.c
index 5098b50158a3..495f8c6ffcb6 100644
--- a/arch/arm/mach-s3c2410/usb-simtec.c
+++ b/arch/arm/mach-s3c2410/usb-simtec.c
@@ -84,13 +84,13 @@ static void usb_simtec_enableoc(struct s3c2410_hcd_info *info, int on)
 	int ret;
 
 	if (on) {
-		ret = request_irq(IRQ_USBOC, usb_simtec_ocirq, SA_INTERRUPT,
+		ret = request_irq(IRQ_USBOC, usb_simtec_ocirq,
+				  SA_INTERRUPT | SA_TRIGGER_RISING |
+				   SA_TRIGGER_FALLING,
 				  "USB Over-current", info);
 		if (ret != 0) {
 			printk(KERN_ERR "failed to request usb oc irq\n");
 		}
-
-		set_irq_type(IRQ_USBOC, IRQT_BOTHEDGE);
 	} else {
 		free_irq(IRQ_USBOC, info);
 	}
diff --git a/drivers/i2c/chips/tps65010.c b/drivers/i2c/chips/tps65010.c
index e70b3db69edd..1af3dfbb8086 100644
--- a/drivers/i2c/chips/tps65010.c
+++ b/drivers/i2c/chips/tps65010.c
@@ -494,6 +494,7 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 {
 	struct tps65010		*tps;
 	int			status;
+	unsigned long		irqflags;
 
 	if (the_tps) {
 		dev_dbg(&bus->dev, "only one %s for now\n", DRIVER_NAME);
@@ -520,13 +521,14 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 	}
 
 #ifdef	CONFIG_ARM
+	irqflags = SA_SAMPLE_RANDOM | SA_TRIGGER_LOW;
 	if (machine_is_omap_h2()) {
 		tps->model = TPS65010;
 		omap_cfg_reg(W4_GPIO58);
 		tps->irq = OMAP_GPIO_IRQ(58);
 		omap_request_gpio(58);
 		omap_set_gpio_direction(58, 1);
-		set_irq_type(tps->irq, IRQT_FALLING);
+		irqflags |= SA_TRIGGER_FALLING;
 	}
 	if (machine_is_omap_osk()) {
 		tps->model = TPS65010;
@@ -534,7 +536,7 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 		tps->irq = OMAP_GPIO_IRQ(OMAP_MPUIO(1));
 		omap_request_gpio(OMAP_MPUIO(1));
 		omap_set_gpio_direction(OMAP_MPUIO(1), 1);
-		set_irq_type(tps->irq, IRQT_FALLING);
+		irqflags |= SA_TRIGGER_FALLING;
 	}
 	if (machine_is_omap_h3()) {
 		tps->model = TPS65013;
@@ -542,13 +544,12 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 		// FIXME set up this board's IRQ ...
 	}
 #else
-#define set_irq_type(num,trigger)	do{}while(0)
+	irqflags = SA_SAMPLE_RANDOM;
 #endif
 
 	if (tps->irq > 0) {
-		set_irq_type(tps->irq, IRQT_LOW);
 		status = request_irq(tps->irq, tps65010_irq,
-			SA_SAMPLE_RANDOM, DRIVER_NAME, tps);
+			irqflags, DRIVER_NAME, tps);
 		if (status < 0) {
 			dev_dbg(&tps->client.dev, "can't get IRQ %d, err %d\n",
 					tps->irq, status);
diff --git a/drivers/input/keyboard/corgikbd.c b/drivers/input/keyboard/corgikbd.c
index 64672d491222..e301ee4ca264 100644
--- a/drivers/input/keyboard/corgikbd.c
+++ b/drivers/input/keyboard/corgikbd.c
@@ -19,7 +19,6 @@
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <asm/irq.h>
 
 #include <asm/arch/corgi.h>
 #include <asm/arch/hardware.h>
@@ -343,10 +342,9 @@ static int __init corgikbd_probe(struct platform_device *pdev)
 	for (i = 0; i < CORGI_KEY_SENSE_NUM; i++) {
 		pxa_gpio_mode(CORGI_GPIO_KEY_SENSE(i) | GPIO_IN);
 		if (request_irq(CORGI_IRQ_GPIO_KEY_SENSE(i), corgikbd_interrupt,
-						SA_INTERRUPT, "corgikbd", corgikbd))
+				SA_INTERRUPT | SA_TRIGGER_RISING,
+				"corgikbd", corgikbd))
 			printk(KERN_WARNING "corgikbd: Can't get IRQ: %d!\n", i);
-		else
-			set_irq_type(CORGI_IRQ_GPIO_KEY_SENSE(i),IRQT_RISING);
 	}
 
 	/* Set Strobe lines as outputs - set high */
diff --git a/drivers/input/keyboard/spitzkbd.c b/drivers/input/keyboard/spitzkbd.c
index 6a15fe3bc527..83999d583122 100644
--- a/drivers/input/keyboard/spitzkbd.c
+++ b/drivers/input/keyboard/spitzkbd.c
@@ -19,7 +19,6 @@
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <asm/irq.h>
 
 #include <asm/arch/spitz.h>
 #include <asm/arch/hardware.h>
@@ -407,10 +406,9 @@ static int __init spitzkbd_probe(struct platform_device *dev)
 	for (i = 0; i < SPITZ_KEY_SENSE_NUM; i++) {
 		pxa_gpio_mode(spitz_senses[i] | GPIO_IN);
 		if (request_irq(IRQ_GPIO(spitz_senses[i]), spitzkbd_interrupt,
-						SA_INTERRUPT, "Spitzkbd Sense", spitzkbd))
+				SA_INTERRUPT|SA_TRIGGER_RISING,
+				"Spitzkbd Sense", spitzkbd))
 			printk(KERN_WARNING "spitzkbd: Can't get Sense IRQ: %d!\n", i);
-		else
-			set_irq_type(IRQ_GPIO(spitz_senses[i]),IRQT_RISING);
 	}
 
 	/* Set Strobe lines as outputs - set high */
@@ -422,15 +420,18 @@ static int __init spitzkbd_probe(struct platform_device *dev)
 	pxa_gpio_mode(SPITZ_GPIO_SWA | GPIO_IN);
 	pxa_gpio_mode(SPITZ_GPIO_SWB | GPIO_IN);
 
-	request_irq(SPITZ_IRQ_GPIO_SYNC, spitzkbd_interrupt, SA_INTERRUPT, "Spitzkbd Sync", spitzkbd);
-	request_irq(SPITZ_IRQ_GPIO_ON_KEY, spitzkbd_interrupt, SA_INTERRUPT, "Spitzkbd PwrOn", spitzkbd);
-	request_irq(SPITZ_IRQ_GPIO_SWA, spitzkbd_hinge_isr, SA_INTERRUPT, "Spitzkbd SWA", spitzkbd);
-	request_irq(SPITZ_IRQ_GPIO_SWB, spitzkbd_hinge_isr, SA_INTERRUPT, "Spitzkbd SWB", spitzkbd);
-
-	set_irq_type(SPITZ_IRQ_GPIO_SYNC, IRQT_BOTHEDGE);
-	set_irq_type(SPITZ_IRQ_GPIO_ON_KEY, IRQT_BOTHEDGE);
-	set_irq_type(SPITZ_IRQ_GPIO_SWA, IRQT_BOTHEDGE);
-	set_irq_type(SPITZ_IRQ_GPIO_SWB, IRQT_BOTHEDGE);
+	request_irq(SPITZ_IRQ_GPIO_SYNC, spitzkbd_interrupt,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd Sync", spitzkbd);
+	request_irq(SPITZ_IRQ_GPIO_ON_KEY, spitzkbd_interrupt,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd PwrOn", spitzkbd);
+	request_irq(SPITZ_IRQ_GPIO_SWA, spitzkbd_hinge_isr,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd SWA", spitzkbd);
+	request_irq(SPITZ_IRQ_GPIO_SWB, spitzkbd_hinge_isr,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd SWB", spitzkbd);
 
 	printk(KERN_INFO "input: Spitz Keyboard Registered\n");
 
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index e335d54c4659..b42e0fbab59b 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -27,7 +27,6 @@
 
 #include <asm/dma.h>
 #include <asm/hardware.h>
-#include <asm/irq.h>
 
 #include "ucb1x00.h"
 
@@ -507,14 +506,14 @@ static int ucb1x00_probe(struct mcp *mcp)
 		goto err_free;
 	}
 
-	ret = request_irq(ucb->irq, ucb1x00_irq, 0, "UCB1x00", ucb);
+	ret = request_irq(ucb->irq, ucb1x00_irq, SA_TRIGGER_RISING,
+			  "UCB1x00", ucb);
 	if (ret) {
 		printk(KERN_ERR "ucb1x00: unable to grab irq%d: %d\n",
 			ucb->irq, ret);
 		goto err_free;
 	}
 
-	set_irq_type(ucb->irq, IRQT_RISING);
 	mcp_set_drvdata(mcp, ucb);
 
 	ret = class_device_register(&ucb->cdev);
diff --git a/drivers/net/smc91x.c b/drivers/net/smc91x.c
index 28bf2e69eb5e..7ec08127c9d6 100644
--- a/drivers/net/smc91x.c
+++ b/drivers/net/smc91x.c
@@ -88,7 +88,6 @@ static const char version[] =
 #include <linux/skbuff.h>
 
 #include <asm/io.h>
-#include <asm/irq.h>
 
 #include "smc91x.h"
 
@@ -2007,12 +2006,10 @@ static int __init smc_probe(struct net_device *dev, void __iomem *ioaddr)
 	}
 
 	/* Grab the IRQ */
-      	retval = request_irq(dev->irq, &smc_interrupt, 0, dev->name, dev);
+      	retval = request_irq(dev->irq, &smc_interrupt, SMC_IRQ_FLAGS, dev->name, dev);
       	if (retval)
       		goto err_out;
 
-	set_irq_type(dev->irq, SMC_IRQ_TRIGGER_TYPE);
-
 #ifdef SMC_USE_PXA_DMA
 	{
 		int dma = pxa_request_dma(dev->name, DMA_PRIO_LOW,
diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index 5c2824be4ee6..e0efd1964e72 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -90,7 +90,7 @@
 			__l--;						\
 		}							\
 	} while (0)
-#define set_irq_type(irq, type)
+#define SMC_IRQ_FLAGS		(0)
 
 #elif defined(CONFIG_SA1100_PLEB)
 /* We can only do 16-bit reads and writes in the static memory space. */
@@ -109,7 +109,7 @@
 #define SMC_outw(v, a, r)	writew(v, (a) + (r))
 #define SMC_outsw(a, r, p, l)	writesw((a) + (r), p, l)
 
-#define set_irq_type(irq, type) do {} while (0)
+#define SMC_IRQ_FLAGS		(0)
 
 #elif defined(CONFIG_SA1100_ASSABET)
 
@@ -185,11 +185,11 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #include <asm/mach-types.h>
 #include <asm/arch/cpu.h>
 
-#define	SMC_IRQ_TRIGGER_TYPE (( \
+#define	SMC_IRQ_FLAGS (( \
 		   machine_is_omap_h2() \
 		|| machine_is_omap_h3() \
 		|| (machine_is_omap_innovator() && !cpu_is_omap1510()) \
-	) ? IRQT_FALLING : IRQT_RISING)
+	) ? SA_TRIGGER_FALLING : SA_TRIGGER_RISING)
 
 
 #elif	defined(CONFIG_SH_SH4202_MICRODEV)
@@ -209,7 +209,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #define SMC_insw(a, r, p, l)	insw((a) + (r) - 0xa0000000, p, l)
 #define SMC_outsw(a, r, p, l)	outsw((a) + (r) - 0xa0000000, p, l)
 
-#define set_irq_type(irq, type)	do {} while(0)
+#define SMC_IRQ_FLAGS		(0)
 
 #elif	defined(CONFIG_ISA)
 
@@ -237,7 +237,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #define SMC_insw(a, r, p, l)	insw(((u32)a) + (r), p, l)
 #define SMC_outsw(a, r, p, l)	outsw(((u32)a) + (r), p, l)
 
-#define set_irq_type(irq, type)	do {} while(0)
+#define SMC_IRQ_FLAGS		(0)
 
 #define RPC_LSA_DEFAULT		RPC_LED_TX_RX
 #define RPC_LSB_DEFAULT		RPC_LED_100_10
@@ -319,7 +319,7 @@ static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l)
 			au_writew(*_p++ , _a); \
 	} while(0)
 
-#define set_irq_type(irq, type) do {} while (0)
+#define SMC_IRQ_FLAGS		(0)
 
 #else
 
@@ -342,8 +342,8 @@ static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l)
 
 #endif
 
-#ifndef	SMC_IRQ_TRIGGER_TYPE
-#define	SMC_IRQ_TRIGGER_TYPE	IRQT_RISING
+#ifndef	SMC_IRQ_FLAGS
+#define	SMC_IRQ_FLAGS		SA_TRIGGER_RISING
 #endif
 
 #ifdef SMC_USE_PXA_DMA
diff --git a/include/asm-arm/irq.h b/include/asm-arm/irq.h
index 59975ee43cf1..7772432d3fd7 100644
--- a/include/asm-arm/irq.h
+++ b/include/asm-arm/irq.h
@@ -25,10 +25,14 @@ extern void disable_irq_nosync(unsigned int);
 extern void disable_irq(unsigned int);
 extern void enable_irq(unsigned int);
 
-#define __IRQT_FALEDGE	(1 << 0)
-#define __IRQT_RISEDGE	(1 << 1)
-#define __IRQT_LOWLVL	(1 << 2)
-#define __IRQT_HIGHLVL	(1 << 3)
+/*
+ * These correspond with the SA_TRIGGER_* defines, and therefore the
+ * IRQRESOURCE_IRQ_* defines.
+ */
+#define __IRQT_RISEDGE	(1 << 0)
+#define __IRQT_FALEDGE	(1 << 1)
+#define __IRQT_HIGHLVL	(1 << 2)
+#define __IRQT_LOWLVL	(1 << 3)
 
 #define IRQT_NOEDGE	(0)
 #define IRQT_RISING	(__IRQT_RISEDGE)
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 5dd5f02c5c5f..ea9eff16c4b7 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -18,6 +18,19 @@
 #define SA_PROBE		SA_ONESHOT
 #define SA_SAMPLE_RANDOM	SA_RESTART
 #define SA_SHIRQ		0x04000000
+/*
+ * As above, these correspond to the IORESOURCE_IRQ_* defines in
+ * linux/ioport.h to select the interrupt line behaviour.  When
+ * requesting an interrupt without specifying a SA_TRIGGER, the
+ * setting should be assumed to be "as already configured", which
+ * may be as per machine or firmware initialisation.
+ */
+#define SA_TRIGGER_LOW		0x00000008
+#define SA_TRIGGER_HIGH		0x00000004
+#define SA_TRIGGER_FALLING	0x00000002
+#define SA_TRIGGER_RISING	0x00000001
+#define SA_TRIGGER_MASK	(SA_TRIGGER_HIGH|SA_TRIGGER_LOW|\
+				 SA_TRIGGER_RISING|SA_TRIGGER_FALLING)
 
 /*
  * Real Time signals may be queued.
-- 
cgit v1.2.3-71-gd317


From e5174baaea7585760f02eef23b225847d209a8db Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:11 -0800
Subject: [PATCH] fat: support ->direct_IO()

This patch add to support of ->direct_IO() for mostly read.

The user of this seems to want to use for streaming read.  So, current direct
I/O has limitation, it can only overwrite.  (For write operation, mainly we
need to handle the hole etc..)

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/cache.c           | 14 +++++++--
 fs/fat/dir.c             |  6 ++--
 fs/fat/inode.c           | 82 ++++++++++++++++++++++++++++++++++++++++++------
 include/linux/msdos_fs.h |  3 +-
 4 files changed, 89 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 77c24fcf712a..1acc941245fb 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -295,7 +295,8 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 	return dclus;
 }
 
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+	     unsigned long *mapped_blocks)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -303,9 +304,12 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
 	int cluster, offset;
 
 	*phys = 0;
+	*mapped_blocks = 0;
 	if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) {
-		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits))
+		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) {
 			*phys = sector + sbi->dir_start;
+			*mapped_blocks = 1;
+		}
 		return 0;
 	}
 	last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1))
@@ -318,7 +322,11 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
 	cluster = fat_bmap_cluster(inode, cluster);
 	if (cluster < 0)
 		return cluster;
-	else if (cluster)
+	else if (cluster) {
 		*phys = fat_clus_to_blknr(sbi, cluster) + offset;
+		*mapped_blocks = sbi->sec_per_clus - offset;
+		if (*mapped_blocks > last_block - sector)
+			*mapped_blocks = last_block - sector;
+	}
 	return 0;
 }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4ce77475aed3..eef1b81aa294 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -68,8 +68,8 @@ static int fat__get_entry(struct inode *dir, loff_t *pos,
 {
 	struct super_block *sb = dir->i_sb;
 	sector_t phys, iblock;
-	int offset;
-	int err;
+	unsigned long mapped_blocks;
+	int err, offset;
 
 next:
 	if (*bh)
@@ -77,7 +77,7 @@ next:
 
 	*bh = NULL;
 	iblock = *pos >> sb->s_blocksize_bits;
-	err = fat_bmap(dir, iblock, &phys);
+	err = fat_bmap(dir, iblock, &phys, &mapped_blocks);
 	if (err || !phys)
 		return -1;	/* beyond EOF or error */
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 932c8d6d1f54..e7f4aa7fc686 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -23,6 +23,7 @@
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
+#include <linux/uio.h>
 #include <asm/unaligned.h>
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -49,43 +50,77 @@ static int fat_add_cluster(struct inode *inode)
 	return err;
 }
 
-static int fat_get_block(struct inode *inode, sector_t iblock,
-			 struct buffer_head *bh_result, int create)
+static int __fat_get_blocks(struct inode *inode, sector_t iblock,
+			    unsigned long *max_blocks,
+			    struct buffer_head *bh_result, int create)
 {
 	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	sector_t phys;
-	int err;
+	unsigned long mapped_blocks;
+	int err, offset;
 
-	err = fat_bmap(inode, iblock, &phys);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
 	if (err)
 		return err;
 	if (phys) {
 		map_bh(bh_result, sb, phys);
+		*max_blocks = min(mapped_blocks, *max_blocks);
 		return 0;
 	}
 	if (!create)
 		return 0;
+
 	if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
 		fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
 			     MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
 		return -EIO;
 	}
-	if (!((unsigned long)iblock & (MSDOS_SB(sb)->sec_per_clus - 1))) {
+
+	offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
+	if (!offset) {
+		/* TODO: multiple cluster allocation would be desirable. */
 		err = fat_add_cluster(inode);
 		if (err)
 			return err;
 	}
-	MSDOS_I(inode)->mmu_private += sb->s_blocksize;
-	err = fat_bmap(inode, iblock, &phys);
+	/* available blocks on this cluster */
+	mapped_blocks = sbi->sec_per_clus - offset;
+
+	*max_blocks = min(mapped_blocks, *max_blocks);
+	MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
+
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
 	if (err)
 		return err;
-	if (!phys)
-		BUG();
+	BUG_ON(!phys);
+	BUG_ON(*max_blocks != mapped_blocks);
 	set_buffer_new(bh_result);
 	map_bh(bh_result, sb, phys);
 	return 0;
 }
 
+static int fat_get_blocks(struct inode *inode, sector_t iblock,
+			  unsigned long max_blocks,
+			  struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	int err;
+
+	err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
+	if (err)
+		return err;
+	bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+	return 0;
+}
+
+static int fat_get_block(struct inode *inode, sector_t iblock,
+			 struct buffer_head *bh_result, int create)
+{
+	unsigned long max_blocks = 1;
+	return __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
+}
+
 static int fat_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, fat_get_block, wbc);
@@ -128,6 +163,34 @@ static int fat_commit_write(struct file *file, struct page *page,
 	return err;
 }
 
+static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
+			     const struct iovec *iov,
+			     loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+
+	if (rw == WRITE) {
+		/*
+		 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
+		 * so we need to update the ->mmu_private to block boundary.
+		 *
+		 * But we must fill the remaining area or hole by nul for
+		 * updating ->mmu_private.
+		 */
+		loff_t size = offset + iov_length(iov, nr_segs);
+		if (MSDOS_I(inode)->mmu_private < size)
+			return -EINVAL;
+	}
+
+	/*
+	 * FAT need to use the DIO_LOCKING for avoiding the race
+	 * condition of fat_get_block() and ->truncate().
+	 */
+	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, fat_get_blocks, NULL);
+}
+
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping, block, fat_get_block);
@@ -141,6 +204,7 @@ static struct address_space_operations fat_aops = {
 	.sync_page	= block_sync_page,
 	.prepare_write	= fat_prepare_write,
 	.commit_write	= fat_commit_write,
+	.direct_IO	= fat_direct_IO,
 	.bmap		= _fat_bmap
 };
 
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 941da5c016a0..e933e2a355ad 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -329,7 +329,8 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
 extern void fat_cache_inval_inode(struct inode *inode);
 extern int fat_get_cluster(struct inode *inode, int cluster,
 			   int *fclus, int *dclus);
-extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+		    unsigned long *mapped_blocks);
 
 /* fat/dir.c */
 extern struct file_operations fat_dir_operations;
-- 
cgit v1.2.3-71-gd317


From 268fc16e343b4f8e249468747db2e658da46a814 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:12 -0800
Subject: [PATCH] export/change sync_page_range/_nolock()

This exports/changes the sync_page_range/_nolock().  The fatfs needs
sync_page_range/_nolock() for expanding truncate, and changes "size_t count"
to "loff_t count".

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h | 4 +++-
 mm/filemap.c              | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b096159086e8..beaef5c7a0ea 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -103,7 +103,9 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping);
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
 int sync_page_range(struct inode *inode, struct address_space *mapping,
-			loff_t pos, size_t count);
+			loff_t pos, loff_t count);
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+			   loff_t pos, loff_t count);
 
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ef24a397684..8fdf36508023 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -280,7 +280,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
  * it is otherwise livelockable.
  */
 int sync_page_range(struct inode *inode, struct address_space *mapping,
-			loff_t pos, size_t count)
+			loff_t pos, loff_t count)
 {
 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -305,9 +305,8 @@ EXPORT_SYMBOL(sync_page_range);
  * as it forces O_SYNC writers to different parts of the same file
  * to be serialised right until io completion.
  */
-static int sync_page_range_nolock(struct inode *inode,
-				  struct address_space *mapping,
-				  loff_t pos, size_t count)
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+			   loff_t pos, loff_t count)
 {
 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +321,7 @@ static int sync_page_range_nolock(struct inode *inode,
 		ret = wait_on_page_writeback_range(mapping, start, end);
 	return ret;
 }
+EXPORT_SYMBOL(sync_page_range_nolock);
 
 /**
  * filemap_fdatawait - walk the list of under-writeback pages of the given
-- 
cgit v1.2.3-71-gd317


From 05eb0b51fb46430050d5873458612f53e0234f2e Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:13 -0800
Subject: [PATCH] fat: support a truncate() for expanding size
 (generic_cont_expand)

This patch changes generic_cont_expand(), in order to share the code
with fatfs.

  - Use vmtruncate() if ->prepare_write() returns a error.

Even if ->prepare_write() returns an error, it may already have added some
blocks.  So, this truncates blocks outside of ->i_size by vmtruncate().

  - Add generic_cont_expand_simple().

The generic_cont_expand_simple() assumes that ->prepare_write() can handle
the block boundary.  With this, we don't need to care the extra byte.

And for expanding a file size by truncate(), fatfs uses the
added generic_cont_expand_simple().

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 60 ++++++++++++++++++++++++++++++++++-----------
 fs/fat/file.c               | 31 ++++++++++++++++++++---
 include/linux/buffer_head.h |  3 ++-
 3 files changed, 76 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 5287be18633b..55023231e460 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2160,11 +2160,12 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
  * truncates.  Uses prepare/commit_write to allow the filesystem to
  * deal with the hole.  
  */
-int generic_cont_expand(struct inode *inode, loff_t size)
+static int __generic_cont_expand(struct inode *inode, loff_t size,
+				 pgoff_t index, unsigned int offset)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
-	unsigned long index, offset, limit;
+	unsigned long limit;
 	int err;
 
 	err = -EFBIG;
@@ -2176,24 +2177,24 @@ int generic_cont_expand(struct inode *inode, loff_t size)
 	if (size > inode->i_sb->s_maxbytes)
 		goto out;
 
-	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
-
-	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
-	** skip the prepare.  make sure we never send an offset for the start
-	** of a block
-	*/
-	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
-		offset++;
-	}
-	index = size >> PAGE_CACHE_SHIFT;
 	err = -ENOMEM;
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		goto out;
 	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
-	if (!err) {
-		err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+	if (err) {
+		/*
+		 * ->prepare_write() may have instantiated a few blocks
+		 * outside i_size.  Trim these off again.
+		 */
+		unlock_page(page);
+		page_cache_release(page);
+		vmtruncate(inode, inode->i_size);
+		goto out;
 	}
+
+	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+
 	unlock_page(page);
 	page_cache_release(page);
 	if (err > 0)
@@ -2202,6 +2203,36 @@ out:
 	return err;
 }
 
+int generic_cont_expand(struct inode *inode, loff_t size)
+{
+	pgoff_t index;
+	unsigned int offset;
+
+	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
+
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		/* caller must handle this extra byte. */
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	return __generic_cont_expand(inode, size, index, offset);
+}
+
+int generic_cont_expand_simple(struct inode *inode, loff_t size)
+{
+	loff_t pos = size - 1;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
+
+	/* prepare/commit_write can handle even if from==to==start of block. */
+	return __generic_cont_expand(inode, size, index, offset);
+}
+
 /*
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
@@ -3145,6 +3176,7 @@ EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_commit_write);
 EXPORT_SYMBOL(generic_cont_expand);
+EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
 EXPORT_SYMBOL(invalidate_bdev);
 EXPORT_SYMBOL(ll_rw_block);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 15229fe569c3..9b07c328a6fc 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,6 +11,7 @@
 #include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
@@ -124,6 +125,24 @@ struct file_operations fat_file_operations = {
 	.sendfile	= generic_file_sendfile,
 };
 
+static int fat_cont_expand(struct inode *inode, loff_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	loff_t start = inode->i_size, count = size - inode->i_size;
+	int err;
+
+	err = generic_cont_expand_simple(inode, size);
+	if (err)
+		goto out;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	if (IS_SYNC(inode))
+		err = sync_page_range_nolock(inode, mapping, start, count);
+out:
+	return err;
+}
+
 int fat_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
@@ -132,11 +151,17 @@ int fat_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	lock_kernel();
 
-	/* FAT cannot truncate to a longer file */
+	/*
+	 * Expand the file. Since inode_setattr() updates ->i_size
+	 * before calling the ->truncate(), but FAT needs to fill the
+	 * hole before it.
+	 */
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (attr->ia_size > inode->i_size) {
-			error = -EPERM;
-			goto out;
+			error = fat_cont_expand(inode, attr->ia_size);
+			if (error || attr->ia_valid == ATTR_SIZE)
+				goto out;
+			attr->ia_valid &= ~ATTR_SIZE;
 		}
 	}
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1db061bb6b08..9f159baf153f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -197,7 +197,8 @@ int block_read_full_page(struct page*, get_block_t*);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
 				loff_t *);
-int generic_cont_expand(struct inode *inode, loff_t size) ;
+int generic_cont_expand(struct inode *inode, loff_t size);
+int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
 int block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
-- 
cgit v1.2.3-71-gd317


From 2a10e0b28b196051ae71829e5b989cba00513289 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:02:15 -0800
Subject: [PATCH] move rtc_interrupt() prototype to rtc.h

This patch moves the rtc_interrupt() prototype to rtc.h and removes the
prototypes from C files.

It also renames static rtc_interrupt() functions in
arch/arm/mach-integrator/time.c and arch/sh64/kernel/time.c to avoid compile
problems.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Paul Gortmaker <p_gortmaker@yahoo.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mach-integrator/time.c | 5 +++--
 arch/i386/kernel/time_hpet.c    | 2 --
 arch/sh64/kernel/time.c         | 7 ++++---
 arch/x86_64/kernel/time.c       | 2 --
 include/linux/rtc.h             | 3 +++
 5 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-integrator/time.c b/arch/arm/mach-integrator/time.c
index 9f46aaef8968..3c22c16b38bf 100644
--- a/arch/arm/mach-integrator/time.c
+++ b/arch/arm/mach-integrator/time.c
@@ -96,7 +96,8 @@ static struct rtc_ops rtc_ops = {
 	.set_alarm	= rtc_set_alarm,
 };
 
-static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t arm_rtc_interrupt(int irq, void *dev_id,
+				     struct pt_regs *regs)
 {
 	writel(0, rtc_base + RTC_EOI);
 	return IRQ_HANDLED;
@@ -124,7 +125,7 @@ static int rtc_probe(struct amba_device *dev, void *id)
 
 	xtime.tv_sec = __raw_readl(rtc_base + RTC_DR);
 
-	ret = request_irq(dev->irq[0], rtc_interrupt, SA_INTERRUPT,
+	ret = request_irq(dev->irq[0], arm_rtc_interrupt, SA_INTERRUPT,
 			  "rtc-pl030", dev);
 	if (ret)
 		goto map_out;
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 9caeaa315cd7..a529f0cdce17 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -259,8 +259,6 @@ __setup("hpet=", hpet_setup);
 #include <linux/mc146818rtc.h>
 #include <linux/rtc.h>
 
-extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-
 #define DEFAULT_RTC_INT_FREQ 	64
 #define RTC_NUM_INTS 		1
 
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 870fe5327e09..1195af37ee5a 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -417,7 +417,7 @@ static __init unsigned int get_cpu_hz(void)
 	/*
 	** Regardless the toolchain, force the compiler to use the
 	** arbitrary register r3 as a clock tick counter.
-	** NOTE: r3 must be in accordance with rtc_interrupt()
+	** NOTE: r3 must be in accordance with sh64_rtc_interrupt()
 	*/
 	register unsigned long long  __rtc_irq_flag __asm__ ("r3");
 
@@ -482,7 +482,8 @@ static __init unsigned int get_cpu_hz(void)
 #endif
 }
 
-static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t sh64_rtc_interrupt(int irq, void *dev_id,
+				      struct pt_regs *regs)
 {
 	ctrl_outb(0, RCR1);	/* Disable Carry Interrupts */
 	regs->regs[3] = 1;	/* Using r3 */
@@ -491,7 +492,7 @@ static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 }
 
 static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
-static struct irqaction irq1  = { rtc_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "rtc", NULL, NULL};
+static struct irqaction irq1  = { sh64_rtc_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "rtc", NULL, NULL};
 
 void __init time_init(void)
 {
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 74102796e5c0..43c9fa0f8d5f 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1075,8 +1075,6 @@ device_initcall(time_init_device);
  */
 #include <linux/rtc.h>
 
-extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-
 #define DEFAULT_RTC_INT_FREQ 	64
 #define RTC_NUM_INTS 		1
 
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index e1aaf1fac8e0..0b2ba67ff13c 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -11,6 +11,8 @@
 #ifndef _LINUX_RTC_H_
 #define _LINUX_RTC_H_
 
+#include <linux/interrupt.h>
+
 /*
  * The struct used to pass data via the following ioctl. Similar to the
  * struct tm in <time.h>, but it needs to be here so that the kernel 
@@ -102,6 +104,7 @@ int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 void rtc_get_rtc_time(struct rtc_time *rtc_tm);
+irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3-71-gd317


From 095975da26dba21698582e91e96be10f7417333f Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:02:19 -0800
Subject: [PATCH] rcu file: use atomic primitives

Use atomic_inc_not_zero for rcu files instead of special case rcuref.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/RCU/rcuref.txt |  87 ++++++++---------
 fs/aio.c                     |   3 +-
 fs/file_table.c              |   8 +-
 include/linux/fs.h           |   3 +-
 include/linux/radix-tree.h   |   1 +
 include/linux/rcuref.h       | 220 -------------------------------------------
 kernel/rcupdate.c            |  14 ---
 kernel/rcutorture.c          |   1 -
 security/selinux/hooks.c     |   2 +-
 9 files changed, 48 insertions(+), 291 deletions(-)
 delete mode 100644 include/linux/rcuref.h

(limited to 'include/linux')

diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index a23fee66064d..3f60db41b2f0 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -1,74 +1,67 @@
-Refcounter framework for elements of lists/arrays protected by
-RCU.
+Refcounter design for elements of lists/arrays protected by RCU.
 
 Refcounting on elements of  lists which are protected by traditional
 reader/writer spinlocks or semaphores are straight forward as in:
 
-1.					2.
-add()					search_and_reference()
-{					{
-	alloc_object				read_lock(&list_lock);
-	...					search_for_element
-	atomic_set(&el->rc, 1);			atomic_inc(&el->rc);
-	write_lock(&list_lock);			...
-	add_element				read_unlock(&list_lock);
-	...					...
-	write_unlock(&list_lock);	}
+1.				2.
+add()				search_and_reference()
+{				{
+    alloc_object		    read_lock(&list_lock);
+    ...				    search_for_element
+    atomic_set(&el->rc, 1);	    atomic_inc(&el->rc);
+    write_lock(&list_lock);	     ...
+    add_element			    read_unlock(&list_lock);
+    ...				    ...
+    write_unlock(&list_lock);	}
 }
 
 3.					4.
 release_referenced()			delete()
 {					{
-	...				write_lock(&list_lock);
-	atomic_dec(&el->rc, relfunc)	...
-	...				delete_element
-}					write_unlock(&list_lock);
- 					...
-					if (atomic_dec_and_test(&el->rc))
-						kfree(el);
-					...
+    ...					    write_lock(&list_lock);
+    atomic_dec(&el->rc, relfunc)	    ...
+    ...					    delete_element
+}					    write_unlock(&list_lock);
+ 					    ...
+					    if (atomic_dec_and_test(&el->rc))
+					        kfree(el);
+					    ...
 					}
 
 If this list/array is made lock free using rcu as in changing the
 write_lock in add() and delete() to spin_lock and changing read_lock
-in search_and_reference to rcu_read_lock(), the rcuref_get in
+in search_and_reference to rcu_read_lock(), the atomic_get in
 search_and_reference could potentially hold reference to an element which
-has already been deleted from the list/array.  rcuref_lf_get_rcu takes
+has already been deleted from the list/array.  atomic_inc_not_zero takes
 care of this scenario. search_and_reference should look as;
 
 1.					2.
 add()					search_and_reference()
 {					{
- 	alloc_object				rcu_read_lock();
-	...					search_for_element
-	atomic_set(&el->rc, 1);			if (rcuref_inc_lf(&el->rc)) {
-	write_lock(&list_lock);				rcu_read_unlock();
-							return FAIL;
-	add_element				}
-	...					...
-	write_unlock(&list_lock);		rcu_read_unlock();
+    alloc_object			    rcu_read_lock();
+    ...					    search_for_element
+    atomic_set(&el->rc, 1);		    if (atomic_inc_not_zero(&el->rc)) {
+    write_lock(&list_lock);		        rcu_read_unlock();
+					        return FAIL;
+    add_element				    }
+    ...					    ...
+    write_unlock(&list_lock);		    rcu_read_unlock();
 }					}
 3.					4.
 release_referenced()			delete()
 {					{
-	...				write_lock(&list_lock);
-	rcuref_dec(&el->rc, relfunc)	...
-	...				delete_element
-}					write_unlock(&list_lock);
- 					...
-					if (rcuref_dec_and_test(&el->rc))
-						call_rcu(&el->head, el_free);
-					...
+    ...					    write_lock(&list_lock);
+    atomic_dec(&el->rc, relfunc)	    ...
+    ...					    delete_element
+}					    write_unlock(&list_lock);
+ 					    ...
+					    if (atomic_dec_and_test(&el->rc))
+					        call_rcu(&el->head, el_free);
+					    ...
 					}
 
 Sometimes, reference to the element need to be obtained in the
-update (write) stream.  In such cases, rcuref_inc_lf might be an overkill
-since the spinlock serialising list updates are held. rcuref_inc
+update (write) stream.  In such cases, atomic_inc_not_zero might be an
+overkill since the spinlock serialising list updates are held. atomic_inc
 is to be used in such cases.
-For arches which do not have cmpxchg rcuref_inc_lf
-api uses a hashed spinlock implementation and the same hashed spinlock
-is acquired in all rcuref_xxx primitives to preserve atomicity.
-Note: Use rcuref_inc api only if you need to use rcuref_inc_lf on the
-refcounter atleast at one place.  Mixing rcuref_inc and atomic_xxx api
-might lead to races. rcuref_inc_lf() must be used in lockfree
-RCU critical sections only.
+
diff --git a/fs/aio.c b/fs/aio.c
index 5a28b69ad223..aec2b1916d1b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,7 +29,6 @@
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
-#include <linux/rcuref.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -514,7 +513,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	/* Must be done under the lock to serialise against cancellation.
 	 * Call this aio_fput as it duplicates fput via the fput_work.
 	 */
-	if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
diff --git a/fs/file_table.c b/fs/file_table.c
index c3a5e2fd663b..6142250104a6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(get_empty_filp);
 
 void fastcall fput(struct file *file)
 {
-	if (rcuref_dec_and_test(&file->f_count))
+	if (atomic_dec_and_test(&file->f_count))
 		__fput(file);
 }
 
@@ -166,7 +166,7 @@ struct file fastcall *fget(unsigned int fd)
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
-		if (!rcuref_inc_lf(&file->f_count)) {
+		if (!atomic_inc_not_zero(&file->f_count)) {
 			/* File object ref couldn't be taken */
 			rcu_read_unlock();
 			return NULL;
@@ -198,7 +198,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
 		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
-			if (rcuref_inc_lf(&file->f_count))
+			if (atomic_inc_not_zero(&file->f_count))
 				*fput_needed = 1;
 			else
 				/* Didn't get the reference, someone's freed */
@@ -213,7 +213,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
 
 void put_filp(struct file *file)
 {
-	if (rcuref_dec_and_test(&file->f_count)) {
+	if (atomic_dec_and_test(&file->f_count)) {
 		security_file_free(file);
 		file_kill(file);
 		file_free(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c9c48d65630..ef29500b5df8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -9,7 +9,6 @@
 #include <linux/config.h>
 #include <linux/limits.h>
 #include <linux/ioctl.h>
-#include <linux/rcuref.h>
 
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -653,7 +652,7 @@ extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 #define file_list_unlock() spin_unlock(&files_lock);
 
-#define get_file(x)	rcuref_inc(&(x)->f_count)
+#define get_file(x)	atomic_inc(&(x)->f_count)
 #define file_count(x)	atomic_read(&(x)->f_count)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 36e5d269612f..c57ff2fcb30a 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -19,6 +19,7 @@
 #ifndef _LINUX_RADIX_TREE_H
 #define _LINUX_RADIX_TREE_H
 
+#include <linux/sched.h>
 #include <linux/preempt.h>
 #include <linux/types.h>
 
diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
deleted file mode 100644
index e1adbba14b67..000000000000
--- a/include/linux/rcuref.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * rcuref.h
- *
- * Reference counting for elements of lists/arrays protected by
- * RCU.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2005
- *
- * Author: Dipankar Sarma <dipankar@in.ibm.com>
- *	   Ravikiran Thirumalai <kiran_th@gmail.com>
- *
- * See Documentation/RCU/rcuref.txt for detailed user guide.
- *
- */
-
-#ifndef _RCUREF_H_
-#define _RCUREF_H_
-
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-
-/*
- * These APIs work on traditional atomic_t counters used in the
- * kernel for reference counting. Under special circumstances
- * where a lock-free get() operation races with a put() operation
- * these APIs can be used. See Documentation/RCU/rcuref.txt.
- */
-
-#ifdef __HAVE_ARCH_CMPXCHG
-
-/**
- * rcuref_inc - increment refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline void rcuref_inc(atomic_t *rcuref)
-{
-	atomic_inc(rcuref);
-}
-
-/**
- * rcuref_dec - decrement refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline void rcuref_dec(atomic_t *rcuref)
-{
-	atomic_dec(rcuref);
-}
-
-/**
- * rcuref_dec_and_test - decrement refcount for object and test
- * @rcuref: reference counter in the object.
- * @release: pointer to the function that will clean up the object
- *	     when the last reference to the object is released.
- *	     This pointer is required.
- *
- * Decrement the refcount, and if 0, return 1. Else return 0.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline int rcuref_dec_and_test(atomic_t *rcuref)
-{
-	return atomic_dec_and_test(rcuref);
-}
-
-/*
- * cmpxchg is needed on UP too, if deletions to the list/array can happen
- * in interrupt context.
- */
-
-/**
- * rcuref_inc_lf - Take reference to an object in a read-side
- * critical section protected by RCU.
- * @rcuref: reference counter in the object in question.
- *
- * Try and increment the refcount by 1.  The increment might fail if
- * the reference counter has been through a 1 to 0 transition and
- * is no longer part of the lock-free list.
- * Returns non-zero on successful increment and zero otherwise.
- */
-static inline int rcuref_inc_lf(atomic_t *rcuref)
-{
-	int c, old;
-	c = atomic_read(rcuref);
-	while (c && (old = cmpxchg(&rcuref->counter, c, c + 1)) != c)
-		c = old;
-	return c;
-}
-
-#else				/* !__HAVE_ARCH_CMPXCHG */
-
-extern spinlock_t __rcuref_hash[];
-
-/*
- * Use a hash table of locks to protect the reference count
- * since cmpxchg is not available in this arch.
- */
-#ifdef	CONFIG_SMP
-#define RCUREF_HASH_SIZE	4
-#define RCUREF_HASH(k) \
-	(&__rcuref_hash[(((unsigned long)k)>>8) & (RCUREF_HASH_SIZE-1)])
-#else
-#define	RCUREF_HASH_SIZE	1
-#define RCUREF_HASH(k) 	&__rcuref_hash[0]
-#endif				/* CONFIG_SMP */
-
-/**
- * rcuref_inc - increment refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline void rcuref_inc(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter += 1;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-}
-
-/**
- * rcuref_dec - decrement refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline void rcuref_dec(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter -= 1;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-}
-
-/**
- * rcuref_dec_and_test - decrement refcount for object and test
- * @rcuref: reference counter in the object.
- * @release: pointer to the function that will clean up the object
- *	     when the last reference to the object is released.
- *	     This pointer is required.
- *
- * Decrement the refcount, and if 0, return 1. Else return 0.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline int rcuref_dec_and_test(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter--;
-	if (!rcuref->counter) {
-		spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-		return 1;
-	} else {
-		spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-		return 0;
-	}
-}
-
-/**
- * rcuref_inc_lf - Take reference to an object of a lock-free collection
- * by traversing a lock-free list/array.
- * @rcuref: reference counter in the object in question.
- *
- * Try and increment the refcount by 1.  The increment might fail if
- * the reference counter has been through a 1 to 0 transition and
- * object is no longer part of the lock-free list.
- * Returns non-zero on successful increment and zero otherwise.
- */
-static inline int rcuref_inc_lf(atomic_t *rcuref)
-{
-	int ret;
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	if (rcuref->counter)
-		ret = rcuref->counter++;
-	else
-		ret = 0;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-	return ret;
-}
-
-
-#endif /* !__HAVE_ARCH_CMPXCHG */
-
-#endif /* __KERNEL__ */
-#endif /* _RCUREF_H_ */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0a669bd2f6d1..30b0bba03859 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,7 +46,6 @@
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
-#include <linux/rcuref.h>
 #include <linux/cpu.h>
 
 /* Definition for rcupdate control block. */
@@ -74,19 +73,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int maxbatch = 10000;
 
-#ifndef __HAVE_ARCH_CMPXCHG
-/*
- * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
- * 32 bit atomic_t implementations, and a hash function similar to that
- * for our refcounting needs.
- * Can't help multiprocessors which donot have cmpxchg :(
- */
-
-spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
-	[0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
-};
-#endif
-
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 36efe088ad81..75174c81529a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,7 +39,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcuref.h>
 #include <linux/cpu.h>
 #include <linux/random.h>
 #include <linux/delay.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 3d496eae1b47..6647204e4636 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1663,7 +1663,7 @@ static inline void flush_unauthorized_files(struct files_struct * files)
 						continue;
 					}
 					if (devnull) {
-						rcuref_inc(&devnull->f_count);
+						get_file(devnull);
 					} else {
 						devnull = dentry_open(dget(selinux_null), mntget(selinuxfs_mount), O_RDWR);
 						if (!devnull) {
-- 
cgit v1.2.3-71-gd317


From b3f3d6141f8636f627bf19fd44eaf59a52637ac8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Sun, 8 Jan 2006 01:02:20 -0800
Subject: [PATCH] ELF: symbol table type additions

Needed for the Novell kernel debugger and perhaps some per-cpu data on x86_64
in the future.

Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/elf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/elf.h b/include/linux/elf.h
index ff955dbf510d..d3bfacb24496 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -151,6 +151,8 @@ typedef __s64	Elf64_Sxword;
 #define STT_FUNC    2
 #define STT_SECTION 3
 #define STT_FILE    4
+#define STT_COMMON  5
+#define STT_TLS     6
 
 #define ELF_ST_BIND(x)		((x) >> 4)
 #define ELF_ST_TYPE(x)		(((unsigned int) x) & 0xf)
-- 
cgit v1.2.3-71-gd317


From 907f2c77d1653ce235e8e1fd6ce5c46005814e78 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:24 -0800
Subject: [PATCH] relayfs: export relayfs_create_file() with fileops param

This patch adds a mandatory fileops param to relayfs_create_file() and exports
that function so that clients can use it to create files defined by their own
set of file operations, in relayfs.  The purpose is to allow relayfs
applications to create their own set of 'control' files alongside their relay
files in relayfs rather than having to create them in /proc or debugfs for
instance.  relayfs_create_file() is also used by relay_open_buf() to create
the relay files for a channel.  In this case, a pointer to
relayfs_file_operations is passed in, along with a pointer to the buffer
associated with the file.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 41 ++++++++++++++++++++++++++---------------
 fs/relayfs/relay.c         |  3 ++-
 fs/relayfs/relay.h         |  4 ----
 include/linux/relayfs_fs.h |  7 ++++++-
 4 files changed, 34 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 379e07cd2b34..a5e6d4f2efb9 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -33,7 +33,9 @@ static struct backing_dev_info		relayfs_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 };
 
-static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
+static struct inode *relayfs_get_inode(struct super_block *sb,
+				       int mode,
+ 				       struct file_operations *fops,
 				       void *data)
 {
 	struct inode *inode;
@@ -51,8 +53,8 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	switch (mode & S_IFMT) {
 	case S_IFREG:
-		inode->i_fop = &relayfs_file_operations;
-		RELAYFS_I(inode)->buf = data;
+		inode->i_fop = fops;
+		RELAYFS_I(inode)->data = data;
 		break;
 	case S_IFDIR:
 		inode->i_op = &simple_dir_inode_operations;
@@ -73,6 +75,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode
+ *	@fops: file operations to use for the file
  *	@data: user-associated data for this file
  *
  *	Returns the new dentry, NULL on failure
@@ -82,6 +85,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 static struct dentry *relayfs_create_entry(const char *name,
 					   struct dentry *parent,
 					   int mode,
+					   struct file_operations *fops,
 					   void *data)
 {
 	struct dentry *d;
@@ -117,7 +121,7 @@ static struct dentry *relayfs_create_entry(const char *name,
 		goto release_mount;
 	}
 
-	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, data);
+	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, fops, data);
 	if (!inode) {
 		d = NULL;
 		goto release_mount;
@@ -145,20 +149,26 @@ exit:
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode, if not specied the default perms are used
+ *	@fops: file operations to use for the file
  *	@data: user-associated data for this file
  *
  *	Returns file dentry if successful, NULL otherwise.
  *
  *	The file will be created user r on behalf of current user.
  */
-struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
-				   int mode, void *data)
+struct dentry *relayfs_create_file(const char *name,
+				   struct dentry *parent,
+				   int mode,
+				   struct file_operations *fops,
+				   void *data)
 {
+	BUG_ON(!fops);
+
 	if (!mode)
 		mode = S_IRUSR;
 	mode = (mode & S_IALLUGO) | S_IFREG;
 
-	return relayfs_create_entry(name, parent, mode, data);
+	return relayfs_create_entry(name, parent, mode, fops, data);
 }
 
 /**
@@ -173,7 +183,7 @@ struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
 struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
 {
 	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	return relayfs_create_entry(name, parent, mode, NULL);
+	return relayfs_create_entry(name, parent, mode, NULL, NULL);
 }
 
 /**
@@ -234,7 +244,7 @@ int relayfs_remove_dir(struct dentry *dentry)
  */
 static int relayfs_open(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	kref_get(&buf->kref);
 
 	return 0;
@@ -250,7 +260,7 @@ static int relayfs_open(struct inode *inode, struct file *filp)
 static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	return relay_mmap_buf(RELAYFS_I(inode)->buf, vma);
+	return relay_mmap_buf(RELAYFS_I(inode)->data, vma);
 }
 
 /**
@@ -264,7 +274,7 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask = 0;
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 
 	if (buf->finalized)
 		return POLLERR;
@@ -288,7 +298,7 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
  */
 static int relayfs_release(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	kref_put(&buf->kref, relay_remove_buf);
 
 	return 0;
@@ -450,7 +460,7 @@ static ssize_t relayfs_read(struct file *filp,
 			    loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	size_t read_start, avail;
 	ssize_t ret = 0;
 	void *from;
@@ -485,7 +495,7 @@ static struct inode *relayfs_alloc_inode(struct super_block *sb)
 	struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
 	if (!p)
 		return NULL;
-	p->buf = NULL;
+	p->data = NULL;
 
 	return &p->vfs_inode;
 }
@@ -531,7 +541,7 @@ static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = RELAYFS_MAGIC;
 	sb->s_op = &relayfs_ops;
-	inode = relayfs_get_inode(sb, mode, NULL);
+	inode = relayfs_get_inode(sb, mode, NULL, NULL);
 
 	if (!inode)
 		return -ENOMEM;
@@ -589,6 +599,7 @@ module_exit(exit_relayfs_fs)
 EXPORT_SYMBOL_GPL(relayfs_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
+EXPORT_SYMBOL_GPL(relayfs_create_file);
 
 MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
 MODULE_DESCRIPTION("Relay Filesystem");
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 7fbda177ad8f..a9cd5585c45c 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -176,7 +176,8 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
  		return NULL;
 
 	/* Create file in fs */
-	dentry = relayfs_create_file(filename, parent, S_IRUSR, buf);
+	dentry = relayfs_create_file(filename, parent, S_IRUSR,
+				     &relayfs_file_operations, buf);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
index c325bb243549..0993d3e5753b 100644
--- a/fs/relayfs/relay.h
+++ b/fs/relayfs/relay.h
@@ -1,10 +1,6 @@
 #ifndef _RELAY_H
 #define _RELAY_H
 
-struct dentry *relayfs_create_file(const char *name,
-				   struct dentry *parent,
-				   int mode,
-				   void *data);
 extern int relayfs_remove(struct dentry *dentry);
 extern int relay_buf_empty(struct rchan_buf *buf);
 extern void relay_destroy_channel(struct kref *kref);
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index fb7e80737325..a122df2d9880 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -70,7 +70,7 @@ struct rchan
 struct relayfs_inode_info
 {
 	struct inode vfs_inode;
-	struct rchan_buf *buf;
+	void *data;
 };
 
 static inline struct relayfs_inode_info *RELAYFS_I(struct inode *inode)
@@ -148,6 +148,11 @@ extern size_t relay_switch_subbuf(struct rchan_buf *buf,
 extern struct dentry *relayfs_create_dir(const char *name,
 					 struct dentry *parent);
 extern int relayfs_remove_dir(struct dentry *dentry);
+extern struct dentry *relayfs_create_file(const char *name,
+					  struct dentry *parent,
+					  int mode,
+					  struct file_operations *fops,
+					  void *data);
 
 /**
  *	relay_write - write data into the channel
-- 
cgit v1.2.3-71-gd317


From 7431733791feb0b19453d8047b0723c744667040 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:25 -0800
Subject: [PATCH] relayfs: add relayfs_remove_file()

This patch adds and exports relayfs_remove_file(), for API symmetry (with
relayfs_create_file()).

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 12 ++++++++++++
 include/linux/relayfs_fs.h |  1 +
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index a5e6d4f2efb9..b2f50655736b 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -224,6 +224,17 @@ int relayfs_remove(struct dentry *dentry)
 	return error;
 }
 
+/**
+ *	relayfs_remove_file - remove a file from relay filesystem
+ *	@dentry: directory dentry
+ *
+ *	Returns 0 if successful, negative otherwise.
+ */
+int relayfs_remove_file(struct dentry *dentry)
+{
+	return relayfs_remove(dentry);
+}
+
 /**
  *	relayfs_remove_dir - remove a directory in the relay filesystem
  *	@dentry: directory dentry
@@ -600,6 +611,7 @@ EXPORT_SYMBOL_GPL(relayfs_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
 EXPORT_SYMBOL_GPL(relayfs_create_file);
+EXPORT_SYMBOL_GPL(relayfs_remove_file);
 
 MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
 MODULE_DESCRIPTION("Relay Filesystem");
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index a122df2d9880..921540b1cdf8 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -153,6 +153,7 @@ extern struct dentry *relayfs_create_file(const char *name,
 					  int mode,
 					  struct file_operations *fops,
 					  void *data);
+extern int relayfs_remove_file(struct dentry *dentry);
 
 /**
  *	relay_write - write data into the channel
-- 
cgit v1.2.3-71-gd317


From aaea25d7a68a7f72e167dc1698b66a15edc71883 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:26 -0800
Subject: [PATCH] relayfs: remove unused alloc/destroy_inode()

Since we're no longer using relayfs_inode_info, remove relayfs_alloc_inode()
and relayfs_destroy_inode() along with the relayfs inode cache.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 46 +---------------------------------------------
 include/linux/relayfs_fs.h | 14 --------------
 2 files changed, 1 insertion(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 7f6d2c8e91c2..b4c3e0466e98 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -26,7 +26,6 @@
 
 static struct vfsmount *		relayfs_mount;
 static int				relayfs_mount_count;
-static kmem_cache_t *			relayfs_inode_cachep;
 
 static struct backing_dev_info		relayfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
@@ -499,34 +498,6 @@ out:
 	return ret;
 }
 
-/**
- *	relayfs alloc_inode() implementation
- */
-static struct inode *relayfs_alloc_inode(struct super_block *sb)
-{
-	struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
-	if (!p)
-		return NULL;
-	p->data = NULL;
-
-	return &p->vfs_inode;
-}
-
-/**
- *	relayfs destroy_inode() implementation
- */
-static void relayfs_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode));
-}
-
-static void init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
-{
-	struct relayfs_inode_info *i = p;
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&i->vfs_inode);
-}
-
 struct file_operations relayfs_file_operations = {
 	.open		= relayfs_open,
 	.poll		= relayfs_poll,
@@ -539,8 +510,6 @@ struct file_operations relayfs_file_operations = {
 static struct super_operations relayfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
-	.alloc_inode	= relayfs_alloc_inode,
-	.destroy_inode	= relayfs_destroy_inode,
 };
 
 static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
@@ -584,25 +553,12 @@ static struct file_system_type relayfs_fs_type = {
 
 static int __init init_relayfs_fs(void)
 {
-	int err;
-
-	relayfs_inode_cachep = kmem_cache_create("relayfs_inode_cache",
-				sizeof(struct relayfs_inode_info), 0,
-				0, init_once, NULL);
-	if (!relayfs_inode_cachep)
-		return -ENOMEM;
-
-	err = register_filesystem(&relayfs_fs_type);
-	if (err)
-		kmem_cache_destroy(relayfs_inode_cachep);
-
-	return err;
+	return register_filesystem(&relayfs_fs_type);
 }
 
 static void __exit exit_relayfs_fs(void)
 {
 	unregister_filesystem(&relayfs_fs_type);
-	kmem_cache_destroy(relayfs_inode_cachep);
 }
 
 module_init(init_relayfs_fs)
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 921540b1cdf8..8200ecbe6e0f 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -64,20 +64,6 @@ struct rchan
 	struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
 };
 
-/*
- * Relayfs inode
- */
-struct relayfs_inode_info
-{
-	struct inode vfs_inode;
-	void *data;
-};
-
-static inline struct relayfs_inode_info *RELAYFS_I(struct inode *inode)
-{
-	return container_of(inode, struct relayfs_inode_info, vfs_inode);
-}
-
 /*
  * Relay channel client callbacks
  */
-- 
cgit v1.2.3-71-gd317


From 08c541a7ade230883c48225f4ea406a0117e7c2f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:28 -0800
Subject: [PATCH] relayfs: add support for relay files in other filesystems

This patch adds a couple of callback functions that allow a client to hook
into relay_open()/close() and supply the files that will be used to represent
the channel buffers; the default implementation if no callbacks are defined is
to create the files in relayfs.  This is to support the creation and use of
relay files in other filesystems such as debugfs, as implied by the fact that
relayfs_file_operations are exported.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/buffers.c       |  2 +-
 fs/relayfs/relay.c         | 30 ++++++++++++++++++++++++++++--
 include/linux/relayfs_fs.h | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
index 667b529944c5..10187812771e 100644
--- a/fs/relayfs/buffers.c
+++ b/fs/relayfs/buffers.c
@@ -185,6 +185,6 @@ void relay_destroy_buf(struct rchan_buf *buf)
 void relay_remove_buf(struct kref *kref)
 {
 	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-	relayfs_remove(buf->dentry);
+	buf->chan->cb->remove_buf_file(buf->dentry);
 	relay_destroy_buf(buf);
 }
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index a9cd5585c45c..b9bb56903272 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -80,11 +80,33 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 {
 }
 
+/*
+ * create_buf_file_create() default callback.  Creates file to represent buf.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+						       struct dentry *parent,
+						       int mode,
+						       struct rchan_buf *buf)
+{
+	return relayfs_create_file(filename, parent, mode,
+				   &relayfs_file_operations, buf);
+}
+
+/*
+ * remove_buf_file() default callback.  Removes file representing relay buffer.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+	return relayfs_remove(dentry);
+}
+
 /* relay channel default callbacks */
 static struct rchan_callbacks default_channel_callbacks = {
 	.subbuf_start = subbuf_start_default_callback,
 	.buf_mapped = buf_mapped_default_callback,
 	.buf_unmapped = buf_unmapped_default_callback,
+	.create_buf_file = create_buf_file_default_callback,
+	.remove_buf_file = remove_buf_file_default_callback,
 };
 
 /**
@@ -176,8 +198,8 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
  		return NULL;
 
 	/* Create file in fs */
-	dentry = relayfs_create_file(filename, parent, S_IRUSR,
-				     &relayfs_file_operations, buf);
+ 	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
+ 					   buf);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
@@ -220,6 +242,10 @@ static inline void setup_callbacks(struct rchan *chan,
 		cb->buf_mapped = buf_mapped_default_callback;
 	if (!cb->buf_unmapped)
 		cb->buf_unmapped = buf_unmapped_default_callback;
+	if (!cb->create_buf_file)
+		cb->create_buf_file = create_buf_file_default_callback;
+	if (!cb->remove_buf_file)
+		cb->remove_buf_file = remove_buf_file_default_callback;
 	chan->cb = cb;
 }
 
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 8200ecbe6e0f..8c2177105857 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -110,6 +110,40 @@ struct rchan_callbacks
 	 */
         void (*buf_unmapped)(struct rchan_buf *buf,
 			     struct file *filp);
+	/*
+	 * create_buf_file - create file to represent a relayfs channel buffer
+	 * @filename: the name of the file to create
+	 * @parent: the parent of the file to create
+	 * @mode: the mode of the file to create
+	 * @buf: the channel buffer
+	 *
+	 * Called during relay_open(), once for each per-cpu buffer,
+	 * to allow the client to create a file to be used to
+	 * represent the corresponding channel buffer.  If the file is
+	 * created outside of relayfs, the parent must also exist in
+	 * that filesystem.
+	 *
+	 * The callback should return the dentry of the file created
+	 * to represent the relay buffer.
+	 *
+	 * See Documentation/filesystems/relayfs.txt for more info.
+	 */
+	struct dentry *(*create_buf_file)(const char *filename,
+					  struct dentry *parent,
+					  int mode,
+					  struct rchan_buf *buf);
+
+	/*
+	 * remove_buf_file - remove file representing a relayfs channel buffer
+	 * @dentry: the dentry of the file to remove
+	 *
+	 * Called during relay_close(), once for each per-cpu buffer,
+	 * to allow the client to remove a file used to represent a
+	 * channel buffer.
+	 *
+	 * The callback should return 0 if successful, negative if not.
+	 */
+	int (*remove_buf_file)(struct dentry *dentry);
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From e6c08367b8fc6dce6dfd1106f53f6ef28215b313 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:29 -0800
Subject: [PATCH] relayfs: add support for global relay buffers

This patch adds the optional is_global outparam to the create_buf_file()
callback.  This can be used by clients to create a single global relayfs
buffer instead of the default per-cpu buffers.  This was suggested as being
useful for certain debugging applications where it's more convenient to be
able to get all the data from a single channel without having to go to the
bother of dealing with per-cpu files.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/relay.c         | 35 +++++++++++++++++++++++++----------
 include/linux/relayfs_fs.h |  8 +++++++-
 2 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index b9bb56903272..2935a6ab8ffa 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -86,7 +86,8 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 static struct dentry *create_buf_file_default_callback(const char *filename,
 						       struct dentry *parent,
 						       int mode,
-						       struct rchan_buf *buf)
+						       struct rchan_buf *buf,
+						       int *is_global)
 {
 	return relayfs_create_file(filename, parent, mode,
 				   &relayfs_file_operations, buf);
@@ -170,14 +171,16 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
 void relay_reset(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		__relay_reset(chan->buf[i], 0);
+		prev = chan->buf[i];
 	}
 }
 
@@ -188,18 +191,22 @@ void relay_reset(struct rchan *chan)
  */
 static struct rchan_buf *relay_open_buf(struct rchan *chan,
 					const char *filename,
-					struct dentry *parent)
+					struct dentry *parent,
+					int *is_global)
 {
 	struct rchan_buf *buf;
 	struct dentry *dentry;
 
+	if (*is_global)
+		return chan->buf[0];
+
  	buf = relay_create_buf(chan);
  	if (!buf)
  		return NULL;
 
 	/* Create file in fs */
  	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
- 					   buf);
+ 					   buf, is_global);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
@@ -273,6 +280,7 @@ struct rchan *relay_open(const char *base_filename,
 	unsigned int i;
 	struct rchan *chan;
 	char *tmpname;
+	int is_global = 0;
 
 	if (!base_filename)
 		return NULL;
@@ -297,7 +305,8 @@ struct rchan *relay_open(const char *base_filename,
 
 	for_each_online_cpu(i) {
 		sprintf(tmpname, "%s%d", base_filename, i);
-		chan->buf[i] = relay_open_buf(chan, tmpname, parent);
+		chan->buf[i] = relay_open_buf(chan, tmpname, parent,
+					      &is_global);
 		chan->buf[i]->cpu = i;
 		if (!chan->buf[i])
 			goto free_bufs;
@@ -311,6 +320,8 @@ free_bufs:
 		if (!chan->buf[i])
 			break;
 		relay_close_buf(chan->buf[i]);
+		if (is_global)
+			break;
 	}
 	kfree(tmpname);
 
@@ -420,14 +431,16 @@ void relay_destroy_channel(struct kref *kref)
 void relay_close(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		relay_close_buf(chan->buf[i]);
+		prev = chan->buf[i];
 	}
 
 	if (chan->last_toobig)
@@ -447,14 +460,16 @@ void relay_close(struct rchan *chan)
 void relay_flush(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		relay_switch_subbuf(chan->buf[i], 0);
+		prev = chan->buf[i];
 	}
 }
 
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 8c2177105857..30f45511b40d 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -116,6 +116,7 @@ struct rchan_callbacks
 	 * @parent: the parent of the file to create
 	 * @mode: the mode of the file to create
 	 * @buf: the channel buffer
+	 * @is_global: outparam - set non-zero if the buffer should be global
 	 *
 	 * Called during relay_open(), once for each per-cpu buffer,
 	 * to allow the client to create a file to be used to
@@ -126,12 +127,17 @@ struct rchan_callbacks
 	 * The callback should return the dentry of the file created
 	 * to represent the relay buffer.
 	 *
+	 * Setting the is_global outparam to a non-zero value will
+	 * cause relay_open() to create a single global buffer rather
+	 * than the default set of per-cpu buffers.
+	 *
 	 * See Documentation/filesystems/relayfs.txt for more info.
 	 */
 	struct dentry *(*create_buf_file)(const char *filename,
 					  struct dentry *parent,
 					  int mode,
-					  struct rchan_buf *buf);
+					  struct rchan_buf *buf,
+					  int *is_global);
 
 	/*
 	 * remove_buf_file - remove file representing a relayfs channel buffer
-- 
cgit v1.2.3-71-gd317


From 761da5c88aca34586e5b7295bd8b9be2438906f2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:31 -0800
Subject: [PATCH] relayfs: cleanup, change relayfs_file_* to relay_file_*

This patch renames relayfs_file_operations to relay_file_operations, and the
file operations themselves from relayfs_XXX to relay_file_XXX, to make it more
clear that they refer to relay files.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 89 ++++++++++++++++++++++++----------------------
 fs/relayfs/relay.c         |  2 +-
 include/linux/relayfs_fs.h |  5 ++-
 3 files changed, 50 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index b4c3e0466e98..7b7f2cb5f0e1 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -247,13 +247,13 @@ int relayfs_remove_dir(struct dentry *dentry)
 }
 
 /**
- *	relayfs_open - open file op for relayfs files
+ *	relay_file_open - open file op for relay files
  *	@inode: the inode
  *	@filp: the file
  *
  *	Increments the channel buffer refcount.
  */
-static int relayfs_open(struct inode *inode, struct file *filp)
+static int relay_file_open(struct inode *inode, struct file *filp)
 {
 	struct rchan_buf *buf = inode->u.generic_ip;
 	kref_get(&buf->kref);
@@ -263,26 +263,26 @@ static int relayfs_open(struct inode *inode, struct file *filp)
 }
 
 /**
- *	relayfs_mmap - mmap file op for relayfs files
+ *	relay_file_mmap - mmap file op for relay files
  *	@filp: the file
  *	@vma: the vma describing what to map
  *
  *	Calls upon relay_mmap_buf to map the file into user space.
  */
-static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct rchan_buf *buf = filp->private_data;
 	return relay_mmap_buf(buf, vma);
 }
 
 /**
- *	relayfs_poll - poll file op for relayfs files
+ *	relay_file_poll - poll file op for relay files
  *	@filp: the file
  *	@wait: poll table
  *
  *	Poll implemention.
  */
-static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask = 0;
 	struct rchan_buf *buf = filp->private_data;
@@ -300,14 +300,14 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
 }
 
 /**
- *	relayfs_release - release file op for relayfs files
+ *	relay_file_release - release file op for relay files
  *	@inode: the inode
  *	@filp: the file
  *
  *	Decrements the channel refcount, as the filesystem is
  *	no longer using it.
  */
-static int relayfs_release(struct inode *inode, struct file *filp)
+static int relay_file_release(struct inode *inode, struct file *filp)
 {
 	struct rchan_buf *buf = filp->private_data;
 	kref_put(&buf->kref, relay_remove_buf);
@@ -316,11 +316,11 @@ static int relayfs_release(struct inode *inode, struct file *filp)
 }
 
 /**
- *	relayfs_read_consume - update the consumed count for the buffer
+ *	relay_file_read_consume - update the consumed count for the buffer
  */
-static void relayfs_read_consume(struct rchan_buf *buf,
-				 size_t read_pos,
-				 size_t bytes_consumed)
+static void relay_file_read_consume(struct rchan_buf *buf,
+				    size_t read_pos,
+				    size_t bytes_consumed)
 {
 	size_t subbuf_size = buf->chan->subbuf_size;
 	size_t n_subbufs = buf->chan->n_subbufs;
@@ -343,9 +343,9 @@ static void relayfs_read_consume(struct rchan_buf *buf,
 }
 
 /**
- *	relayfs_read_avail - boolean, are there unconsumed bytes available?
+ *	relay_file_read_avail - boolean, are there unconsumed bytes available?
  */
-static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 {
 	size_t bytes_produced, bytes_consumed, write_offset;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -376,16 +376,16 @@ static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
 	if (bytes_produced == bytes_consumed)
 		return 0;
 
-	relayfs_read_consume(buf, read_pos, 0);
+	relay_file_read_consume(buf, read_pos, 0);
 
 	return 1;
 }
 
 /**
- *	relayfs_read_subbuf_avail - return bytes available in sub-buffer
+ *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
  */
-static size_t relayfs_read_subbuf_avail(size_t read_pos,
-					struct rchan_buf *buf)
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+					   struct rchan_buf *buf)
 {
 	size_t padding, avail = 0;
 	size_t read_subbuf, read_offset, write_subbuf, write_offset;
@@ -407,14 +407,14 @@ static size_t relayfs_read_subbuf_avail(size_t read_pos,
 }
 
 /**
- *	relayfs_read_start_pos - find the first available byte to read
+ *	relay_file_read_start_pos - find the first available byte to read
  *
  *	If the read_pos is in the middle of padding, return the
  *	position of the first actually available byte, otherwise
  *	return the original value.
  */
-static size_t relayfs_read_start_pos(size_t read_pos,
-				     struct rchan_buf *buf)
+static size_t relay_file_read_start_pos(size_t read_pos,
+					struct rchan_buf *buf)
 {
 	size_t read_subbuf, padding, padding_start, padding_end;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -433,11 +433,11 @@ static size_t relayfs_read_start_pos(size_t read_pos,
 }
 
 /**
- *	relayfs_read_end_pos - return the new read position
+ *	relay_file_read_end_pos - return the new read position
  */
-static size_t relayfs_read_end_pos(struct rchan_buf *buf,
-				   size_t read_pos,
-				   size_t count)
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+				      size_t read_pos,
+				      size_t count)
 {
 	size_t read_subbuf, padding, end_pos;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -456,7 +456,7 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf,
 }
 
 /**
- *	relayfs_read - read file op for relayfs files
+ *	relay_file_read - read file op for relay files
  *	@filp: the file
  *	@buffer: the userspace buffer
  *	@count: number of bytes to read
@@ -465,10 +465,10 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf,
  *	Reads count bytes or the number of bytes available in the
  *	current sub-buffer being read, whichever is smaller.
  */
-static ssize_t relayfs_read(struct file *filp,
-			    char __user *buffer,
-			    size_t count,
-			    loff_t *ppos)
+static ssize_t relay_file_read(struct file *filp,
+			       char __user *buffer,
+			       size_t count,
+			       loff_t *ppos)
 {
 	struct rchan_buf *buf = filp->private_data;
 	struct inode *inode = filp->f_dentry->d_inode;
@@ -477,11 +477,11 @@ static ssize_t relayfs_read(struct file *filp,
 	void *from;
 
 	down(&inode->i_sem);
-	if(!relayfs_read_avail(buf, *ppos))
+	if(!relay_file_read_avail(buf, *ppos))
 		goto out;
 
-	read_start = relayfs_read_start_pos(*ppos, buf);
-	avail = relayfs_read_subbuf_avail(read_start, buf);
+	read_start = relay_file_read_start_pos(*ppos, buf);
+	avail = relay_file_read_subbuf_avail(read_start, buf);
 	if (!avail)
 		goto out;
 
@@ -491,20 +491,20 @@ static ssize_t relayfs_read(struct file *filp,
 		ret = -EFAULT;
 		goto out;
 	}
-	relayfs_read_consume(buf, read_start, count);
-	*ppos = relayfs_read_end_pos(buf, read_start, count);
+	relay_file_read_consume(buf, read_start, count);
+	*ppos = relay_file_read_end_pos(buf, read_start, count);
 out:
 	up(&inode->i_sem);
 	return ret;
 }
 
-struct file_operations relayfs_file_operations = {
-	.open		= relayfs_open,
-	.poll		= relayfs_poll,
-	.mmap		= relayfs_mmap,
-	.read		= relayfs_read,
+struct file_operations relay_file_operations = {
+	.open		= relay_file_open,
+	.poll		= relay_file_poll,
+	.mmap		= relay_file_mmap,
+	.read		= relay_file_read,
 	.llseek		= no_llseek,
-	.release	= relayfs_release,
+	.release	= relay_file_release,
 };
 
 static struct super_operations relayfs_ops = {
@@ -558,13 +558,18 @@ static int __init init_relayfs_fs(void)
 
 static void __exit exit_relayfs_fs(void)
 {
+
+
+
+
+
 	unregister_filesystem(&relayfs_fs_type);
 }
 
 module_init(init_relayfs_fs)
 module_exit(exit_relayfs_fs)
 
-EXPORT_SYMBOL_GPL(relayfs_file_operations);
+EXPORT_SYMBOL_GPL(relay_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
 EXPORT_SYMBOL_GPL(relayfs_create_file);
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 2935a6ab8ffa..abf3ceaace49 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -90,7 +90,7 @@ static struct dentry *create_buf_file_default_callback(const char *filename,
 						       int *is_global)
 {
 	return relayfs_create_file(filename, parent, mode,
-				   &relayfs_file_operations, buf);
+				   &relay_file_operations, buf);
 }
 
 /*
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 30f45511b40d..7342e66247fb 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -279,10 +279,9 @@ static inline void subbuf_start_reserve(struct rchan_buf *buf,
 }
 
 /*
- * exported relayfs file operations, fs/relayfs/inode.c
+ * exported relay file operations, fs/relayfs/inode.c
  */
-
-extern struct file_operations relayfs_file_operations;
+extern struct file_operations relay_file_operations;
 
 #endif /* _LINUX_RELAYFS_FS_H */
 
-- 
cgit v1.2.3-71-gd317


From 6b9c7ed84837753a436415097063232422e29a35 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 8 Jan 2006 01:02:33 -0800
Subject: [PATCH] use ptrace_get_task_struct in various places

The ptrace_get_task_struct() helper that I added as part of the ptrace
consolidation is useful in variety of places that currently opencode it.
Switch them to the common helpers.

Add a ptrace_traceme() helper that needs to be explicitly called, and simplify
the ptrace_get_task_struct() interface.  We don't need the request argument
now, and we return the task_struct directly, using ERR_PTR() for error
returns.  It's a bit more code in the callers, but we have two sane routines
that do one thing well now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/ptrace.c     | 24 +++----------
 arch/ia64/ia32/sys_ia32.c      | 16 +++------
 arch/ia64/kernel/ptrace.c      |  9 +----
 arch/m32r/kernel/ptrace.c      | 22 +++---------
 arch/mips/kernel/ptrace32.c    | 26 ++++----------
 arch/powerpc/kernel/ptrace32.c | 28 ++++-----------
 arch/s390/kernel/ptrace.c      | 29 ++++------------
 arch/sparc/kernel/ptrace.c     | 35 ++++---------------
 arch/sparc64/kernel/ptrace.c   | 34 +++----------------
 arch/x86_64/ia32/ptrace32.c    | 44 ++++++------------------
 include/linux/ptrace.h         |  2 ++
 kernel/ptrace.c                | 77 +++++++++++++++++++++++++-----------------
 12 files changed, 105 insertions(+), 241 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index bbd37536d14e..9969d212e94d 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -265,30 +265,16 @@ do_sys_ptrace(long request, long pid, long addr, long data,
 	lock_kernel();
 	DBG(DBG_MEM, ("request=%ld pid=%ld addr=0x%lx data=0x%lx\n",
 		      request, pid, addr, data));
-	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out_notsk;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out_notsk;
-		/* set the ptrace bit in the process ptrace flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out_notsk;
 	}
-	if (pid == 1)		/* you may not mess with init */
-		goto out_notsk;
 
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out_notsk;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index dc282710421a..9f8e8d558873 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1761,21 +1761,15 @@ sys32_ptrace (int request, pid_t pid, unsigned int addr, unsigned int data)
 
 	lock_kernel();
 	if (request == PTRACE_TRACEME) {
-		ret = sys_ptrace(request, pid, addr, data);
+		ret = ptrace_traceme();
 		goto out;
 	}
 
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out;
-	ret = -EPERM;
-	if (pid == 1)		/* no messing around with init! */
-		goto out_tsk;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = sys_ptrace(request, pid, addr, data);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 4b19d0410632..8d88eeea02d1 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1422,14 +1422,7 @@ sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data)
 	lock_kernel();
 	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
 
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index 078d2a0e71c2..9b75caaf5cec 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -762,28 +762,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	int ret;
 
 	lock_kernel();
-	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 9a9b04972132..7e55457a491f 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -57,30 +57,16 @@ asmlinkage int sys32_ptrace(int request, int pid, int addr, int data)
 	       (unsigned long) data);
 #endif
 	lock_kernel();
-	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		if ((ret = security_ptrace(current->parent, current)))
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
index 61762640b877..826ee3d056de 100644
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -45,33 +45,19 @@ long compat_sys_ptrace(int request, int pid, unsigned long addr,
 		       unsigned long data)
 {
 	struct task_struct *child;
-	int ret = -EPERM;
+	int ret;
 
 	lock_kernel();
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 8ecda6d66de4..cc02232aa96e 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -712,35 +712,18 @@ sys_ptrace(long request, long pid, long addr, long data)
 	int ret;
 
 	lock_kernel();
-
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		ret = -EPERM;
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		goto out;
+		 ret = ptrace_traceme();
+		 goto out;
 	}
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out;
-
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out;
+	}
 
 	ret = do_ptrace(child, request, addr, data);
-
 	put_task_struct(child);
 out:
 	unlock_kernel();
diff --git a/arch/sparc/kernel/ptrace.c b/arch/sparc/kernel/ptrace.c
index 475c4c13462c..fc470c0e9dc6 100644
--- a/arch/sparc/kernel/ptrace.c
+++ b/arch/sparc/kernel/ptrace.c
@@ -286,40 +286,17 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
 			       s, (int) request, (int) pid, addr, data, addr2);
 	}
 #endif
-	if (request == PTRACE_TRACEME) {
-		int my_ret;
-
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED) {
-			pt_error_return(regs, EPERM);
-			goto out;
-		}
-		my_ret = security_ptrace(current->parent, current);
-		if (my_ret) {
-			pt_error_return(regs, -my_ret);
-			goto out;
-		}
 
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
+	if (request == PTRACE_TRACEME) {
+		ret = ptrace_traceme();
 		pt_succ_return(regs, 0);
 		goto out;
 	}
-#ifndef ALLOW_INIT_TRACING
-	if (pid == 1) {
-		/* Can't dork with init. */
-		pt_error_return(regs, EPERM);
-		goto out;
-	}
-#endif
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
 
-	if (!child) {
-		pt_error_return(regs, ESRCH);
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		pt_error_return(regs, -ret);
 		goto out;
 	}
 
diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c
index 774ecbb8a031..84d3df2264cb 100644
--- a/arch/sparc64/kernel/ptrace.c
+++ b/arch/sparc64/kernel/ptrace.c
@@ -198,39 +198,15 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
 	}
 #endif
 	if (request == PTRACE_TRACEME) {
-		int ret;
-
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED) {
-			pt_error_return(regs, EPERM);
-			goto out;
-		}
-		ret = security_ptrace(current->parent, current);
-		if (ret) {
-			pt_error_return(regs, -ret);
-			goto out;
-		}
-
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
+		ret = ptrace_traceme();
 		pt_succ_return(regs, 0);
 		goto out;
 	}
-#ifndef ALLOW_INIT_TRACING
-	if (pid == 1) {
-		/* Can't dork with init. */
-		pt_error_return(regs, EPERM);
-		goto out;
-	}
-#endif
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
 
-	if (!child) {
-		pt_error_return(regs, ESRCH);
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		pt_error_return(regs, -ret);
 		goto out;
 	}
 
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c
index 2a925e2af390..5f4cdfa56901 100644
--- a/arch/x86_64/ia32/ptrace32.c
+++ b/arch/x86_64/ia32/ptrace32.c
@@ -196,36 +196,6 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
 
 #undef R32
 
-static struct task_struct *find_target(int request, int pid, int *err)
-{ 
-	struct task_struct *child;
-
-	*err = -EPERM; 
-	if (pid == 1)
-		return NULL; 
-
-	*err = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (child) { 
-		*err = -EPERM;
-		if (child->pid == 1) 
-			goto out;
-		*err = ptrace_check_attach(child, request == PTRACE_KILL); 
-		if (*err < 0) 
-			goto out;
-		return child; 
-	} 
- out:
-	if (child)
-	put_task_struct(child);
-	return NULL; 
-	
-} 
-
 asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 {
 	struct task_struct *child;
@@ -254,9 +224,16 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 		break;
 	} 
 
-	child = find_target(request, pid, &ret);
-	if (!child)
-		return ret;
+	if (request == PTRACE_TRACEME)
+		return ptrace_traceme();
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child))
+		return PTR_ERR(child);
+
+	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	if (ret < 0)
+		goto out;
 
 	childregs = (struct pt_regs *)(child->thread.rsp0 - sizeof(struct pt_regs)); 
 
@@ -373,6 +350,7 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 		break;
 	}
 
+ out:
 	put_task_struct(child);
 	return ret;
 }
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index b2b3dba1298d..864791996b5f 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -80,6 +80,8 @@
 
 
 extern long arch_ptrace(struct task_struct *child, long request, long addr, long data);
+extern struct task_struct *ptrace_get_task_struct(pid_t pid);
+extern int ptrace_traceme(void);
 extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern int ptrace_attach(struct task_struct *tsk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 656476eedb1b..cceaf09ac413 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -408,54 +408,62 @@ int ptrace_request(struct task_struct *child, long request,
 	return ret;
 }
 
-#ifndef __ARCH_SYS_PTRACE
-static int ptrace_get_task_struct(long request, long pid,
-		struct task_struct **childp)
+/**
+ * ptrace_traceme  --  helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+int ptrace_traceme(void)
 {
-	struct task_struct *child;
 	int ret;
 
 	/*
-	 * Callers use child == NULL as an indication to exit early even
-	 * when the return value is 0, so make sure it is non-NULL here.
+	 * Are we already being traced?
+	 */
+	if (current->ptrace & PT_PTRACED)
+		return -EPERM;
+	ret = security_ptrace(current->parent, current);
+	if (ret)
+		return -EPERM;
+	/*
+	 * Set the ptrace bit in the process ptrace flags.
 	 */
-	*childp = NULL;
+	current->ptrace |= PT_PTRACED;
+	return 0;
+}
 
-	if (request == PTRACE_TRACEME) {
-		/*
-		 * Are we already being traced?
-		 */
-		if (current->ptrace & PT_PTRACED)
-			return -EPERM;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			return -EPERM;
-		/*
-		 * Set the ptrace bit in the process ptrace flags.
-		 */
-		current->ptrace |= PT_PTRACED;
-		return 0;
-	}
+/**
+ * ptrace_get_task_struct  --  grab a task struct reference for ptrace
+ * @pid:       process id to grab a task_struct reference of
+ *
+ * This function is a helper for ptrace implementations.  It checks
+ * permissions and then grabs a task struct for use of the actual
+ * ptrace implementation.
+ *
+ * Returns the task_struct for @pid or an ERR_PTR() on failure.
+ */
+struct task_struct *ptrace_get_task_struct(pid_t pid)
+{
+	struct task_struct *child;
 
 	/*
-	 * You may not mess with init
+	 * Tracing init is not allowed.
 	 */
 	if (pid == 1)
-		return -EPERM;
+		return ERR_PTR(-EPERM);
 
-	ret = -ESRCH;
 	read_lock(&tasklist_lock);
 	child = find_task_by_pid(pid);
 	if (child)
 		get_task_struct(child);
 	read_unlock(&tasklist_lock);
 	if (!child)
-		return -ESRCH;
-
-	*childp = child;
-	return 0;
+		return ERR_PTR(-ESRCH);
+	return child;
 }
 
+#ifndef __ARCH_SYS_PTRACE
 asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 {
 	struct task_struct *child;
@@ -465,9 +473,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	 * This lock_kernel fixes a subtle race with suid exec
 	 */
 	lock_kernel();
-	ret = ptrace_get_task_struct(request, pid, &child);
-	if (!child)
+	if (request == PTRACE_TRACEME) {
+		ret = ptrace_traceme();
 		goto out;
+	}
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
-- 
cgit v1.2.3-71-gd317


From 788540141f4549637e89aadca6e25cf25eb53383 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:02:37 -0800
Subject: [PATCH] Permit multiple inclusion of linux/pagevec.h

Make it possible to include linux/pagevec.h multiple times without
incurring errors due to duplicate definitions.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pagevec.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index def32c5715be..8eb7fa76c1d0 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -5,6 +5,9 @@
  * pages.  A pagevec is a multipage container which is used for that.
  */
 
+#ifndef _LINUX_PAGEVEC_H
+#define _LINUX_PAGEVEC_H
+
 /* 14 pointers + two long's align the pagevec structure to a power of two */
 #define PAGEVEC_SIZE	14
 
@@ -83,3 +86,5 @@ static inline void pagevec_lru_add(struct pagevec *pvec)
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
 }
+
+#endif /* _LINUX_PAGEVEC_H */
-- 
cgit v1.2.3-71-gd317


From 4a30131e7dbb17e5fec6958bfac9da9aff1fa29b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sun, 8 Jan 2006 01:02:39 -0800
Subject: [PATCH] Fix some problems with truncate and mtime semantics.

SUS requires that when truncating a file to the size that it currently
is:
  truncate and ftruncate should NOT modify ctime or mtime
  O_TRUNC SHOULD modify ctime and mtime.

Currently mtime and ctime are always modified on most local
filesystems (side effect of ->truncate) or never modified (on NFS).

With this patch:
  ATTR_CTIME|ATTR_MTIME are sent with ATTR_SIZE precisely when
    an update of these times is required whether size changes or not
    (via a new argument to do_truncate).  This allows NFS to do
    the right thing for O_TRUNC.
  inode_setattr nolonger forces ATTR_MTIME|ATTR_CTIME when the ATTR_SIZE
    sets the size to it's current value.  This allows local filesystems
    to do the right thing for f?truncate.

Also, the logic in inode_setattr is changed a bit so there are two return
points.  One returns the error from vmtruncate if it failed, the other
returns 0 (there can be no other failure).

Finally, if vmtruncate succeeds, and ATTR_SIZE is the only change
requested, we now fall-through and mark_inode_dirty.  If a filesystem did
not have a ->truncate function, then vmtruncate will have changed i_size,
without marking the inode as 'dirty', and I think this is wrong.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/attr.c          | 24 ++++++++----------------
 fs/exec.c          |  2 +-
 fs/namei.c         |  2 +-
 fs/open.c          |  9 +++++----
 include/linux/fs.h |  3 ++-
 5 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index 67bcd9b14ea5..b34732506f1d 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,20 +67,12 @@ EXPORT_SYMBOL(inode_change_ok);
 int inode_setattr(struct inode * inode, struct iattr * attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
-	int error = 0;
-
-	if (ia_valid & ATTR_SIZE) {
-		if (attr->ia_size != i_size_read(inode)) {
-			error = vmtruncate(inode, attr->ia_size);
-			if (error || (ia_valid == ATTR_SIZE))
-				goto out;
-		} else {
-			/*
-			 * We skipped the truncate but must still update
-			 * timestamps
-			 */
-			ia_valid |= ATTR_MTIME|ATTR_CTIME;
-		}
+
+	if (ia_valid & ATTR_SIZE &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
 	}
 
 	if (ia_valid & ATTR_UID)
@@ -104,8 +96,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
 		inode->i_mode = mode;
 	}
 	mark_inode_dirty(inode);
-out:
-	return error;
+
+	return 0;
 }
 EXPORT_SYMBOL(inode_setattr);
 
diff --git a/fs/exec.c b/fs/exec.c
index e9650cd22a3b..2075b674d85e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1505,7 +1505,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 		goto close_fail;
 	if (!file->f_op->write)
 		goto close_fail;
-	if (do_truncate(file->f_dentry, 0, file) != 0)
+	if (do_truncate(file->f_dentry, 0, 0, file) != 0)
 		goto close_fail;
 
 	retval = binfmt->core_dump(signr, regs, file);
diff --git a/fs/namei.c b/fs/namei.c
index 6dbbd42d8b95..300eae088d5f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1491,7 +1491,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 		if (!error) {
 			DQUOT_INIT(inode);
 			
-			error = do_truncate(dentry, 0, NULL);
+			error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL);
 		}
 		put_write_access(inode);
 		if (error)
diff --git a/fs/open.c b/fs/open.c
index f53a5b9ffb7d..94968cb3afca 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -194,7 +194,8 @@ out:
 	return error;
 }
 
-int do_truncate(struct dentry *dentry, loff_t length, struct file *filp)
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+	struct file *filp)
 {
 	int err;
 	struct iattr newattrs;
@@ -204,7 +205,7 @@ int do_truncate(struct dentry *dentry, loff_t length, struct file *filp)
 		return -EINVAL;
 
 	newattrs.ia_size = length;
-	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+	newattrs.ia_valid = ATTR_SIZE | time_attrs;
 	if (filp) {
 		newattrs.ia_file = filp;
 		newattrs.ia_valid |= ATTR_FILE;
@@ -266,7 +267,7 @@ static inline long do_sys_truncate(const char __user * path, loff_t length)
 	error = locks_verify_truncate(inode, NULL, length);
 	if (!error) {
 		DQUOT_INIT(inode);
-		error = do_truncate(nd.dentry, length, NULL);
+		error = do_truncate(nd.dentry, length, 0, NULL);
 	}
 	put_write_access(inode);
 
@@ -318,7 +319,7 @@ static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 
 	error = locks_verify_truncate(inode, file, length);
 	if (!error)
-		error = do_truncate(dentry, length, file);
+		error = do_truncate(dentry, length, 0, file);
 out_putf:
 	fput(file);
 out:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ef29500b5df8..74c01aabd4ab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1344,7 +1344,8 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 
 /* fs/open.c */
 
-extern int do_truncate(struct dentry *, loff_t start, struct file *filp);
+extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
+		       struct file *filp);
 extern long do_sys_open(const char __user *filename, int flags, int mode);
 extern struct file *filp_open(const char *, int, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
-- 
cgit v1.2.3-71-gd317


From 017679c4d45783158dba1dd6f79e712c22bb3d9a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:02:43 -0800
Subject: [PATCH] keys: Permit key expiry time to be set

Add a new keyctl function that allows the expiry time to be set on a key or
removed from a key, provided the caller has attribute modification access.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Alexander Zangerl <az@bond.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt   | 15 ++++++++++++++-
 include/linux/keyctl.h   |  1 +
 security/keys/compat.c   |  3 +++
 security/keys/internal.h |  1 +
 security/keys/keyctl.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 6304db59bfe4..c17c4ca74302 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -498,7 +498,7 @@ The keyctl syscall functions are:
      keyring is full, error ENFILE will result.
 
      The link procedure checks the nesting of the keyrings, returning ELOOP if
-     it appears to deep or EDEADLK if the link would introduce a cycle.
+     it appears too deep or EDEADLK if the link would introduce a cycle.
 
 
  (*) Unlink a key or keyring from another keyring:
@@ -628,6 +628,19 @@ The keyctl syscall functions are:
      there is one, otherwise the user default session keyring.
 
 
+ (*) Set the timeout on a key.
+
+	long keyctl(KEYCTL_SET_TIMEOUT, key_serial_t key, unsigned timeout);
+
+     This sets or clears the timeout on a key. The timeout can be 0 to clear
+     the timeout or a number of seconds to set the expiry time that far into
+     the future.
+
+     The process must have attribute modification access on a key to set its
+     timeout. Timeouts may not be set with this function on negative, revoked
+     or expired keys.
+
+
 ===============
 KERNEL SERVICES
 ===============
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index 8d7c59a29e09..ec8f3d622a8d 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -46,5 +46,6 @@
 #define KEYCTL_INSTANTIATE		12	/* instantiate a partially constructed key */
 #define KEYCTL_NEGATE			13	/* negate a partially constructed key */
 #define KEYCTL_SET_REQKEY_KEYRING	14	/* set default request-key keyring */
+#define KEYCTL_SET_TIMEOUT		15	/* set key timeout */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/security/keys/compat.c b/security/keys/compat.c
index 3303673c636e..e8e7ef4a290c 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -74,6 +74,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
 	case KEYCTL_SET_REQKEY_KEYRING:
 		return keyctl_set_reqkey_keyring(arg2);
 
+	case KEYCTL_SET_TIMEOUT:
+		return keyctl_set_timeout(arg2, arg3);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 39cba97c5eb9..51f37c0bdb32 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -136,6 +136,7 @@ extern long keyctl_instantiate_key(key_serial_t, const void __user *,
 				   size_t, key_serial_t);
 extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
 extern long keyctl_set_reqkey_keyring(int);
+extern long keyctl_set_timeout(key_serial_t, unsigned);
 
 
 /*
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index b7a468fabdf9..299f0ae11cf0 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -965,6 +965,46 @@ long keyctl_set_reqkey_keyring(int reqkey_defl)
 
 } /* end keyctl_set_reqkey_keyring() */
 
+/*****************************************************************************/
+/*
+ * set or clear the timeout for a key
+ */
+long keyctl_set_timeout(key_serial_t id, unsigned timeout)
+{
+	struct timespec now;
+	struct key *key;
+	key_ref_t key_ref;
+	time_t expiry;
+	long ret;
+
+	key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR);
+	if (IS_ERR(key_ref)) {
+		ret = PTR_ERR(key_ref);
+		goto error;
+	}
+
+	key = key_ref_to_ptr(key_ref);
+
+	/* make the changes with the locks held to prevent races */
+	down_write(&key->sem);
+
+	expiry = 0;
+	if (timeout > 0) {
+		now = current_kernel_time();
+		expiry = now.tv_sec + timeout;
+	}
+
+	key->expiry = expiry;
+
+	up_write(&key->sem);
+	key_put(key);
+
+	ret = 0;
+error:
+	return ret;
+
+} /* end keyctl_set_timeout() */
+
 /*****************************************************************************/
 /*
  * the key control system call
@@ -1038,6 +1078,10 @@ asmlinkage long sys_keyctl(int option, unsigned long arg2, unsigned long arg3,
 	case KEYCTL_SET_REQKEY_KEYRING:
 		return keyctl_set_reqkey_keyring(arg2);
 
+	case KEYCTL_SET_TIMEOUT:
+		return keyctl_set_timeout((key_serial_t) arg2,
+					  (unsigned) arg3);
+
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-71-gd317


From b5f545c880a2a47947ba2118b2509644ab7a2969 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:02:47 -0800
Subject: [PATCH] keys: Permit running process to instantiate keys

Make it possible for a running process (such as gssapid) to be able to
instantiate a key, as was requested by Trond Myklebust for NFS4.

The patch makes the following changes:

 (1) A new, optional key type method has been added. This permits a key type
     to intercept requests at the point /sbin/request-key is about to be
     spawned and do something else with them - passing them over the
     rpc_pipefs files or netlink sockets for instance.

     The uninstantiated key, the authorisation key and the intended operation
     name are passed to the method.

 (2) The callout_info is no longer passed as an argument to /sbin/request-key
     to prevent unauthorised viewing of this data using ps or by looking in
     /proc/pid/cmdline.

     This means that the old /sbin/request-key program will not work with the
     patched kernel as it will expect to see an extra argument that is no
     longer there.

     A revised keyutils package will be made available tomorrow.

 (3) The callout_info is now attached to the authorisation key. Reading this
     key will retrieve the information.

 (4) A new field has been added to the task_struct. This holds the
     authorisation key currently active for a thread. Searches now look here
     for the caller's set of keys rather than looking for an auth key in the
     lowest level of the session keyring.

     This permits a thread to be servicing multiple requests at once and to
     switch between them. Note that this is per-thread, not per-process, and
     so is usable in multithreaded programs.

     The setting of this field is inherited across fork and exec.

 (5) A new keyctl function (KEYCTL_ASSUME_AUTHORITY) has been added that
     permits a thread to assume the authority to deal with an uninstantiated
     key. Assumption is only permitted if the authorisation key associated
     with the uninstantiated key is somewhere in the thread's keyrings.

     This function can also clear the assumption.

 (6) A new magic key specifier has been added to refer to the currently
     assumed authorisation key (KEY_SPEC_REQKEY_AUTH_KEY).

 (7) Instantiation will only proceed if the appropriate authorisation key is
     assumed first. The assumed authorisation key is discarded if
     instantiation is successful.

 (8) key_validate() is moved from the file of request_key functions to the
     file of permissions functions.

 (9) The documentation is updated.

From: <Valdis.Kletnieks@vt.edu>

    Build fix.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Alexander Zangerl <az@bond.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys-request-key.txt |  22 +++--
 Documentation/keys.txt             |  24 +++++
 include/linux/key.h                |  12 +++
 include/linux/keyctl.h             |   2 +
 include/linux/sched.h              |   1 +
 security/keys/compat.c             |   3 +
 security/keys/internal.h           |   4 +-
 security/keys/keyctl.c             | 107 ++++++++++++++++-----
 security/keys/keyring.c            |  45 ---------
 security/keys/permission.c         |  32 +++++++
 security/keys/process_keys.c       |  71 +++++++-------
 security/keys/request_key.c        | 108 ++++++++++-----------
 security/keys/request_key_auth.c   | 192 ++++++++++++++++++++++---------------
 13 files changed, 378 insertions(+), 245 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt
index 5f2b9c5edbb5..22488d791168 100644
--- a/Documentation/keys-request-key.txt
+++ b/Documentation/keys-request-key.txt
@@ -56,10 +56,12 @@ A request proceeds in the following manner:
  (4) request_key() then forks and executes /sbin/request-key with a new session
      keyring that contains a link to auth key V.
 
- (5) /sbin/request-key execs an appropriate program to perform the actual
+ (5) /sbin/request-key assumes the authority associated with key U.
+
+ (6) /sbin/request-key execs an appropriate program to perform the actual
      instantiation.
 
- (6) The program may want to access another key from A's context (say a
+ (7) The program may want to access another key from A's context (say a
      Kerberos TGT key). It just requests the appropriate key, and the keyring
      search notes that the session keyring has auth key V in its bottom level.
 
@@ -67,19 +69,19 @@ A request proceeds in the following manner:
      UID, GID, groups and security info of process A as if it was process A,
      and come up with key W.
 
- (7) The program then does what it must to get the data with which to
+ (8) The program then does what it must to get the data with which to
      instantiate key U, using key W as a reference (perhaps it contacts a
      Kerberos server using the TGT) and then instantiates key U.
 
- (8) Upon instantiating key U, auth key V is automatically revoked so that it
+ (9) Upon instantiating key U, auth key V is automatically revoked so that it
      may not be used again.
 
- (9) The program then exits 0 and request_key() deletes key V and returns key
+(10) The program then exits 0 and request_key() deletes key V and returns key
      U to the caller.
 
-This also extends further. If key W (step 5 above) didn't exist, key W would be
-created uninstantiated, another auth key (X) would be created [as per step 3]
-and another copy of /sbin/request-key spawned [as per step 4]; but the context
+This also extends further. If key W (step 7 above) didn't exist, key W would be
+created uninstantiated, another auth key (X) would be created (as per step 3)
+and another copy of /sbin/request-key spawned (as per step 4); but the context
 specified by auth key X will still be process A, as it was in auth key V.
 
 This is because process A's keyrings can't simply be attached to
@@ -138,8 +140,8 @@ until one succeeds:
 
  (3) The process's session keyring is searched.
 
- (4) If the process has a request_key() authorisation key in its session
-     keyring then:
+ (4) If the process has assumed the authority associated with a request_key()
+     authorisation key then:
 
      (a) If extant, the calling process's thread keyring is searched.
 
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index eeda00f82d2c..aaa01b0e3ee9 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -308,6 +308,8 @@ process making the call:
 	KEY_SPEC_USER_KEYRING		-4	UID-specific keyring
 	KEY_SPEC_USER_SESSION_KEYRING	-5	UID-session keyring
 	KEY_SPEC_GROUP_KEYRING		-6	GID-specific keyring
+	KEY_SPEC_REQKEY_AUTH_KEY	-7	assumed request_key()
+						  authorisation key
 
 
 The main syscalls are:
@@ -645,6 +647,28 @@ The keyctl syscall functions are:
      or expired keys.
 
 
+ (*) Assume the authority granted to instantiate a key
+
+	long keyctl(KEYCTL_ASSUME_AUTHORITY, key_serial_t key);
+
+     This assumes or divests the authority required to instantiate the
+     specified key. Authority can only be assumed if the thread has the
+     authorisation key associated with the specified key in its keyrings
+     somewhere.
+
+     Once authority is assumed, searches for keys will also search the
+     requester's keyrings using the requester's security label, UID, GID and
+     groups.
+
+     If the requested authority is unavailable, error EPERM will be returned,
+     likewise if the authority has been revoked because the target key is
+     already instantiated.
+
+     If the specified key is 0, then any assumed authority will be divested.
+
+     The assumed authorititive key is inherited across fork and exec.
+
+
 ===============
 KERNEL SERVICES
 ===============
diff --git a/include/linux/key.h b/include/linux/key.h
index 4d189e51bc6c..cbf464ad9589 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -177,6 +177,8 @@ struct key {
 /*
  * kernel managed key type definition
  */
+typedef int (*request_key_actor_t)(struct key *key, struct key *authkey, const char *op);
+
 struct key_type {
 	/* name of the type */
 	const char *name;
@@ -218,6 +220,16 @@ struct key_type {
 	 */
 	long (*read)(const struct key *key, char __user *buffer, size_t buflen);
 
+	/* handle request_key() for this type instead of invoking
+	 * /sbin/request-key (optional)
+	 * - key is the key to instantiate
+	 * - authkey is the authority to assume when instantiating this key
+	 * - op is the operation to be done, usually "create"
+	 * - the call must not return until the instantiation process has run
+	 *   its course
+	 */
+	request_key_actor_t request_key;
+
 	/* internal fields */
 	struct list_head	link;		/* link in types list */
 };
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index ec8f3d622a8d..3365945640c9 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -19,6 +19,7 @@
 #define KEY_SPEC_USER_KEYRING		-4	/* - key ID for UID-specific keyring */
 #define KEY_SPEC_USER_SESSION_KEYRING	-5	/* - key ID for UID-session keyring */
 #define KEY_SPEC_GROUP_KEYRING		-6	/* - key ID for GID-specific keyring */
+#define KEY_SPEC_REQKEY_AUTH_KEY	-7	/* - key ID for assumed request_key auth key */
 
 /* request-key default keyrings */
 #define KEY_REQKEY_DEFL_NO_CHANGE		-1
@@ -47,5 +48,6 @@
 #define KEYCTL_NEGATE			13	/* negate a partially constructed key */
 #define KEYCTL_SET_REQKEY_KEYRING	14	/* set default request-key keyring */
 #define KEYCTL_SET_TIMEOUT		15	/* set key timeout */
+#define KEYCTL_ASSUME_AUTHORITY		16	/* assume request_key() authorisation */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20bd70749104..78eb92ae4d94 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -771,6 +771,7 @@ struct task_struct {
 	unsigned keep_capabilities:1;
 	struct user_struct *user;
 #ifdef CONFIG_KEYS
+	struct key *request_key_auth;	/* assumed request_key authority */
 	struct key *thread_keyring;	/* keyring private to this thread */
 	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
 #endif
diff --git a/security/keys/compat.c b/security/keys/compat.c
index e8e7ef4a290c..bcdb28533733 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -77,6 +77,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
 	case KEYCTL_SET_TIMEOUT:
 		return keyctl_set_timeout(arg2, arg3);
 
+	case KEYCTL_ASSUME_AUTHORITY:
+		return keyctl_assume_authority(arg2);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 51f37c0bdb32..e066e6057955 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -107,12 +107,13 @@ extern struct key *request_key_and_link(struct key_type *type,
 struct request_key_auth {
 	struct key		*target_key;
 	struct task_struct	*context;
+	const char		*callout_info;
 	pid_t			pid;
 };
 
 extern struct key_type key_type_request_key_auth;
 extern struct key *request_key_auth_new(struct key *target,
-					struct key **_rkakey);
+					const char *callout_info);
 
 extern struct key *key_get_instantiation_authkey(key_serial_t target_id);
 
@@ -137,6 +138,7 @@ extern long keyctl_instantiate_key(key_serial_t, const void __user *,
 extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
 extern long keyctl_set_reqkey_keyring(int);
 extern long keyctl_set_timeout(key_serial_t, unsigned);
+extern long keyctl_assume_authority(key_serial_t);
 
 
 /*
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 299f0ae11cf0..3d2ebae029c1 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -834,6 +834,17 @@ long keyctl_instantiate_key(key_serial_t id,
 	if (plen > 32767)
 		goto error;
 
+	/* the appropriate instantiation authorisation key must have been
+	 * assumed before calling this */
+	ret = -EPERM;
+	instkey = current->request_key_auth;
+	if (!instkey)
+		goto error;
+
+	rka = instkey->payload.data;
+	if (rka->target_key->serial != id)
+		goto error;
+
 	/* pull the payload in if one was supplied */
 	payload = NULL;
 
@@ -848,15 +859,6 @@ long keyctl_instantiate_key(key_serial_t id,
 			goto error2;
 	}
 
-	/* find the instantiation authorisation key */
-	instkey = key_get_instantiation_authkey(id);
-	if (IS_ERR(instkey)) {
-		ret = PTR_ERR(instkey);
-		goto error2;
-	}
-
-	rka = instkey->payload.data;
-
 	/* find the destination keyring amongst those belonging to the
 	 * requesting task */
 	keyring_ref = NULL;
@@ -865,7 +867,7 @@ long keyctl_instantiate_key(key_serial_t id,
 					      KEY_WRITE);
 		if (IS_ERR(keyring_ref)) {
 			ret = PTR_ERR(keyring_ref);
-			goto error3;
+			goto error2;
 		}
 	}
 
@@ -874,11 +876,17 @@ long keyctl_instantiate_key(key_serial_t id,
 				       key_ref_to_ptr(keyring_ref), instkey);
 
 	key_ref_put(keyring_ref);
- error3:
-	key_put(instkey);
- error2:
+
+	/* discard the assumed authority if it's just been disabled by
+	 * instantiation of the key */
+	if (ret == 0) {
+		key_put(current->request_key_auth);
+		current->request_key_auth = NULL;
+	}
+
+error2:
 	kfree(payload);
- error:
+error:
 	return ret;
 
 } /* end keyctl_instantiate_key() */
@@ -895,14 +903,16 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 	key_ref_t keyring_ref;
 	long ret;
 
-	/* find the instantiation authorisation key */
-	instkey = key_get_instantiation_authkey(id);
-	if (IS_ERR(instkey)) {
-		ret = PTR_ERR(instkey);
+	/* the appropriate instantiation authorisation key must have been
+	 * assumed before calling this */
+	ret = -EPERM;
+	instkey = current->request_key_auth;
+	if (!instkey)
 		goto error;
-	}
 
 	rka = instkey->payload.data;
+	if (rka->target_key->serial != id)
+		goto error;
 
 	/* find the destination keyring if present (which must also be
 	 * writable) */
@@ -911,7 +921,7 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 		keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(keyring_ref)) {
 			ret = PTR_ERR(keyring_ref);
-			goto error2;
+			goto error;
 		}
 	}
 
@@ -920,9 +930,15 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 				  key_ref_to_ptr(keyring_ref), instkey);
 
 	key_ref_put(keyring_ref);
- error2:
-	key_put(instkey);
- error:
+
+	/* discard the assumed authority if it's just been disabled by
+	 * instantiation of the key */
+	if (ret == 0) {
+		key_put(current->request_key_auth);
+		current->request_key_auth = NULL;
+	}
+
+error:
 	return ret;
 
 } /* end keyctl_negate_key() */
@@ -1005,6 +1021,48 @@ error:
 
 } /* end keyctl_set_timeout() */
 
+/*****************************************************************************/
+/*
+ * assume the authority to instantiate the specified key
+ */
+long keyctl_assume_authority(key_serial_t id)
+{
+	struct key *authkey;
+	long ret;
+
+	/* special key IDs aren't permitted */
+	ret = -EINVAL;
+	if (id < 0)
+		goto error;
+
+	/* we divest ourselves of authority if given an ID of 0 */
+	if (id == 0) {
+		key_put(current->request_key_auth);
+		current->request_key_auth = NULL;
+		ret = 0;
+		goto error;
+	}
+
+	/* attempt to assume the authority temporarily granted to us whilst we
+	 * instantiate the specified key
+	 * - the authorisation key must be in the current task's keyrings
+	 *   somewhere
+	 */
+	authkey = key_get_instantiation_authkey(id);
+	if (IS_ERR(authkey)) {
+		ret = PTR_ERR(authkey);
+		goto error;
+	}
+
+	key_put(current->request_key_auth);
+	current->request_key_auth = authkey;
+	ret = authkey->serial;
+
+error:
+	return ret;
+
+} /* end keyctl_assume_authority() */
+
 /*****************************************************************************/
 /*
  * the key control system call
@@ -1082,6 +1140,9 @@ asmlinkage long sys_keyctl(int option, unsigned long arg2, unsigned long arg3,
 		return keyctl_set_timeout((key_serial_t) arg2,
 					  (unsigned) arg3);
 
+	case KEYCTL_ASSUME_AUTHORITY:
+		return keyctl_assume_authority((key_serial_t) arg2);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 09d92d52ef75..d65a180f888d 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -479,51 +479,6 @@ key_ref_t __keyring_search_one(key_ref_t keyring_ref,
 
 } /* end __keyring_search_one() */
 
-/*****************************************************************************/
-/*
- * search for an instantiation authorisation key matching a target key
- * - the RCU read lock must be held by the caller
- * - a target_id of zero specifies any valid token
- */
-struct key *keyring_search_instkey(struct key *keyring,
-				   key_serial_t target_id)
-{
-	struct request_key_auth *rka;
-	struct keyring_list *klist;
-	struct key *instkey;
-	int loop;
-
-	klist = rcu_dereference(keyring->payload.subscriptions);
-	if (klist) {
-		for (loop = 0; loop < klist->nkeys; loop++) {
-			instkey = klist->keys[loop];
-
-			if (instkey->type != &key_type_request_key_auth)
-				continue;
-
-			rka = instkey->payload.data;
-			if (target_id && rka->target_key->serial != target_id)
-				continue;
-
-			/* the auth key is revoked during instantiation */
-			if (!test_bit(KEY_FLAG_REVOKED, &instkey->flags))
-				goto found;
-
-			instkey = ERR_PTR(-EKEYREVOKED);
-			goto error;
-		}
-	}
-
-	instkey = ERR_PTR(-EACCES);
-	goto error;
-
-found:
-	atomic_inc(&instkey->usage);
-error:
-	return instkey;
-
-} /* end keyring_search_instkey() */
-
 /*****************************************************************************/
 /*
  * find a keyring with the specified name
diff --git a/security/keys/permission.c b/security/keys/permission.c
index e7f579c0eaf5..3b41f9b52537 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -73,3 +73,35 @@ use_these_perms:
 } /* end key_task_permission() */
 
 EXPORT_SYMBOL(key_task_permission);
+
+/*****************************************************************************/
+/*
+ * validate a key
+ */
+int key_validate(struct key *key)
+{
+	struct timespec now;
+	int ret = 0;
+
+	if (key) {
+		/* check it's still accessible */
+		ret = -EKEYREVOKED;
+		if (test_bit(KEY_FLAG_REVOKED, &key->flags) ||
+		    test_bit(KEY_FLAG_DEAD, &key->flags))
+			goto error;
+
+		/* check it hasn't expired */
+		ret = 0;
+		if (key->expiry) {
+			now = current_kernel_time();
+			if (now.tv_sec >= key->expiry)
+				ret = -EKEYEXPIRED;
+		}
+	}
+
+ error:
+	return ret;
+
+} /* end key_validate() */
+
+EXPORT_SYMBOL(key_validate);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 566b1cc0118a..74cb79eb917e 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -270,9 +270,14 @@ int copy_thread_group_keys(struct task_struct *tsk)
 int copy_keys(unsigned long clone_flags, struct task_struct *tsk)
 {
 	key_check(tsk->thread_keyring);
+	key_check(tsk->request_key_auth);
 
 	/* no thread keyring yet */
 	tsk->thread_keyring = NULL;
+
+	/* copy the request_key() authorisation for this thread */
+	key_get(tsk->request_key_auth);
+
 	return 0;
 
 } /* end copy_keys() */
@@ -290,11 +295,12 @@ void exit_thread_group_keys(struct signal_struct *tg)
 
 /*****************************************************************************/
 /*
- * dispose of keys upon thread exit
+ * dispose of per-thread keys upon thread exit
  */
 void exit_keys(struct task_struct *tsk)
 {
 	key_put(tsk->thread_keyring);
+	key_put(tsk->request_key_auth);
 
 } /* end exit_keys() */
 
@@ -382,7 +388,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 				  struct task_struct *context)
 {
 	struct request_key_auth *rka;
-	key_ref_t key_ref, ret, err, instkey_ref;
+	key_ref_t key_ref, ret, err;
 
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
@@ -461,30 +467,12 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			err = key_ref;
 			break;
 		}
-
-		/* if this process has a session keyring and that has an
-		 * instantiation authorisation key in the bottom level, then we
-		 * also search the keyrings of the process mentioned there */
-		if (context != current)
-			goto no_key;
-
-		rcu_read_lock();
-		instkey_ref = __keyring_search_one(
-			make_key_ref(rcu_dereference(
-					     context->signal->session_keyring),
-				     1),
-			&key_type_request_key_auth, NULL, 0);
-		rcu_read_unlock();
-
-		if (IS_ERR(instkey_ref))
-			goto no_key;
-
-		rka = key_ref_to_ptr(instkey_ref)->payload.data;
-
-		key_ref = search_process_keyrings(type, description, match,
-						  rka->context);
-		key_ref_put(instkey_ref);
-
+	}
+	/* or search the user-session keyring */
+	else {
+		key_ref = keyring_search_aux(
+			make_key_ref(context->user->session_keyring, 1),
+			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -500,11 +488,21 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			break;
 		}
 	}
-	/* or search the user-session keyring */
-	else {
-		key_ref = keyring_search_aux(
-			make_key_ref(context->user->session_keyring, 1),
-			context, type, description, match);
+
+	/* if this process has an instantiation authorisation key, then we also
+	 * search the keyrings of the process mentioned there
+	 * - we don't permit access to request_key auth keys via this method
+	 */
+	if (context->request_key_auth &&
+	    context == current &&
+	    type != &key_type_request_key_auth &&
+	    key_validate(context->request_key_auth) == 0
+	    ) {
+		rka = context->request_key_auth->payload.data;
+
+		key_ref = search_process_keyrings(type, description, match,
+						  rka->context);
+
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -521,8 +519,6 @@ key_ref_t search_process_keyrings(struct key_type *type,
 		}
 	}
 
-
-no_key:
 	/* no key - decide on the error we're going to go for */
 	key_ref = ret ? ret : err;
 
@@ -628,6 +624,15 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		key = ERR_PTR(-EINVAL);
 		goto error;
 
+	case KEY_SPEC_REQKEY_AUTH_KEY:
+		key = context->request_key_auth;
+		if (!key)
+			goto error;
+
+		atomic_inc(&key->usage);
+		key_ref = make_key_ref(key, 1);
+		break;
+
 	default:
 		key_ref = ERR_PTR(-EINVAL);
 		if (id < 1)
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 5cc4bba70db6..f030a0ccbb93 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -29,28 +29,36 @@ DECLARE_WAIT_QUEUE_HEAD(request_key_conswq);
 /*****************************************************************************/
 /*
  * request userspace finish the construction of a key
- * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring> <info>"
+ * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring>"
  */
-static int call_request_key(struct key *key,
-			    const char *op,
-			    const char *callout_info)
+static int call_sbin_request_key(struct key *key,
+				 struct key *authkey,
+				 const char *op)
 {
 	struct task_struct *tsk = current;
 	key_serial_t prkey, sskey;
-	struct key *session_keyring, *rkakey;
-	char *argv[10], *envp[3], uid_str[12], gid_str[12];
+	struct key *keyring;
+	char *argv[9], *envp[3], uid_str[12], gid_str[12];
 	char key_str[12], keyring_str[3][12];
+	char desc[20];
 	int ret, i;
 
-	kenter("{%d},%s,%s", key->serial, op, callout_info);
+	kenter("{%d},{%d},%s", key->serial, authkey->serial, op);
 
-	/* generate a new session keyring with an auth key in it */
-	session_keyring = request_key_auth_new(key, &rkakey);
-	if (IS_ERR(session_keyring)) {
-		ret = PTR_ERR(session_keyring);
-		goto error;
+	/* allocate a new session keyring */
+	sprintf(desc, "_req.%u", key->serial);
+
+	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto error_alloc;
 	}
 
+	/* attach the auth key to the session keyring */
+	ret = __key_link(keyring, authkey);
+	if (ret < 0)
+		goto error_link;
+
 	/* record the UID and GID */
 	sprintf(uid_str, "%d", current->fsuid);
 	sprintf(gid_str, "%d", current->fsgid);
@@ -95,22 +103,19 @@ static int call_request_key(struct key *key,
 	argv[i++] = keyring_str[0];
 	argv[i++] = keyring_str[1];
 	argv[i++] = keyring_str[2];
-	argv[i++] = (char *) callout_info;
 	argv[i] = NULL;
 
 	/* do it */
-	ret = call_usermodehelper_keys(argv[0], argv, envp, session_keyring, 1);
+	ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, 1);
 
-	/* dispose of the special keys */
-	key_revoke(rkakey);
-	key_put(rkakey);
-	key_put(session_keyring);
+error_link:
+	key_put(keyring);
 
- error:
+error_alloc:
 	kleave(" = %d", ret);
 	return ret;
 
-} /* end call_request_key() */
+} /* end call_sbin_request_key() */
 
 /*****************************************************************************/
 /*
@@ -122,9 +127,10 @@ static struct key *__request_key_construction(struct key_type *type,
 					      const char *description,
 					      const char *callout_info)
 {
+	request_key_actor_t actor;
 	struct key_construction cons;
 	struct timespec now;
-	struct key *key;
+	struct key *key, *authkey;
 	int ret, negated;
 
 	kenter("%s,%s,%s", type->name, description, callout_info);
@@ -143,8 +149,19 @@ static struct key *__request_key_construction(struct key_type *type,
 	/* we drop the construction sem here on behalf of the caller */
 	up_write(&key_construction_sem);
 
+	/* allocate an authorisation key */
+	authkey = request_key_auth_new(key, callout_info);
+	if (IS_ERR(authkey)) {
+		ret = PTR_ERR(authkey);
+		authkey = NULL;
+		goto alloc_authkey_failed;
+	}
+
 	/* make the call */
-	ret = call_request_key(key, "create", callout_info);
+	actor = call_sbin_request_key;
+	if (type->request_key)
+		actor = type->request_key;
+	ret = actor(key, authkey, "create");
 	if (ret < 0)
 		goto request_failed;
 
@@ -153,22 +170,29 @@ static struct key *__request_key_construction(struct key_type *type,
 	if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags))
 		goto request_failed;
 
+	key_revoke(authkey);
+	key_put(authkey);
+
 	down_write(&key_construction_sem);
 	list_del(&cons.link);
 	up_write(&key_construction_sem);
 
 	/* also give an error if the key was negatively instantiated */
- check_not_negative:
+check_not_negative:
 	if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) {
 		key_put(key);
 		key = ERR_PTR(-ENOKEY);
 	}
 
- out:
+out:
 	kleave(" = %p", key);
 	return key;
 
- request_failed:
+request_failed:
+	key_revoke(authkey);
+	key_put(authkey);
+
+alloc_authkey_failed:
 	/* it wasn't instantiated
 	 * - remove from construction queue
 	 * - mark the key as dead
@@ -217,7 +241,7 @@ static struct key *__request_key_construction(struct key_type *type,
 	key = ERR_PTR(ret);
 	goto out;
 
- alloc_failed:
+alloc_failed:
 	up_write(&key_construction_sem);
 	goto out;
 
@@ -464,35 +488,3 @@ struct key *request_key(struct key_type *type,
 } /* end request_key() */
 
 EXPORT_SYMBOL(request_key);
-
-/*****************************************************************************/
-/*
- * validate a key
- */
-int key_validate(struct key *key)
-{
-	struct timespec now;
-	int ret = 0;
-
-	if (key) {
-		/* check it's still accessible */
-		ret = -EKEYREVOKED;
-		if (test_bit(KEY_FLAG_REVOKED, &key->flags) ||
-		    test_bit(KEY_FLAG_DEAD, &key->flags))
-			goto error;
-
-		/* check it hasn't expired */
-		ret = 0;
-		if (key->expiry) {
-			now = current_kernel_time();
-			if (now.tv_sec >= key->expiry)
-				ret = -EKEYEXPIRED;
-		}
-	}
-
- error:
-	return ret;
-
-} /* end key_validate() */
-
-EXPORT_SYMBOL(key_validate);
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index a8e4069d48cb..cce6ba6b0323 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -15,11 +15,13 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <linux/seq_file.h>
+#include <asm/uaccess.h>
 #include "internal.h"
 
 static int request_key_auth_instantiate(struct key *, const void *, size_t);
 static void request_key_auth_describe(const struct key *, struct seq_file *);
 static void request_key_auth_destroy(struct key *);
+static long request_key_auth_read(const struct key *, char __user *, size_t);
 
 /*
  * the request-key authorisation key type definition
@@ -30,51 +32,25 @@ struct key_type key_type_request_key_auth = {
 	.instantiate	= request_key_auth_instantiate,
 	.describe	= request_key_auth_describe,
 	.destroy	= request_key_auth_destroy,
+	.read		= request_key_auth_read,
 };
 
 /*****************************************************************************/
 /*
- * instantiate a request-key authorisation record
+ * instantiate a request-key authorisation key
  */
 static int request_key_auth_instantiate(struct key *key,
 					const void *data,
 					size_t datalen)
 {
-	struct request_key_auth *rka, *irka;
-	struct key *instkey;
-	int ret;
-
-	ret = -ENOMEM;
-	rka = kmalloc(sizeof(*rka), GFP_KERNEL);
-	if (rka) {
-		/* see if the calling process is already servicing the key
-		 * request of another process */
-		instkey = key_get_instantiation_authkey(0);
-		if (!IS_ERR(instkey)) {
-			/* it is - use that instantiation context here too */
-			irka = instkey->payload.data;
-			rka->context = irka->context;
-			rka->pid = irka->pid;
-			key_put(instkey);
-		}
-		else {
-			/* it isn't - use this process as the context */
-			rka->context = current;
-			rka->pid = current->pid;
-		}
-
-		rka->target_key = key_get((struct key *) data);
-		key->payload.data = rka;
-		ret = 0;
-	}
-
-	return ret;
+	key->payload.data = (struct request_key_auth *) data;
+	return 0;
 
 } /* end request_key_auth_instantiate() */
 
 /*****************************************************************************/
 /*
- *
+ * reading a request-key authorisation key retrieves the callout information
  */
 static void request_key_auth_describe(const struct key *key,
 				      struct seq_file *m)
@@ -83,10 +59,38 @@ static void request_key_auth_describe(const struct key *key,
 
 	seq_puts(m, "key:");
 	seq_puts(m, key->description);
-	seq_printf(m, " pid:%d", rka->pid);
+	seq_printf(m, " pid:%d ci:%zu", rka->pid, strlen(rka->callout_info));
 
 } /* end request_key_auth_describe() */
 
+/*****************************************************************************/
+/*
+ * read the callout_info data
+ * - the key's semaphore is read-locked
+ */
+static long request_key_auth_read(const struct key *key,
+				  char __user *buffer, size_t buflen)
+{
+	struct request_key_auth *rka = key->payload.data;
+	size_t datalen;
+	long ret;
+
+	datalen = strlen(rka->callout_info);
+	ret = datalen;
+
+	/* we can return the data as is */
+	if (buffer && buflen > 0) {
+		if (buflen > datalen)
+			buflen = datalen;
+
+		if (copy_to_user(buffer, rka->callout_info, buflen) != 0)
+			ret = -EFAULT;
+	}
+
+	return ret;
+
+} /* end request_key_auth_read() */
+
 /*****************************************************************************/
 /*
  * destroy an instantiation authorisation token key
@@ -104,54 +108,87 @@ static void request_key_auth_destroy(struct key *key)
 
 /*****************************************************************************/
 /*
- * create a session keyring to be for the invokation of /sbin/request-key and
- * stick an authorisation token in it
+ * create an authorisation token for /sbin/request-key or whoever to gain
+ * access to the caller's security data
  */
-struct key *request_key_auth_new(struct key *target, struct key **_rkakey)
+struct key *request_key_auth_new(struct key *target, const char *callout_info)
 {
-	struct key *keyring, *rkakey = NULL;
+	struct request_key_auth *rka, *irka;
+	struct key *authkey = NULL;
 	char desc[20];
 	int ret;
 
 	kenter("%d,", target->serial);
 
-	/* allocate a new session keyring */
-	sprintf(desc, "_req.%u", target->serial);
+	/* allocate a auth record */
+	rka = kmalloc(sizeof(*rka), GFP_KERNEL);
+	if (!rka) {
+		kleave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
 
-	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL);
-	if (IS_ERR(keyring)) {
-		kleave("= %ld", PTR_ERR(keyring));
-		return keyring;
+	/* see if the calling process is already servicing the key request of
+	 * another process */
+	if (current->request_key_auth) {
+		/* it is - use that instantiation context here too */
+		irka = current->request_key_auth->payload.data;
+		rka->context = irka->context;
+		rka->pid = irka->pid;
 	}
+	else {
+		/* it isn't - use this process as the context */
+		rka->context = current;
+		rka->pid = current->pid;
+	}
+
+	rka->target_key = key_get(target);
+	rka->callout_info = callout_info;
 
 	/* allocate the auth key */
 	sprintf(desc, "%x", target->serial);
 
-	rkakey = key_alloc(&key_type_request_key_auth, desc,
-			   current->fsuid, current->fsgid,
-			   KEY_POS_VIEW | KEY_USR_VIEW, 1);
-	if (IS_ERR(rkakey)) {
-		key_put(keyring);
-		kleave("= %ld", PTR_ERR(rkakey));
-		return rkakey;
+	authkey = key_alloc(&key_type_request_key_auth, desc,
+			    current->fsuid, current->fsgid,
+			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
+			    KEY_USR_VIEW, 1);
+	if (IS_ERR(authkey)) {
+		ret = PTR_ERR(authkey);
+		goto error_alloc;
 	}
 
 	/* construct and attach to the keyring */
-	ret = key_instantiate_and_link(rkakey, target, 0, keyring, NULL);
-	if (ret < 0) {
-		key_revoke(rkakey);
-		key_put(rkakey);
-		key_put(keyring);
-		kleave("= %d", ret);
-		return ERR_PTR(ret);
-	}
+	ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL);
+	if (ret < 0)
+		goto error_inst;
 
-	*_rkakey = rkakey;
-	kleave(" = {%d} ({%d})", keyring->serial, rkakey->serial);
-	return keyring;
+	kleave(" = {%d})", authkey->serial);
+	return authkey;
+
+error_inst:
+	key_revoke(authkey);
+	key_put(authkey);
+error_alloc:
+	key_put(rka->target_key);
+	kfree(rka);
+	kleave("= %d", ret);
+	return ERR_PTR(ret);
 
 } /* end request_key_auth_new() */
 
+/*****************************************************************************/
+/*
+ * see if an authorisation key is associated with a particular key
+ */
+static int key_get_instantiation_authkey_match(const struct key *key,
+					       const void *_id)
+{
+	struct request_key_auth *rka = key->payload.data;
+	key_serial_t id = (key_serial_t)(unsigned long) _id;
+
+	return rka->target_key->serial == id;
+
+} /* end key_get_instantiation_authkey_match() */
+
 /*****************************************************************************/
 /*
  * get the authorisation key for instantiation of a specific key if attached to
@@ -162,22 +199,27 @@ struct key *request_key_auth_new(struct key *target, struct key **_rkakey)
  */
 struct key *key_get_instantiation_authkey(key_serial_t target_id)
 {
-	struct task_struct *tsk = current;
-	struct key *instkey;
-
-	/* we must have our own personal session keyring */
-	if (!tsk->signal->session_keyring)
-		return ERR_PTR(-EACCES);
-
-	/* and it must contain a suitable request authorisation key
-	 * - lock RCU against session keyring changing
-	 */
-	rcu_read_lock();
+	struct key *authkey;
+	key_ref_t authkey_ref;
+
+	authkey_ref = search_process_keyrings(
+		&key_type_request_key_auth,
+		(void *) (unsigned long) target_id,
+		key_get_instantiation_authkey_match,
+		current);
+
+	if (IS_ERR(authkey_ref)) {
+		authkey = ERR_PTR(PTR_ERR(authkey_ref));
+		goto error;
+	}
 
-	instkey = keyring_search_instkey(
-		rcu_dereference(tsk->signal->session_keyring), target_id);
+	authkey = key_ref_to_ptr(authkey_ref);
+	if (test_bit(KEY_FLAG_REVOKED, &authkey->flags)) {
+		key_put(authkey);
+		authkey = ERR_PTR(-EKEYREVOKED);
+	}
 
-	rcu_read_unlock();
-	return instkey;
+error:
+	return authkey;
 
 } /* end key_get_instantiation_authkey() */
-- 
cgit v1.2.3-71-gd317


From 71fabd5e4835309b4feca6209122ce56c595c461 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Sun, 8 Jan 2006 01:02:48 -0800
Subject: [PATCH] sigaction should clear all signals on SIG_IGN, not just < 32

While rooting aroung in the signal code trying to understand how to fix the
SIG_IGN ploy (set sig handler to SIG_IGN and flood system with high speed
repeating timers) I came across what, I think, is a problem in sigaction()
in that when processing a SIG_IGN request it flushes signals from 1 to
SIGRTMIN and leaves the rest.  Attempt to fix this.

Signed-off-by: George Anzinger <george@mvista.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/signal.h | 17 +++++++++++++++++
 kernel/signal.c        | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index ea9eff16c4b7..b7d093520bb6 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -94,6 +94,23 @@ static inline int sigfindinword(unsigned long word)
 
 #endif /* __HAVE_ARCH_SIG_BITOPS */
 
+static inline int sigisemptyset(sigset_t *set)
+{
+	extern void _NSIG_WORDS_is_unsupported_size(void);
+	switch (_NSIG_WORDS) {
+	case 4:
+		return (set->sig[3] | set->sig[2] |
+			set->sig[1] | set->sig[0]) == 0;
+	case 2:
+		return (set->sig[1] | set->sig[0]) == 0;
+	case 1:
+		return set->sig[0] == 0;
+	default:
+		_NSIG_WORDS_is_unsupported_size();
+		return 0;
+	}
+}
+
 #define sigmask(sig)	(1UL << ((sig) - 1))
 
 #ifndef __HAVE_ARCH_SIG_SETOPS
diff --git a/kernel/signal.c b/kernel/signal.c
index 9b6fda5e87f1..e20724af9b36 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -620,6 +620,33 @@ void signal_wake_up(struct task_struct *t, int resume)
 		kick_process(t);
 }
 
+/*
+ * Remove signals in mask from the pending set and queue.
+ * Returns 1 if any signals were found.
+ *
+ * All callers must be holding the siglock.
+ *
+ * This version takes a sigset mask and looks at all signals,
+ * not just those in the first mask word.
+ */
+static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
+{
+	struct sigqueue *q, *n;
+	sigset_t m;
+
+	sigandsets(&m, mask, &s->signal);
+	if (sigisemptyset(&m))
+		return 0;
+
+	signandsets(&s->signal, &s->signal, mask);
+	list_for_each_entry_safe(q, n, &s->list, list) {
+		if (sigismember(mask, q->info.si_signo)) {
+			list_del_init(&q->list);
+			__sigqueue_free(q);
+		}
+	}
+	return 1;
+}
 /*
  * Remove signals in mask from the pending set and queue.
  * Returns 1 if any signals were found.
@@ -2408,6 +2435,7 @@ int
 do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct k_sigaction *k;
+	sigset_t mask;
 
 	if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
 		return -EINVAL;
@@ -2455,9 +2483,11 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 			*k = *act;
 			sigdelsetmask(&k->sa.sa_mask,
 				      sigmask(SIGKILL) | sigmask(SIGSTOP));
-			rm_from_queue(sigmask(sig), &t->signal->shared_pending);
+			sigemptyset(&mask);
+			sigaddset(&mask, sig);
+			rm_from_queue_full(&mask, &t->signal->shared_pending);
 			do {
-				rm_from_queue(sigmask(sig), &t->pending);
+				rm_from_queue_full(&mask, &t->pending);
 				recalc_sigpending_tsk(t);
 				t = next_thread(t);
 			} while (t != current);
-- 
cgit v1.2.3-71-gd317


From a885c8c4316e1c1d2d2c8755da3f3d14f852528d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 8 Jan 2006 01:02:50 -0800
Subject: [PATCH] Add block_device_operations.getgeo block device method

HDIO_GETGEO is implemented in most block drivers, and all of them have to
duplicate the code to copy the structure to userspace, as well as getting
the start sector.  This patch moves that to common code [1] and adds a
->getgeo method to fill out the raw kernel hd_geometry structure.  For many
drivers this means ->ioctl can go away now.

[1] the s390 block drivers are odd in this respect.  xpram sets ->start
    to 4 always which seems more than odd, and the dasd driver shifts
    the start offset around, probably because of it's non-standard
    sector size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@suse.de>
Cc: <mike.miller@hp.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo Giarrusso <blaisorblade@yahoo.it>
Cc: Bartlomiej Zolnierkiewicz <B.Zolnierkiewicz@elka.pw.edu.pl>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/ubd_kern.c      | 21 +++++++++--------
 block/ioctl.c                   | 22 ++++++++++++++++++
 drivers/acorn/block/mfmhd.c     | 36 ++++++-----------------------
 drivers/block/DAC960.c          | 35 ++++++++++++-----------------
 drivers/block/acsi.c            | 26 +++++++++++----------
 drivers/block/amiflop.c         | 23 +++++++++----------
 drivers/block/aoe/aoeblk.c      | 26 ++++++---------------
 drivers/block/cciss.c           | 31 ++++++++++++-------------
 drivers/block/cpqarray.c        | 36 +++++++++++++++--------------
 drivers/block/floppy.c          | 35 +++++++++++++++--------------
 drivers/block/paride/pd.c       | 34 +++++++++++++++-------------
 drivers/block/paride/pf.c       | 50 +++++++++++++++++++++--------------------
 drivers/block/ps2esdi.c         | 25 +++++++--------------
 drivers/block/sx8.c             | 35 +++++++----------------------
 drivers/block/umem.c            | 41 +++++++++++++--------------------
 drivers/block/viodasd.c         | 44 ++++++++----------------------------
 drivers/block/xd.c              | 25 +++++++++++----------
 drivers/ide/ide-disk.c          | 12 ++++++++++
 drivers/ide/ide-floppy.c        | 12 ++++++++++
 drivers/ide/ide.c               | 13 -----------
 drivers/ide/legacy/hd.c         | 24 +++++++-------------
 drivers/md/md.c                 | 30 +++++++++----------------
 drivers/message/i2o/i2o_block.c | 18 +++++++--------
 drivers/mmc/mmc_block.c         | 25 +++++----------------
 drivers/mtd/mtd_blkdevs.c       | 25 ++++++++-------------
 drivers/s390/block/dasd.c       | 23 +++++++++++++++++++
 drivers/s390/block/dasd_ioctl.c | 28 -----------------------
 drivers/s390/block/xpram.c      | 18 ++++++---------
 drivers/scsi/sd.c               | 21 +++++------------
 include/linux/fs.h              |  2 ++
 30 files changed, 340 insertions(+), 456 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 73f9652b2ee9..3a93c6f772fa 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -117,6 +117,7 @@ static int ubd_open(struct inode * inode, struct file * filp);
 static int ubd_release(struct inode * inode, struct file * file);
 static int ubd_ioctl(struct inode * inode, struct file * file,
 		     unsigned int cmd, unsigned long arg);
+static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 #define MAX_DEV (8)
 
@@ -125,6 +126,7 @@ static struct block_device_operations ubd_blops = {
         .open		= ubd_open,
         .release	= ubd_release,
         .ioctl		= ubd_ioctl,
+	.getgeo		= ubd_getgeo,
 };
 
 /* Protected by the queue_lock */
@@ -1058,6 +1060,16 @@ static void do_ubd_request(request_queue_t *q)
 	}
 }
 
+static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct ubd *dev = bdev->bd_disk->private_data;
+
+	geo->heads = 128;
+	geo->sectors = 32;
+	geo->cylinders = dev->size / (128 * 32 * 512);
+	return 0;
+}
+
 static int ubd_ioctl(struct inode * inode, struct file * file,
 		     unsigned int cmd, unsigned long arg)
 {
@@ -1070,16 +1082,7 @@ static int ubd_ioctl(struct inode * inode, struct file * file,
 	};
 
 	switch (cmd) {
-	        struct hd_geometry g;
 		struct cdrom_volctrl volume;
-	case HDIO_GETGEO:
-		if(!loc) return(-EINVAL);
-		g.heads = 128;
-		g.sectors = 32;
-		g.cylinders = dev->size / (128 * 32 * 512);
-		g.start = get_start_sect(inode->i_bdev);
-		return(copy_to_user(loc, &g, sizeof(g)) ? -EFAULT : 0);
-
 	case HDIO_GET_IDENTITY:
 		ubd_id.cyls = dev->size / (128 * 32 * 512);
 		if(copy_to_user((char __user *) arg, (char *) &ubd_id,
diff --git a/block/ioctl.c b/block/ioctl.c
index 6e278474f9a8..82030e1dfd63 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,6 +1,7 @@
 #include <linux/sched.h>		/* for capable() */
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
+#include <linux/hdreg.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
@@ -245,6 +246,27 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 		set_device_ro(bdev, n);
 		unlock_kernel();
 		return 0;
+	case HDIO_GETGEO: {
+		struct hd_geometry geo;
+
+		if (!arg)
+			return -EINVAL;
+		if (!disk->fops->getgeo)
+			return -ENOTTY;
+
+		/*
+		 * We need to set the startsect first, the driver may
+		 * want to override it.
+		 */
+		geo.start = get_start_sect(bdev);
+		ret = disk->fops->getgeo(bdev, &geo);
+		if (ret)
+			return ret;
+		if (copy_to_user((struct hd_geometry __user *)arg, &geo,
+					sizeof(geo)))
+			return -EFAULT;
+		return 0;
+	}
 	}
 
 	lock_kernel();
diff --git a/drivers/acorn/block/mfmhd.c b/drivers/acorn/block/mfmhd.c
index 4b65f74d66b1..ce074f6f3369 100644
--- a/drivers/acorn/block/mfmhd.c
+++ b/drivers/acorn/block/mfmhd.c
@@ -129,19 +129,6 @@ static DEFINE_SPINLOCK(mfm_lock);
 #define MAJOR_NR	MFM_ACORN_MAJOR
 #define QUEUE (mfm_queue)
 #define CURRENT elv_next_request(mfm_queue)
-/*
- * This sort of stuff should be in a header file shared with ide.c, hd.c, xd.c etc
- */
-#ifndef HDIO_GETGEO
-#define HDIO_GETGEO 0x301
-struct hd_geometry {
-	unsigned char heads;
-	unsigned char sectors;
-	unsigned short cylinders;
-	unsigned long start;
-};
-#endif
-
 
 /*
  * Configuration section
@@ -1153,22 +1140,13 @@ static int mfm_initdrives(void)
  * The 'front' end of the mfm driver follows...
  */
 
-static int mfm_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long arg)
+static int mfm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct mfm_info *p = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry *geo = (struct hd_geometry *) arg;
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	if (!arg)
-		return -EINVAL;
-	if (put_user (p->heads, &geo->heads))
-		return -EFAULT;
-	if (put_user (p->sectors, &geo->sectors))
-		return -EFAULT;
-	if (put_user (p->cylinders, &geo->cylinders))
-		return -EFAULT;
-	if (put_user (get_start_sect(inode->i_bdev), &geo->start))
-		return -EFAULT;
+	struct mfm_info *p = bdev->bd_disk->private_data;
+
+	geo->heads = p->heads;
+	geo->sectors = p->sectors;
+	geo->cylinders = p->cylinders;
 	return 0;
 }
 
@@ -1219,7 +1197,7 @@ void xd_set_geometry(struct block_device *bdev, unsigned char secsptrack,
 static struct block_device_operations mfm_fops =
 {
 	.owner		= THIS_MODULE,
-	.ioctl		= mfm_ioctl,
+	.getgeo		= mfm_getgeo,
 };
 
 /*
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 21097a39a057..179c68a3cef3 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -92,34 +92,28 @@ static int DAC960_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int DAC960_ioctl(struct inode *inode, struct file *file,
-			unsigned int cmd, unsigned long arg)
+static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct gendisk *disk = inode->i_bdev->bd_disk;
+	struct gendisk *disk = bdev->bd_disk;
 	DAC960_Controller_T *p = disk->queue->queuedata;
 	int drive_nr = (long)disk->private_data;
-	struct hd_geometry g;
-	struct hd_geometry __user *loc = (struct hd_geometry __user *)arg;
-
-	if (cmd != HDIO_GETGEO || !loc)
-		return -EINVAL;
 
 	if (p->FirmwareType == DAC960_V1_Controller) {
-		g.heads = p->V1.GeometryTranslationHeads;
-		g.sectors = p->V1.GeometryTranslationSectors;
-		g.cylinders = p->V1.LogicalDriveInformation[drive_nr].
-			LogicalDriveSize / (g.heads * g.sectors);
+		geo->heads = p->V1.GeometryTranslationHeads;
+		geo->sectors = p->V1.GeometryTranslationSectors;
+		geo->cylinders = p->V1.LogicalDriveInformation[drive_nr].
+			LogicalDriveSize / (geo->heads * geo->sectors);
 	} else {
 		DAC960_V2_LogicalDeviceInfo_T *i =
 			p->V2.LogicalDeviceInformation[drive_nr];
 		switch (i->DriveGeometry) {
 		case DAC960_V2_Geometry_128_32:
-			g.heads = 128;
-			g.sectors = 32;
+			geo->heads = 128;
+			geo->sectors = 32;
 			break;
 		case DAC960_V2_Geometry_255_63:
-			g.heads = 255;
-			g.sectors = 63;
+			geo->heads = 255;
+			geo->sectors = 63;
 			break;
 		default:
 			DAC960_Error("Illegal Logical Device Geometry %d\n",
@@ -127,12 +121,11 @@ static int DAC960_ioctl(struct inode *inode, struct file *file,
 			return -EINVAL;
 		}
 
-		g.cylinders = i->ConfigurableDeviceSize / (g.heads * g.sectors);
+		geo->cylinders = i->ConfigurableDeviceSize /
+			(geo->heads * geo->sectors);
 	}
 	
-	g.start = get_start_sect(inode->i_bdev);
-
-	return copy_to_user(loc, &g, sizeof g) ? -EFAULT : 0; 
+	return 0;
 }
 
 static int DAC960_media_changed(struct gendisk *disk)
@@ -157,7 +150,7 @@ static int DAC960_revalidate_disk(struct gendisk *disk)
 static struct block_device_operations DAC960_BlockDeviceOperations = {
 	.owner			= THIS_MODULE,
 	.open			= DAC960_open,
-	.ioctl			= DAC960_ioctl,
+	.getgeo			= DAC960_getgeo,
 	.media_changed		= DAC960_media_changed,
 	.revalidate_disk	= DAC960_revalidate_disk,
 };
diff --git a/drivers/block/acsi.c b/drivers/block/acsi.c
index 5d2d649f7e8d..196c0ec9cd54 100644
--- a/drivers/block/acsi.c
+++ b/drivers/block/acsi.c
@@ -1079,6 +1079,19 @@ static void redo_acsi_request( void )
  *
  ***********************************************************************/
 
+static int acsi_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct acsi_info_struct *aip = bdev->bd_disk->private_data;
+
+	/*
+	 * Just fake some geometry here, it's nonsense anyway
+	 * To make it easy, use Adaptec's usual 64/32 mapping
+	 */
+	geo->heads = 64;
+	geo->sectors = 32;
+	geo->cylinders = aip->size >> 11;
+	return 0;
+}
 
 static int acsi_ioctl( struct inode *inode, struct file *file,
 					   unsigned int cmd, unsigned long arg )
@@ -1086,18 +1099,6 @@ static int acsi_ioctl( struct inode *inode, struct file *file,
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct acsi_info_struct *aip = disk->private_data;
 	switch (cmd) {
-	  case HDIO_GETGEO:
-		/* HDIO_GETGEO is supported more for getting the partition's
-		 * start sector... */
-	  { struct hd_geometry *geo = (struct hd_geometry *)arg;
-	    /* just fake some geometry here, it's nonsense anyway; to make it
-		 * easy, use Adaptec's usual 64/32 mapping */
-	    put_user( 64, &geo->heads );
-	    put_user( 32, &geo->sectors );
-	    put_user( aip->size >> 11, &geo->cylinders );
-		put_user(get_start_sect(inode->i_bdev), &geo->start);
-		return 0;
-	  }
 	  case SCSI_IOCTL_GET_IDLUN:
 		/* SCSI compatible GET_IDLUN call to get target's ID and LUN number */
 		put_user( aip->target | (aip->lun << 8),
@@ -1592,6 +1593,7 @@ static struct block_device_operations acsi_fops = {
 	.open		= acsi_open,
 	.release	= acsi_release,
 	.ioctl		= acsi_ioctl,
+	.getgeo		= acsi_getgeo,
 	.media_changed	= acsi_media_change,
 	.revalidate_disk= acsi_revalidate,
 };
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 0acbfff8ad28..cb2a545e57dc 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1424,6 +1424,16 @@ static void do_fd_request(request_queue_t * q)
 	redo_fd_request();
 }
 
+static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	int drive = MINOR(bdev->bd_dev) & 3;
+
+	geo->heads = unit[drive].type->heads;
+	geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
+	geo->cylinders = unit[drive].type->tracks;
+	return 0;
+}
+
 static int fd_ioctl(struct inode *inode, struct file *filp,
 		    unsigned int cmd, unsigned long param)
 {
@@ -1431,18 +1441,6 @@ static int fd_ioctl(struct inode *inode, struct file *filp,
 	static struct floppy_struct getprm;
 
 	switch(cmd){
-	case HDIO_GETGEO:
-	{
-		struct hd_geometry loc;
-		loc.heads = unit[drive].type->heads;
-		loc.sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
-		loc.cylinders = unit[drive].type->tracks;
-		loc.start = 0;
-		if (copy_to_user((void *)param, (void *)&loc,
-				 sizeof(struct hd_geometry)))
-			return -EFAULT;
-		break;
-	}
 	case FDFMTBEG:
 		get_fdc(drive);
 		if (fd_ref[drive] > 1) {
@@ -1652,6 +1650,7 @@ static struct block_device_operations floppy_fops = {
 	.open		= floppy_open,
 	.release	= floppy_release,
 	.ioctl		= fd_ioctl,
+	.getgeo		= fd_getgeo,
 	.media_changed	= amiga_floppy_change,
 };
 
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 0e97fcb9f3a1..c05ee8bffd97 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -169,38 +169,26 @@ aoeblk_make_request(request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
-/* This ioctl implementation expects userland to have the device node
- * permissions set so that only priviledged users can open an aoe
- * block device directly.
- */
 static int
-aoeblk_ioctl(struct inode *inode, struct file *filp, uint cmd, ulong arg)
+aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct aoedev *d;
-
-	if (!arg)
-		return -EINVAL;
+	struct aoedev *d = bdev->bd_disk->private_data;
 
-	d = inode->i_bdev->bd_disk->private_data;
 	if ((d->flags & DEVFL_UP) == 0) {
 		printk(KERN_ERR "aoe: aoeblk_ioctl: disk not up\n");
 		return -ENODEV;
 	}
 
-	if (cmd == HDIO_GETGEO) {
-		d->geo.start = get_start_sect(inode->i_bdev);
-		if (!copy_to_user((void __user *) arg, &d->geo, sizeof d->geo))
-			return 0;
-		return -EFAULT;
-	}
-	printk(KERN_INFO "aoe: aoeblk_ioctl: unknown ioctl %d\n", cmd);
-	return -EINVAL;
+	geo->cylinders = d->geo.cylinders;
+	geo->heads = d->geo.heads;
+	geo->sectors = d->geo.sectors;
+	return 0;
 }
 
 static struct block_device_operations aoe_bdops = {
 	.open = aoeblk_open,
 	.release = aoeblk_release,
-	.ioctl = aoeblk_ioctl,
+	.getgeo = aoeblk_getgeo,
 	.owner = THIS_MODULE,
 };
 
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d2815b7a9150..bdb9c2717d40 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -153,6 +153,7 @@ static int cciss_open(struct inode *inode, struct file *filep);
 static int cciss_release(struct inode *inode, struct file *filep);
 static int cciss_ioctl(struct inode *inode, struct file *filep, 
 		unsigned int cmd, unsigned long arg);
+static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int revalidate_allvol(ctlr_info_t *host);
 static int cciss_revalidate(struct gendisk *disk);
@@ -194,6 +195,7 @@ static struct block_device_operations cciss_fops  = {
 	.open		= cciss_open, 
 	.release       	= cciss_release,
         .ioctl		= cciss_ioctl,
+        .getgeo		= cciss_getgeo,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = cciss_compat_ioctl,
 #endif
@@ -633,6 +635,20 @@ static int cciss_ioctl32_big_passthru(struct file *file, unsigned cmd, unsigned
 	return err;
 }
 #endif
+
+static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	drive_info_struct *drv = get_drv(bdev->bd_disk);
+
+	if (!drv->cylinders)
+		return -ENXIO;
+
+	geo->heads = drv->heads;
+	geo->sectors = drv->sectors;
+	geo->cylinders = drv->cylinders;
+	return 0;
+}
+
 /*
  * ioctl 
  */
@@ -651,21 +667,6 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
 #endif /* CCISS_DEBUG */ 
 	
 	switch(cmd) {
-	case HDIO_GETGEO:
-	{
-                struct hd_geometry driver_geo;
-                if (drv->cylinders) {
-                        driver_geo.heads = drv->heads;
-                        driver_geo.sectors = drv->sectors;
-                        driver_geo.cylinders = drv->cylinders;
-                } else
-			return -ENXIO;
-                driver_geo.start= get_start_sect(inode->i_bdev);
-                if (copy_to_user(argp, &driver_geo, sizeof(struct hd_geometry)))
-                        return  -EFAULT;
-                return(0);
-	}
-
 	case CCISS_GETPCIINFO:
 	{
 		cciss_pci_info_struct pciinfo;
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 9bddb6874873..9f0664dd3800 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -160,6 +160,7 @@ static int sendcmd(
 static int ida_open(struct inode *inode, struct file *filep);
 static int ida_release(struct inode *inode, struct file *filep);
 static int ida_ioctl(struct inode *inode, struct file *filep, unsigned int cmd, unsigned long arg);
+static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io);
 
 static void do_ida_request(request_queue_t *q);
@@ -199,6 +200,7 @@ static struct block_device_operations ida_fops  = {
 	.open		= ida_open,
 	.release	= ida_release,
 	.ioctl		= ida_ioctl,
+	.getgeo		= ida_getgeo,
 	.revalidate_disk= ida_revalidate,
 };
 
@@ -1124,6 +1126,23 @@ static void ida_timer(unsigned long tdata)
 	h->misc_tflags = 0;
 }
 
+static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	drv_info_t *drv = get_drv(bdev->bd_disk);
+
+	if (drv->cylinders) {
+		geo->heads = drv->heads;
+		geo->sectors = drv->sectors;
+		geo->cylinders = drv->cylinders;
+	} else {
+		geo->heads = 0xff;
+		geo->sectors = 0x3f;
+		geo->cylinders = drv->nr_blks / (0xff*0x3f);
+	}
+
+	return 0;
+}
+
 /*
  *  ida_ioctl does some miscellaneous stuff like reporting drive geometry,
  *  setting readahead and submitting commands from userspace to the controller.
@@ -1133,27 +1152,10 @@ static int ida_ioctl(struct inode *inode, struct file *filep, unsigned int cmd,
 	drv_info_t *drv = get_drv(inode->i_bdev->bd_disk);
 	ctlr_info_t *host = get_host(inode->i_bdev->bd_disk);
 	int error;
-	int diskinfo[4];
-	struct hd_geometry __user *geo = (struct hd_geometry __user *)arg;
 	ida_ioctl_t __user *io = (ida_ioctl_t __user *)arg;
 	ida_ioctl_t *my_io;
 
 	switch(cmd) {
-	case HDIO_GETGEO:
-		if (drv->cylinders) {
-			diskinfo[0] = drv->heads;
-			diskinfo[1] = drv->sectors;
-			diskinfo[2] = drv->cylinders;
-		} else {
-			diskinfo[0] = 0xff;
-			diskinfo[1] = 0x3f;
-			diskinfo[2] = drv->nr_blks / (0xff*0x3f);
-		}
-		put_user(diskinfo[0], &geo->heads);
-		put_user(diskinfo[1], &geo->sectors);
-		put_user(diskinfo[2], &geo->cylinders);
-		put_user(get_start_sect(inode->i_bdev), &geo->start);
-		return 0;
 	case IDAGETDRVINFO:
 		if (copy_to_user(&io->c.drv, drv, sizeof(drv_info_t)))
 			return -EFAULT;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index a5b857c5c4b8..b86613b21cf1 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3445,6 +3445,23 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
 	return 0;
 }
 
+static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	int drive = (long)bdev->bd_disk->private_data;
+	int type = ITYPE(drive_state[drive].fd_device);
+	struct floppy_struct *g;
+	int ret;
+
+	ret = get_floppy_geometry(drive, type, &g);
+	if (ret)
+		return ret;
+
+	geo->heads = g->head;
+	geo->sectors = g->sect;
+	geo->cylinders = g->track;
+	return 0;
+}
+
 static int fd_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		    unsigned long param)
 {
@@ -3474,23 +3491,6 @@ static int fd_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		cmd = FDEJECT;
 	}
 
-	/* generic block device ioctls */
-	switch (cmd) {
-		/* the following have been inspired by the corresponding
-		 * code for other block devices. */
-		struct floppy_struct *g;
-	case HDIO_GETGEO:
-		{
-			struct hd_geometry loc;
-			ECALL(get_floppy_geometry(drive, type, &g));
-			loc.heads = g->head;
-			loc.sectors = g->sect;
-			loc.cylinders = g->track;
-			loc.start = 0;
-			return _COPYOUT(loc);
-		}
-	}
-
 	/* convert the old style command into a new style command */
 	if ((cmd & 0xff00) == 0x0200) {
 		ECALL(normalize_ioctl(&cmd, &size));
@@ -3938,6 +3938,7 @@ static struct block_device_operations floppy_fops = {
 	.open		= floppy_open,
 	.release	= floppy_release,
 	.ioctl		= fd_ioctl,
+	.getgeo		= fd_getgeo,
 	.media_changed	= check_floppy_change,
 	.revalidate_disk = floppy_revalidate,
 };
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index fa49d62626ba..62d2464c12f2 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -747,32 +747,33 @@ static int pd_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int pd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct pd_unit *disk = bdev->bd_disk->private_data;
+
+	if (disk->alt_geom) {
+		geo->heads = PD_LOG_HEADS;
+		geo->sectors = PD_LOG_SECTS;
+		geo->cylinders = disk->capacity / (geo->heads * geo->sectors);
+	} else {
+		geo->heads = disk->heads;
+		geo->sectors = disk->sectors;
+		geo->cylinders = disk->cylinders;
+	}
+
+	return 0;
+}
+
 static int pd_ioctl(struct inode *inode, struct file *file,
 	 unsigned int cmd, unsigned long arg)
 {
 	struct pd_unit *disk = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry __user *geo = (struct hd_geometry __user *) arg;
-	struct hd_geometry g;
 
 	switch (cmd) {
 	case CDROMEJECT:
 		if (disk->access == 1)
 			pd_special_command(disk, pd_eject);
 		return 0;
-	case HDIO_GETGEO:
-		if (disk->alt_geom) {
-			g.heads = PD_LOG_HEADS;
-			g.sectors = PD_LOG_SECTS;
-			g.cylinders = disk->capacity / (g.heads * g.sectors);
-		} else {
-			g.heads = disk->heads;
-			g.sectors = disk->sectors;
-			g.cylinders = disk->cylinders;
-		}
-		g.start = get_start_sect(inode->i_bdev);
-		if (copy_to_user(geo, &g, sizeof(struct hd_geometry)))
-			return -EFAULT;
-		return 0;
 	default:
 		return -EINVAL;
 	}
@@ -815,6 +816,7 @@ static struct block_device_operations pd_fops = {
 	.open		= pd_open,
 	.release	= pd_release,
 	.ioctl		= pd_ioctl,
+	.getgeo		= pd_getgeo,
 	.media_changed	= pd_check_media,
 	.revalidate_disk= pd_revalidate
 };
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index e9746af29b9f..852b564e903a 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -205,6 +205,7 @@ static int pf_open(struct inode *inode, struct file *file);
 static void do_pf_request(request_queue_t * q);
 static int pf_ioctl(struct inode *inode, struct file *file,
 		    unsigned int cmd, unsigned long arg);
+static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int pf_release(struct inode *inode, struct file *file);
 
@@ -266,6 +267,7 @@ static struct block_device_operations pf_fops = {
 	.open		= pf_open,
 	.release	= pf_release,
 	.ioctl		= pf_ioctl,
+	.getgeo		= pf_getgeo,
 	.media_changed	= pf_check_media,
 };
 
@@ -313,34 +315,34 @@ static int pf_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int pf_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct pf_unit *pf = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry __user *geo = (struct hd_geometry __user *) arg;
-	struct hd_geometry g;
-	sector_t capacity;
-
-	if (cmd == CDROMEJECT) {
-		if (pf->access == 1) {
-			pf_eject(pf);
-			return 0;
-		}
-		return -EBUSY;
-	}
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	capacity = get_capacity(pf->disk);
+	struct pf_unit *pf = bdev->bd_disk->private_data;
+	sector_t capacity = get_capacity(pf->disk);
+
 	if (capacity < PF_FD_MAX) {
-		g.cylinders = sector_div(capacity, PF_FD_HDS * PF_FD_SPT);
-		g.heads = PF_FD_HDS;
-		g.sectors = PF_FD_SPT;
+		geo->cylinders = sector_div(capacity, PF_FD_HDS * PF_FD_SPT);
+		geo->heads = PF_FD_HDS;
+		geo->sectors = PF_FD_SPT;
 	} else {
-		g.cylinders = sector_div(capacity, PF_HD_HDS * PF_HD_SPT);
-		g.heads = PF_HD_HDS;
-		g.sectors = PF_HD_SPT;
+		geo->cylinders = sector_div(capacity, PF_HD_HDS * PF_HD_SPT);
+		geo->heads = PF_HD_HDS;
+		geo->sectors = PF_HD_SPT;
 	}
-	if (copy_to_user(geo, &g, sizeof(g)))
-		return -EFAULT;
+
+	return 0;
+}
+
+static int pf_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pf_unit *pf = inode->i_bdev->bd_disk->private_data;
+
+	if (cmd != CDROMEJECT)
+		return -EINVAL;
+
+	if (pf->access != 1)
+		return -EBUSY;
+	pf_eject(pf);
 	return 0;
 }
 
diff --git a/drivers/block/ps2esdi.c b/drivers/block/ps2esdi.c
index 29d1518be72a..43415f69839f 100644
--- a/drivers/block/ps2esdi.c
+++ b/drivers/block/ps2esdi.c
@@ -81,8 +81,7 @@ static void (*current_int_handler) (u_int) = NULL;
 static void ps2esdi_normal_interrupt_handler(u_int);
 static void ps2esdi_initial_reset_int_handler(u_int);
 static void ps2esdi_geometry_int_handler(u_int);
-static int ps2esdi_ioctl(struct inode *inode, struct file *file,
-			 u_int cmd, u_long arg);
+static int ps2esdi_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int ps2esdi_read_status_words(int num_words, int max_words, u_short * buffer);
 
@@ -132,7 +131,7 @@ static struct ps2esdi_i_struct ps2esdi_info[MAX_HD] =
 static struct block_device_operations ps2esdi_fops =
 {
 	.owner		= THIS_MODULE,
-	.ioctl		= ps2esdi_ioctl,
+	.getgeo		= ps2esdi_getgeo,
 };
 
 static struct gendisk *ps2esdi_gendisk[2];
@@ -1058,21 +1057,13 @@ static void dump_cmd_complete_status(u_int int_ret_code)
 
 }
 
-static int ps2esdi_ioctl(struct inode *inode,
-			 struct file *file, u_int cmd, u_long arg)
+static int ps2esdi_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct ps2esdi_i_struct *p = inode->i_bdev->bd_disk->private_data;
-	struct ps2esdi_geometry geom;
-
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	memset(&geom, 0, sizeof(geom));
-	geom.heads = p->head;
-	geom.sectors = p->sect;
-	geom.cylinders = p->cyl;
-	geom.start = get_start_sect(inode->i_bdev);
-	if (copy_to_user((void __user *)arg, &geom, sizeof(geom)))
-		return -EFAULT;
+	struct ps2esdi_i_struct *p = bdev->bd_disk->private_data;
+
+	geo->heads = p->head;
+	geo->sectors = p->sect;
+	geo->cylinders = p->cyl;
 	return 0;
 }
 
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 9251f4131b53..c0cdc182a8b0 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -407,8 +407,7 @@ struct carm_array_info {
 
 static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
 static void carm_remove_one (struct pci_dev *pdev);
-static int carm_bdev_ioctl(struct inode *ino, struct file *fil,
-			   unsigned int cmd, unsigned long arg);
+static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static struct pci_device_id carm_pci_tbl[] = {
 	{ PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
@@ -426,7 +425,7 @@ static struct pci_driver carm_driver = {
 
 static struct block_device_operations carm_bd_ops = {
 	.owner		= THIS_MODULE,
-	.ioctl		= carm_bdev_ioctl,
+	.getgeo		= carm_bdev_getgeo,
 };
 
 static unsigned int carm_host_id;
@@ -434,32 +433,14 @@ static unsigned long carm_major_alloc;
 
 
-static int carm_bdev_ioctl(struct inode *ino, struct file *fil,
-			   unsigned int cmd, unsigned long arg)
+static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	void __user *usermem = (void __user *) arg;
-	struct carm_port *port = ino->i_bdev->bd_disk->private_data;
-	struct hd_geometry geom;
+	struct carm_port *port = bdev->bd_disk->private_data;
 
-	switch (cmd) {
-	case HDIO_GETGEO:
-		if (!usermem)
-			return -EINVAL;
-
-		geom.heads = (u8) port->dev_geom_head;
-		geom.sectors = (u8) port->dev_geom_sect;
-		geom.cylinders = port->dev_geom_cyl;
-		geom.start = get_start_sect(ino->i_bdev);
-
-		if (copy_to_user(usermem, &geom, sizeof(geom)))
-			return -EFAULT;
-		return 0;
-
-	default:
-		break;
-	}
-
-	return -EOPNOTSUPP;
+	geo->heads = (u8) port->dev_geom_head;
+	geo->sectors = (u8) port->dev_geom_sect;
+	geo->cylinders = port->dev_geom_cyl;
+	return 0;
 }
 
 static const u32 msg_sizes[] = { 32, 64, 128, CARM_MSG_SIZE };
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 0f48301342da..15299e7a1ade 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -809,34 +809,23 @@ static int mm_revalidate(struct gendisk *disk)
 	set_capacity(disk, card->mm_size << 1);
 	return 0;
 }
-/*
------------------------------------------------------------------------------------
---                            mm_ioctl
------------------------------------------------------------------------------------
-*/
-static int mm_ioctl(struct inode *i, struct file *f, unsigned int cmd, unsigned long arg)
+
+static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	if (cmd == HDIO_GETGEO) {
-		struct cardinfo *card = i->i_bdev->bd_disk->private_data;
-		int size = card->mm_size * (1024 / MM_HARDSECT);
-		struct hd_geometry geo;
-		/*
-		 * get geometry: we have to fake one...  trim the size to a
-		 * multiple of 2048 (1M): tell we have 32 sectors, 64 heads,
-		 * whatever cylinders.
-		 */
-		geo.heads     = 64;
-		geo.sectors   = 32;
-		geo.start     = get_start_sect(i->i_bdev);
-		geo.cylinders = size / (geo.heads * geo.sectors);
-
-		if (copy_to_user((void __user *) arg, &geo, sizeof(geo)))
-			return -EFAULT;
-		return 0;
-	}
+	struct cardinfo *card = bdev->bd_disk->private_data;
+	int size = card->mm_size * (1024 / MM_HARDSECT);
 
-	return -EINVAL;
+	/*
+	 * get geometry: we have to fake one...  trim the size to a
+	 * multiple of 2048 (1M): tell we have 32 sectors, 64 heads,
+	 * whatever cylinders.
+	 */
+	geo->heads     = 64;
+	geo->sectors   = 32;
+	geo->cylinders = size / (geo->heads * geo->sectors);
+	return 0;
 }
+
 /*
 -----------------------------------------------------------------------------------
 --                                mm_check_change
@@ -855,7 +844,7 @@ static int mm_check_change(struct gendisk *disk)
 */
 static struct block_device_operations mm_fops = {
 	.owner		= THIS_MODULE,
-	.ioctl		= mm_ioctl,
+	.getgeo		= mm_getgeo,
 	.revalidate_disk= mm_revalidate,
 	.media_changed	= mm_check_change,
 };
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 063f0304a163..d1aaf31bd97e 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -247,43 +247,17 @@ static int viodasd_release(struct inode *ino, struct file *fil)
 
 /* External ioctl entry point.
  */
-static int viodasd_ioctl(struct inode *ino, struct file *fil,
-			 unsigned int cmd, unsigned long arg)
+static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	unsigned char sectors;
-	unsigned char heads;
-	unsigned short cylinders;
-	struct hd_geometry *geo;
-	struct gendisk *gendisk;
-	struct viodasd_device *d;
+	struct gendisk *disk = bdev->bd_disk;
+	struct viodasd_device *d = disk->private_data;
 
-	switch (cmd) {
-	case HDIO_GETGEO:
-		geo = (struct hd_geometry *)arg;
-		if (geo == NULL)
-			return -EINVAL;
-		if (!access_ok(VERIFY_WRITE, geo, sizeof(*geo)))
-			return -EFAULT;
-		gendisk = ino->i_bdev->bd_disk;
-		d = gendisk->private_data;
-		sectors = d->sectors;
-		if (sectors == 0)
-			sectors = 32;
-		heads = d->tracks;
-		if (heads == 0)
-			heads = 64;
-		cylinders = d->cylinders;
-		if (cylinders == 0)
-			cylinders = get_capacity(gendisk) / (sectors * heads);
-		if (__put_user(sectors, &geo->sectors) ||
-		    __put_user(heads, &geo->heads) ||
-		    __put_user(cylinders, &geo->cylinders) ||
-		    __put_user(get_start_sect(ino->i_bdev), &geo->start))
-			return -EFAULT;
-		return 0;
-	}
+	geo->sectors = d->sectors ? d->sectors : 0;
+	geo->heads = d->tracks ? d->tracks  : 64;
+	geo->cylinders = d->cylinders ? d->cylinders :
+		get_capacity(disk) / (geo->cylinders * geo->heads);
 
-	return -EINVAL;
+	return 0;
 }
 
 /*
@@ -293,7 +267,7 @@ static struct block_device_operations viodasd_fops = {
 	.owner = THIS_MODULE,
 	.open = viodasd_open,
 	.release = viodasd_release,
-	.ioctl = viodasd_ioctl,
+	.getgeo = viodasd_getgeo,
 };
 
 /*
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 68b6d7b154cf..97f5dab24b5a 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -128,9 +128,12 @@ static DEFINE_SPINLOCK(xd_lock);
 
 static struct gendisk *xd_gendisk[2];
 
+static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+
 static struct block_device_operations xd_fops = {
 	.owner	= THIS_MODULE,
 	.ioctl	= xd_ioctl,
+	.getgeo = xd_getgeo,
 };
 static DECLARE_WAIT_QUEUE_HEAD(xd_wait_int);
 static u_char xd_drives, xd_irq = 5, xd_dma = 3, xd_maxsectors;
@@ -330,22 +333,20 @@ static void do_xd_request (request_queue_t * q)
 	}
 }
 
+static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	XD_INFO *p = bdev->bd_disk->private_data;
+
+	geo->heads = p->heads;
+	geo->sectors = p->sectors;
+	geo->cylinders = p->cylinders;
+	return 0;
+}
+
 /* xd_ioctl: handle device ioctl's */
 static int xd_ioctl (struct inode *inode,struct file *file,u_int cmd,u_long arg)
 {
-	XD_INFO *p = inode->i_bdev->bd_disk->private_data;
-
 	switch (cmd) {
-		case HDIO_GETGEO:
-		{
-			struct hd_geometry g;
-			struct hd_geometry __user *geom= (void __user *)arg;
-			g.heads = p->heads;
-			g.sectors = p->sectors;
-			g.cylinders = p->cylinders;
-			g.start = get_start_sect(inode->i_bdev);
-			return copy_to_user(geom, &g, sizeof(g)) ? -EFAULT : 0;
-		}
 		case HDIO_SET_DMA:
 			if (!capable(CAP_SYS_ADMIN)) return -EACCES;
 			if (xdc_busy) return -EBUSY;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 4b441720b6ba..cab362ea0336 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -1130,6 +1130,17 @@ static int idedisk_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int idedisk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct ide_disk_obj *idkp = ide_disk_g(bdev->bd_disk);
+	ide_drive_t *drive = idkp->drive;
+
+	geo->heads = drive->bios_head;
+	geo->sectors = drive->bios_sect;
+	geo->cylinders = (u16)drive->bios_cyl; /* truncate */
+	return 0;
+}
+
 static int idedisk_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
@@ -1164,6 +1175,7 @@ static struct block_device_operations idedisk_ops = {
 	.open		= idedisk_open,
 	.release	= idedisk_release,
 	.ioctl		= idedisk_ioctl,
+	.getgeo		= idedisk_getgeo,
 	.media_changed	= idedisk_media_changed,
 	.revalidate_disk= idedisk_revalidate_disk
 };
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index fba3fffc2d66..5945f551aaaa 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -2031,6 +2031,17 @@ static int idefloppy_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int idefloppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct ide_floppy_obj *floppy = ide_floppy_g(bdev->bd_disk);
+	ide_drive_t *drive = floppy->drive;
+
+	geo->heads = drive->bios_head;
+	geo->sectors = drive->bios_sect;
+	geo->cylinders = (u16)drive->bios_cyl; /* truncate */
+	return 0;
+}
+
 static int idefloppy_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
@@ -2120,6 +2131,7 @@ static struct block_device_operations idefloppy_ops = {
 	.open		= idefloppy_open,
 	.release	= idefloppy_release,
 	.ioctl		= idefloppy_ioctl,
+	.getgeo		= idefloppy_getgeo,
 	.media_changed	= idefloppy_media_changed,
 	.revalidate_disk= idefloppy_revalidate_disk
 };
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 4b524f6b3ecd..b069b13b75a7 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -1278,19 +1278,6 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 	up(&ide_setting_sem);
 
 	switch (cmd) {
-		case HDIO_GETGEO:
-		{
-			struct hd_geometry geom;
-			if (!p || (drive->media != ide_disk && drive->media != ide_floppy)) return -EINVAL;
-			geom.heads = drive->bios_head;
-			geom.sectors = drive->bios_sect;
-			geom.cylinders = (u16)drive->bios_cyl; /* truncate */
-			geom.start = get_start_sect(bdev);
-			if (copy_to_user(p, &geom, sizeof(struct hd_geometry)))
-				return -EFAULT;
-			return 0;
-		}
-
 		case HDIO_OBSOLETE_IDENTITY:
 		case HDIO_GET_IDENTITY:
 			if (bdev != bdev->bd_contains)
diff --git a/drivers/ide/legacy/hd.c b/drivers/ide/legacy/hd.c
index 242029c9c0ca..6439dec66881 100644
--- a/drivers/ide/legacy/hd.c
+++ b/drivers/ide/legacy/hd.c
@@ -658,22 +658,14 @@ static void do_hd_request (request_queue_t * q)
 	enable_irq(HD_IRQ);
 }
 
-static int hd_ioctl(struct inode * inode, struct file * file,
-	unsigned int cmd, unsigned long arg)
+static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct hd_i_struct *disk = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry __user *loc = (struct hd_geometry __user *) arg;
-	struct hd_geometry g; 
-
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	if (!loc)
-		return -EINVAL;
-	g.heads = disk->head;
-	g.sectors = disk->sect;
-	g.cylinders = disk->cyl;
-	g.start = get_start_sect(inode->i_bdev);
-	return copy_to_user(loc, &g, sizeof g) ? -EFAULT : 0; 
+	struct hd_i_struct *disk = bdev->bd_disk->private_data;
+
+	geo->heads = disk->head;
+	geo->sectors = disk->sect;
+	geo->cylinders = disk->cyl;
+	return 0;
 }
 
 /*
@@ -695,7 +687,7 @@ static irqreturn_t hd_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 }
 
 static struct block_device_operations hd_fops = {
-	.ioctl =	hd_ioctl,
+	.getgeo =	hd_getgeo,
 };
 
 /*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1b76fb29fb70..e423a16ba3c9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3598,12 +3598,21 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
 	return 0;
 }
 
+static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	mddev_t *mddev = bdev->bd_disk->private_data;
+
+	geo->heads = 2;
+	geo->sectors = 4;
+	geo->cylinders = get_capacity(mddev->gendisk) / 8;
+	return 0;
+}
+
 static int md_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
 	int err = 0;
 	void __user *argp = (void __user *)arg;
-	struct hd_geometry __user *loc = argp;
 	mddev_t *mddev = NULL;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -3765,24 +3774,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
 	 * 4 sectors (with a BIG number of cylinders...). This drives
 	 * dosfs just mad... ;-)
 	 */
-		case HDIO_GETGEO:
-			if (!loc) {
-				err = -EINVAL;
-				goto abort_unlock;
-			}
-			err = put_user (2, (char __user *) &loc->heads);
-			if (err)
-				goto abort_unlock;
-			err = put_user (4, (char __user *) &loc->sectors);
-			if (err)
-				goto abort_unlock;
-			err = put_user(get_capacity(mddev->gendisk)/8,
-					(short __user *) &loc->cylinders);
-			if (err)
-				goto abort_unlock;
-			err = put_user (get_start_sect(inode->i_bdev),
-						(long __user *) &loc->start);
-			goto done_unlock;
 	}
 
 	/*
@@ -3911,6 +3902,7 @@ static struct block_device_operations md_fops =
 	.open		= md_open,
 	.release	= md_release,
 	.ioctl		= md_ioctl,
+	.getgeo		= md_getgeo,
 	.media_changed	= md_media_changed,
 	.revalidate_disk= md_revalidate,
 };
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 5b1febed3133..b09fb6307153 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -662,6 +662,13 @@ static int i2o_block_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int i2o_block_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	i2o_block_biosparam(get_capacity(bdev->bd_disk),
+			    &geo->cylinders, &geo->heads, &geo->sectors);
+	return 0;
+}
+
 /**
  *	i2o_block_ioctl - Issue device specific ioctl calls.
  *	@cmd: ioctl command
@@ -676,7 +683,6 @@ static int i2o_block_ioctl(struct inode *inode, struct file *file,
 {
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct i2o_block_device *dev = disk->private_data;
-	void __user *argp = (void __user *)arg;
 
 	/* Anyone capable of this syscall can do *real bad* things */
 
@@ -684,15 +690,6 @@ static int i2o_block_ioctl(struct inode *inode, struct file *file,
 		return -EPERM;
 
 	switch (cmd) {
-	case HDIO_GETGEO:
-		{
-			struct hd_geometry g;
-			i2o_block_biosparam(get_capacity(disk),
-					    &g.cylinders, &g.heads, &g.sectors);
-			g.start = get_start_sect(inode->i_bdev);
-			return copy_to_user(argp, &g, sizeof(g)) ? -EFAULT : 0;
-		}
-
 	case BLKI2OGRSTRAT:
 		return put_user(dev->rcache, (int __user *)arg);
 	case BLKI2OGWSTRAT:
@@ -962,6 +959,7 @@ static struct block_device_operations i2o_block_fops = {
 	.open = i2o_block_open,
 	.release = i2o_block_release,
 	.ioctl = i2o_block_ioctl,
+	.getgeo = i2o_block_getgeo,
 	.media_changed = i2o_block_media_changed
 };
 
diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c
index 198561d21710..d5f28981596b 100644
--- a/drivers/mmc/mmc_block.c
+++ b/drivers/mmc/mmc_block.c
@@ -113,31 +113,18 @@ static int mmc_blk_release(struct inode *inode, struct file *filp)
 }
 
 static int
-mmc_blk_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
+mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct block_device *bdev = inode->i_bdev;
-
-	if (cmd == HDIO_GETGEO) {
-		struct hd_geometry geo;
-
-		memset(&geo, 0, sizeof(struct hd_geometry));
-
-		geo.cylinders	= get_capacity(bdev->bd_disk) / (4 * 16);
-		geo.heads	= 4;
-		geo.sectors	= 16;
-		geo.start	= get_start_sect(bdev);
-
-		return copy_to_user((void __user *)arg, &geo, sizeof(geo))
-			? -EFAULT : 0;
-	}
-
-	return -ENOTTY;
+	geo->cylinders = get_capacity(bdev->bd_disk) / (4 * 16);
+	geo->heads = 4;
+	geo->sectors = 16;
+	return 0;
 }
 
 static struct block_device_operations mmc_bdops = {
 	.open			= mmc_blk_open,
 	.release		= mmc_blk_release,
-	.ioctl			= mmc_blk_ioctl,
+	.getgeo			= mmc_blk_getgeo,
 	.owner			= THIS_MODULE,
 };
 
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 339cb1218eaa..7f3ff500b68e 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -194,6 +194,14 @@ static int blktrans_release(struct inode *i, struct file *f)
 	return ret;
 }
 
+static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data;
+
+	if (dev->tr->getgeo)
+		return dev->tr->getgeo(dev, geo);
+	return -ENOTTY;
+}
 
 static int blktrans_ioctl(struct inode *inode, struct file *file,
 			      unsigned int cmd, unsigned long arg)
@@ -207,22 +215,6 @@ static int blktrans_ioctl(struct inode *inode, struct file *file,
 			return tr->flush(dev);
 		/* The core code did the work, we had nothing to do. */
 		return 0;
-
-	case HDIO_GETGEO:
-		if (tr->getgeo) {
-			struct hd_geometry g;
-			int ret;
-
-			memset(&g, 0, sizeof(g));
-			ret = tr->getgeo(dev, &g);
-			if (ret)
-				return ret;
-
-			g.start = get_start_sect(inode->i_bdev);
-			if (copy_to_user((void __user *)arg, &g, sizeof(g)))
-				return -EFAULT;
-			return 0;
-		} /* else */
 	default:
 		return -ENOTTY;
 	}
@@ -233,6 +225,7 @@ struct block_device_operations mtd_blktrans_ops = {
 	.open		= blktrans_open,
 	.release	= blktrans_release,
 	.ioctl		= blktrans_ioctl,
+	.getgeo		= blktrans_getgeo,
 };
 
 int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index f779f674dfa0..2472fa1a1be1 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -18,6 +18,7 @@
 #include <linux/major.h>
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
+#include <linux/hdreg.h>
 
 #include <asm/ccwdev.h>
 #include <asm/ebcdic.h>
@@ -1723,12 +1724,34 @@ dasd_release(struct inode *inp, struct file *filp)
 	return 0;
 }
 
+/*
+ * Return disk geometry.
+ */
+static int
+dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct dasd_device *device;
+
+	device = bdev->bd_disk->private_data;
+	if (!device)
+		return -ENODEV;
+
+	if (!device->discipline ||
+	    !device->discipline->fill_geometry)
+		return -EINVAL;
+
+	device->discipline->fill_geometry(device, geo);
+	geo->start = get_start_sect(bdev) >> device->s2b_shift;
+	return 0;
+}
+
 struct block_device_operations
 dasd_device_operations = {
 	.owner		= THIS_MODULE,
 	.open		= dasd_open,
 	.release	= dasd_release,
 	.ioctl		= dasd_ioctl,
+	.getgeo		= dasd_getgeo,
 };
 
 
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 044b75371990..8e4dcd58599e 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -485,33 +485,6 @@ dasd_ioctl_set_ro(struct block_device *bdev, int no, long args)
 	return rc;
 }
 
-/*
- * Return disk geometry.
- */
-static int
-dasd_ioctl_getgeo(struct block_device *bdev, int no, long args)
-{
-	struct hd_geometry geo = { 0, };
-	struct dasd_device *device;
-
-	device =  bdev->bd_disk->private_data;
-	if (device == NULL)
-		return -ENODEV;
-
-	if (device == NULL || device->discipline == NULL ||
-	    device->discipline->fill_geometry == NULL)
-		return -EINVAL;
-
-	geo = (struct hd_geometry) {};
-	device->discipline->fill_geometry(device, &geo);
-	geo.start = get_start_sect(bdev) >> device->s2b_shift;
-	if (copy_to_user((struct hd_geometry __user *) args, &geo,
-			 sizeof (struct hd_geometry)))
-		return -EFAULT;
-
-	return 0;
-}
-
 /*
  * List of static ioctls.
  */
@@ -528,7 +501,6 @@ static struct { int no; dasd_ioctl_fn_t fn; } dasd_ioctls[] =
 	{ BIODASDPRRST, dasd_ioctl_reset_profile },
 	{ BLKROSET, dasd_ioctl_set_ro },
 	{ DASDAPIVER, dasd_ioctl_api_version },
-	{ HDIO_GETGEO, dasd_ioctl_getgeo },
 	{ -1, NULL }
 };
 
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index bf3a67c3cc5e..54ecd548c318 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -328,31 +328,27 @@ fail:
 	return 0;
 }
 
-static int xpram_ioctl (struct inode *inode, struct file *filp,
-		 unsigned int cmd, unsigned long arg)
+static int xpram_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct hd_geometry __user *geo;
 	unsigned long size;
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
+
 	/*
 	 * get geometry: we have to fake one...  trim the size to a
 	 * multiple of 64 (32k): tell we have 16 sectors, 4 heads,
 	 * whatever cylinders. Tell also that data starts at sector. 4.
 	 */
-	geo = (struct hd_geometry __user *) arg;
 	size = (xpram_pages * 8) & ~0x3f;
-	put_user(size >> 6, &geo->cylinders);
-	put_user(4, &geo->heads);
-	put_user(16, &geo->sectors);
-	put_user(4, &geo->start);
+	geo->cylinders = size >> 6;
+	geo->heads = 4;
+	geo->sectors = 16;
+	geo->start = 4;
 	return 0;
 }
 
 static struct block_device_operations xpram_devops =
 {
 	.owner	= THIS_MODULE,
-	.ioctl	= xpram_ioctl,
+	.getgeo	= xpram_getgeo,
 };
 
 /*
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 32d4d8d7b9f3..4c5127ed379c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -527,7 +527,7 @@ static int sd_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int sd_hdio_getgeo(struct block_device *bdev, struct hd_geometry __user *loc)
+static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
 	struct scsi_device *sdp = sdkp->device;
@@ -545,15 +545,9 @@ static int sd_hdio_getgeo(struct block_device *bdev, struct hd_geometry __user *
 	else
 		scsicam_bios_param(bdev, sdkp->capacity, diskinfo);
 
-	if (put_user(diskinfo[0], &loc->heads))
-		return -EFAULT;
-	if (put_user(diskinfo[1], &loc->sectors))
-		return -EFAULT;
-	if (put_user(diskinfo[2], &loc->cylinders))
-		return -EFAULT;
-	if (put_user((unsigned)get_start_sect(bdev),
-	             (unsigned long __user *)&loc->start))
-		return -EFAULT;
+	geo->heads = diskinfo[0];
+	geo->sectors = diskinfo[1];
+	geo->cylinders = diskinfo[2];
 	return 0;
 }
 
@@ -593,12 +587,6 @@ static int sd_ioctl(struct inode * inode, struct file * filp,
 	if (!scsi_block_when_processing_errors(sdp) || !error)
 		return error;
 
-	if (cmd == HDIO_GETGEO) {
-		if (!arg)
-			return -EINVAL;
-		return sd_hdio_getgeo(bdev, p);
-	}
-
 	/*
 	 * Send SCSI addressing ioctls directly to mid level, send other
 	 * ioctls to block level and then onto mid level if they can't be
@@ -800,6 +788,7 @@ static struct block_device_operations sd_fops = {
 	.open			= sd_open,
 	.release		= sd_release,
 	.ioctl			= sd_ioctl,
+	.getgeo			= sd_getgeo,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl		= sd_compat_ioctl,
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 74c01aabd4ab..7f140480c6a8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -224,6 +224,7 @@ extern int dir_notify_enable;
 #include <asm/semaphore.h>
 #include <asm/byteorder.h>
 
+struct hd_geometry;
 struct iovec;
 struct nameidata;
 struct kiocb;
@@ -962,6 +963,7 @@ struct block_device_operations {
 	int (*direct_access) (struct block_device *, sector_t, unsigned long *);
 	int (*media_changed) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
+	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	struct module *owner;
 };
 
-- 
cgit v1.2.3-71-gd317


From bf066c7db775a04bd761f8ea206f5522d0cf40ff Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 8 Jan 2006 01:03:19 -0800
Subject: [PATCH] shared mounts: cleanup

Small cleanups in shared mounts code.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c        | 2 +-
 fs/pnode.c            | 2 +-
 include/linux/fs.h    | 2 +-
 include/linux/mount.h | 3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index e5aa1eeb5748..3e8fb61ad597 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -451,7 +451,7 @@ EXPORT_SYMBOL(may_umount);
 void release_mounts(struct list_head *head)
 {
 	struct vfsmount *mnt;
-	while(!list_empty(head)) {
+	while (!list_empty(head)) {
 		mnt = list_entry(head->next, struct vfsmount, mnt_hash);
 		list_del_init(&mnt->mnt_hash);
 		if (mnt->mnt_parent != mnt) {
diff --git a/fs/pnode.c b/fs/pnode.c
index aeeec8ba8dd2..f1871f773f64 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -103,7 +103,7 @@ static struct vfsmount *propagation_next(struct vfsmount *m,
 		struct vfsmount *next;
 		struct vfsmount *master = m->mnt_master;
 
-		if ( master == origin->mnt_master ) {
+		if (master == origin->mnt_master) {
 			next = next_peer(m);
 			return ((next == origin) ? NULL : next);
 		} else if (m->mnt_slave.next != &master->mnt_slave_list)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7f140480c6a8..a1e28f0895c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -103,11 +103,11 @@ extern int dir_notify_enable;
 #define MS_MOVE		8192
 #define MS_REC		16384
 #define MS_VERBOSE	32768
+#define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_UNBINDABLE	(1<<17)	/* change to unbindable */
 #define MS_PRIVATE	(1<<18)	/* change to private */
 #define MS_SLAVE	(1<<19)	/* change to slave */
 #define MS_SHARED	(1<<20)	/* change to shared */
-#define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index dd4e83eba933..b98a709f1794 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -22,7 +22,8 @@
 #define MNT_NOEXEC	0x04
 #define MNT_SHARED	0x10	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x20	/* if the vfsmount is a unbindable mount */
-#define MNT_PNODE_MASK	0x30	/* propogation flag mask */
+
+#define MNT_PNODE_MASK	(MNT_SHARED | MNT_UNBINDABLE)
 
 struct vfsmount {
 	struct list_head mnt_hash;
-- 
cgit v1.2.3-71-gd317


From 5160ee6fc891a9ca114be0e90fa6655647bb64b2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 8 Jan 2006 01:03:32 -0800
Subject: [PATCH] shrink dentry struct

Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.

Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)

This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.

At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.

Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)

As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/usb/core/inode.c     |  6 +++---
 fs/autofs4/autofs_i.h        |  2 +-
 fs/autofs4/expire.c          | 12 ++++++------
 fs/autofs4/inode.c           |  4 ++--
 fs/autofs4/root.c            |  3 ++-
 fs/coda/cache.c              |  2 +-
 fs/dcache.c                  | 34 +++++++++++++++++-----------------
 fs/libfs.c                   | 12 ++++++------
 fs/ncpfs/dir.c               |  2 +-
 fs/ncpfs/ncplib_kernel.h     |  4 ++--
 fs/smbfs/cache.c             |  4 ++--
 include/linux/dcache.h       |  9 +++++++--
 kernel/cpuset.c              |  4 ++--
 net/sunrpc/rpc_pipe.c        |  2 +-
 security/selinux/selinuxfs.c |  2 +-
 15 files changed, 54 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index c44bbedec817..4ddc453023a2 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -186,7 +186,7 @@ static void update_bus(struct dentry *bus)
 
 	down(&bus->d_inode->i_sem);
 
-	list_for_each_entry(dev, &bus->d_subdirs, d_child)
+	list_for_each_entry(dev, &bus->d_subdirs, d_u.d_child)
 		if (dev->d_inode)
 			update_dev(dev);
 
@@ -203,7 +203,7 @@ static void update_sb(struct super_block *sb)
 
 	down(&root->d_inode->i_sem);
 
-	list_for_each_entry(bus, &root->d_subdirs, d_child) {
+	list_for_each_entry(bus, &root->d_subdirs, d_u.d_child) {
 		if (bus->d_inode) {
 			switch (S_IFMT & bus->d_inode->i_mode) {
 			case S_IFDIR:
@@ -319,7 +319,7 @@ static int usbfs_empty (struct dentry *dentry)
 	spin_lock(&dcache_lock);
 
 	list_for_each(list, &dentry->d_subdirs) {
-		struct dentry *de = list_entry(list, struct dentry, d_child);
+		struct dentry *de = list_entry(list, struct dentry, d_u.d_child);
 		if (usbfs_positive(de)) {
 			spin_unlock(&dcache_lock);
 			return 0;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index fca83e28edcf..385bed09b0d8 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -209,7 +209,7 @@ static inline int simple_empty_nolock(struct dentry *dentry)
 	struct dentry *child;
 	int ret = 0;
 
-	list_for_each_entry(child, &dentry->d_subdirs, d_child)
+	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
 		if (simple_positive(child))
 			goto out;
 	ret = 1;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index feb6ac427d05..dc39589df165 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -105,7 +105,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if (!simple_positive(dentry)) {
@@ -138,7 +138,7 @@ resume:
 	}
 
 	if (this_parent != top) {
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -163,7 +163,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if (!simple_positive(dentry)) {
@@ -199,7 +199,7 @@ cont:
 	}
 
 	if (this_parent != parent) {
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -238,7 +238,7 @@ static struct dentry *autofs4_expire(struct super_block *sb,
 	/* On exit from the loop expire is set to a dgot dentry
 	 * to expire or it's NULL */
 	while ( next != &root->d_subdirs ) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if ( !simple_positive(dentry) ) {
@@ -302,7 +302,7 @@ next:
 			expired, (int)expired->d_name.len, expired->d_name.name);
 		spin_lock(&dcache_lock);
 		list_del(&expired->d_parent->d_subdirs);
-		list_add(&expired->d_parent->d_subdirs, &expired->d_child);
+		list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 		spin_unlock(&dcache_lock);
 		return expired;
 	}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 818b37be5153..2d3082854a29 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -91,7 +91,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - don`t care */
 		if (!simple_positive(dentry)) {
@@ -117,7 +117,7 @@ resume:
 	if (this_parent != sbi->root) {
 		struct dentry *dentry = this_parent;
 
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		spin_unlock(&dcache_lock);
 		DPRINTK("parent dentry %p %.*s",
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2a771ec66956..2241405ffc41 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -143,7 +143,8 @@ static int autofs4_dcache_readdir(struct file * filp, void * dirent, filldir_t f
 			}
 
 			while(1) {
-				struct dentry *de = list_entry(list, struct dentry, d_child);
+				struct dentry *de = list_entry(list,
+						struct dentry, d_u.d_child);
 
 				if (!d_unhashed(de) && de->d_inode) {
 					spin_unlock(&dcache_lock);
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 80072fd9b7fa..c607d923350a 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,7 +93,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
 	spin_lock(&dcache_lock);
 	list_for_each(child, &parent->d_subdirs)
 	{
-		de = list_entry(child, struct dentry, d_child);
+		de = list_entry(child, struct dentry, d_u.d_child);
 		/* don't know what to do with negative dentries */
 		if ( ! de->d_inode ) 
 			continue;
diff --git a/fs/dcache.c b/fs/dcache.c
index 17e439138681..1536f15c4d4c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -71,7 +71,7 @@ struct dentry_stat_t dentry_stat = {
 
 static void d_callback(struct rcu_head *head)
 {
-	struct dentry * dentry = container_of(head, struct dentry, d_rcu);
+	struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
 
 	if (dname_external(dentry))
 		kfree(dentry->d_name.name);
@@ -86,7 +86,7 @@ static void d_free(struct dentry *dentry)
 {
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
- 	call_rcu(&dentry->d_rcu, d_callback);
+ 	call_rcu(&dentry->d_u.d_rcu, d_callback);
 }
 
 /*
@@ -193,7 +193,7 @@ kill_it: {
   			list_del(&dentry->d_lru);
   			dentry_stat.nr_unused--;
   		}
-  		list_del(&dentry->d_child);
+  		list_del(&dentry->d_u.d_child);
 		dentry_stat.nr_dentry--;	/* For d_free, below */
 		/*drops the locks, at that point nobody can reach this dentry */
 		dentry_iput(dentry);
@@ -367,7 +367,7 @@ static inline void prune_one_dentry(struct dentry * dentry)
 	struct dentry * parent;
 
 	__d_drop(dentry);
-	list_del(&dentry->d_child);
+	list_del(&dentry->d_u.d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
@@ -518,7 +518,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 		/* Have we found a mount point ? */
 		if (d_mountpoint(dentry))
@@ -532,7 +532,7 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -569,7 +569,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 
 		if (!list_empty(&dentry->d_lru)) {
@@ -610,7 +610,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name, found);
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 #ifdef DCACHE_DEBUG
 printk(KERN_DEBUG "select_parent: ascending to %s/%s, found=%d\n",
@@ -753,12 +753,12 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 		dentry->d_parent = dget(parent);
 		dentry->d_sb = parent->d_sb;
 	} else {
-		INIT_LIST_HEAD(&dentry->d_child);
+		INIT_LIST_HEAD(&dentry->d_u.d_child);
 	}
 
 	spin_lock(&dcache_lock);
 	if (parent)
-		list_add(&dentry->d_child, &parent->d_subdirs);
+		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	dentry_stat.nr_dentry++;
 	spin_unlock(&dcache_lock);
 
@@ -1310,8 +1310,8 @@ already_unhashed:
 	/* Unhash the target: dput() will then get rid of it */
 	__d_drop(target);
 
-	list_del(&dentry->d_child);
-	list_del(&target->d_child);
+	list_del(&dentry->d_u.d_child);
+	list_del(&target->d_u.d_child);
 
 	/* Switch the names.. */
 	switch_names(dentry, target);
@@ -1322,15 +1322,15 @@ already_unhashed:
 	if (IS_ROOT(dentry)) {
 		dentry->d_parent = target->d_parent;
 		target->d_parent = target;
-		INIT_LIST_HEAD(&target->d_child);
+		INIT_LIST_HEAD(&target->d_u.d_child);
 	} else {
 		do_switch(dentry->d_parent, target->d_parent);
 
 		/* And add them back to the (new) parent lists */
-		list_add(&target->d_child, &target->d_parent->d_subdirs);
+		list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
 	}
 
-	list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
+	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
 	spin_unlock(&target->d_lock);
 	spin_unlock(&dentry->d_lock);
 	write_sequnlock(&rename_lock);
@@ -1568,7 +1568,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 		if (d_unhashed(dentry)||!dentry->d_inode)
 			continue;
@@ -1579,7 +1579,7 @@ resume:
 		atomic_dec(&dentry->d_count);
 	}
 	if (this_parent != root) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		atomic_dec(&this_parent->d_count);
 		this_parent = this_parent->d_parent;
 		goto resume;
diff --git a/fs/libfs.c b/fs/libfs.c
index 58101dff2c66..9c50523382e7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -93,16 +93,16 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			loff_t n = file->f_pos - 2;
 
 			spin_lock(&dcache_lock);
-			list_del(&cursor->d_child);
+			list_del(&cursor->d_u.d_child);
 			p = file->f_dentry->d_subdirs.next;
 			while (n && p != &file->f_dentry->d_subdirs) {
 				struct dentry *next;
-				next = list_entry(p, struct dentry, d_child);
+				next = list_entry(p, struct dentry, d_u.d_child);
 				if (!d_unhashed(next) && next->d_inode)
 					n--;
 				p = p->next;
 			}
-			list_add_tail(&cursor->d_child, p);
+			list_add_tail(&cursor->d_u.d_child, p);
 			spin_unlock(&dcache_lock);
 		}
 	}
@@ -126,7 +126,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct dentry *cursor = filp->private_data;
-	struct list_head *p, *q = &cursor->d_child;
+	struct list_head *p, *q = &cursor->d_u.d_child;
 	ino_t ino;
 	int i = filp->f_pos;
 
@@ -153,7 +153,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			}
 			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
 				struct dentry *next;
-				next = list_entry(p, struct dentry, d_child);
+				next = list_entry(p, struct dentry, d_u.d_child);
 				if (d_unhashed(next) || !next->d_inode)
 					continue;
 
@@ -261,7 +261,7 @@ int simple_empty(struct dentry *dentry)
 	int ret = 0;
 
 	spin_lock(&dcache_lock);
-	list_for_each_entry(child, &dentry->d_subdirs, d_child)
+	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
 		if (simple_positive(child))
 			goto out;
 	ret = 1;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index a9f7a8ab1d59..cfd76f431dc0 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -365,7 +365,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_child);
+		dent = list_entry(next, struct dentry, d_u.d_child);
 		if ((unsigned long)dent->d_fsdata == fpos) {
 			if (dent->d_inode)
 				dget_locked(dent);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 9e4dc30c2435..799e5c2bec55 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -196,7 +196,7 @@ ncp_renew_dentries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		if (dentry->d_fsdata == NULL)
 			ncp_age_dentry(server, dentry);
@@ -218,7 +218,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 		dentry->d_fsdata = NULL;
 		ncp_age_dentry(server, dentry);
 		next = next->next;
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
index f3e6b81288ab..74b86d9725a6 100644
--- a/fs/smbfs/cache.c
+++ b/fs/smbfs/cache.c
@@ -66,7 +66,7 @@ smb_invalidate_dircache_entries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 		dentry->d_fsdata = NULL;
 		smb_age_dentry(server, dentry);
 		next = next->next;
@@ -100,7 +100,7 @@ smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_child);
+		dent = list_entry(next, struct dentry, d_u.d_child);
 		if ((unsigned long)dent->d_fsdata == fpos) {
 			if (dent->d_inode)
 				dget_locked(dent);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 46a2ba617595..a3ed5e059d47 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -95,14 +95,19 @@ struct dentry {
 	struct qstr d_name;
 
 	struct list_head d_lru;		/* LRU list */
-	struct list_head d_child;	/* child of parent list */
+	/*
+	 * d_child and d_rcu can share memory
+	 */
+	union {
+		struct list_head d_child;	/* child of parent list */
+	 	struct rcu_head d_rcu;
+	} d_u;
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
 	unsigned long d_time;		/* used by d_revalidate */
 	struct dentry_operations *d_op;
 	struct super_block *d_sb;	/* The root of the dentry tree */
 	void *d_fsdata;			/* fs-specific data */
- 	struct rcu_head d_rcu;
 	struct dcookie_struct *d_cookie; /* cookie, if any */
 	int d_mounted;
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e04c2da9dadb..eab64e23bcae 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -331,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
 	spin_lock(&dcache_lock);
 	node = dentry->d_subdirs.next;
 	while (node != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_child);
+		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		list_del_init(node);
 		if (d->d_inode) {
 			d = dget_locked(d);
@@ -343,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
 		}
 		node = dentry->d_subdirs.next;
 	}
-	list_del_init(&dentry->d_child);
+	list_del_init(&dentry->d_u.d_child);
 	spin_unlock(&dcache_lock);
 	remove_dir(dentry);
 }
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 24cc23af9b95..e14c1cae7460 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -495,7 +495,7 @@ rpc_depopulate(struct dentry *parent)
 repeat:
 	spin_lock(&dcache_lock);
 	list_for_each_safe(pos, next, &parent->d_subdirs) {
-		dentry = list_entry(pos, struct dentry, d_child);
+		dentry = list_entry(pos, struct dentry, d_u.d_child);
 		spin_lock(&dentry->d_lock);
 		if (!d_unhashed(dentry)) {
 			dget_locked(dentry);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index e59da6398d44..b5fa02d17b1e 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -889,7 +889,7 @@ static void sel_remove_bools(struct dentry *de)
 	spin_lock(&dcache_lock);
 	node = de->d_subdirs.next;
 	while (node != &de->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_child);
+		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		list_del_init(node);
 
 		if (d->d_inode) {
-- 
cgit v1.2.3-71-gd317


From e78c9a004aadebe22306c81d1a7f1d1278dc37f9 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 8 Jan 2006 01:03:39 -0800
Subject: [PATCH] fs: remove s_old_blocksize from struct super_block

This patch inlines the single user of struct super_block field
s_old_blocksize and removes the field.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/super.c         | 3 +--
 include/linux/fs.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 5a347a4f673a..0a30e51692cf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -700,8 +700,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		s->s_old_blocksize = block_size(bdev);
-		sb_set_blocksize(s, s->s_old_blocksize);
+		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a1e28f0895c0..4c82219b0fae 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -808,7 +808,6 @@ struct super_block {
 	struct list_head	s_list;		/* Keep this first */
 	dev_t			s_dev;		/* search index; _not_ kdev_t */
 	unsigned long		s_blocksize;
-	unsigned long		s_old_blocksize;
 	unsigned char		s_blocksize_bits;
 	unsigned char		s_dirt;
 	unsigned long long	s_maxbytes;	/* Max file size */
-- 
cgit v1.2.3-71-gd317


From f867bac65419a98c9682f4409e087582d29ec5f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 8 Jan 2006 01:03:40 -0800
Subject: [PATCH] remove unused blkp field in percpu_data

I found that blkp field was not used in kernel tree.

As most of the times NR_CPUS is a power of two and kmalloc() memory blocks
too, this extra field basically doubles the memory space allocated in
__alloc_percpu() to store the 'struct percpu_data'

(for example, if NR_CPUS=8 on i386, kmalloc(4*8+4) returns a 64 bytes block
instead of a 32 bytes block after this patch)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/percpu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 20317d88deba..cb9039a21f2a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -19,7 +19,6 @@
 
 struct percpu_data {
 	void *ptrs[NR_CPUS];
-	void *blkp;
 };
 
 /* 
-- 
cgit v1.2.3-71-gd317


From fd285bb54d8a3e99810090ae88cfe8ed77d1da25 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:04:07 -0800
Subject: [PATCH] Abandon gcc-2.95.x

There's one scsi driver which doesn't compile due to weird __VA_ARGS__ tricks
and the rather useful scsi/sd.c is currently getting an ICE.  None of the new
SAS code compiles, due to extensive use of anonymous unions.  The V4L guys are
very good at exploiting the gcc-2.95.x macro expansion bug (_why_ does each
driver need to implement its own debug macros?) and various people keep on
sneaking in anonymous unions, which are rather nice.

Plus anonymous unions are rather useful.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compiler-gcc2.h | 29 -----------------------------
 include/linux/compiler.h      |  2 --
 init/main.c                   |  7 +------
 3 files changed, 1 insertion(+), 37 deletions(-)
 delete mode 100644 include/linux/compiler-gcc2.h

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc2.h b/include/linux/compiler-gcc2.h
deleted file mode 100644
index ebed17660c5f..000000000000
--- a/include/linux/compiler-gcc2.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Never include this file directly.  Include <linux/compiler.h> instead.  */
-
-/* These definitions are for GCC v2.x.  */
-
-/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
-   a mechanism by which the user can annotate likely branch directions and
-   expect the blocks to be reordered appropriately.  Define __builtin_expect
-   to nothing for earlier compilers.  */
-#include <linux/compiler-gcc.h>
-
-#if __GNUC_MINOR__ < 96
-# define __builtin_expect(x, expected_value) (x)
-#endif
-
-#define __attribute_used__	__attribute__((__unused__))
-
-/*
- * The attribute `pure' is not implemented in GCC versions earlier
- * than 2.96.
- */
-#if __GNUC_MINOR__ >= 96
-# define __attribute_pure__	__attribute__((pure))
-# define __attribute_const__	__attribute__((__const__))
-#endif
-
-/* GCC 2.95.x/2.96 recognize __va_copy, but not va_copy. Actually later GCC's
- * define both va_copy and __va_copy, but the latter may go away, so limit this
- * to this header */
-#define va_copy			__va_copy
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d7378215b851..f23d3c6fc2c0 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -42,8 +42,6 @@ extern void __chk_io_ptr(void __iomem *);
 # include <linux/compiler-gcc4.h>
 #elif __GNUC__ == 3
 # include <linux/compiler-gcc3.h>
-#elif __GNUC__ == 2
-# include <linux/compiler-gcc2.h>
 #else
 # error Sorry, your compiler is too old/not recognized.
 #endif
diff --git a/init/main.c b/init/main.c
index afe5eb84ad52..8342c2890b16 100644
--- a/init/main.c
+++ b/init/main.c
@@ -58,11 +58,6 @@
  * This is one of the first .c files built. Error out early
  * if we have compiler trouble..
  */
-#if __GNUC__ == 2 && __GNUC_MINOR__ == 96
-#ifdef CONFIG_FRAME_POINTER
-#error This compiler cannot compile correctly with frame pointers enabled
-#endif
-#endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
@@ -74,7 +69,7 @@
  * To avoid associated bogus bug reports, we flatly refuse to compile
  * with a gcc that is known to be too old from the very beginning.
  */
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 95)
+#if (__GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 2)
 #error Sorry, your GCC is too old. It builds incorrect kernels.
 #endif
 
-- 
cgit v1.2.3-71-gd317


From a1365647022eb05a5993f270a78e9bef3bf554eb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:04:09 -0800
Subject: [PATCH] remove gcc-2 checks

Remove various things which were checking for gcc-1.x and gcc-2.x compilers.

From: Adrian Bunk <bunk@stusta.de>

    Some documentation updates and removes some code paths for gcc < 3.2.

Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/Changes                 | 31 +++++--------------------------
 README                                |  7 ++-----
 arch/arm/kernel/asm-offsets.c         |  9 ++-------
 arch/arm26/kernel/asm-offsets.c       |  7 -------
 arch/i386/Kconfig                     |  4 ----
 arch/i386/Makefile                    |  5 +----
 arch/i386/Makefile.cpu                | 10 +++++-----
 arch/ia64/Makefile                    |  4 ----
 arch/ia64/kernel/head.S               |  2 +-
 arch/ia64/kernel/ia64_ksyms.c         |  2 +-
 arch/ia64/oprofile/backtrace.c        |  2 +-
 drivers/md/raid0.c                    |  6 ------
 drivers/media/video/v4l2-common.c     |  2 --
 fs/ocfs2/cluster/masklog.h            |  7 +++----
 fs/xfs/xfs_log.h                      |  8 +-------
 include/asm-alpha/compiler.h          |  2 --
 include/asm-alpha/processor.h         | 21 ---------------------
 include/asm-ia64/bug.h                |  6 +-----
 include/asm-ia64/spinlock.h           |  2 +-
 include/asm-sparc64/system.h          |  4 ----
 include/asm-um/rwsem.h                |  4 ----
 include/asm-v850/unistd.h             | 18 ------------------
 include/linux/byteorder/generic.h     |  2 +-
 include/linux/byteorder/swab.h        |  2 +-
 include/linux/byteorder/swabb.h       |  2 +-
 include/linux/compiler-gcc.h          |  9 +++++++++
 include/linux/compiler-gcc3.h         | 17 -----------------
 include/linux/compiler-gcc4.h         |  7 -------
 include/linux/kernel.h                |  2 --
 include/linux/seccomp.h               |  6 +-----
 include/linux/spinlock_types_up.h     | 14 --------------
 sound/isa/wavefront/wavefront_synth.c |  7 -------
 32 files changed, 37 insertions(+), 194 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/Changes b/Documentation/Changes
index 86b86399d61d..fe5ae0f55020 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -31,8 +31,6 @@ al espa
 Eine deutsche Version dieser Datei finden Sie unter
 <http://www.stefan-winter.de/Changes-2.4.0.txt>.
 
-Last updated: October 29th, 2002
-
 Chris Ricker (kaboom@gatech.edu or chris.ricker@genetics.utah.edu).
 
 Current Minimal Requirements
@@ -48,7 +46,7 @@ necessary on all systems; obviously, if you don't have any ISDN
 hardware, for example, you probably needn't concern yourself with
 isdn4k-utils.
 
-o  Gnu C                  2.95.3                  # gcc --version
+o  Gnu C                  3.2                     # gcc --version
 o  Gnu make               3.79.1                  # make --version
 o  binutils               2.12                    # ld -v
 o  util-linux             2.10o                   # fdformat --version
@@ -74,26 +72,7 @@ GCC
 ---
 
 The gcc version requirements may vary depending on the type of CPU in your
-computer. The next paragraph applies to users of x86 CPUs, but not
-necessarily to users of other CPUs. Users of other CPUs should obtain
-information about their gcc version requirements from another source.
-
-The recommended compiler for the kernel is gcc 2.95.x (x >= 3), and it
-should be used when you need absolute stability. You may use gcc 3.0.x
-instead if you wish, although it may cause problems. Later versions of gcc 
-have not received much testing for Linux kernel compilation, and there are 
-almost certainly bugs (mainly, but not exclusively, in the kernel) that
-will need to be fixed in order to use these compilers. In any case, using
-pgcc instead of plain gcc is just asking for trouble.
-
-The Red Hat gcc 2.96 compiler subtree can also be used to build this tree.
-You should ensure you use gcc-2.96-74 or later. gcc-2.96-54 will not build
-the kernel correctly.
-
-In addition, please pay attention to compiler optimization.  Anything
-greater than -O2 may not be wise.  Similarly, if you choose to use gcc-2.95.x
-or derivatives, be sure not to use -fstrict-aliasing (which, depending on
-your version of gcc 2.95.x, may necessitate using -fno-strict-aliasing).
+computer.
 
 Make
 ----
@@ -322,9 +301,9 @@ Getting updated software
 Kernel compilation
 ******************
 
-gcc 2.95.3
-----------
-o  <ftp://ftp.gnu.org/gnu/gcc/gcc-2.95.3.tar.gz>
+gcc
+---
+o  <ftp://ftp.gnu.org/gnu/gcc/>
 
 Make
 ----
diff --git a/README b/README
index 61c4f7429233..cd5e2eb6213b 100644
--- a/README
+++ b/README
@@ -183,11 +183,8 @@ CONFIGURING the kernel:
 
 COMPILING the kernel:
 
- - Make sure you have gcc 2.95.3 available.
-   gcc 2.91.66 (egcs-1.1.2), and gcc 2.7.2.3 are known to miscompile
-   some parts of the kernel, and are *no longer supported*.
-   Also remember to upgrade your binutils package (for as/ld/nm and company)
-   if necessary. For more information, refer to Documentation/Changes.
+ - Make sure you have at least gcc 3.2 available.
+   For more information, refer to Documentation/Changes.
 
    Please note that you can still run a.out user programs with this kernel.
 
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 04d3082a7b94..0abbce8c70bc 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -23,20 +23,15 @@
 #error Sorry, your compiler targets APCS-26 but this kernel requires APCS-32
 #endif
 /*
- * GCC 2.95.1, 2.95.2: ignores register clobber list in asm().
  * GCC 3.0, 3.1: general bad code generation.
  * GCC 3.2.0: incorrect function argument offset calculation.
  * GCC 3.2.x: miscompiles NEW_AUX_ENT in fs/binfmt_elf.c
  *            (http://gcc.gnu.org/PR8896) and incorrect structure
  *	      initialisation in fs/jffs2/erase.c
  */
-#if __GNUC__ < 2 || \
-   (__GNUC__ == 2 && __GNUC_MINOR__ < 95) || \
-   (__GNUC__ == 2 && __GNUC_MINOR__ == 95 && __GNUC_PATCHLEVEL__ != 0 && \
-					     __GNUC_PATCHLEVEL__ < 3) || \
-   (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 #error Your compiler is too buggy; it is known to miscompile kernels.
-#error    Known good compilers: 2.95.3, 2.95.4, 2.96, 3.3
+#error    Known good compilers: 3.3
 #endif
 
 /* Use marker if you need to separate the values later */
diff --git a/arch/arm26/kernel/asm-offsets.c b/arch/arm26/kernel/asm-offsets.c
index 4ccacaef94df..ac682d5fd039 100644
--- a/arch/arm26/kernel/asm-offsets.c
+++ b/arch/arm26/kernel/asm-offsets.c
@@ -25,13 +25,6 @@
 #if defined(__APCS_32__) && defined(CONFIG_CPU_26)
 #error Sorry, your compiler targets APCS-32 but this kernel requires APCS-26
 #endif
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 95)
-#error Sorry, your compiler is known to miscompile kernels.  Only use gcc 2.95.3 and later.
-#endif
-#if __GNUC__ == 2 && __GNUC_MINOR__ == 95
-/* shame we can't detect the .1 or .2 releases */
-#warning GCC 2.95.2 and earlier miscompiles kernels.
-#endif
 
 /* Use marker if you need to separate the values later */
 
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 968fabd8723f..486449e9e710 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -630,10 +630,6 @@ config REGPARM
 	and passes the first three arguments of a function call in registers.
 	This will probably break binary only modules.
 
-	This feature is only enabled for gcc-3.0 and later - earlier compilers
-	generate incorrect output with certain kernel constructs when
-	-mregparm=3 is used.
-
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index d121ea18460f..b84119f9cc63 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -37,10 +37,7 @@ CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
 # CPU-specific tuning. Anything which can be shared with UML should go here.
 include $(srctree)/arch/i386/Makefile.cpu
 
-# -mregparm=3 works ok on gcc-3.0 and later
-#
-GCC_VERSION			:= $(call cc-version)
-cflags-$(CONFIG_REGPARM) 	+= $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;)
+cflags-$(CONFIG_REGPARM) 	+= -mregparm=3
 
 # Disable unit-at-a-time mode, it makes gcc use a lot more stack
 # due to the lack of sharing of stacklots.
diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu
index 8e51456df23d..dcd936ef45db 100644
--- a/arch/i386/Makefile.cpu
+++ b/arch/i386/Makefile.cpu
@@ -1,7 +1,7 @@
 # CPU tuning section - shared with UML.
 # Must change only cflags-y (or [yn]), not CFLAGS! That makes a difference for UML.
 
-#-mtune exists since gcc 3.4, and some -mcpu flavors didn't exist in gcc 2.95.
+#-mtune exists since gcc 3.4
 HAS_MTUNE	:= $(call cc-option-yn, -mtune=i386)
 ifeq ($(HAS_MTUNE),y)
 tune		= $(call cc-option,-mtune=$(1),)
@@ -14,7 +14,7 @@ cflags-$(CONFIG_M386)		+= -march=i386
 cflags-$(CONFIG_M486)		+= -march=i486
 cflags-$(CONFIG_M586)		+= -march=i586
 cflags-$(CONFIG_M586TSC)	+= -march=i586
-cflags-$(CONFIG_M586MMX)	+= $(call cc-option,-march=pentium-mmx,-march=i586)
+cflags-$(CONFIG_M586MMX)	+= -march=pentium-mmx
 cflags-$(CONFIG_M686)		+= -march=i686
 cflags-$(CONFIG_MPENTIUMII)	+= -march=i686 $(call tune,pentium2)
 cflags-$(CONFIG_MPENTIUMIII)	+= -march=i686 $(call tune,pentium3)
@@ -23,8 +23,8 @@ cflags-$(CONFIG_MPENTIUM4)	+= -march=i686 $(call tune,pentium4)
 cflags-$(CONFIG_MK6)		+= -march=k6
 # Please note, that patches that add -march=athlon-xp and friends are pointless.
 # They make zero difference whatsosever to performance at this time.
-cflags-$(CONFIG_MK7)		+= $(call cc-option,-march=athlon,-march=i686 $(align)-functions=4)
-cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,$(call cc-option,-march=athlon,-march=i686 $(align)-functions=4))
+cflags-$(CONFIG_MK7)		+= -march=athlon
+cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,-march=athlon)
 cflags-$(CONFIG_MCRUSOE)	+= -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MEFFICEON)	+= -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MWINCHIPC6)	+= $(call cc-option,-march=winchip-c6,-march=i586)
@@ -37,5 +37,5 @@ cflags-$(CONFIG_MVIAC3_2)	+= $(call cc-option,-march=c3-2,-march=i686)
 cflags-$(CONFIG_X86_ELAN)	+= -march=i486
 
 # Geode GX1 support
-cflags-$(CONFIG_MGEODEGX1)		+= $(call cc-option,-march=pentium-mmx,-march=i486)
+cflags-$(CONFIG_MGEODEGX1)	+= -march=pentium-mmx
 
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 67932ad53082..57b047c27e46 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -37,10 +37,6 @@ $(error Sorry, you need a newer version of the assember, one that is built from
 		ftp://ftp.hpl.hp.com/pub/linux-ia64/gas-030124.tar.gz)
 endif
 
-ifneq ($(shell if [ $(GCC_VERSION) -lt 0300 ] ; then echo "bad"; fi ;),)
-$(error Sorry, your compiler is too old.  GCC v2.96 is known to generate bad code.)
-endif
-
 ifeq ($(GCC_VERSION),0304)
 	cflags-$(CONFIG_ITANIUM)	+= -mtune=merced
 	cflags-$(CONFIG_MCKINLEY)	+= -mtune=mckinley
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index bfe65b2e8621..fbc7ea35dd57 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -1060,7 +1060,7 @@ SET_REG(b5);
 	 * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
 	 */
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 
 GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4)
 	.prologue
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 5db9d3bcbbcb..e72de580ebbf 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -103,7 +103,7 @@ EXPORT_SYMBOL(unw_init_running);
 
 #ifdef ASM_SUPPORTED
 # ifdef CONFIG_SMP
-#  if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#  if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 /*
  * This is not a normal routine and we don't want a function descriptor for it, so we use
  * a fake declaration here.
diff --git a/arch/ia64/oprofile/backtrace.c b/arch/ia64/oprofile/backtrace.c
index b7dabbfb0d61..adb01566bd57 100644
--- a/arch/ia64/oprofile/backtrace.c
+++ b/arch/ia64/oprofile/backtrace.c
@@ -32,7 +32,7 @@ typedef struct
 	u64 *prev_pfs_loc;	/* state for WAR for old spinlock ool code */
 } ia64_backtrace_t;
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 /*
  * Returns non-zero if the PC is in the spinlock contention out-of-line code
  * with non-standard calling sequence (on older compilers).
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index abbca150202b..d03f99cf4b7d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -306,9 +306,6 @@ static int raid0_run (mddev_t *mddev)
 	printk("raid0 : conf->hash_spacing is %llu blocks.\n",
 		(unsigned long long)conf->hash_spacing);
 	{
-#if __GNUC__ < 3
-		volatile
-#endif
 		sector_t s = mddev->array_size;
 		sector_t space = conf->hash_spacing;
 		int round;
@@ -439,9 +436,6 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio)
  
 
 	{
-#if __GNUC__ < 3
-		volatile
-#endif
 		sector_t x = block >> conf->preshift;
 		sector_div(x, (u32)conf->hash_spacing);
 		zone = conf->hash_table[x];
diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index 597b8db35a13..62a7d636ef11 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -191,9 +191,7 @@ char *v4l2_type_names[] = {
 };
 
 char *v4l2_ioctl_names[256] = {
-#if __GNUC__ >= 3
 	[0 ... 255]                      = "UNKNOWN",
-#endif
 	[_IOC_NR(VIDIOC_QUERYCAP)]       = "VIDIOC_QUERYCAP",
 	[_IOC_NR(VIDIOC_RESERVED)]       = "VIDIOC_RESERVED",
 	[_IOC_NR(VIDIOC_ENUM_FMT)]       = "VIDIOC_ENUM_FMT",
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index f5ef5ea61a05..e8c56a3d9c64 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -212,11 +212,10 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 	mlog(ML_ENTRY, "ENTRY:\n");					\
 } while (0)
 
-/* We disable this for old compilers since they don't have support for
- * __builtin_types_compatible_p.
+/*
+ * We disable this for sparse.
  */
-#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
-    !defined(__CHECKER__)
+#if !defined(__CHECKER__)
 #define mlog_exit(st) do {						     \
 	if (__builtin_types_compatible_p(typeof(st), unsigned long))	     \
 		mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));	     \
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 158829ca56f6..f40d4391fcfc 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -30,13 +30,7 @@
  * By comparing each compnent, we don't have to worry about extra
  * endian issues in treating two 32 bit numbers as one 64 bit number
  */
-static
-#if defined(__GNUC__) && (__GNUC__ == 2) && ( (__GNUC_MINOR__ == 95) || (__GNUC_MINOR__ == 96))
-__attribute__((unused))	/* gcc 2.95, 2.96 miscompile this when inlined */
-#else
-__inline__
-#endif
-xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
+static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 {
 	if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
 		return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
diff --git a/include/asm-alpha/compiler.h b/include/asm-alpha/compiler.h
index 0a4a8b40dfcd..00c6f57ad9a7 100644
--- a/include/asm-alpha/compiler.h
+++ b/include/asm-alpha/compiler.h
@@ -98,9 +98,7 @@
 #undef inline
 #undef __inline__
 #undef __inline
-#if __GNUC__ == 3 && __GNUC_MINOR__ >= 1 || __GNUC__ > 3
 #undef __always_inline
 #define __always_inline		inline __attribute__((always_inline))
-#endif
 
 #endif /* __ALPHA_COMPILER_H */
diff --git a/include/asm-alpha/processor.h b/include/asm-alpha/processor.h
index 059780a7d3d7..bb1a7a3abb8b 100644
--- a/include/asm-alpha/processor.h
+++ b/include/asm-alpha/processor.h
@@ -77,7 +77,6 @@ unsigned long get_wchan(struct task_struct *p);
 #define spin_lock_prefetch(lock)  	do { } while (0)
 #endif
 
-#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
 extern inline void prefetch(const void *ptr)  
 { 
 	__builtin_prefetch(ptr, 0, 3);
@@ -95,24 +94,4 @@ extern inline void spin_lock_prefetch(const void *ptr)
 }
 #endif
 
-#else
-extern inline void prefetch(const void *ptr)  
-{ 
-	__asm__ ("ldl $31,%0" : : "m"(*(char *)ptr)); 
-}
-
-extern inline void prefetchw(const void *ptr)  
-{
-	__asm__ ("ldq $31,%0" : : "m"(*(char *)ptr)); 
-}
-
-#ifdef CONFIG_SMP
-extern inline void spin_lock_prefetch(const void *ptr)  
-{
-	__asm__ ("ldq $31,%0" : : "m"(*(char *)ptr)); 
-}
-#endif
-
-#endif /* GCC 3.1 */
-
 #endif /* __ASM_ALPHA_PROCESSOR_H */
diff --git a/include/asm-ia64/bug.h b/include/asm-ia64/bug.h
index 3aa0a0a5474b..823616b5020b 100644
--- a/include/asm-ia64/bug.h
+++ b/include/asm-ia64/bug.h
@@ -2,11 +2,7 @@
 #define _ASM_IA64_BUG_H
 
 #ifdef CONFIG_BUG
-#if (__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
-# define ia64_abort()	__builtin_trap()
-#else
-# define ia64_abort()	(*(volatile int *) 0 = 0)
-#endif
+#define ia64_abort()	__builtin_trap()
 #define BUG() do { printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); ia64_abort(); } while (0)
 
 /* should this BUG be made generic? */
diff --git a/include/asm-ia64/spinlock.h b/include/asm-ia64/spinlock.h
index 0c91a76c5ea3..9e83210dc312 100644
--- a/include/asm-ia64/spinlock.h
+++ b/include/asm-ia64/spinlock.h
@@ -34,7 +34,7 @@ __raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
 {
 	register volatile unsigned int *ptr asm ("r31") = &lock->lock;
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 # ifdef CONFIG_ITANIUM
 	/* don't use brl on Itanium... */
 	asm volatile ("{\n\t"
diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h
index b5417529f6f1..309f1466b6fa 100644
--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -193,11 +193,7 @@ do {						\
 	 * not preserve it's value.  Hairy, but it lets us remove 2 loads
 	 * and 2 stores in this critical code path.  -DaveM
 	 */
-#if __GNUC__ >= 3
 #define EXTRA_CLOBBER ,"%l1"
-#else
-#define EXTRA_CLOBBER
-#endif
 #define switch_to(prev, next, last)					\
 do {	if (test_thread_flag(TIF_PERFCTR)) {				\
 		unsigned long __tmp;					\
diff --git a/include/asm-um/rwsem.h b/include/asm-um/rwsem.h
index 661c0e54702b..b5fc449dc86b 100644
--- a/include/asm-um/rwsem.h
+++ b/include/asm-um/rwsem.h
@@ -1,10 +1,6 @@
 #ifndef __UM_RWSEM_H__
 #define __UM_RWSEM_H__
 
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
-#define __builtin_expect(exp,c) (exp)
-#endif
-
 #include "asm/arch/rwsem.h"
 
 #endif
diff --git a/include/asm-v850/unistd.h b/include/asm-v850/unistd.h
index 5a86f8e976ec..82460a7bb233 100644
--- a/include/asm-v850/unistd.h
+++ b/include/asm-v850/unistd.h
@@ -241,9 +241,6 @@
 /* User programs sometimes end up including this header file
    (indirectly, via uClibc header files), so I'm a bit nervous just
    including <linux/compiler.h>.  */
-#if !defined(__builtin_expect) && __GNUC__ == 2 && __GNUC_MINOR__ < 96
-#define __builtin_expect(x, expected_value) (x)
-#endif
 
 #define __syscall_return(type, res)					      \
   do {									      \
@@ -346,20 +343,6 @@ type name (atype a, btype b, ctype c, dtype d, etype e)			      \
   __syscall_return (type, __ret);					      \
 }
 
-#if __GNUC__ < 3
-/* In older versions of gcc, `asm' statements with more than 10
-   input/output arguments produce a fatal error.  To work around this
-   problem, we use two versions, one for gcc-3.x and one for earlier
-   versions of gcc (the `earlier gcc' version doesn't work with gcc-3.x
-   because gcc-3.x doesn't allow clobbers to also be input arguments).  */
-#define __SYSCALL6_TRAP(syscall, ret, a, b, c, d, e, f)			      \
-  __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
-			: "=r" (ret), "=r" (syscall)			      \
-			: "1" (syscall),				      \
-			"r" (a), "r" (b), "r" (c), "r" (d),		      \
- 			"r" (e), "r" (f)				      \
-			: SYSCALL_CLOBBERS, SYSCALL_ARG4, SYSCALL_ARG5);
-#else /* __GNUC__ >= 3 */
 #define __SYSCALL6_TRAP(syscall, ret, a, b, c, d, e, f)			      \
   __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
 			: "=r" (ret), "=r" (syscall),			      \
@@ -368,7 +351,6 @@ type name (atype a, btype b, ctype c, dtype d, etype e)			      \
 			"r" (a), "r" (b), "r" (c), "r" (d),		      \
 			"2" (e), "3" (f)				      \
 			: SYSCALL_CLOBBERS);
-#endif
 
 #define _syscall6(type, name, atype, a, btype, b, ctype, c, dtype, d, etype, e, ftype, f) \
 type name (atype a, btype b, ctype c, dtype d, etype e, ftype f)	      \
diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 04bd756efc67..e86e4a938373 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -156,7 +156,7 @@ extern __be32			htonl(__u32);
 extern __u16			ntohs(__be16);
 extern __be16			htons(__u16);
 
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 
 #define ___htonl(x) __cpu_to_be32(x)
 #define ___htons(x) __cpu_to_be16(x)
diff --git a/include/linux/byteorder/swab.h b/include/linux/byteorder/swab.h
index 2f1cb775125a..25f7f32883ec 100644
--- a/include/linux/byteorder/swab.h
+++ b/include/linux/byteorder/swab.h
@@ -110,7 +110,7 @@
 /*
  * Allow constant folding
  */
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 #  define __swab16(x) \
 (__builtin_constant_p((__u16)(x)) ? \
  ___swab16((x)) : \
diff --git a/include/linux/byteorder/swabb.h b/include/linux/byteorder/swabb.h
index d5f2a3205109..ae5e5f914bf4 100644
--- a/include/linux/byteorder/swabb.h
+++ b/include/linux/byteorder/swabb.h
@@ -77,7 +77,7 @@
 /*
  * Allow constant folding
  */
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 #  define __swahw32(x) \
 (__builtin_constant_p((__u32)(x)) ? \
  ___swahw32((x)) : \
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 152734055403..2e05e1e6b0e6 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -15,3 +15,12 @@
   ({ unsigned long __ptr;					\
     __asm__ ("" : "=g"(__ptr) : "0"(ptr));		\
     (typeof(ptr)) (__ptr + (off)); })
+
+
+#define inline		inline		__attribute__((always_inline))
+#define __inline__	__inline__	__attribute__((always_inline))
+#define __inline	__inline	__attribute__((always_inline))
+#define __deprecated			__attribute__((deprecated))
+#define  noinline			__attribute__((noinline))
+#define __attribute_pure__		__attribute__((pure))
+#define __attribute_const__		__attribute__((__const__))
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index a6fa615afab5..4209082ee934 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -3,29 +3,12 @@
 /* These definitions are for GCC v3.x.  */
 #include <linux/compiler-gcc.h>
 
-#if __GNUC_MINOR__ >= 1
-# define inline		inline		__attribute__((always_inline))
-# define __inline__	__inline__	__attribute__((always_inline))
-# define __inline	__inline	__attribute__((always_inline))
-#endif
-
-#if __GNUC_MINOR__ > 0
-# define __deprecated		__attribute__((deprecated))
-#endif
-
 #if __GNUC_MINOR__ >= 3
 # define __attribute_used__	__attribute__((__used__))
 #else
 # define __attribute_used__	__attribute__((__unused__))
 #endif
 
-#define __attribute_pure__	__attribute__((pure))
-#define __attribute_const__	__attribute__((__const__))
-
-#if __GNUC_MINOR__ >= 1
-#define  noinline		__attribute__((noinline))
-#endif
-
 #if __GNUC_MINOR__ >= 4
 #define __must_check		__attribute__((warn_unused_result))
 #endif
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 53686c037a06..e913e9beaf69 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -3,14 +3,7 @@
 /* These definitions are for GCC v4.x.  */
 #include <linux/compiler-gcc.h>
 
-#define inline			inline		__attribute__((always_inline))
-#define __inline__		__inline__	__attribute__((always_inline))
-#define __inline		__inline	__attribute__((always_inline))
-#define __deprecated		__attribute__((deprecated))
 #define __attribute_used__	__attribute__((__used__))
-#define __attribute_pure__	__attribute__((pure))
-#define __attribute_const__	__attribute__((__const__))
-#define  noinline		__attribute__((noinline))
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b1e407a4fbda..ca7ff8fdd090 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -316,8 +316,6 @@ extern int randomize_va_space;
 #endif
 
 /* Trap pasters of __FUNCTION__ at compile-time */
-#if __GNUC__ > 2 || __GNUC_MINOR__ >= 95
 #define __FUNCTION__ (__func__)
-#endif
 
 #endif
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index dc89116bb1ca..cd2773b29a64 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -26,11 +26,7 @@ static inline int has_secure_computing(struct thread_info *ti)
 
 #else /* CONFIG_SECCOMP */
 
-#if (__GNUC__ > 2)
-  typedef struct { } seccomp_t;
-#else
-  typedef struct { int gcc_is_buggy; } seccomp_t;
-#endif
+typedef struct { } seccomp_t;
 
 #define secure_computing(x) do { } while (0)
 /* static inline to preserve typechecking */
diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
index def2d173a8db..04135b0e198e 100644
--- a/include/linux/spinlock_types_up.h
+++ b/include/linux/spinlock_types_up.h
@@ -22,30 +22,16 @@ typedef struct {
 
 #else
 
-/*
- * All gcc 2.95 versions and early versions of 2.96 have a nasty bug
- * with empty initializers.
- */
-#if (__GNUC__ > 2)
 typedef struct { } raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { }
-#else
-typedef struct { int gcc_is_buggy; } raw_spinlock_t;
-#define __RAW_SPIN_LOCK_UNLOCKED (raw_spinlock_t) { 0 }
-#endif
 
 #endif
 
-#if (__GNUC__ > 2)
 typedef struct {
 	/* no debug version on UP */
 } raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
-#else
-typedef struct { int gcc_is_buggy; } raw_rwlock_t;
-#define __RAW_RW_LOCK_UNLOCKED (raw_rwlock_t) { 0 }
-#endif
 
 #endif /* __LINUX_SPINLOCK_TYPES_UP_H */
diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index 679d0ae97e4f..ed81eec6e732 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -115,18 +115,11 @@ MODULE_PARM_DESC(osrun_time, "how many seconds to wait for the ICS2115 OS");
 
 #ifdef WF_DEBUG
 
-#if defined(NEW_MACRO_VARARGS) || __GNUC__ >= 3
 #define DPRINT(cond, ...) \
        if ((dev->debug & (cond)) == (cond)) { \
 	     snd_printk (__VA_ARGS__); \
        }
 #else
-#define DPRINT(cond, args...) \
-       if ((dev->debug & (cond)) == (cond)) { \
-	     snd_printk (args); \
-       }
-#endif
-#else
 #define DPRINT(cond, args...)
 #endif /* WF_DEBUG */
 
-- 
cgit v1.2.3-71-gd317


From 59d9136b9844d3a0376d93c945ab280decedb323 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Sun, 8 Jan 2006 01:04:34 -0800
Subject: [PATCH] aio: reorder kiocb structure elements to make sync iocb setup
 faster

Reorder members of the kiocb structure to make sync kiocb setup faster.  By
setting the elements sequentially, the write combining buffers on the CPU
are able to combine the writes into a single burst, which results in fewer
cache cycles being consumed, freeing them up for other code.  This results
in a 10-20KB/s[*] increase on the bw_unix part of LMbench on my test
system.

* The improvement varies based on what other patches are in the system,
  as there are a number of bottlenecks, so this number is not absolutely
  accurate.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/aio.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 49fd37629ee4..00c8efa95cc3 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -94,26 +94,27 @@ struct kiocb {
 	ssize_t			(*ki_retry)(struct kiocb *);
 	void			(*ki_dtor)(struct kiocb *);
 
-	struct list_head	ki_list;	/* the aio core uses this
-						 * for cancellation */
-
 	union {
 		void __user		*user;
 		struct task_struct	*tsk;
 	} ki_obj;
+
 	__u64			ki_user_data;	/* user's data for completion */
+	wait_queue_t		ki_wait;
 	loff_t			ki_pos;
+
+	void			*private;
 	/* State that we remember to be able to restart/retry  */
 	unsigned short		ki_opcode;
 	size_t			ki_nbytes; 	/* copy of iocb->aio_nbytes */
 	char 			__user *ki_buf;	/* remaining iocb->aio_buf */
 	size_t			ki_left; 	/* remaining bytes */
-	wait_queue_t		ki_wait;
 	long			ki_retried; 	/* just for testing */
 	long			ki_kicked; 	/* just for testing */
 	long			ki_queued; 	/* just for testing */
 
-	void			*private;
+	struct list_head	ki_list;	/* the aio core uses this
+						 * for cancellation */
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
@@ -126,6 +127,7 @@ struct kiocb {
 		(x)->ki_filp = (filp);			\
 		(x)->ki_ctx = NULL;			\
 		(x)->ki_cancel = NULL;			\
+		(x)->ki_retry = NULL;			\
 		(x)->ki_dtor = NULL;			\
 		(x)->ki_obj.tsk = tsk;			\
 		(x)->ki_user_data = 0;                  \
-- 
cgit v1.2.3-71-gd317


From 349aef0bc4c7f07d685c977e12d0e2d0b5d0e6db Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:04:36 -0800
Subject: [PATCH] shrink struct page

Reduce the size of the pageframe for NR_CPUS>4, CONFIG_PREEMPT back to the
minimal size by unionising both ->private and ->mapping with the pagetable
lock.

It uses an anonymous struct and hence requires gcc-3.x.

Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ff54242c5d7..df80e63903b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -223,24 +223,27 @@ struct page {
 					 * & limit reverse map searches.
 					 */
 	union {
-		unsigned long private;	/* Mapping-private opaque data:
-					 * usually used for buffer_heads
-					 * if PagePrivate set; used for
-					 * swp_entry_t if PageSwapCache
-					 * When page is free, this indicates
-					 * order in the buddy system.
-					 */
+	    struct {
+		unsigned long private;		/* Mapping-private opaque data:
+					 	 * usually used for buffer_heads
+						 * if PagePrivate set; used for
+						 * swp_entry_t if PageSwapCache.
+						 * When page is free, this
+						 * indicates order in the buddy
+						 * system.
+						 */
+		struct address_space *mapping;	/* If low bit clear, points to
+						 * inode address_space, or NULL.
+						 * If page mapped as anonymous
+						 * memory, low bit is set, and
+						 * it points to anon_vma object:
+						 * see PAGE_MAPPING_ANON below.
+						 */
+	    };
 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-		spinlock_t ptl;
+	    spinlock_t ptl;
 #endif
-	} u;
-	struct address_space *mapping;	/* If low bit clear, points to
-					 * inode address_space, or NULL.
-					 * If page mapped as anonymous
-					 * memory, low bit is set, and
-					 * it points to anon_vma object:
-					 * see PAGE_MAPPING_ANON below.
-					 */
+	};
 	pgoff_t index;			/* Our offset within mapping. */
 	struct list_head lru;		/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
@@ -261,8 +264,8 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
-#define page_private(page)		((page)->u.private)
-#define set_page_private(page, v)	((page)->u.private = (v))
+#define page_private(page)		((page)->private)
+#define set_page_private(page, v)	((page)->private = (v))
 
 /*
  * FIXME: take this include out, include page-flags.h in
@@ -815,7 +818,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
  * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  * When freeing, reset page->mapping so free_pages_check won't complain.
  */
-#define __pte_lockptr(page)	&((page)->u.ptl)
+#define __pte_lockptr(page)	&((page)->ptl)
 #define pte_lock_init(_page)	do {					\
 	spin_lock_init(__pte_lockptr(_page));				\
 } while (0)
-- 
cgit v1.2.3-71-gd317


From a12dea7af93ae83bd868c0dc09367090ead7cc1e Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sun, 8 Jan 2006 01:04:49 -0800
Subject: [PATCH] PTRACE_SYSEMU is only for i386 and clashes with other ptrace
 codes of other archs

PTRACE_SYSEMU{,_SINGLESTEP} is actually arch specific, for now, and the
current allocated number clashes with a ptrace code of frv, i.e.
PTRACE_GETFDPIC.  I should have submitted this much earlier, anyway we get no
breakage for this.

CC: Daniel Jacobowitz <dan@debian.org>
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/ptrace.h | 3 +++
 include/linux/ptrace.h    | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index 7e0f2945d17d..f324c53b6f9a 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -54,6 +54,9 @@ struct pt_regs {
 #define PTRACE_GET_THREAD_AREA    25
 #define PTRACE_SET_THREAD_AREA    26
 
+#define PTRACE_SYSEMU		  31
+#define PTRACE_SYSEMU_SINGLESTEP  32
+
 #ifdef __KERNEL__
 
 #include <asm/vm86.h>
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 864791996b5f..9d5cd106b344 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -20,8 +20,6 @@
 #define PTRACE_DETACH		0x11
 
 #define PTRACE_SYSCALL		  24
-#define PTRACE_SYSEMU		  31
-#define PTRACE_SYSEMU_SINGLESTEP  32
 
 /* 0x4200-0x4300 are reserved for architecture-independent additions.  */
 #define PTRACE_SETOPTIONS	0x4200
-- 
cgit v1.2.3-71-gd317


From 7e7f358c8f8f836c504faa293fda0c1c0733b63c Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Sun, 8 Jan 2006 01:04:54 -0800
Subject: [PATCH] Split out screen_info from tty.h

This makes it possible for boot code to use screen_info without dragging in
all of tty.h.

Signed-off-by: Brian Gerst <bgerst@didntduck.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/boot/compressed/misc.c        |  2 +-
 arch/x86_64/boot/compressed/misc.c      |  2 +-
 arch/x86_64/boot/compressed/miscsetup.h | 39 -----------------
 include/linux/screen_info.h             | 77 +++++++++++++++++++++++++++++++++
 include/linux/tty.h                     | 72 +-----------------------------
 5 files changed, 80 insertions(+), 112 deletions(-)
 delete mode 100644 arch/x86_64/boot/compressed/miscsetup.h
 create mode 100644 include/linux/screen_info.h

(limited to 'include/linux')

diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index 82a807f9f5e6..f19f3a7492a5 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -11,7 +11,7 @@
 
 #include <linux/linkage.h>
 #include <linux/vmalloc.h>
-#include <linux/tty.h>
+#include <linux/screen_info.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index 0e10fd84c7cc..cf4b88c416dc 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -9,7 +9,7 @@
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 
-#include "miscsetup.h"
+#include <linux/screen_info.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
diff --git a/arch/x86_64/boot/compressed/miscsetup.h b/arch/x86_64/boot/compressed/miscsetup.h
deleted file mode 100644
index bb1620531703..000000000000
--- a/arch/x86_64/boot/compressed/miscsetup.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#define NULL 0
-//typedef unsigned int size_t; 
-
-
-struct screen_info {
-	unsigned char  orig_x;			/* 0x00 */
-	unsigned char  orig_y;			/* 0x01 */
-	unsigned short dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
-	unsigned short orig_video_page;		/* 0x04 */
-	unsigned char  orig_video_mode;		/* 0x06 */
-	unsigned char  orig_video_cols;		/* 0x07 */
-	unsigned short unused2;			/* 0x08 */
-	unsigned short orig_video_ega_bx;	/* 0x0a */
-	unsigned short unused3;			/* 0x0c */
-	unsigned char  orig_video_lines;	/* 0x0e */
-	unsigned char  orig_video_isVGA;	/* 0x0f */
-	unsigned short orig_video_points;	/* 0x10 */
-
-	/* VESA graphic mode -- linear frame buffer */
-	unsigned short lfb_width;		/* 0x12 */
-	unsigned short lfb_height;		/* 0x14 */
-	unsigned short lfb_depth;		/* 0x16 */
-	unsigned long  lfb_base;		/* 0x18 */
-	unsigned long  lfb_size;		/* 0x1c */
-	unsigned short dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
-	unsigned short lfb_linelength;		/* 0x24 */
-	unsigned char  red_size;		/* 0x26 */
-	unsigned char  red_pos;			/* 0x27 */
-	unsigned char  green_size;		/* 0x28 */
-	unsigned char  green_pos;		/* 0x29 */
-	unsigned char  blue_size;		/* 0x2a */
-	unsigned char  blue_pos;		/* 0x2b */
-	unsigned char  rsvd_size;		/* 0x2c */
-	unsigned char  rsvd_pos;		/* 0x2d */
-	unsigned short vesapm_seg;		/* 0x2e */
-	unsigned short vesapm_off;		/* 0x30 */
-	unsigned short pages;			/* 0x32 */
-						/* 0x34 -- 0x3f reserved for future expansion */
-};
diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h
new file mode 100644
index 000000000000..76850b75b3f6
--- /dev/null
+++ b/include/linux/screen_info.h
@@ -0,0 +1,77 @@
+#ifndef _SCREEN_INFO_H
+#define _SCREEN_INFO_H
+
+#include <linux/types.h>
+
+/*
+ * These are set up by the setup-routine at boot-time:
+ */
+
+struct screen_info {
+	u8  orig_x;		/* 0x00 */
+	u8  orig_y;		/* 0x01 */
+	u16 dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
+	u16 orig_video_page;	/* 0x04 */
+	u8  orig_video_mode;	/* 0x06 */
+	u8  orig_video_cols;	/* 0x07 */
+	u16 unused2;		/* 0x08 */
+	u16 orig_video_ega_bx;	/* 0x0a */
+	u16 unused3;		/* 0x0c */
+	u8  orig_video_lines;	/* 0x0e */
+	u8  orig_video_isVGA;	/* 0x0f */
+	u16 orig_video_points;	/* 0x10 */
+
+	/* VESA graphic mode -- linear frame buffer */
+	u16 lfb_width;		/* 0x12 */
+	u16 lfb_height;		/* 0x14 */
+	u16 lfb_depth;		/* 0x16 */
+	u32 lfb_base;		/* 0x18 */
+	u32 lfb_size;		/* 0x1c */
+	u16 dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
+	u16 lfb_linelength;	/* 0x24 */
+	u8  red_size;		/* 0x26 */
+	u8  red_pos;		/* 0x27 */
+	u8  green_size;		/* 0x28 */
+	u8  green_pos;		/* 0x29 */
+	u8  blue_size;		/* 0x2a */
+	u8  blue_pos;		/* 0x2b */
+	u8  rsvd_size;		/* 0x2c */
+	u8  rsvd_pos;		/* 0x2d */
+	u16 vesapm_seg;		/* 0x2e */
+	u16 vesapm_off;		/* 0x30 */
+	u16 pages;		/* 0x32 */
+	u16 vesa_attributes;	/* 0x34 */
+	u32  capabilities;      /* 0x36 */
+				/* 0x3a -- 0x3f reserved for future expansion */
+};
+
+extern struct screen_info screen_info;
+
+#define ORIG_X			(screen_info.orig_x)
+#define ORIG_Y			(screen_info.orig_y)
+#define ORIG_VIDEO_MODE		(screen_info.orig_video_mode)
+#define ORIG_VIDEO_COLS 	(screen_info.orig_video_cols)
+#define ORIG_VIDEO_EGA_BX	(screen_info.orig_video_ega_bx)
+#define ORIG_VIDEO_LINES	(screen_info.orig_video_lines)
+#define ORIG_VIDEO_ISVGA	(screen_info.orig_video_isVGA)
+#define ORIG_VIDEO_POINTS       (screen_info.orig_video_points)
+
+#define VIDEO_TYPE_MDA		0x10	/* Monochrome Text Display	*/
+#define VIDEO_TYPE_CGA		0x11	/* CGA Display 			*/
+#define VIDEO_TYPE_EGAM		0x20	/* EGA/VGA in Monochrome Mode	*/
+#define VIDEO_TYPE_EGAC		0x21	/* EGA in Color Mode		*/
+#define VIDEO_TYPE_VGAC		0x22	/* VGA+ in Color Mode		*/
+#define VIDEO_TYPE_VLFB		0x23	/* VESA VGA in graphic mode	*/
+
+#define VIDEO_TYPE_PICA_S3	0x30	/* ACER PICA-61 local S3 video	*/
+#define VIDEO_TYPE_MIPS_G364	0x31    /* MIPS Magnum 4000 G364 video  */
+#define VIDEO_TYPE_SGI          0x33    /* Various SGI graphics hardware */
+
+#define VIDEO_TYPE_TGAC		0x40	/* DEC TGA */
+
+#define VIDEO_TYPE_SUN          0x50    /* Sun frame buffer. */
+#define VIDEO_TYPE_SUNPCI       0x51    /* Sun PCI based frame buffer. */
+
+#define VIDEO_TYPE_PMAC		0x60	/* PowerMacintosh frame buffer. */
+
+#endif /* _SCREEN_INFO_H */
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1267f88ece6e..57449704a47b 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -23,6 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/tty_driver.h>
 #include <linux/tty_ldisc.h>
+#include <linux/screen_info.h>
 
 #include <asm/system.h>
 
@@ -36,77 +37,6 @@
 #define NR_UNIX98_PTY_MAX	(1 << MINORBITS) /* Absolute limit */
 #define NR_LDISCS		16
 
-/*
- * These are set up by the setup-routine at boot-time:
- */
-
-struct screen_info {
-	u8  orig_x;		/* 0x00 */
-	u8  orig_y;		/* 0x01 */
-	u16 dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
-	u16 orig_video_page;	/* 0x04 */
-	u8  orig_video_mode;	/* 0x06 */
-	u8  orig_video_cols;	/* 0x07 */
-	u16 unused2;		/* 0x08 */
-	u16 orig_video_ega_bx;	/* 0x0a */
-	u16 unused3;		/* 0x0c */
-	u8  orig_video_lines;	/* 0x0e */
-	u8  orig_video_isVGA;	/* 0x0f */
-	u16 orig_video_points;	/* 0x10 */
-
-	/* VESA graphic mode -- linear frame buffer */
-	u16 lfb_width;		/* 0x12 */
-	u16 lfb_height;		/* 0x14 */
-	u16 lfb_depth;		/* 0x16 */
-	u32 lfb_base;		/* 0x18 */
-	u32 lfb_size;		/* 0x1c */
-	u16 dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
-	u16 lfb_linelength;	/* 0x24 */
-	u8  red_size;		/* 0x26 */
-	u8  red_pos;		/* 0x27 */
-	u8  green_size;		/* 0x28 */
-	u8  green_pos;		/* 0x29 */
-	u8  blue_size;		/* 0x2a */
-	u8  blue_pos;		/* 0x2b */
-	u8  rsvd_size;		/* 0x2c */
-	u8  rsvd_pos;		/* 0x2d */
-	u16 vesapm_seg;		/* 0x2e */
-	u16 vesapm_off;		/* 0x30 */
-	u16 pages;		/* 0x32 */
-	u16 vesa_attributes;	/* 0x34 */
-	u32  capabilities;      /* 0x36 */
-				/* 0x3a -- 0x3f reserved for future expansion */
-};
-
-extern struct screen_info screen_info;
-
-#define ORIG_X			(screen_info.orig_x)
-#define ORIG_Y			(screen_info.orig_y)
-#define ORIG_VIDEO_MODE		(screen_info.orig_video_mode)
-#define ORIG_VIDEO_COLS 	(screen_info.orig_video_cols)
-#define ORIG_VIDEO_EGA_BX	(screen_info.orig_video_ega_bx)
-#define ORIG_VIDEO_LINES	(screen_info.orig_video_lines)
-#define ORIG_VIDEO_ISVGA	(screen_info.orig_video_isVGA)
-#define ORIG_VIDEO_POINTS       (screen_info.orig_video_points)
-
-#define VIDEO_TYPE_MDA		0x10	/* Monochrome Text Display	*/
-#define VIDEO_TYPE_CGA		0x11	/* CGA Display 			*/
-#define VIDEO_TYPE_EGAM		0x20	/* EGA/VGA in Monochrome Mode	*/
-#define VIDEO_TYPE_EGAC		0x21	/* EGA in Color Mode		*/
-#define VIDEO_TYPE_VGAC		0x22	/* VGA+ in Color Mode		*/
-#define VIDEO_TYPE_VLFB		0x23	/* VESA VGA in graphic mode	*/
-
-#define VIDEO_TYPE_PICA_S3	0x30	/* ACER PICA-61 local S3 video	*/
-#define VIDEO_TYPE_MIPS_G364	0x31    /* MIPS Magnum 4000 G364 video  */
-#define VIDEO_TYPE_SGI          0x33    /* Various SGI graphics hardware */
-
-#define VIDEO_TYPE_TGAC		0x40	/* DEC TGA */
-
-#define VIDEO_TYPE_SUN          0x50    /* Sun frame buffer. */
-#define VIDEO_TYPE_SUNPCI       0x51    /* Sun PCI based frame buffer. */
-
-#define VIDEO_TYPE_PMAC		0x60	/* PowerMacintosh frame buffer. */
-
 /*
  * This character is the same as _POSIX_VDISABLE: it cannot be used as
  * a c_cc[] character, but indicates that a particular special character
-- 
cgit v1.2.3-71-gd317


From d8a33496671e4533aed090793436d58debea6f3a Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Sun, 8 Jan 2006 01:05:06 -0800
Subject: [PATCH] parport: bring back an unused phase for ppdev ioctl

Earlier fix removed unused phase, but that changed the values for other
phases.  Since these are exposed to userspace through ppdev, it is safer
not to change them.  Restore the unused phase value.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/parport.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/parport.h b/include/linux/parport.h
index f7ff0b0c4031..f67f838a3a1f 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -236,12 +236,14 @@ struct pardevice {
 
 /* IEEE1284 information */
 
-/* IEEE1284 phases */
+/* IEEE1284 phases. These are exposed to userland through ppdev IOCTL
+ * PP[GS]ETPHASE, so do not change existing values. */
 enum ieee1284_phase {
 	IEEE1284_PH_FWD_DATA,
 	IEEE1284_PH_FWD_IDLE,
 	IEEE1284_PH_TERMINATE,
 	IEEE1284_PH_NEGOTIATION,
+	IEEE1284_PH_HBUSY_DNA,
 	IEEE1284_PH_REV_IDLE,
 	IEEE1284_PH_HBUSY_DAVAIL,
 	IEEE1284_PH_REV_DATA,
-- 
cgit v1.2.3-71-gd317


From 6a878184c202395ea17212f111ab9ec4b5f6d6ee Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Sun, 8 Jan 2006 01:05:07 -0800
Subject: [PATCH] Eliminate __attribute__ ((packed)) warnings for gcc-4.1

Since version 4.1 the gcc is warning about ignored attributes. This patch is
using the equivalent attribute on the struct instead of on each of the
structure or union members.

GCC Manual:
  "Specifying Attributes of Types

   packed
    This attribute, attached to struct or union type definition, specifies
    that
    each member of the structure or union is placed to minimize the memory
    required. When attached to an enum definition, it indicates that the
    smallest integral type should be used.

    Specifying this attribute for struct and union types is equivalent to
    specifying the packed attribute on each of the structure or union
    members."

Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/isdn/hisax/hisax.h          |  18 +++---
 drivers/isdn/hisax/hisax_fcpcipnp.h |  18 +++---
 drivers/net/3c527.h                 |  50 +++++++-------
 drivers/net/irda/vlsi_ir.h          |   4 +-
 drivers/net/wan/sdla.c              |   6 +-
 include/linux/atalk.h               |  18 +++---
 include/linux/cycx_x25.h            |  66 +++++++++----------
 include/linux/if_frad.h             |  12 ++--
 include/linux/isdnif.h              |  70 ++++++++++----------
 include/linux/ncp.h                 | 126 ++++++++++++++++++------------------
 include/linux/sdla.h                |  64 +++++++++---------
 include/linux/wavefront.h           |  36 +++++------
 include/net/dn_dev.h                |  84 ++++++++++++------------
 include/net/dn_nsp.h                |  74 ++++++++++-----------
 include/sound/wavefront.h           |  36 +++++------
 15 files changed, 341 insertions(+), 341 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h
index 26c545fa223b..1b85ce166af8 100644
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -396,17 +396,17 @@ struct isar_hw {
 
 struct hdlc_stat_reg {
 #ifdef __BIG_ENDIAN
-	u_char fill __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char cmd  __attribute__((packed));
+	u_char fill;
+	u_char mode;
+	u_char xml;
+	u_char cmd;
 #else
-	u_char cmd  __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char fill __attribute__((packed));
+	u_char cmd;
+	u_char xml;
+	u_char mode;
+	u_char fill;
 #endif
-};
+} __attribute__((packed));
 
 struct hdlc_hw {
 	union {
diff --git a/drivers/isdn/hisax/hisax_fcpcipnp.h b/drivers/isdn/hisax/hisax_fcpcipnp.h
index bd8a22e4d6a2..21fbcedf3a94 100644
--- a/drivers/isdn/hisax/hisax_fcpcipnp.h
+++ b/drivers/isdn/hisax/hisax_fcpcipnp.h
@@ -12,17 +12,17 @@ enum {
 
 struct hdlc_stat_reg {
 #ifdef __BIG_ENDIAN
-	u_char fill __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char cmd  __attribute__((packed));
+	u_char fill;
+	u_char mode;
+	u_char xml;
+	u_char cmd;
 #else
-	u_char cmd  __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char fill __attribute__((packed));
+	u_char cmd;
+	u_char xml;
+	u_char mode;
+	u_char fill;
 #endif
-};
+} __attribute__((packed));
 
 struct fritz_bcs {
 	struct hisax_b_if b_if;
diff --git a/drivers/net/3c527.h b/drivers/net/3c527.h
index c10f009ce9b6..53b5b071df08 100644
--- a/drivers/net/3c527.h
+++ b/drivers/net/3c527.h
@@ -32,43 +32,43 @@
 
 struct mc32_mailbox
 {
-	u16	mbox __attribute((packed));
-	u16	data[1] __attribute((packed));
-};
+ 	u16 mbox;
+ 	u16 data[1];
+} __attribute((packed));
 
 struct skb_header
 {
-	u8	status __attribute((packed));
-	u8	control __attribute((packed));
-	u16	next __attribute((packed));	/* Do not change! */
-	u16	length __attribute((packed));
-	u32	data __attribute((packed));
-};
+	u8 status;
+	u8 control;
+	u16 next;	/* Do not change! */
+	u16 length;
+	u32 data;
+} __attribute((packed));
 
 struct mc32_stats
 {
 	/* RX Errors */
-	u32     rx_crc_errors       __attribute((packed)); 	
-	u32     rx_alignment_errors  __attribute((packed)); 	
-	u32     rx_overrun_errors    __attribute((packed));
-	u32     rx_tooshort_errors  __attribute((packed));
-	u32     rx_toolong_errors   __attribute((packed));
-	u32     rx_outofresource_errors  __attribute((packed)); 
+	u32 rx_crc_errors;
+	u32 rx_alignment_errors;
+	u32 rx_overrun_errors;
+	u32 rx_tooshort_errors;
+	u32 rx_toolong_errors;
+	u32 rx_outofresource_errors;
 
-	u32     rx_discarded   __attribute((packed));  /* via card pattern match filter */ 
+	u32 rx_discarded;  /* via card pattern match filter */
 
 	/* TX Errors */
-	u32     tx_max_collisions __attribute((packed)); 
-	u32     tx_carrier_errors __attribute((packed)); 
-	u32     tx_underrun_errors __attribute((packed)); 
-	u32     tx_cts_errors     __attribute((packed)); 
-	u32     tx_timeout_errors __attribute((packed)) ;
+	u32 tx_max_collisions;
+	u32 tx_carrier_errors;
+	u32 tx_underrun_errors;
+	u32 tx_cts_errors;
+	u32 tx_timeout_errors;
 	
 	/* various cruft */
-	u32     dataA[6] __attribute((packed));   
-        u16	dataB[5] __attribute((packed));   
-  	u32     dataC[14] __attribute((packed)); 	
-};
+	u32 dataA[6];
+	u16 dataB[5];
+	u32 dataC[14];
+} __attribute((packed));
 
 #define STATUS_MASK	0x0F
 #define COMPLETED	(1<<7)
diff --git a/drivers/net/irda/vlsi_ir.h b/drivers/net/irda/vlsi_ir.h
index 741aecc655df..a82a4ba8de4f 100644
--- a/drivers/net/irda/vlsi_ir.h
+++ b/drivers/net/irda/vlsi_ir.h
@@ -577,8 +577,8 @@ struct ring_descr_hw {
 		struct {
 			u8		addr_res[3];
 			volatile u8	status;		/* descriptor status */
-		} rd_s __attribute__((packed));
-	} rd_u __attribute((packed));
+		} __attribute__((packed)) rd_s;
+	} __attribute((packed)) rd_u;
 } __attribute__ ((packed));
 
 #define rd_addr		rd_u.addr
diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index 036adc4f8ba7..22e794071cf4 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -329,9 +329,9 @@ static int sdla_cpuspeed(struct net_device *dev, struct ifreq *ifr)
 
 struct _dlci_stat 
 {
-	short dlci		__attribute__((packed));
-	char  flags		__attribute__((packed));
-};
+	short dlci;
+	char  flags;
+} __attribute__((packed));
 
 struct _frad_stat 
 {
diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index 911c09cb9bf9..6ba3aa8a81f4 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -155,15 +155,15 @@ struct elapaarp {
 #define AARP_REQUEST			1
 #define AARP_REPLY			2
 #define AARP_PROBE			3
-	__u8	hw_src[ETH_ALEN]	__attribute__ ((packed));
-	__u8	pa_src_zero		__attribute__ ((packed));
-	__be16	pa_src_net		__attribute__ ((packed));
-	__u8	pa_src_node		__attribute__ ((packed));
-	__u8	hw_dst[ETH_ALEN]	__attribute__ ((packed));
-	__u8	pa_dst_zero		__attribute__ ((packed));
-	__be16	pa_dst_net		__attribute__ ((packed));
-	__u8	pa_dst_node		__attribute__ ((packed));	
-};
+	__u8	hw_src[ETH_ALEN];
+	__u8	pa_src_zero;
+	__be16	pa_src_net;
+	__u8	pa_src_node;
+	__u8	hw_dst[ETH_ALEN];
+	__u8	pa_dst_zero;
+	__be16	pa_dst_net;
+	__u8	pa_dst_node;
+} __attribute__ ((packed));
 
 static __inline__ struct elapaarp *aarp_hdr(struct sk_buff *skb)
 {
diff --git a/include/linux/cycx_x25.h b/include/linux/cycx_x25.h
index b10a7f3a8cac..f7a906583463 100644
--- a/include/linux/cycx_x25.h
+++ b/include/linux/cycx_x25.h
@@ -38,11 +38,11 @@ extern unsigned int cycx_debug;
 /* Data Structures */
 /* X.25 Command Block. */
 struct cycx_x25_cmd {
-	u16 command PACKED;
-	u16 link    PACKED; /* values: 0 or 1 */
-	u16 len     PACKED; /* values: 0 thru 0x205 (517) */
-	u32 buf     PACKED;
-};
+	u16 command;
+	u16 link;	/* values: 0 or 1 */
+	u16 len;	/* values: 0 thru 0x205 (517) */
+	u32 buf;
+} PACKED;
 
 /* Defines for the 'command' field. */
 #define X25_CONNECT_REQUEST             0x4401
@@ -92,34 +92,34 @@ struct cycx_x25_cmd {
  *	@flags - see dosx25.doc, in portuguese, for details
  */
 struct cycx_x25_config {
-	u8  link	PACKED;
-	u8  speed	PACKED;
-	u8  clock	PACKED;
-	u8  n2		PACKED;
-	u8  n2win	PACKED;
-	u8  n3win	PACKED;
-	u8  nvc		PACKED;
-	u8  pktlen	PACKED;
-	u8  locaddr	PACKED;
-	u8  remaddr	PACKED;
-	u16 t1		PACKED;
-	u16 t2		PACKED;
-	u8  t21		PACKED;
-	u8  npvc	PACKED;
-	u8  t23		PACKED;
-	u8  flags	PACKED;
-};
+	u8  link;
+	u8  speed;
+	u8  clock;
+	u8  n2;
+	u8  n2win;
+	u8  n3win;
+	u8  nvc;
+	u8  pktlen;
+	u8  locaddr;
+	u8  remaddr;
+	u16 t1;
+	u16 t2;
+	u8  t21;
+	u8  npvc;
+	u8  t23;
+	u8  flags;
+} PACKED;
 
 struct cycx_x25_stats {
-	u16 rx_crc_errors	PACKED;
-	u16 rx_over_errors	PACKED;
-	u16 n2_tx_frames 	PACKED;
-	u16 n2_rx_frames 	PACKED;
-	u16 tx_timeouts 	PACKED;
-	u16 rx_timeouts 	PACKED;
-	u16 n3_tx_packets 	PACKED;
-	u16 n3_rx_packets 	PACKED;
-	u16 tx_aborts	 	PACKED;
-	u16 rx_aborts	 	PACKED;
-};
+	u16 rx_crc_errors;
+	u16 rx_over_errors;
+	u16 n2_tx_frames;
+	u16 n2_rx_frames;
+	u16 tx_timeouts;
+	u16 rx_timeouts;
+	u16 n3_tx_packets;
+	u16 n3_rx_packets;
+	u16 tx_aborts;
+	u16 rx_aborts;
+} PACKED;
 #endif	/* _CYCX_X25_H */
diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 511999c7eeda..395f0aad9cbf 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -131,17 +131,17 @@ struct frad_conf
 /* these are the fields of an RFC 1490 header */
 struct frhdr
 {
-   unsigned char  control	__attribute__((packed));
+   unsigned char  control;
 
    /* for IP packets, this can be the NLPID */
-   unsigned char  pad		__attribute__((packed)); 
+   unsigned char  pad;
 
-   unsigned char  NLPID		__attribute__((packed));
-   unsigned char  OUI[3]	__attribute__((packed));
-   unsigned short PID		__attribute__((packed));
+   unsigned char  NLPID;
+   unsigned char  OUI[3];
+   unsigned short PID;
 
 #define IP_NLPID pad 
-};
+} __attribute__((packed));
 
 /* see RFC 1490 for the definition of the following */
 #define FRAD_I_UI		0x03
diff --git a/include/linux/isdnif.h b/include/linux/isdnif.h
index 7a4eacd77cb2..04e10f9f14f8 100644
--- a/include/linux/isdnif.h
+++ b/include/linux/isdnif.h
@@ -282,43 +282,43 @@ typedef struct setup_parm {
 
 typedef struct T30_s {
 	/* session parameters */
-	__u8 resolution		__attribute__ ((packed));
-	__u8 rate		__attribute__ ((packed));
-	__u8 width		__attribute__ ((packed));
-	__u8 length		__attribute__ ((packed));
-	__u8 compression	__attribute__ ((packed));
-	__u8 ecm		__attribute__ ((packed));
-	__u8 binary		__attribute__ ((packed));
-	__u8 scantime		__attribute__ ((packed));
-	__u8 id[FAXIDLEN]	__attribute__ ((packed));
+	__u8 resolution;
+	__u8 rate;
+	__u8 width;
+	__u8 length;
+	__u8 compression;
+	__u8 ecm;
+	__u8 binary;
+	__u8 scantime;
+	__u8 id[FAXIDLEN];
 	/* additional parameters */
-	__u8 phase		__attribute__ ((packed));
-	__u8 direction		__attribute__ ((packed));
-	__u8 code		__attribute__ ((packed));
-	__u8 badlin		__attribute__ ((packed));
-	__u8 badmul		__attribute__ ((packed));
-	__u8 bor		__attribute__ ((packed));
-	__u8 fet		__attribute__ ((packed));
-	__u8 pollid[FAXIDLEN]	__attribute__ ((packed));
-	__u8 cq			__attribute__ ((packed));
-	__u8 cr			__attribute__ ((packed));
-	__u8 ctcrty		__attribute__ ((packed));
-	__u8 minsp		__attribute__ ((packed));
-	__u8 phcto		__attribute__ ((packed));
-	__u8 rel		__attribute__ ((packed));
-	__u8 nbc		__attribute__ ((packed));
+	__u8 phase;
+	__u8 direction;
+	__u8 code;
+	__u8 badlin;
+	__u8 badmul;
+	__u8 bor;
+	__u8 fet;
+	__u8 pollid[FAXIDLEN];
+	__u8 cq;
+	__u8 cr;
+	__u8 ctcrty;
+	__u8 minsp;
+	__u8 phcto;
+	__u8 rel;
+	__u8 nbc;
 	/* remote station parameters */
-	__u8 r_resolution	__attribute__ ((packed));
-	__u8 r_rate		__attribute__ ((packed));
-	__u8 r_width		__attribute__ ((packed));
-	__u8 r_length		__attribute__ ((packed));
-	__u8 r_compression	__attribute__ ((packed));
-	__u8 r_ecm		__attribute__ ((packed));
-	__u8 r_binary		__attribute__ ((packed));
-	__u8 r_scantime		__attribute__ ((packed));
-	__u8 r_id[FAXIDLEN]	__attribute__ ((packed));
-	__u8 r_code		__attribute__ ((packed));
-} T30_s;
+	__u8 r_resolution;
+	__u8 r_rate;
+	__u8 r_width;
+	__u8 r_length;
+	__u8 r_compression;
+	__u8 r_ecm;
+	__u8 r_binary;
+	__u8 r_scantime;
+	__u8 r_id[FAXIDLEN];
+	__u8 r_code;
+} __attribute__((packed)) T30_s;
 
 #define ISDN_TTY_FAX_CONN_IN	0
 #define ISDN_TTY_FAX_CONN_OUT	1
diff --git a/include/linux/ncp.h b/include/linux/ncp.h
index 99f77876b716..99f0adeeb3f3 100644
--- a/include/linux/ncp.h
+++ b/include/linux/ncp.h
@@ -20,29 +20,29 @@
 #define NCP_DEALLOC_SLOT_REQUEST (0x5555)
 
 struct ncp_request_header {
-	__u16 type __attribute__((packed));
-	__u8 sequence __attribute__((packed));
-	__u8 conn_low __attribute__((packed));
-	__u8 task __attribute__((packed));
-	__u8 conn_high __attribute__((packed));
-	__u8 function __attribute__((packed));
-	__u8 data[0] __attribute__((packed));
-};
+	__u16 type;
+	__u8 sequence;
+	__u8 conn_low;
+	__u8 task;
+	__u8 conn_high;
+	__u8 function;
+	__u8 data[0];
+} __attribute__((packed));
 
 #define NCP_REPLY                (0x3333)
 #define NCP_WATCHDOG		 (0x3E3E)
 #define NCP_POSITIVE_ACK         (0x9999)
 
 struct ncp_reply_header {
-	__u16 type __attribute__((packed));
-	__u8 sequence __attribute__((packed));
-	__u8 conn_low __attribute__((packed));
-	__u8 task __attribute__((packed));
-	__u8 conn_high __attribute__((packed));
-	__u8 completion_code __attribute__((packed));
-	__u8 connection_state __attribute__((packed));
-	__u8 data[0] __attribute__((packed));
-};
+	__u16 type;
+	__u8 sequence;
+	__u8 conn_low;
+	__u8 task;
+	__u8 conn_high;
+	__u8 completion_code;
+	__u8 connection_state;
+	__u8 data[0];
+} __attribute__((packed));
 
 #define NCP_VOLNAME_LEN (16)
 #define NCP_NUMBER_OF_VOLUMES (256)
@@ -128,37 +128,37 @@ struct nw_nfs_info {
 };
 
 struct nw_info_struct {
-	__u32 spaceAlloc __attribute__((packed));
-	__le32 attributes __attribute__((packed));
-	__u16 flags __attribute__((packed));
-	__le32 dataStreamSize __attribute__((packed));
-	__le32 totalStreamSize __attribute__((packed));
-	__u16 numberOfStreams __attribute__((packed));
-	__le16 creationTime __attribute__((packed));
-	__le16 creationDate __attribute__((packed));
-	__u32 creatorID __attribute__((packed));
-	__le16 modifyTime __attribute__((packed));
-	__le16 modifyDate __attribute__((packed));
-	__u32 modifierID __attribute__((packed));
-	__le16 lastAccessDate __attribute__((packed));
-	__u16 archiveTime __attribute__((packed));
-	__u16 archiveDate __attribute__((packed));
-	__u32 archiverID __attribute__((packed));
-	__u16 inheritedRightsMask __attribute__((packed));
-	__le32 dirEntNum __attribute__((packed));
-	__le32 DosDirNum __attribute__((packed));
-	__u32 volNumber __attribute__((packed));
-	__u32 EADataSize __attribute__((packed));
-	__u32 EAKeyCount __attribute__((packed));
-	__u32 EAKeySize __attribute__((packed));
-	__u32 NSCreator __attribute__((packed));
-	__u8 nameLen __attribute__((packed));
-	__u8 entryName[256] __attribute__((packed));
+	__u32 spaceAlloc;
+	__le32 attributes;
+	__u16 flags;
+	__le32 dataStreamSize;
+	__le32 totalStreamSize;
+	__u16 numberOfStreams;
+	__le16 creationTime;
+	__le16 creationDate;
+	__u32 creatorID;
+	__le16 modifyTime;
+	__le16 modifyDate;
+	__u32 modifierID;
+	__le16 lastAccessDate;
+	__u16 archiveTime;
+	__u16 archiveDate;
+	__u32 archiverID;
+	__u16 inheritedRightsMask;
+	__le32 dirEntNum;
+	__le32 DosDirNum;
+	__u32 volNumber;
+	__u32 EADataSize;
+	__u32 EAKeyCount;
+	__u32 EAKeySize;
+	__u32 NSCreator;
+	__u8 nameLen;
+	__u8 entryName[256];
 	/* libncp may depend on there being nothing after entryName */
 #ifdef __KERNEL__
 	struct nw_nfs_info nfs;
 #endif
-};
+} __attribute__((packed));
 
 /* modify mask - use with MODIFY_DOS_INFO structure */
 #define DM_ATTRIBUTES		  (cpu_to_le32(0x02))
@@ -176,26 +176,26 @@ struct nw_info_struct {
 #define DM_MAXIMUM_SPACE	  (cpu_to_le32(0x2000))
 
 struct nw_modify_dos_info {
-	__le32 attributes __attribute__((packed));
-	__le16 creationDate __attribute__((packed));
-	__le16 creationTime __attribute__((packed));
-	__u32 creatorID __attribute__((packed));
-	__le16 modifyDate __attribute__((packed));
-	__le16 modifyTime __attribute__((packed));
-	__u32 modifierID __attribute__((packed));
-	__u16 archiveDate __attribute__((packed));
-	__u16 archiveTime __attribute__((packed));
-	__u32 archiverID __attribute__((packed));
-	__le16 lastAccessDate __attribute__((packed));
-	__u16 inheritanceGrantMask __attribute__((packed));
-	__u16 inheritanceRevokeMask __attribute__((packed));
-	__u32 maximumSpace __attribute__((packed));
-};
+	__le32 attributes;
+	__le16 creationDate;
+	__le16 creationTime;
+	__u32 creatorID;
+	__le16 modifyDate;
+	__le16 modifyTime;
+	__u32 modifierID;
+	__u16 archiveDate;
+	__u16 archiveTime;
+	__u32 archiverID;
+	__le16 lastAccessDate;
+	__u16 inheritanceGrantMask;
+	__u16 inheritanceRevokeMask;
+	__u32 maximumSpace;
+} __attribute__((packed));
 
 struct nw_search_sequence {
-	__u8 volNumber __attribute__((packed));
-	__u32 dirBase __attribute__((packed));
-	__u32 sequence __attribute__((packed));
-};
+	__u8 volNumber;
+	__u32 dirBase;
+	__u32 sequence;
+} __attribute__((packed));
 
 #endif				/* _LINUX_NCP_H */
diff --git a/include/linux/sdla.h b/include/linux/sdla.h
index 3b6afb8caa42..564acd3a71c1 100644
--- a/include/linux/sdla.h
+++ b/include/linux/sdla.h
@@ -293,46 +293,46 @@ void sdla(void *cfg_info, char *dev, struct frad_conf *conf, int quiet);
 #define SDLA_S508_INTEN			0x10
 
 struct sdla_cmd {
-   char  opp_flag		__attribute__((packed));
-   char  cmd			__attribute__((packed));
-   short length			__attribute__((packed));
-   char  retval			__attribute__((packed));
-   short dlci			__attribute__((packed));
-   char  flags			__attribute__((packed));
-   short rxlost_int		__attribute__((packed));
-   long  rxlost_app		__attribute__((packed));
-   char  reserve[2]		__attribute__((packed));
-   char  data[SDLA_MAX_DATA]	__attribute__((packed));	/* transfer data buffer */
-};
+   char  opp_flag;
+   char  cmd;
+   short length;
+   char  retval;
+   short dlci;
+   char  flags;
+   short rxlost_int;
+   long  rxlost_app;
+   char  reserve[2];
+   char  data[SDLA_MAX_DATA];	/* transfer data buffer */
+} __attribute__((packed));
 
 struct intr_info {
-   char  flags		__attribute__((packed));
-   short txlen		__attribute__((packed));
-   char  irq		__attribute__((packed));
-   char  flags2		__attribute__((packed));
-   short timeout	__attribute__((packed));
-};
+   char  flags;
+   short txlen;
+   char  irq;
+   char  flags2;
+   short timeout;
+} __attribute__((packed));
 
 /* found in the 508's control window at RXBUF_INFO */
 struct buf_info {
-   unsigned short rse_num	__attribute__((packed));
-   unsigned long  rse_base	__attribute__((packed));
-   unsigned long  rse_next	__attribute__((packed));
-   unsigned long  buf_base	__attribute__((packed));
-   unsigned short reserved	__attribute__((packed));
-   unsigned long  buf_top	__attribute__((packed));
-};
+   unsigned short rse_num;
+   unsigned long  rse_base;
+   unsigned long  rse_next;
+   unsigned long  buf_base;
+   unsigned short reserved;
+   unsigned long  buf_top;
+} __attribute__((packed));
 
 /* structure pointed to by rse_base in RXBUF_INFO struct */
 struct buf_entry {
-   char  opp_flag	__attribute__((packed));
-   short length		__attribute__((packed));
-   short dlci		__attribute__((packed));
-   char  flags		__attribute__((packed));
-   short timestamp	__attribute__((packed));
-   short reserved[2]	__attribute__((packed));
-   long  buf_addr	__attribute__((packed));
-};
+   char  opp_flag;
+   short length;
+   short dlci;
+   char  flags;
+   short timestamp;
+   short reserved[2];
+   long  buf_addr;
+} __attribute__((packed));
 
 #endif
 
diff --git a/include/linux/wavefront.h b/include/linux/wavefront.h
index 61bd0fd35240..51ab3c933acd 100644
--- a/include/linux/wavefront.h
+++ b/include/linux/wavefront.h
@@ -434,22 +434,22 @@ typedef struct wf_multisample {
 } wavefront_multisample;
 
 typedef struct wf_alias {
-    INT16 OriginalSample __attribute__ ((packed));
-
-    struct wf_sample_offset sampleStartOffset __attribute__ ((packed));
-    struct wf_sample_offset loopStartOffset __attribute__ ((packed));
-    struct wf_sample_offset sampleEndOffset __attribute__ ((packed));
-    struct wf_sample_offset loopEndOffset __attribute__ ((packed));
-
-    INT16  FrequencyBias __attribute__ ((packed));
-
-    UCHAR8 SampleResolution:2  __attribute__ ((packed));
-    UCHAR8 Unused1:1  __attribute__ ((packed));
-    UCHAR8 Loop:1 __attribute__ ((packed));
-    UCHAR8 Bidirectional:1  __attribute__ ((packed));
-    UCHAR8 Unused2:1 __attribute__ ((packed));
-    UCHAR8 Reverse:1 __attribute__ ((packed));
-    UCHAR8 Unused3:1 __attribute__ ((packed)); 
+    INT16 OriginalSample;
+
+    struct wf_sample_offset sampleStartOffset;
+    struct wf_sample_offset loopStartOffset;
+    struct wf_sample_offset sampleEndOffset;
+    struct wf_sample_offset loopEndOffset;
+
+    INT16  FrequencyBias;
+
+    UCHAR8 SampleResolution:2;
+    UCHAR8 Unused1:1;
+    UCHAR8 Loop:1;
+    UCHAR8 Bidirectional:1;
+    UCHAR8 Unused2:1;
+    UCHAR8 Reverse:1;
+    UCHAR8 Unused3:1;
     
     /* This structure is meant to be padded only to 16 bits on their
        original. Of course, whoever wrote their documentation didn't
@@ -460,8 +460,8 @@ typedef struct wf_alias {
        standard 16->32 bit issues.
     */
 
-    UCHAR8 sixteen_bit_padding __attribute__ ((packed));
-} wavefront_alias;
+    UCHAR8 sixteen_bit_padding;
+} __attribute__((packed)) wavefront_alias;
 
 typedef struct wf_drum {
     UCHAR8 PatchNumber;
diff --git a/include/net/dn_dev.h b/include/net/dn_dev.h
index 86e8e86e624a..5a86e78081bf 100644
--- a/include/net/dn_dev.h
+++ b/include/net/dn_dev.h
@@ -88,8 +88,8 @@ struct dn_dev {
 	struct net_device *dev;
 	struct dn_dev_parms parms;
 	char use_long;
-        struct timer_list timer;
-        unsigned long t3;
+	struct timer_list timer;
+	unsigned long t3;
 	struct neigh_parms *neigh_parms;
 	unsigned char addr[ETH_ALEN];
 	struct neighbour *router; /* Default router on circuit */
@@ -99,57 +99,57 @@ struct dn_dev {
 
 struct dn_short_packet
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned short  dstnode         __attribute__((packed));
-        unsigned short  srcnode         __attribute__((packed));
-        unsigned char   forward         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstnode;
+	unsigned short  srcnode;
+	unsigned char   forward;
+} __attribute__((packed));
 
 struct dn_long_packet
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned char   d_area          __attribute__((packed));
-        unsigned char   d_subarea       __attribute__((packed));
-        unsigned char   d_id[6]         __attribute__((packed));
-        unsigned char   s_area          __attribute__((packed));
-        unsigned char   s_subarea       __attribute__((packed));
-        unsigned char   s_id[6]         __attribute__((packed));
-        unsigned char   nl2             __attribute__((packed));
-        unsigned char   visit_ct        __attribute__((packed));
-        unsigned char   s_class         __attribute__((packed));
-        unsigned char   pt              __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned char   d_area;
+	unsigned char   d_subarea;
+	unsigned char   d_id[6];
+	unsigned char   s_area;
+	unsigned char   s_subarea;
+	unsigned char   s_id[6];
+	unsigned char   nl2;
+	unsigned char   visit_ct;
+	unsigned char   s_class;
+	unsigned char   pt;
+} __attribute__((packed));
 
 /*------------------------- DRP - Routing messages ---------------------*/
 
 struct endnode_hello_message
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned char   tiver[3]        __attribute__((packed));
-        unsigned char   id[6]           __attribute__((packed));
-        unsigned char   iinfo           __attribute__((packed));
-        unsigned short  blksize         __attribute__((packed));
-        unsigned char   area            __attribute__((packed));
-        unsigned char   seed[8]         __attribute__((packed));
-        unsigned char   neighbor[6]     __attribute__((packed));
-        unsigned short  timer           __attribute__((packed));
-        unsigned char   mpd             __attribute__((packed));
-        unsigned char   datalen         __attribute__((packed));
-        unsigned char   data[2]         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned char   tiver[3];
+	unsigned char   id[6];
+	unsigned char   iinfo;
+	unsigned short  blksize;
+	unsigned char   area;
+	unsigned char   seed[8];
+	unsigned char   neighbor[6];
+	unsigned short  timer;
+	unsigned char   mpd;
+	unsigned char   datalen;
+	unsigned char   data[2];
+} __attribute__((packed));
 
 struct rtnode_hello_message
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned char   tiver[3]        __attribute__((packed));
-        unsigned char   id[6]           __attribute__((packed));
-        unsigned char   iinfo           __attribute__((packed));
-        unsigned short  blksize         __attribute__((packed));
-        unsigned char   priority        __attribute__((packed));
-        unsigned char   area            __attribute__((packed));
-        unsigned short  timer           __attribute__((packed));
-        unsigned char   mpd             __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned char   tiver[3];
+	unsigned char   id[6];
+	unsigned char   iinfo;
+	unsigned short  blksize;
+	unsigned char   priority;
+	unsigned char   area;
+	unsigned short  timer;
+	unsigned char   mpd;
+} __attribute__((packed));
 
 
 extern void dn_dev_init(void);
diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h
index 1ba03be0af3a..e6182b86262b 100644
--- a/include/net/dn_nsp.h
+++ b/include/net/dn_nsp.h
@@ -72,78 +72,78 @@ extern struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int nobl
 
 struct nsp_data_seg_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-	unsigned short  dstaddr         __attribute__((packed));
-	unsigned short  srcaddr         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+} __attribute__((packed));
 
 struct nsp_data_opt_msg
 {
-	unsigned short  acknum          __attribute__((packed));
-	unsigned short  segnum          __attribute__((packed));
-	unsigned short  lsflgs          __attribute__((packed));
-};
+	unsigned short  acknum;
+	unsigned short  segnum;
+	unsigned short  lsflgs;
+} __attribute__((packed));
 
 struct nsp_data_opt_msg1
 {
-	unsigned short  acknum          __attribute__((packed));
-	unsigned short  segnum          __attribute__((packed));
-};
+	unsigned short  acknum;
+	unsigned short  segnum;
+} __attribute__((packed));
 
 
 /* Acknowledgment Message (data/other data)                             */
 struct nsp_data_ack_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-	unsigned short  dstaddr         __attribute__((packed));
-	unsigned short  srcaddr         __attribute__((packed));
-	unsigned short  acknum          __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+	unsigned short  acknum;
+} __attribute__((packed));
 
 /* Connect Acknowledgment Message */
 struct  nsp_conn_ack_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-	unsigned short  dstaddr         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+} __attribute__((packed));
 
 
 /* Connect Initiate/Retransmit Initiate/Connect Confirm */
 struct  nsp_conn_init_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
+	unsigned char   msgflg;
 #define NSP_CI      0x18            /* Connect Initiate     */
 #define NSP_RCI     0x68            /* Retrans. Conn Init   */
-	unsigned short  dstaddr         __attribute__((packed));
-        unsigned short  srcaddr         __attribute__((packed));
-        unsigned char   services        __attribute__((packed));
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+	unsigned char   services;
 #define NSP_FC_NONE   0x00            /* Flow Control None    */
 #define NSP_FC_SRC    0x04            /* Seg Req. Count       */
 #define NSP_FC_SCMC   0x08            /* Sess. Control Mess   */
 #define NSP_FC_MASK   0x0c            /* FC type mask         */
-	unsigned char   info            __attribute__((packed));
-        unsigned short  segsize         __attribute__((packed));
-};
+	unsigned char   info;
+	unsigned short  segsize;
+} __attribute__((packed));
 
 /* Disconnect Initiate/Disconnect Confirm */
 struct  nsp_disconn_init_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned short  dstaddr         __attribute__((packed));
-        unsigned short  srcaddr         __attribute__((packed));
-        unsigned short  reason          __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+	unsigned short  reason;
+} __attribute__((packed));
 
 
 struct  srcobj_fmt
 {
-	char            format          __attribute__((packed));
-        unsigned char   task            __attribute__((packed));
-        unsigned short  grpcode         __attribute__((packed));
-        unsigned short  usrcode         __attribute__((packed));
-        char            dlen            __attribute__((packed));
-};
+	char            format;
+	unsigned char   task;
+	unsigned short  grpcode;
+	unsigned short  usrcode;
+	char            dlen;
+} __attribute__((packed));
 
 /*
  * A collection of functions for manipulating the sequence
diff --git a/include/sound/wavefront.h b/include/sound/wavefront.h
index 9e572aed2435..15d82e594b56 100644
--- a/include/sound/wavefront.h
+++ b/include/sound/wavefront.h
@@ -454,22 +454,22 @@ typedef struct wf_multisample {
 } wavefront_multisample;
 
 typedef struct wf_alias {
-    s16 OriginalSample __attribute__ ((packed));
-
-    struct wf_sample_offset sampleStartOffset __attribute__ ((packed));
-    struct wf_sample_offset loopStartOffset __attribute__ ((packed));
-    struct wf_sample_offset sampleEndOffset __attribute__ ((packed));
-    struct wf_sample_offset loopEndOffset __attribute__ ((packed));
-
-    s16  FrequencyBias __attribute__ ((packed));
-
-    u8 SampleResolution:2  __attribute__ ((packed));
-    u8 Unused1:1  __attribute__ ((packed));
-    u8 Loop:1 __attribute__ ((packed));
-    u8 Bidirectional:1  __attribute__ ((packed));
-    u8 Unused2:1 __attribute__ ((packed));
-    u8 Reverse:1 __attribute__ ((packed));
-    u8 Unused3:1 __attribute__ ((packed)); 
+    s16 OriginalSample;
+
+    struct wf_sample_offset sampleStartOffset;
+    struct wf_sample_offset loopStartOffset;
+    struct wf_sample_offset sampleEndOffset;
+    struct wf_sample_offset loopEndOffset;
+
+    s16  FrequencyBias;
+
+    u8 SampleResolution:2;
+    u8 Unused1:1;
+    u8 Loop:1;
+    u8 Bidirectional:1;
+    u8 Unused2:1;
+    u8 Reverse:1;
+    u8 Unused3:1;
     
     /* This structure is meant to be padded only to 16 bits on their
        original. Of course, whoever wrote their documentation didn't
@@ -480,8 +480,8 @@ typedef struct wf_alias {
        standard 16->32 bit issues.
     */
 
-    u8 sixteen_bit_padding __attribute__ ((packed));
-} wavefront_alias;
+    u8 sixteen_bit_padding;
+} __attribute__((packed)) wavefront_alias;
 
 typedef struct wf_drum {
     u8 PatchNumber;
-- 
cgit v1.2.3-71-gd317


From ef9ceab28203690a42d7d3915ccf6e208f0762bc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:05:10 -0800
Subject: [PATCH] remove semicolons from save_flags()

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/interrupt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 41f150a3d2dd..e50a95fbeb11 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -79,7 +79,7 @@ static inline void __deprecated save_flags(unsigned long *x)
 {
 	local_save_flags(*x);
 }
-#define save_flags(x) save_flags(&x);
+#define save_flags(x) save_flags(&x)
 static inline void __deprecated restore_flags(unsigned long x)
 {
 	local_irq_restore(x);
-- 
cgit v1.2.3-71-gd317


From 730745a5c45093982112ddc94cee6a9973455641 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sat, 7 Jan 2006 11:30:44 +1100
Subject: [PATCH] 1/5 powerpc: Rework PowerMac i2c part 1

This is the first part of a rework of the PowerMac i2c code. It
completely reworks the "low_i2c" layer. It is now more flexible,
supports KeyWest, SMU and PMU i2c busses, and provides functions to
match device nodes to i2c busses and adapters.

This patch also extends & fix some bugs in the SMU driver related to i2c
support and removes the clock spreading hacks from the pmac feature code
rather than adapting them to the new API since they'll be replaced by
the platform function code completely in patch 3/5

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/platforms/powermac/feature.c | 127 -----
 arch/powerpc/platforms/powermac/low_i2c.c | 853 ++++++++++++++++++++++++------
 arch/powerpc/platforms/powermac/setup.c   |  23 +-
 arch/powerpc/platforms/powermac/smp.c     |  75 +--
 drivers/i2c/busses/i2c-pmac-smu.c         |  17 +-
 drivers/macintosh/smu.c                   |  58 +-
 drivers/macintosh/via-pmu.c               | 264 +--------
 include/asm-powerpc/pmac_feature.h        |   4 -
 include/asm-powerpc/pmac_low_i2c.h        |  85 ++-
 include/asm-powerpc/smu.h                 |  12 +-
 include/linux/pmu.h                       |   8 +-
 11 files changed, 859 insertions(+), 667 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c
index d2915d64d45e..b271b11583ac 100644
--- a/arch/powerpc/platforms/powermac/feature.c
+++ b/arch/powerpc/platforms/powermac/feature.c
@@ -1677,124 +1677,6 @@ intrepid_shutdown(struct macio_chip *macio, int sleep_mode)
 }
 
 
-void pmac_tweak_clock_spreading(int enable)
-{
-	struct macio_chip *macio = &macio_chips[0];
-
-	/* Hack for doing clock spreading on some machines PowerBooks and
-	 * iBooks. This implements the "platform-do-clockspreading" OF
-	 * property as decoded manually on various models. For safety, we also
-	 * check the product ID in the device-tree in cases we'll whack the i2c
-	 * chip to make reasonably sure we won't set wrong values in there
-	 *
-	 * Of course, ultimately, we have to implement a real parser for
-	 * the platform-do-* stuff...
-	 */
-
-	if (macio->type == macio_intrepid) {
-		struct device_node *clock =
-			of_find_node_by_path("/uni-n@f8000000/hw-clock");
-		if (clock && get_property(clock, "platform-do-clockspreading",
-					  NULL)) {
-			printk(KERN_INFO "%sabling clock spreading on Intrepid"
-			       " ASIC\n", enable ? "En" : "Dis");
-			if (enable)
-				UN_OUT(UNI_N_CLOCK_SPREADING, 2);
-			else
-				UN_OUT(UNI_N_CLOCK_SPREADING, 0);
-			mdelay(40);
-		}
-		of_node_put(clock);
-	}
-
-	while (machine_is_compatible("PowerBook5,2") ||
-	       machine_is_compatible("PowerBook5,3") ||
-	       machine_is_compatible("PowerBook6,2") ||
-	       machine_is_compatible("PowerBook6,3")) {
-		struct device_node *ui2c = of_find_node_by_type(NULL, "i2c");
-		struct device_node *dt = of_find_node_by_name(NULL, "device-tree");
-		u8 buffer[9];
-		u32 *productID;
-		int i, rc, changed = 0;
-
-		if (dt == NULL)
-			break;
-		productID = (u32 *)get_property(dt, "pid#", NULL);
-		if (productID == NULL)
-			break;
-		while(ui2c) {
-			struct device_node *p = of_get_parent(ui2c);
-			if (p && !strcmp(p->name, "uni-n"))
-				break;
-			ui2c = of_find_node_by_type(ui2c, "i2c");
-		}
-		if (ui2c == NULL)
-			break;
-		DBG("Trying to bump clock speed for PID: %08x...\n", *productID);
-		rc = pmac_low_i2c_open(ui2c, 1);
-		if (rc != 0)
-			break;
-		pmac_low_i2c_setmode(ui2c, pmac_low_i2c_mode_combined);
-		rc = pmac_low_i2c_xfer(ui2c, 0xd2 | pmac_low_i2c_read, 0x80, buffer, 9);
-		DBG("read result: %d,", rc);
-		if (rc != 0) {
-			pmac_low_i2c_close(ui2c);
-			break;
-		}
-		for (i=0; i<9; i++)
-			DBG(" %02x", buffer[i]);
-		DBG("\n");
-
-		switch(*productID) {
-		case 0x1182:	/* AlBook 12" rev 2 */
-		case 0x1183:	/* iBook G4 12" */
-			buffer[0] = (buffer[0] & 0x8f) | 0x70;
-			buffer[2] = (buffer[2] & 0x7f) | 0x00;
-			buffer[5] = (buffer[5] & 0x80) | 0x31;
-			buffer[6] = (buffer[6] & 0x40) | 0xb0;
-			buffer[7] = (buffer[7] & 0x00) | (enable ? 0xc0 : 0xba);
-			buffer[8] = (buffer[8] & 0x00) | 0x30;
-			changed = 1;
-			break;
-		case 0x3142:	/* AlBook 15" (ATI M10) */
-		case 0x3143:	/* AlBook 17" (ATI M10) */
-			buffer[0] = (buffer[0] & 0xaf) | 0x50;
-			buffer[2] = (buffer[2] & 0x7f) | 0x00;
-			buffer[5] = (buffer[5] & 0x80) | 0x31;
-			buffer[6] = (buffer[6] & 0x40) | 0xb0;
-			buffer[7] = (buffer[7] & 0x00) | (enable ? 0xd0 : 0xc0);
-			buffer[8] = (buffer[8] & 0x00) | 0x30;
-			changed = 1;
-			break;
-		default:
-			DBG("i2c-hwclock: Machine model not handled\n");
-			break;
-		}
-		if (!changed) {
-			pmac_low_i2c_close(ui2c);
-			break;
-		}
-		printk(KERN_INFO "%sabling clock spreading on i2c clock chip\n",
-		       enable ? "En" : "Dis");
-
-		pmac_low_i2c_setmode(ui2c, pmac_low_i2c_mode_stdsub);
-		rc = pmac_low_i2c_xfer(ui2c, 0xd2 | pmac_low_i2c_write, 0x80, buffer, 9);
-		DBG("write result: %d,", rc);
-		pmac_low_i2c_setmode(ui2c, pmac_low_i2c_mode_combined);
-		rc = pmac_low_i2c_xfer(ui2c, 0xd2 | pmac_low_i2c_read, 0x80, buffer, 9);
-		DBG("read result: %d,", rc);
-		if (rc != 0) {
-			pmac_low_i2c_close(ui2c);
-			break;
-		}
-		for (i=0; i<9; i++)
-			DBG(" %02x", buffer[i]);
-		pmac_low_i2c_close(ui2c);
-		break;
-	}
-}
-
-
 static int
 core99_sleep(void)
 {
@@ -2980,12 +2862,6 @@ set_initial_features(void)
 		MACIO_BIC(HEATHROW_FCR, HRW_SOUND_POWER_N);
 	}
 
-	/* Some machine models need the clock chip to be properly setup for
-	 * clock spreading now. This should be a platform function but we
-	 * don't do these at the moment
-	 */
-	pmac_tweak_clock_spreading(1);
-
 #endif /* CONFIG_POWER4 */
 
 	/* On all machines, switch modem & serial ports off */
@@ -3013,9 +2889,6 @@ pmac_feature_init(void)
 		return;
 	}
 
-	/* Setup low-level i2c stuffs */
-	pmac_init_low_i2c();
-
 	/* Probe machine type */
 	if (probe_motherboard())
 		printk(KERN_WARNING "Unknown PowerMac !\n");
diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c
index 606e0ed13731..f31d6a678b9e 100644
--- a/arch/powerpc/platforms/powermac/low_i2c.c
+++ b/arch/powerpc/platforms/powermac/low_i2c.c
@@ -1,22 +1,34 @@
 /*
- *  arch/ppc/platforms/pmac_low_i2c.c
+ * arch/powerpc/platforms/powermac/low_i2c.c
  *
- *  Copyright (C) 2003 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *  Copyright (C) 2003-2005 Ben. Herrenschmidt (benh@kernel.crashing.org)
  *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  *
- *  This file contains some low-level i2c access routines that
- *  need to be used by various bits of the PowerMac platform code
- *  at times where the real asynchronous & interrupt driven driver
- *  cannot be used. The API borrows some semantics from the darwin
- *  driver in order to ease the implementation of the platform
- *  properties parser
+ * The linux i2c layer isn't completely suitable for our needs for various
+ * reasons ranging from too late initialisation to semantics not perfectly
+ * matching some requirements of the apple platform functions etc...
+ *
+ * This file thus provides a simple low level unified i2c interface for
+ * powermac that covers the various types of i2c busses used in Apple machines.
+ * For now, keywest, PMU and SMU, though we could add Cuda, or other bit
+ * banging busses found on older chipstes in earlier machines if we ever need
+ * one of them.
+ *
+ * The drivers in this file are synchronous/blocking. In addition, the
+ * keywest one is fairly slow due to the use of msleep instead of interrupts
+ * as the interrupt is currently used by i2c-keywest. In the long run, we
+ * might want to get rid of those high-level interfaces to linux i2c layer
+ * either completely (converting all drivers) or replacing them all with a
+ * single stub driver on top of this one. Once done, the interrupt will be
+ * available for our use.
  */
 
 #undef DEBUG
+#undef DEBUG_LOW
 
 #include <linux/config.h>
 #include <linux/types.h>
@@ -25,15 +37,16 @@
 #include <linux/module.h>
 #include <linux/adb.h>
 #include <linux/pmu.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
 #include <asm/keylargo.h>
 #include <asm/uninorth.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
+#include <asm/smu.h>
 #include <asm/pmac_low_i2c.h>
 
-#define MAX_LOW_I2C_HOST	4
-
 #ifdef DEBUG
 #define DBG(x...) do {\
 		printk(KERN_DEBUG "low_i2c:" x);	\
@@ -42,49 +55,54 @@
 #define DBG(x...)
 #endif
 
-struct low_i2c_host;
-
-typedef int (*low_i2c_func_t)(struct low_i2c_host *host, u8 addr, u8 sub, u8 *data, int len);
-
-struct low_i2c_host
-{
-	struct device_node	*np;		/* OF device node */
-	struct semaphore	mutex;		/* Access mutex for use by i2c-keywest */
-	low_i2c_func_t		func;		/* Access function */
-	unsigned int		is_open : 1;	/* Poor man's access control */
-	int			mode;		/* Current mode */
-	int			channel;	/* Current channel */
-	int			num_channels;	/* Number of channels */
-	void __iomem		*base;		/* For keywest-i2c, base address */
-	int			bsteps;		/* And register stepping */
-	int			speed;		/* And speed */
-};
-
-static struct low_i2c_host	low_i2c_hosts[MAX_LOW_I2C_HOST];
+#ifdef DEBUG_LOW
+#define DBG_LOW(x...) do {\
+		printk(KERN_DEBUG "low_i2c:" x);	\
+	} while(0)
+#else
+#define DBG_LOW(x...)
+#endif
 
-/* No locking is necessary on allocation, we are running way before
- * anything can race with us
+/*
+ * A bus structure. Each bus in the system has such a structure associated.
  */
-static struct low_i2c_host *find_low_i2c_host(struct device_node *np)
+struct pmac_i2c_bus
 {
-	int i;
+	struct list_head	link;
+	struct device_node	*controller;
+	struct device_node	*busnode;
+	int			type;
+	int			flags;
+	struct i2c_adapter	*adapter;
+	void			*hostdata;
+	int			channel;	/* some hosts have multiple */
+	int			mode;		/* current mode */
+	struct semaphore	sem;
+	int			opened;
+	int			polled;		/* open mode */
+
+	/* ops */
+	int (*open)(struct pmac_i2c_bus *bus);
+	void (*close)(struct pmac_i2c_bus *bus);
+	int (*xfer)(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
+		    u32 subaddr, u8 *data, int len);
+};
 
-	for (i = 0; i < MAX_LOW_I2C_HOST; i++)
-		if (low_i2c_hosts[i].np == np)
-			return &low_i2c_hosts[i];
-	return NULL;
-}
+static LIST_HEAD(pmac_i2c_busses);
 
 /*
- *
- * i2c-keywest implementation (UniNorth, U2, U3, Keylargo's)
- *
+ * Keywest implementation
  */
 
-/*
- * Keywest i2c definitions borrowed from drivers/i2c/i2c-keywest.h,
- * should be moved somewhere in include/asm-ppc/
- */
+struct pmac_i2c_host_kw
+{
+	struct semaphore	mutex;		/* Access mutex for use by
+						 * i2c-keywest */
+	void __iomem		*base;		/* register base address */
+	int			bsteps;		/* register stepping */
+	int			speed;		/* speed */
+};
+
 /* Register indices */
 typedef enum {
 	reg_mode = 0,
@@ -153,52 +171,56 @@ static const char *__kw_state_names[] = {
 	"state_dead"
 };
 
-static inline u8 __kw_read_reg(struct low_i2c_host *host, reg_t reg)
+static inline u8 __kw_read_reg(struct pmac_i2c_bus *bus, reg_t reg)
 {
+	struct pmac_i2c_host_kw *host = bus->hostdata;
 	return readb(host->base + (((unsigned int)reg) << host->bsteps));
 }
 
-static inline void __kw_write_reg(struct low_i2c_host *host, reg_t reg, u8 val)
+static inline void __kw_write_reg(struct pmac_i2c_bus *bus, reg_t reg, u8 val)
 {
+	struct pmac_i2c_host_kw *host = bus->hostdata;
 	writeb(val, host->base + (((unsigned)reg) << host->bsteps));
-	(void)__kw_read_reg(host, reg_subaddr);
+	(void)__kw_read_reg(bus, reg_subaddr);
 }
 
-#define kw_write_reg(reg, val)	__kw_write_reg(host, reg, val) 
-#define kw_read_reg(reg)	__kw_read_reg(host, reg) 
+#define kw_write_reg(reg, val)	__kw_write_reg(bus, reg, val)
+#define kw_read_reg(reg)	__kw_read_reg(bus, reg)
 
-
-/* Don't schedule, the g5 fan controller is too
- * timing sensitive
- */
-static u8 kw_wait_interrupt(struct low_i2c_host* host)
+static u8 kw_i2c_wait_interrupt(struct pmac_i2c_bus* bus)
 {
 	int i, j;
 	u8 isr;
 	
-	for (i = 0; i < 100000; i++) {
+	for (i = 0; i < 1000; i++) {
 		isr = kw_read_reg(reg_isr) & KW_I2C_IRQ_MASK;
 		if (isr != 0)
 			return isr;
 
 		/* This code is used with the timebase frozen, we cannot rely
-		 * on udelay ! For now, just use a bogus loop
+		 * on udelay nor schedule when in polled mode !
+		 * For now, just use a bogus loop....
 		 */
-		for (j = 1; j < 10000; j++)
-			mb();
+		if (bus->polled) {
+			for (j = 1; j < 1000000; j++)
+				mb();
+		} else
+			msleep(1);
 	}
 	return isr;
 }
 
-static int kw_handle_interrupt(struct low_i2c_host *host, int state, int rw, int *rc, u8 **data, int *len, u8 isr)
+static int kw_i2c_handle_interrupt(struct pmac_i2c_bus *bus, int state, int rw,
+				   int *rc, u8 **data, int *len, u8 isr)
 {
 	u8 ack;
 
-	DBG("kw_handle_interrupt(%s, isr: %x)\n", __kw_state_names[state], isr);
+	DBG_LOW("kw_handle_interrupt(%s, isr: %x)\n",
+		__kw_state_names[state], isr);
 
 	if (isr == 0) {
 		if (state != state_stop) {
-			DBG("KW: Timeout !\n");
+			DBG_LOW("KW: Timeout !\n");
 			*rc = -EIO;
 			goto stop;
 		}
@@ -220,15 +242,16 @@ static int kw_handle_interrupt(struct low_i2c_host *host, int state, int rw, int
 			*rc = -EIO;
 			goto stop;
 		}
-		if ((ack & KW_I2C_STAT_LAST_AAK) == 0) {			
+		if ((ack & KW_I2C_STAT_LAST_AAK) == 0) {
 			*rc = -ENODEV;
-			DBG("KW: NAK on address\n");
+			DBG_LOW("KW: NAK on address\n");
 			return state_stop;		     
 		} else {
 			if (rw) {
 				state = state_read;
 				if (*len > 1)
-					kw_write_reg(reg_control, KW_I2C_CTL_AAK);
+					kw_write_reg(reg_control,
+						     KW_I2C_CTL_AAK);
 			} else {
 				state = state_write;
 				kw_write_reg(reg_data, **data);
@@ -250,7 +273,7 @@ static int kw_handle_interrupt(struct low_i2c_host *host, int state, int rw, int
 		} else if (state == state_write) {
 			ack = kw_read_reg(reg_status);
 			if ((ack & KW_I2C_STAT_LAST_AAK) == 0) {
-				DBG("KW: nack on data write\n");
+				DBG_LOW("KW: nack on data write\n");
 				*rc = -EIO;
 				goto stop;
 			} else if (*len) {
@@ -291,35 +314,57 @@ static int kw_handle_interrupt(struct low_i2c_host *host, int state, int rw, int
 	return state_stop;
 }
 
-static int keywest_low_i2c_func(struct low_i2c_host *host, u8 addr, u8 subaddr, u8 *data, int len)
+static int kw_i2c_open(struct pmac_i2c_bus *bus)
 {
+	struct pmac_i2c_host_kw *host = bus->hostdata;
+	down(&host->mutex);
+	return 0;
+}
+
+static void kw_i2c_close(struct pmac_i2c_bus *bus)
+{
+	struct pmac_i2c_host_kw *host = bus->hostdata;
+	up(&host->mutex);
+}
+
+static int kw_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
+		       u32 subaddr, u8 *data, int len)
+{
+	struct pmac_i2c_host_kw *host = bus->hostdata;
 	u8 mode_reg = host->speed;
 	int state = state_addr;
 	int rc = 0;
 
 	/* Setup mode & subaddress if any */
-	switch(host->mode) {
-	case pmac_low_i2c_mode_dumb:
-		printk(KERN_ERR "low_i2c: Dumb mode not supported !\n");
+	switch(bus->mode) {
+	case pmac_i2c_mode_dumb:
 		return -EINVAL;
-	case pmac_low_i2c_mode_std:
+	case pmac_i2c_mode_std:
 		mode_reg |= KW_I2C_MODE_STANDARD;
+		if (subsize != 0)
+			return -EINVAL;
 		break;
-	case pmac_low_i2c_mode_stdsub:
+	case pmac_i2c_mode_stdsub:
 		mode_reg |= KW_I2C_MODE_STANDARDSUB;
+		if (subsize != 1)
+			return -EINVAL;
 		break;
-	case pmac_low_i2c_mode_combined:
+	case pmac_i2c_mode_combined:
 		mode_reg |= KW_I2C_MODE_COMBINED;
+		if (subsize != 1)
+			return -EINVAL;
 		break;
 	}
 
 	/* Setup channel & clear pending irqs */
 	kw_write_reg(reg_isr, kw_read_reg(reg_isr));
-	kw_write_reg(reg_mode, mode_reg | (host->channel << 4));
+	kw_write_reg(reg_mode, mode_reg | (bus->channel << 4));
 	kw_write_reg(reg_status, 0);
 
-	/* Set up address and r/w bit */
-	kw_write_reg(reg_addr, addr);
+	/* Set up address and r/w bit, strip possible stale bus number from
+	 * address top bits
+	 */
+	kw_write_reg(reg_addr, addrdir & 0xff);
 
 	/* Set up the sub address */
 	if ((mode_reg & KW_I2C_MODE_MODE_MASK) == KW_I2C_MODE_STANDARDSUB
@@ -330,27 +375,27 @@ static int keywest_low_i2c_func(struct low_i2c_host *host, u8 addr, u8 subaddr,
 	kw_write_reg(reg_ier, 0 /*KW_I2C_IRQ_MASK*/);
 	kw_write_reg(reg_control, KW_I2C_CTL_XADDR);
 
-	/* State machine, to turn into an interrupt handler */
+	/* State machine, to turn into an interrupt handler in the future */
 	while(state != state_idle) {
-		u8 isr = kw_wait_interrupt(host);
-		state = kw_handle_interrupt(host, state, addr & 1, &rc, &data, &len, isr);
+		u8 isr = kw_i2c_wait_interrupt(bus);
+		state = kw_i2c_handle_interrupt(bus, state, addrdir & 1, &rc,
+						&data, &len, isr);
 	}
 
 	return rc;
 }
 
-static void keywest_low_i2c_add(struct device_node *np)
+static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np)
 {
-	struct low_i2c_host	*host = find_low_i2c_host(NULL);
+	struct pmac_i2c_host_kw *host;
 	u32			*psteps, *prate, *addrp, steps;
-	struct device_node	*parent;
 
+	host = kzalloc(sizeof(struct pmac_i2c_host_kw), GFP_KERNEL);
 	if (host == NULL) {
 		printk(KERN_ERR "low_i2c: Can't allocate host for %s\n",
 		       np->full_name);
-		return;
+		return NULL;
 	}
-	memset(host, 0, sizeof(*host));
 
 	/* Apple is kind enough to provide a valid AAPL,address property
 	 * on all i2c keywest nodes so far ... we would have to fallback
@@ -360,18 +405,14 @@ static void keywest_low_i2c_add(struct device_node *np)
 	if (addrp == NULL) {
 		printk(KERN_ERR "low_i2c: Can't find address for %s\n",
 		       np->full_name);
-		return;
+		kfree(host);
+		return NULL;
 	}
 	init_MUTEX(&host->mutex);
-	host->np = of_node_get(np);	
 	psteps = (u32 *)get_property(np, "AAPL,address-step", NULL);
 	steps = psteps ? (*psteps) : 0x10;
 	for (host->bsteps = 0; (steps & 0x01) == 0; host->bsteps++)
 		steps >>= 1;
-	parent = of_get_parent(np);
-	host->num_channels = 1;
-	if (parent && parent->name[0] == 'u')
-		host->num_channels = 2;
 	/* Select interface rate */
 	host->speed = KW_I2C_MODE_25KHZ;
 	prate = (u32 *)get_property(np, "AAPL,i2c-rate", NULL);
@@ -387,148 +428,620 @@ static void keywest_low_i2c_add(struct device_node *np)
 		break;
 	}	
 
-	printk(KERN_INFO "low_i2c: Bus %s found at 0x%08x, %d channels,"
-	       " speed = %d KHz\n",
-	       np->full_name, *addrp, host->num_channels, prate ? *prate : 25);
-
-	host->mode = pmac_low_i2c_mode_std;
+	printk(KERN_INFO "KeyWest i2c @0x%08x %s\n", *addrp, np->full_name);
 	host->base = ioremap((*addrp), 0x1000);
-	host->func = keywest_low_i2c_func;
+
+	return host;
 }
 
+
+static void __init kw_i2c_add(struct pmac_i2c_host_kw *host,
+			      struct device_node *controller,
+			      struct device_node *busnode,
+			      int channel)
+{
+	struct pmac_i2c_bus *bus;
+
+	bus = kzalloc(sizeof(struct pmac_i2c_bus), GFP_KERNEL);
+	if (bus == NULL)
+		return;
+
+	bus->controller = of_node_get(controller);
+	bus->busnode = of_node_get(busnode);
+	bus->type = pmac_i2c_bus_keywest;
+	bus->hostdata = host;
+	bus->channel = channel;
+	bus->mode = pmac_i2c_mode_std;
+	bus->open = kw_i2c_open;
+	bus->close = kw_i2c_close;
+	bus->xfer = kw_i2c_xfer;
+	init_MUTEX(&bus->sem);
+	if (controller == busnode)
+		bus->flags = pmac_i2c_multibus;
+	list_add(&bus->link, &pmac_i2c_busses);
+
+	printk(KERN_INFO " channel %d bus %s\n", channel,
+	       (controller == busnode) ? "<multibus>" : busnode->full_name);
+}
+
+static void __init kw_i2c_probe(void)
+{
+	struct device_node *np, *child, *parent;
+
+	/* Probe keywest-i2c busses */
+	for (np = NULL;
+	     (np = of_find_compatible_node(np, "i2c","keywest-i2c")) != NULL;){
+		struct pmac_i2c_host_kw *host;
+		int multibus, chans, i;
+
+		/* Found one, init a host structure */
+		host = kw_i2c_host_init(np);
+		if (host == NULL)
+			continue;
+
+		/* Now check if we have a multibus setup (old style) or if we
+		 * have proper bus nodes. Note that the "new" way (proper bus
+		 * nodes) might cause us to not create some busses that are
+		 * kept hidden in the device-tree. In the future, we might
+		 * want to work around that by creating busses without a node
+		 * but not for now
+		 */
+		child = of_get_next_child(np, NULL);
+		multibus = !child || strcmp(child->name, "i2c-bus");
+		of_node_put(child);
+
+		/* For a multibus setup, we get the bus count based on the
+		 * parent type
+		 */
+		if (multibus) {
+			parent = of_get_parent(np);
+			if (parent == NULL)
+				continue;
+			chans = parent->name[0] == 'u' ? 2 : 1;
+			for (i = 0; i < chans; i++)
+				kw_i2c_add(host, np, np, i);
+		} else {
+			for (child = NULL;
+			     (child = of_get_next_child(np, child)) != NULL;) {
+				u32 *reg =
+					(u32 *)get_property(child, "reg", NULL);
+				if (reg == NULL)
+					continue;
+				kw_i2c_add(host, np, child, *reg);
+			}
+		}
+	}
+}
+
+
 /*
  *
  * PMU implementation
  *
  */
 
-
 #ifdef CONFIG_ADB_PMU
 
-static int pmu_low_i2c_func(struct low_i2c_host *host, u8 addr, u8 sub, u8 *data, int len)
+/*
+ * i2c command block to the PMU
+ */
+struct pmu_i2c_hdr {
+	u8	bus;
+	u8	mode;
+	u8	bus2;
+	u8	address;
+	u8	sub_addr;
+	u8	comb_addr;
+	u8	count;
+	u8	data[];
+};
+
+static void pmu_i2c_complete(struct adb_request *req)
 {
-	// TODO
-	return -ENODEV;
+	complete(req->arg);
 }
 
-static void pmu_low_i2c_add(struct device_node *np)
+static int pmu_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
+			u32 subaddr, u8 *data, int len)
 {
-	struct low_i2c_host	*host = find_low_i2c_host(NULL);
+	struct adb_request *req = bus->hostdata;
+	struct pmu_i2c_hdr *hdr = (struct pmu_i2c_hdr *)&req->data[1];
+	struct completion comp;
+	int read = addrdir & 1;
+	int retry;
+	int rc = 0;
 
-	if (host == NULL) {
-		printk(KERN_ERR "low_i2c: Can't allocate host for %s\n",
-		       np->full_name);
-		return;
+	/* For now, limit ourselves to 16 bytes transfers */
+	if (len > 16)
+		return -EINVAL;
+
+	init_completion(&comp);
+
+	for (retry = 0; retry < 16; retry++) {
+		memset(req, 0, sizeof(struct adb_request));
+		hdr->bus = bus->channel;
+		hdr->count = len;
+
+		switch(bus->mode) {
+		case pmac_i2c_mode_std:
+			if (subsize != 0)
+				return -EINVAL;
+			hdr->address = addrdir;
+			hdr->mode = PMU_I2C_MODE_SIMPLE;
+			break;
+		case pmac_i2c_mode_stdsub:
+		case pmac_i2c_mode_combined:
+			if (subsize != 1)
+				return -EINVAL;
+			hdr->address = addrdir & 0xfe;
+			hdr->comb_addr = addrdir;
+			hdr->sub_addr = subaddr;
+			if (bus->mode == pmac_i2c_mode_stdsub)
+				hdr->mode = PMU_I2C_MODE_STDSUB;
+			else
+				hdr->mode = PMU_I2C_MODE_COMBINED;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		INIT_COMPLETION(comp);
+		req->data[0] = PMU_I2C_CMD;
+		req->reply[0] = 0xff;
+		req->nbytes = sizeof(struct pmu_i2c_hdr) + 1;
+		req->done = pmu_i2c_complete;
+		req->arg = &comp;
+		if (!read) {
+			memcpy(hdr->data, data, len);
+			req->nbytes += len;
+		}
+		rc = pmu_queue_request(req);
+		if (rc)
+			return rc;
+		wait_for_completion(&comp);
+		if (req->reply[0] == PMU_I2C_STATUS_OK)
+			break;
+		msleep(15);
 	}
-	memset(host, 0, sizeof(*host));
+	if (req->reply[0] != PMU_I2C_STATUS_OK)
+		return -EIO;
 
-	init_MUTEX(&host->mutex);
-	host->np = of_node_get(np);	
-	host->num_channels = 3;
-	host->mode = pmac_low_i2c_mode_std;
-	host->func = pmu_low_i2c_func;
+	for (retry = 0; retry < 16; retry++) {
+		memset(req, 0, sizeof(struct adb_request));
+
+		/* I know that looks like a lot, slow as hell, but darwin
+		 * does it so let's be on the safe side for now
+		 */
+		msleep(15);
+
+		hdr->bus = PMU_I2C_BUS_STATUS;
+
+		INIT_COMPLETION(comp);
+		req->data[0] = PMU_I2C_CMD;
+		req->reply[0] = 0xff;
+		req->nbytes = 2;
+		req->done = pmu_i2c_complete;
+		req->arg = &comp;
+		rc = pmu_queue_request(req);
+		if (rc)
+			return rc;
+		wait_for_completion(&comp);
+
+		if (req->reply[0] == PMU_I2C_STATUS_OK && !read)
+			return 0;
+		if (req->reply[0] == PMU_I2C_STATUS_DATAREAD && read) {
+			int rlen = req->reply_len - 1;
+
+			if (rlen != len) {
+				printk(KERN_WARNING "low_i2c: PMU returned %d"
+				       " bytes, expected %d !\n", rlen, len);
+				return -EIO;
+			}
+			memcpy(data, &req->reply[1], len);
+			return 0;
+		}
+	}
+	return -EIO;
+}
+
+static void __init pmu_i2c_probe(void)
+{
+	struct pmac_i2c_bus *bus;
+	struct device_node *busnode;
+	int channel, sz;
+
+	if (!pmu_present())
+		return;
+
+	/* There might or might not be a "pmu-i2c" node, we use that
+	 * or via-pmu itself, whatever we find. I haven't seen a machine
+	 * with separate bus nodes, so we assume a multibus setup
+	 */
+	busnode = of_find_node_by_name(NULL, "pmu-i2c");
+	if (busnode == NULL)
+		busnode = of_find_node_by_name(NULL, "via-pmu");
+	if (busnode == NULL)
+		return;
+
+	printk(KERN_INFO "PMU i2c %s\n", busnode->full_name);
+
+	/*
+	 * We add bus 1 and 2 only for now, bus 0 is "special"
+	 */
+	for (channel = 1; channel <= 2; channel++) {
+		sz = sizeof(struct pmac_i2c_bus) + sizeof(struct adb_request);
+		bus = kzalloc(sz, GFP_KERNEL);
+		if (bus == NULL)
+			return;
+
+		bus->controller = busnode;
+		bus->busnode = busnode;
+		bus->type = pmac_i2c_bus_pmu;
+		bus->channel = channel;
+		bus->mode = pmac_i2c_mode_std;
+		bus->hostdata = bus + 1;
+		bus->xfer = pmu_i2c_xfer;
+		init_MUTEX(&bus->sem);
+		bus->flags = pmac_i2c_multibus;
+		list_add(&bus->link, &pmac_i2c_busses);
+
+		printk(KERN_INFO " channel %d bus <multibus>\n", channel);
+	}
 }
 
 #endif /* CONFIG_ADB_PMU */
 
-void __init pmac_init_low_i2c(void)
+
+/*
+ *
+ * SMU implementation
+ *
+ */
+
+#ifdef CONFIG_PMAC_SMU
+
+static void smu_i2c_complete(struct smu_i2c_cmd *cmd, void *misc)
 {
-	struct device_node *np;
+	complete(misc);
+}
 
-	/* Probe keywest-i2c busses */
-	np = of_find_compatible_node(NULL, "i2c", "keywest-i2c");
-	while(np) {
-		keywest_low_i2c_add(np);
-		np = of_find_compatible_node(np, "i2c", "keywest-i2c");
+static int smu_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
+			u32 subaddr, u8 *data, int len)
+{
+	struct smu_i2c_cmd *cmd = bus->hostdata;
+	struct completion comp;
+	int read = addrdir & 1;
+	int rc = 0;
+
+	memset(cmd, 0, sizeof(struct smu_i2c_cmd));
+	cmd->info.bus = bus->channel;
+	cmd->info.devaddr = addrdir;
+	cmd->info.datalen = len;
+
+	switch(bus->mode) {
+	case pmac_i2c_mode_std:
+		if (subsize != 0)
+			return -EINVAL;
+		cmd->info.type = SMU_I2C_TRANSFER_SIMPLE;
+		break;
+	case pmac_i2c_mode_stdsub:
+	case pmac_i2c_mode_combined:
+		if (subsize > 3 || subsize < 1)
+			return -EINVAL;
+		cmd->info.sublen = subsize;
+		/* that's big-endian only but heh ! */
+		memcpy(&cmd->info.subaddr, ((char *)&subaddr) + (4 - subsize),
+		       subsize);
+		if (bus->mode == pmac_i2c_mode_stdsub)
+			cmd->info.type = SMU_I2C_TRANSFER_STDSUB;
+		else
+			cmd->info.type = SMU_I2C_TRANSFER_COMBINED;
+		break;
+	default:
+		return -EINVAL;
 	}
+	if (!read)
+		memcpy(cmd->info.data, data, len);
+
+	init_completion(&comp);
+	cmd->done = smu_i2c_complete;
+	cmd->misc = &comp;
+	rc = smu_queue_i2c(cmd);
+	if (rc < 0)
+		return rc;
+	wait_for_completion(&comp);
+	rc = cmd->status;
+
+	if (read)
+		memcpy(data, cmd->info.data, len);
+	return rc < 0 ? rc : 0;
+}
 
-#ifdef CONFIG_ADB_PMU
-	/* Probe PMU busses */
-	np = of_find_node_by_name(NULL, "via-pmu");
-	if (np)
-		pmu_low_i2c_add(np);
-#endif /* CONFIG_ADB_PMU */
+static void __init smu_i2c_probe(void)
+{
+	struct device_node *controller, *busnode;
+	struct pmac_i2c_bus *bus;
+	u32 *reg;
+	int sz;
+
+	if (!smu_present())
+		return;
+
+	controller = of_find_node_by_name(NULL, "smu_i2c_control");
+	if (controller == NULL)
+		controller = of_find_node_by_name(NULL, "smu");
+	if (controller == NULL)
+		return;
+
+	printk(KERN_INFO "SMU i2c %s\n", controller->full_name);
+
+	/* Look for childs, note that they might not be of the right
+	 * type as older device trees mix i2c busses and other thigns
+	 * at the same level
+	 */
+	for (busnode = NULL;
+	     (busnode = of_get_next_child(controller, busnode)) != NULL;) {
+		if (strcmp(busnode->type, "i2c") &&
+		    strcmp(busnode->type, "i2c-bus"))
+			continue;
+		reg = (u32 *)get_property(busnode, "reg", NULL);
+		if (reg == NULL)
+			continue;
+
+		sz = sizeof(struct pmac_i2c_bus) + sizeof(struct smu_i2c_cmd);
+		bus = kzalloc(sz, GFP_KERNEL);
+		if (bus == NULL)
+			return;
+
+		bus->controller = controller;
+		bus->busnode = of_node_get(busnode);
+		bus->type = pmac_i2c_bus_smu;
+		bus->channel = *reg;
+		bus->mode = pmac_i2c_mode_std;
+		bus->hostdata = bus + 1;
+		bus->xfer = smu_i2c_xfer;
+		init_MUTEX(&bus->sem);
+		bus->flags = 0;
+		list_add(&bus->link, &pmac_i2c_busses);
+
+		printk(KERN_INFO " channel %x bus %s\n",
+		       bus->channel, busnode->full_name);
+	}
+}
+
+#endif /* CONFIG_PMAC_SMU */
+
+/*
+ *
+ * Core code
+ *
+ */
+
+
+struct pmac_i2c_bus *pmac_i2c_find_bus(struct device_node *node)
+{
+	struct device_node *p = of_node_get(node);
+	struct device_node *prev = NULL;
+	struct pmac_i2c_bus *bus;
+
+	while(p) {
+		list_for_each_entry(bus, &pmac_i2c_busses, link) {
+			if (p == bus->busnode) {
+				if (prev && bus->flags & pmac_i2c_multibus) {
+					u32 *reg;
+					reg = (u32 *)get_property(prev, "reg",
+								  NULL);
+					if (!reg)
+						continue;
+					if (((*reg) >> 8) != bus->channel)
+						continue;
+				}
+				of_node_put(p);
+				of_node_put(prev);
+				return bus;
+			}
+		}
+		of_node_put(prev);
+		prev = p;
+		p = of_get_parent(p);
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_find_bus);
+
+u8 pmac_i2c_get_dev_addr(struct device_node *device)
+{
+	u32 *reg = (u32 *)get_property(device, "reg", NULL);
+
+	if (reg == NULL)
+		return 0;
+
+	return (*reg) & 0xff;
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_get_dev_addr);
+
+struct device_node *pmac_i2c_get_controller(struct pmac_i2c_bus *bus)
+{
+	return bus->controller;
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_get_controller);
+
+struct device_node *pmac_i2c_get_bus_node(struct pmac_i2c_bus *bus)
+{
+	return bus->busnode;
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_get_bus_node);
+
+int pmac_i2c_get_type(struct pmac_i2c_bus *bus)
+{
+	return bus->type;
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_get_type);
+
+int pmac_i2c_get_flags(struct pmac_i2c_bus *bus)
+{
+	return bus->flags;
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_get_flags);
 
-	/* TODO: Add CUDA support as well */
+void pmac_i2c_attach_adapter(struct pmac_i2c_bus *bus,
+			     struct i2c_adapter *adapter)
+{
+	WARN_ON(bus->adapter != NULL);
+	bus->adapter = adapter;
 }
+EXPORT_SYMBOL_GPL(pmac_i2c_attach_adapter);
+
+void pmac_i2c_detach_adapter(struct pmac_i2c_bus *bus,
+			     struct i2c_adapter *adapter)
+{
+	WARN_ON(bus->adapter != adapter);
+	bus->adapter = NULL;
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_detach_adapter);
+
+struct i2c_adapter *pmac_i2c_get_adapter(struct pmac_i2c_bus *bus)
+{
+	return bus->adapter;
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_get_adapter);
+
+extern int pmac_i2c_match_adapter(struct device_node *dev,
+				  struct i2c_adapter *adapter)
+{
+	struct pmac_i2c_bus *bus = pmac_i2c_find_bus(dev);
+
+	if (bus == NULL)
+		return 0;
+	return (bus->adapter == adapter);
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_match_adapter);
 
 int pmac_low_i2c_lock(struct device_node *np)
 {
-	struct low_i2c_host *host = find_low_i2c_host(np);
+	struct pmac_i2c_bus *bus, *found = NULL;
 
-	if (!host)
+	list_for_each_entry(bus, &pmac_i2c_busses, link) {
+		if (np == bus->controller) {
+			found = bus;
+			break;
+		}
+	}
+	if (!found)
 		return -ENODEV;
-	down(&host->mutex);
-	return 0;
+	return pmac_i2c_open(bus, 0);
 }
-EXPORT_SYMBOL(pmac_low_i2c_lock);
+EXPORT_SYMBOL_GPL(pmac_low_i2c_lock);
 
 int pmac_low_i2c_unlock(struct device_node *np)
 {
-	struct low_i2c_host *host = find_low_i2c_host(np);
+	struct pmac_i2c_bus *bus, *found = NULL;
 
-	if (!host)
+	list_for_each_entry(bus, &pmac_i2c_busses, link) {
+		if (np == bus->controller) {
+			found = bus;
+			break;
+		}
+	}
+	if (!found)
 		return -ENODEV;
-	up(&host->mutex);
+	pmac_i2c_close(bus);
 	return 0;
 }
-EXPORT_SYMBOL(pmac_low_i2c_unlock);
+EXPORT_SYMBOL_GPL(pmac_low_i2c_unlock);
 
 
-int pmac_low_i2c_open(struct device_node *np, int channel)
+int pmac_i2c_open(struct pmac_i2c_bus *bus, int polled)
 {
-	struct low_i2c_host *host = find_low_i2c_host(np);
+	int rc;
+
+	down(&bus->sem);
+	bus->polled = polled;
+	bus->opened = 1;
+	bus->mode = pmac_i2c_mode_std;
+	if (bus->open && (rc = bus->open(bus)) != 0) {
+		bus->opened = 0;
+		up(&bus->sem);
+		return rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_open);
 
-	if (!host)
-		return -ENODEV;
+void pmac_i2c_close(struct pmac_i2c_bus *bus)
+{
+	WARN_ON(!bus->opened);
+	if (bus->close)
+		bus->close(bus);
+	bus->opened = 0;
+	up(&bus->sem);
+}
+EXPORT_SYMBOL_GPL(pmac_i2c_close);
 
-	if (channel >= host->num_channels)
-		return -EINVAL;
+int pmac_i2c_setmode(struct pmac_i2c_bus *bus, int mode)
+{
+	WARN_ON(!bus->opened);
 
-	down(&host->mutex);
-	host->is_open = 1;
-	host->channel = channel;
+	/* Report me if you see the error below as there might be a new
+	 * "combined4" mode that I need to implement for the SMU bus
+	 */
+	if (mode < pmac_i2c_mode_dumb || mode > pmac_i2c_mode_combined) {
+		printk(KERN_ERR "low_i2c: Invalid mode %d requested on"
+		       " bus %s !\n", mode, bus->busnode->full_name);
+		return -EINVAL;
+	}
+	bus->mode = mode;
 
 	return 0;
 }
-EXPORT_SYMBOL(pmac_low_i2c_open);
+EXPORT_SYMBOL_GPL(pmac_i2c_setmode);
 
-int pmac_low_i2c_close(struct device_node *np)
+int pmac_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
+		  u32 subaddr, u8 *data, int len)
 {
-	struct low_i2c_host *host = find_low_i2c_host(np);
+	int rc;
 
-	if (!host)
-		return -ENODEV;
+	WARN_ON(!bus->opened);
 
-	host->is_open = 0;
-	up(&host->mutex);
+	DBG("xfer() chan=%d, addrdir=0x%x, mode=%d, subsize=%d, subaddr=0x%x,"
+	    " %d bytes, bus %s\n", bus->channel, addrdir, bus->mode, subsize,
+	    subaddr, len, bus->busnode->full_name);
 
-	return 0;
+	rc = bus->xfer(bus, addrdir, subsize, subaddr, data, len);
+
+#ifdef DEBUG
+	if (rc)
+		DBG("xfer error %d\n", rc);
+#endif
+	return rc;
 }
-EXPORT_SYMBOL(pmac_low_i2c_close);
+EXPORT_SYMBOL_GPL(pmac_i2c_xfer);
 
-int pmac_low_i2c_setmode(struct device_node *np, int mode)
+/*
+ * Initialize us: probe all i2c busses on the machine and instantiate
+ * busses.
+ */
+/* This is non-static as it might be called early by smp code */
+int __init pmac_i2c_init(void)
 {
-	struct low_i2c_host *host = find_low_i2c_host(np);
+	static int i2c_inited;
 
-	if (!host)
-		return -ENODEV;
-	WARN_ON(!host->is_open);
-	host->mode = mode;
+	if (i2c_inited)
+		return 0;
+	i2c_inited = 1;
 
-	return 0;
-}
-EXPORT_SYMBOL(pmac_low_i2c_setmode);
+	/* Probe keywest-i2c busses */
+	kw_i2c_probe();
 
-int pmac_low_i2c_xfer(struct device_node *np, u8 addrdir, u8 subaddr, u8 *data, int len)
-{
-	struct low_i2c_host *host = find_low_i2c_host(np);
+#ifdef CONFIG_ADB_PMU
+	pmu_i2c_probe();
+#endif
 
-	if (!host)
-		return -ENODEV;
-	WARN_ON(!host->is_open);
+#ifdef CONFIG_PMAC_SMU
+	smu_i2c_probe();
+#endif
 
-	return host->func(host, addrdir, subaddr, data, len);
+	return 0;
 }
-EXPORT_SYMBOL(pmac_low_i2c_xfer);
+arch_initcall(pmac_i2c_init);
 
diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c
index e5a5bdbdda7a..dc5cdc1484e8 100644
--- a/arch/powerpc/platforms/powermac/setup.c
+++ b/arch/powerpc/platforms/powermac/setup.c
@@ -652,27 +652,22 @@ static int __init pmac_declare_of_platform_devices(void)
 {
 	struct device_node *np, *npp;
 
-	np = find_devices("uni-n");
-	if (np) {
-		for (np = np->child; np != NULL; np = np->sibling)
-			if (strncmp(np->name, "i2c", 3) == 0) {
-				of_platform_device_create(np, "uni-n-i2c",
-							  NULL);
-				break;
-			}
-	}
-	np = find_devices("valkyrie");
+	np = of_find_node_by_name(NULL, "valkyrie");
 	if (np)
 		of_platform_device_create(np, "valkyrie", NULL);
-	np = find_devices("platinum");
+	np = of_find_node_by_name(NULL, "platinum");
 	if (np)
 		of_platform_device_create(np, "platinum", NULL);
-
-	npp = of_find_node_by_name(NULL, "u3");
+	npp = of_find_node_by_name(NULL, "uni-n");
+	if (npp == NULL)
+		npp = of_find_node_by_name(NULL, "u3");
+	if (npp == NULL)
+		npp = of_find_node_by_name(NULL, "u4");
 	if (npp) {
 		for (np = NULL; (np = of_get_next_child(npp, np)) != NULL;) {
 			if (strncmp(np->name, "i2c", 3) == 0) {
-				of_platform_device_create(np, "u3-i2c", NULL);
+				of_platform_device_create(np, "uni-n-i2c",
+							  NULL);
 				of_node_put(np);
 				break;
 			}
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index df01bb8feb16..ab72ba86be1e 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -482,7 +482,7 @@ static void __devinit smp_core99_take_timebase(void)
 /*
  * G5s enable/disable the timebase via an i2c-connected clock chip.
  */
-static struct device_node *pmac_tb_clock_chip_host;
+static struct pmac_i2c_bus *pmac_tb_clock_chip_host;
 static u8 pmac_tb_pulsar_addr;
 
 static void smp_core99_cypress_tb_freeze(int freeze)
@@ -493,20 +493,20 @@ static void smp_core99_cypress_tb_freeze(int freeze)
 	/* Strangely, the device-tree says address is 0xd2, but darwin
 	 * accesses 0xd0 ...
 	 */
-	pmac_low_i2c_setmode(pmac_tb_clock_chip_host,
-			     pmac_low_i2c_mode_combined);
-	rc = pmac_low_i2c_xfer(pmac_tb_clock_chip_host,
-			       0xd0 | pmac_low_i2c_read,
-			       0x81, &data, 1);
+	pmac_i2c_setmode(pmac_tb_clock_chip_host,
+			 pmac_i2c_mode_combined);
+	rc = pmac_i2c_xfer(pmac_tb_clock_chip_host,
+			   0xd0 | pmac_i2c_read,
+			   1, 0x81, &data, 1);
 	if (rc != 0)
 		goto bail;
 
 	data = (data & 0xf3) | (freeze ? 0x00 : 0x0c);
 
-       	pmac_low_i2c_setmode(pmac_tb_clock_chip_host, pmac_low_i2c_mode_stdsub);
-	rc = pmac_low_i2c_xfer(pmac_tb_clock_chip_host,
-			       0xd0 | pmac_low_i2c_write,
-			       0x81, &data, 1);
+       	pmac_i2c_setmode(pmac_tb_clock_chip_host, pmac_i2c_mode_stdsub);
+	rc = pmac_i2c_xfer(pmac_tb_clock_chip_host,
+			   0xd0 | pmac_i2c_write,
+			   1, 0x81, &data, 1);
 
  bail:
 	if (rc != 0) {
@@ -522,20 +522,20 @@ static void smp_core99_pulsar_tb_freeze(int freeze)
 	u8 data;
 	int rc;
 
-	pmac_low_i2c_setmode(pmac_tb_clock_chip_host,
-			     pmac_low_i2c_mode_combined);
-	rc = pmac_low_i2c_xfer(pmac_tb_clock_chip_host,
-			       pmac_tb_pulsar_addr | pmac_low_i2c_read,
-			       0x2e, &data, 1);
+	pmac_i2c_setmode(pmac_tb_clock_chip_host,
+			 pmac_i2c_mode_combined);
+	rc = pmac_i2c_xfer(pmac_tb_clock_chip_host,
+			   pmac_tb_pulsar_addr | pmac_i2c_read,
+			   1, 0x2e, &data, 1);
 	if (rc != 0)
 		goto bail;
 
 	data = (data & 0x88) | (freeze ? 0x11 : 0x22);
 
-	pmac_low_i2c_setmode(pmac_tb_clock_chip_host, pmac_low_i2c_mode_stdsub);
-	rc = pmac_low_i2c_xfer(pmac_tb_clock_chip_host,
-			       pmac_tb_pulsar_addr | pmac_low_i2c_write,
-			       0x2e, &data, 1);
+	pmac_i2c_setmode(pmac_tb_clock_chip_host, pmac_i2c_mode_stdsub);
+	rc = pmac_i2c_xfer(pmac_tb_clock_chip_host,
+			   pmac_tb_pulsar_addr | pmac_i2c_write,
+			   1, 0x2e, &data, 1);
  bail:
 	if (rc != 0) {
 		printk(KERN_ERR "Pulsar Timebase %s rc: %d\n",
@@ -560,13 +560,15 @@ static void __init smp_core99_setup_i2c_hwsync(int ncpus)
 		if (!ok)
 			continue;
 
+		pmac_tb_clock_chip_host = pmac_i2c_find_bus(cc);
+		if (pmac_tb_clock_chip_host == NULL)
+			continue;
 		reg = (u32 *)get_property(cc, "reg", NULL);
 		if (reg == NULL)
 			continue;
-
 		switch (*reg) {
 		case 0xd2:
-			if (device_is_compatible(cc, "pulsar-legacy-slewing")) {
+			if (device_is_compatible(cc,"pulsar-legacy-slewing")) {
 				pmac_tb_freeze = smp_core99_pulsar_tb_freeze;
 				pmac_tb_pulsar_addr = 0xd2;
 				name = "Pulsar";
@@ -585,30 +587,19 @@ static void __init smp_core99_setup_i2c_hwsync(int ncpus)
 			break;
 	}
 	if (pmac_tb_freeze != NULL) {
-		struct device_node *p = of_get_parent(cc);
-		of_node_put(cc);
-		while(p && strcmp(p->type, "i2c")) {
-			cc = of_get_parent(p);
-			of_node_put(p);
-			p = cc;
-		}
-		if (p == NULL)
-			goto no_i2c_sync;
 		/* Open i2c bus for synchronous access */
-		if (pmac_low_i2c_open(p, 0)) {
-			printk(KERN_ERR "Failed top open i2c bus %s for clock"
-			       " sync, fallback to software sync !\n",
-			       p->full_name);
-			of_node_put(p);
+		if (pmac_i2c_open(pmac_tb_clock_chip_host, 1)) {
+			printk(KERN_ERR "Failed top open i2c bus for clock"
+			       " sync, fallback to software sync !\n");
 			goto no_i2c_sync;
 		}
-		pmac_tb_clock_chip_host = p;
 		printk(KERN_INFO "Processor timebase sync using %s i2c clock\n",
 		       name);
 		return;
 	}
  no_i2c_sync:
 	pmac_tb_freeze = NULL;
+	pmac_tb_clock_chip_host = NULL;
 }
 
 #endif /* CONFIG_PPC64 */
@@ -752,8 +743,18 @@ static int __init smp_core99_probe(void)
 	if (ncpus <= 1)
 		return 1;
 
+	/* We need to perform some early initialisations before we can start
+	 * setting up SMP as we are running before initcalls
+	 */
+	pmac_i2c_init();
+
+	/* Setup various bits like timebase sync method, ability to nap, ... */
 	smp_core99_setup(ncpus);
+
+	/* Install IPIs */
 	mpic_request_ipis();
+
+	/* Collect l2cr and l3cr values from CPU 0 */
 	core99_init_caches(0);
 
 	return ncpus;
@@ -817,7 +818,7 @@ static void __devinit smp_core99_setup_cpu(int cpu_nr)
 
 		/* Close i2c bus if it was used for tb sync */
 		if (pmac_tb_clock_chip_host) {
-			pmac_low_i2c_close(pmac_tb_clock_chip_host);
+			pmac_i2c_close(pmac_tb_clock_chip_host);
 			pmac_tb_clock_chip_host	= NULL;
 		}
 
diff --git a/drivers/i2c/busses/i2c-pmac-smu.c b/drivers/i2c/busses/i2c-pmac-smu.c
index bfefe7f7a53d..7d925be3fd4b 100644
--- a/drivers/i2c/busses/i2c-pmac-smu.c
+++ b/drivers/i2c/busses/i2c-pmac-smu.c
@@ -103,8 +103,8 @@ static s32 smu_smbus_xfer(	struct i2c_adapter*	adap,
 		cmd.info.subaddr[1] = 0;
 		cmd.info.subaddr[2] = 0;
 		if (!read) {
-			cmd.info.data[0] = data->byte & 0xff;
-			cmd.info.data[1] = (data->byte >> 8) & 0xff;
+			cmd.info.data[0] = data->word & 0xff;
+			cmd.info.data[1] = (data->word >> 8) & 0xff;
 		}
 		break;
 	/* Note that these are broken vs. the expected smbus API where
@@ -116,7 +116,7 @@ static s32 smu_smbus_xfer(	struct i2c_adapter*	adap,
         case I2C_SMBUS_BLOCK_DATA:
 		cmd.info.type = SMU_I2C_TRANSFER_STDSUB;
 		cmd.info.datalen = data->block[0] + 1;
-		if (cmd.info.datalen > 6)
+		if (cmd.info.datalen > (SMU_I2C_WRITE_MAX + 1))
 			return -EINVAL;
 		if (!read)
 			memcpy(cmd.info.data, data->block, cmd.info.datalen);
@@ -273,7 +273,13 @@ static int dispose_iface(struct device *dev)
 static int create_iface_of_platform(struct of_device* dev,
 				    const struct of_device_id *match)
 {
-	return create_iface(dev->node, &dev->dev);
+	struct device_node *node = dev->node;
+
+	if (device_is_compatible(node, "smu-i2c") ||
+	    (node->parent != NULL &&
+	     device_is_compatible(node->parent, "smu-i2c-control")))
+		return create_iface(node, &dev->dev);
+	return -ENODEV;
 }
 
 
@@ -288,6 +294,9 @@ static struct of_device_id i2c_smu_match[] =
 	{
 		.compatible	= "smu-i2c",
 	},
+	{
+		.compatible	= "i2c-bus",
+	},
 	{},
 };
 static struct of_platform_driver i2c_smu_of_platform_driver =
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 96226116a646..9ecd76849e35 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -94,6 +94,8 @@ struct smu_device {
 static struct smu_device	*smu;
 static DECLARE_MUTEX(smu_part_access);
 
+static void smu_i2c_retry(unsigned long data);
+
 /*
  * SMU driver low level stuff
  */
@@ -469,7 +471,6 @@ int __init smu_init (void)
 	smu->of_node = np;
 	smu->db_irq = NO_IRQ;
 	smu->msg_irq = NO_IRQ;
-	init_timer(&smu->i2c_timer);
 
 	/* smu_cmdbuf_abs is in the low 2G of RAM, can be converted to a
 	 * 32 bits value safely
@@ -544,6 +545,10 @@ static int smu_late_init(void)
 	if (!smu)
 		return 0;
 
+	init_timer(&smu->i2c_timer);
+	smu->i2c_timer.function = smu_i2c_retry;
+	smu->i2c_timer.data = (unsigned long)smu;
+
 	/*
 	 * Try to request the interrupts
 	 */
@@ -570,28 +575,41 @@ static int smu_late_init(void)
 
 	return 0;
 }
-arch_initcall(smu_late_init);
+/* This has to be before arch_initcall as the low i2c stuff relies on the
+ * above having been done before we reach arch_initcalls
+ */
+core_initcall(smu_late_init);
 
 /*
  * sysfs visibility
  */
 
+static void smu_create_i2c(struct device_node *np)
+{
+	char name[32];
+	u32 *reg = (u32 *)get_property(np, "reg", NULL);
+
+	if (reg != NULL) {
+		sprintf(name, "smu-i2c-%02x", *reg);
+		of_platform_device_create(np, name, &smu->of_dev->dev);
+	}
+}
+
 static void smu_expose_childs(void *unused)
 {
-	struct device_node *np;
+	struct device_node *np, *gp;
 
 	for (np = NULL; (np = of_get_next_child(smu->of_node, np)) != NULL;) {
-		if (device_is_compatible(np, "smu-i2c")) {
-			char name[32];
-			u32 *reg = (u32 *)get_property(np, "reg", NULL);
-
-			if (reg == NULL)
-				continue;
-			sprintf(name, "smu-i2c-%02x", *reg);
-			of_platform_device_create(np, name, &smu->of_dev->dev);
-		}
+		if (device_is_compatible(np, "smu-i2c-control")) {
+			gp = NULL;
+			while ((gp = of_get_next_child(np, gp)) != NULL)
+				if (device_is_compatible(gp, "i2c-bus"))
+					smu_create_i2c(gp);
+		} else if (device_is_compatible(np, "smu-i2c"))
+			smu_create_i2c(np);
 		if (device_is_compatible(np, "smu-sensors"))
-			of_platform_device_create(np, "smu-sensors", &smu->of_dev->dev);
+			of_platform_device_create(np, "smu-sensors",
+						  &smu->of_dev->dev);
 	}
 
 }
@@ -712,13 +730,13 @@ static void smu_i2c_complete_command(struct smu_i2c_cmd *cmd, int fail)
 
 static void smu_i2c_retry(unsigned long data)
 {
-	struct smu_i2c_cmd	*cmd = (struct smu_i2c_cmd *)data;
+	struct smu_i2c_cmd	*cmd = smu->cmd_i2c_cur;
 
 	DPRINTK("SMU: i2c failure, requeuing...\n");
 
 	/* requeue command simply by resetting reply_len */
 	cmd->pdata[0] = 0xff;
-	cmd->scmd.reply_len = 0x10;
+	cmd->scmd.reply_len = sizeof(cmd->pdata);
 	smu_queue_cmd(&cmd->scmd);
 }
 
@@ -747,10 +765,8 @@ static void smu_i2c_low_completion(struct smu_cmd *scmd, void *misc)
 	 */
 	if (fail && --cmd->retries > 0) {
 		DPRINTK("SMU: i2c failure, starting timer...\n");
-		smu->i2c_timer.function = smu_i2c_retry;
-		smu->i2c_timer.data = (unsigned long)cmd;
-		smu->i2c_timer.expires = jiffies + msecs_to_jiffies(5);
-		add_timer(&smu->i2c_timer);
+		BUG_ON(cmd != smu->cmd_i2c_cur);
+		mod_timer(&smu->i2c_timer, jiffies + msecs_to_jiffies(5));
 		return;
 	}
 
@@ -764,7 +780,7 @@ static void smu_i2c_low_completion(struct smu_cmd *scmd, void *misc)
 
 	/* Ok, initial command complete, now poll status */
 	scmd->reply_buf = cmd->pdata;
-	scmd->reply_len = 0x10;
+	scmd->reply_len = sizeof(cmd->pdata);
 	scmd->data_buf = cmd->pdata;
 	scmd->data_len = 1;
 	cmd->pdata[0] = 0;
@@ -786,7 +802,7 @@ int smu_queue_i2c(struct smu_i2c_cmd *cmd)
 	cmd->scmd.done = smu_i2c_low_completion;
 	cmd->scmd.misc = cmd;
 	cmd->scmd.reply_buf = cmd->pdata;
-	cmd->scmd.reply_len = 0x10;
+	cmd->scmd.reply_len = sizeof(cmd->pdata);
 	cmd->scmd.data_buf = (u8 *)(char *)&cmd->info;
 	cmd->scmd.status = 1;
 	cmd->stage = 0;
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 3c0552016b91..aa481a88ccab 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -197,7 +197,6 @@ static int pmu_adb_reset_bus(void);
 #endif /* CONFIG_ADB */
 
 static int init_pmu(void);
-static int pmu_queue_request(struct adb_request *req);
 static void pmu_start(void);
 static irqreturn_t via_pmu_interrupt(int irq, void *arg, struct pt_regs *regs);
 static irqreturn_t gpio1_interrupt(int irq, void *arg, struct pt_regs *regs);
@@ -1802,258 +1801,6 @@ pmu_present(void)
 	return via != 0;
 }
 
-struct pmu_i2c_hdr {
-	u8	bus;
-	u8	mode;
-	u8	bus2;
-	u8	address;
-	u8	sub_addr;
-	u8	comb_addr;
-	u8	count;
-};
-
-int
-pmu_i2c_combined_read(int bus, int addr, int subaddr,  u8* data, int len)
-{
-	struct adb_request	req;
-	struct pmu_i2c_hdr	*hdr = (struct pmu_i2c_hdr *)&req.data[1];
-	int retry;
-	int rc;
-
-	for (retry=0; retry<16; retry++) {
-		memset(&req, 0, sizeof(req));
-
-		hdr->bus = bus;
-		hdr->address = addr & 0xfe;
-		hdr->mode = PMU_I2C_MODE_COMBINED;
-		hdr->bus2 = 0;
-		hdr->sub_addr = subaddr;
-		hdr->comb_addr = addr | 1;
-		hdr->count = len;
-		
-		req.nbytes = sizeof(struct pmu_i2c_hdr) + 1;
-		req.reply_expected = 0;
-		req.reply_len = 0;
-		req.data[0] = PMU_I2C_CMD;
-		req.reply[0] = 0xff;
-		rc = pmu_queue_request(&req);
-		if (rc)
-			return rc;
-		while(!req.complete)
-			pmu_poll();
-		if (req.reply[0] == PMU_I2C_STATUS_OK)
-			break;
-		mdelay(15);
-	}
-	if (req.reply[0] != PMU_I2C_STATUS_OK)
-		return -1;
-
-	for (retry=0; retry<16; retry++) {
-		memset(&req, 0, sizeof(req));
-
-		mdelay(15);
-
-		hdr->bus = PMU_I2C_BUS_STATUS;
-		req.reply[0] = 0xff;
-		
-		req.nbytes = 2;
-		req.reply_expected = 0;
-		req.reply_len = 0;
-		req.data[0] = PMU_I2C_CMD;
-		rc = pmu_queue_request(&req);
-		if (rc)
-			return rc;
-		while(!req.complete)
-			pmu_poll();
-		if (req.reply[0] == PMU_I2C_STATUS_DATAREAD) {
-			memcpy(data, &req.reply[1], req.reply_len - 1);
-			return req.reply_len - 1;
-		}
-	}
-	return -1;
-}
-
-int
-pmu_i2c_stdsub_write(int bus, int addr, int subaddr,  u8* data, int len)
-{
-	struct adb_request	req;
-	struct pmu_i2c_hdr	*hdr = (struct pmu_i2c_hdr *)&req.data[1];
-	int retry;
-	int rc;
-
-	for (retry=0; retry<16; retry++) {
-		memset(&req, 0, sizeof(req));
-
-		hdr->bus = bus;
-		hdr->address = addr & 0xfe;
-		hdr->mode = PMU_I2C_MODE_STDSUB;
-		hdr->bus2 = 0;
-		hdr->sub_addr = subaddr;
-		hdr->comb_addr = addr & 0xfe;
-		hdr->count = len;
-
-		req.data[0] = PMU_I2C_CMD;
-		memcpy(&req.data[sizeof(struct pmu_i2c_hdr) + 1], data, len);
-		req.nbytes = sizeof(struct pmu_i2c_hdr) + len + 1;
-		req.reply_expected = 0;
-		req.reply_len = 0;
-		req.reply[0] = 0xff;
-		rc = pmu_queue_request(&req);
-		if (rc)
-			return rc;
-		while(!req.complete)
-			pmu_poll();
-		if (req.reply[0] == PMU_I2C_STATUS_OK)
-			break;
-		mdelay(15);
-	}
-	if (req.reply[0] != PMU_I2C_STATUS_OK)
-		return -1;
-
-	for (retry=0; retry<16; retry++) {
-		memset(&req, 0, sizeof(req));
-
-		mdelay(15);
-
-		hdr->bus = PMU_I2C_BUS_STATUS;
-		req.reply[0] = 0xff;
-		
-		req.nbytes = 2;
-		req.reply_expected = 0;
-		req.reply_len = 0;
-		req.data[0] = PMU_I2C_CMD;
-		rc = pmu_queue_request(&req);
-		if (rc)
-			return rc;
-		while(!req.complete)
-			pmu_poll();
-		if (req.reply[0] == PMU_I2C_STATUS_OK)
-			return len;
-	}
-	return -1;
-}
-
-int
-pmu_i2c_simple_read(int bus, int addr,  u8* data, int len)
-{
-	struct adb_request	req;
-	struct pmu_i2c_hdr	*hdr = (struct pmu_i2c_hdr *)&req.data[1];
-	int retry;
-	int rc;
-
-	for (retry=0; retry<16; retry++) {
-		memset(&req, 0, sizeof(req));
-
-		hdr->bus = bus;
-		hdr->address = addr | 1;
-		hdr->mode = PMU_I2C_MODE_SIMPLE;
-		hdr->bus2 = 0;
-		hdr->sub_addr = 0;
-		hdr->comb_addr = 0;
-		hdr->count = len;
-
-		req.data[0] = PMU_I2C_CMD;
-		req.nbytes = sizeof(struct pmu_i2c_hdr) + 1;
-		req.reply_expected = 0;
-		req.reply_len = 0;
-		req.reply[0] = 0xff;
-		rc = pmu_queue_request(&req);
-		if (rc)
-			return rc;
-		while(!req.complete)
-			pmu_poll();
-		if (req.reply[0] == PMU_I2C_STATUS_OK)
-			break;
-		mdelay(15);
-	}
-	if (req.reply[0] != PMU_I2C_STATUS_OK)
-		return -1;
-
-	for (retry=0; retry<16; retry++) {
-		memset(&req, 0, sizeof(req));
-
-		mdelay(15);
-
-		hdr->bus = PMU_I2C_BUS_STATUS;
-		req.reply[0] = 0xff;
-		
-		req.nbytes = 2;
-		req.reply_expected = 0;
-		req.reply_len = 0;
-		req.data[0] = PMU_I2C_CMD;
-		rc = pmu_queue_request(&req);
-		if (rc)
-			return rc;
-		while(!req.complete)
-			pmu_poll();
-		if (req.reply[0] == PMU_I2C_STATUS_DATAREAD) {
-			memcpy(data, &req.reply[1], req.reply_len - 1);
-			return req.reply_len - 1;
-		}
-	}
-	return -1;
-}
-
-int
-pmu_i2c_simple_write(int bus, int addr,  u8* data, int len)
-{
-	struct adb_request	req;
-	struct pmu_i2c_hdr	*hdr = (struct pmu_i2c_hdr *)&req.data[1];
-	int retry;
-	int rc;
-
-	for (retry=0; retry<16; retry++) {
-		memset(&req, 0, sizeof(req));
-
-		hdr->bus = bus;
-		hdr->address = addr & 0xfe;
-		hdr->mode = PMU_I2C_MODE_SIMPLE;
-		hdr->bus2 = 0;
-		hdr->sub_addr = 0;
-		hdr->comb_addr = 0;
-		hdr->count = len;
-
-		req.data[0] = PMU_I2C_CMD;
-		memcpy(&req.data[sizeof(struct pmu_i2c_hdr) + 1], data, len);
-		req.nbytes = sizeof(struct pmu_i2c_hdr) + len + 1;
-		req.reply_expected = 0;
-		req.reply_len = 0;
-		req.reply[0] = 0xff;
-		rc = pmu_queue_request(&req);
-		if (rc)
-			return rc;
-		while(!req.complete)
-			pmu_poll();
-		if (req.reply[0] == PMU_I2C_STATUS_OK)
-			break;
-		mdelay(15);
-	}
-	if (req.reply[0] != PMU_I2C_STATUS_OK)
-		return -1;
-
-	for (retry=0; retry<16; retry++) {
-		memset(&req, 0, sizeof(req));
-
-		mdelay(15);
-
-		hdr->bus = PMU_I2C_BUS_STATUS;
-		req.reply[0] = 0xff;
-		
-		req.nbytes = 2;
-		req.reply_expected = 0;
-		req.reply_len = 0;
-		req.data[0] = PMU_I2C_CMD;
-		rc = pmu_queue_request(&req);
-		if (rc)
-			return rc;
-		while(!req.complete)
-			pmu_poll();
-		if (req.reply[0] == PMU_I2C_STATUS_OK)
-			return len;
-	}
-	return -1;
-}
-
 #ifdef CONFIG_PM
 
 static LIST_HEAD(sleep_notifiers);
@@ -2358,9 +2105,6 @@ pmac_suspend_devices(void)
 		return -EBUSY;
 	}
 
-	/* Disable clock spreading on some machines */
-	pmac_tweak_clock_spreading(0);
-
 	/* Stop preemption */
 	preempt_disable();
 
@@ -2431,9 +2175,6 @@ pmac_wakeup_devices(void)
 	mdelay(10);
 	preempt_enable();
 
-	/* Re-enable clock spreading on some machines */
-	pmac_tweak_clock_spreading(1);
-
 	/* Resume devices */
 	device_resume();
 
@@ -3150,16 +2891,13 @@ static int __init init_pmu_sysfs(void)
 subsys_initcall(init_pmu_sysfs);
 
 EXPORT_SYMBOL(pmu_request);
+EXPORT_SYMBOL(pmu_queue_request);
 EXPORT_SYMBOL(pmu_poll);
 EXPORT_SYMBOL(pmu_poll_adb);
 EXPORT_SYMBOL(pmu_wait_complete);
 EXPORT_SYMBOL(pmu_suspend);
 EXPORT_SYMBOL(pmu_resume);
 EXPORT_SYMBOL(pmu_unlock);
-EXPORT_SYMBOL(pmu_i2c_combined_read);
-EXPORT_SYMBOL(pmu_i2c_stdsub_write);
-EXPORT_SYMBOL(pmu_i2c_simple_read);
-EXPORT_SYMBOL(pmu_i2c_simple_write);
 #if defined(CONFIG_PM) && defined(CONFIG_PPC32)
 EXPORT_SYMBOL(pmu_enable_irled);
 EXPORT_SYMBOL(pmu_battery_count);
diff --git a/include/asm-powerpc/pmac_feature.h b/include/asm-powerpc/pmac_feature.h
index f6997ed5179e..e654ad0e5b42 100644
--- a/include/asm-powerpc/pmac_feature.h
+++ b/include/asm-powerpc/pmac_feature.h
@@ -318,10 +318,6 @@ extern void pmac_register_agp_pm(struct pci_dev *bridge,
 extern void pmac_suspend_agp_for_card(struct pci_dev *dev);
 extern void pmac_resume_agp_for_card(struct pci_dev *dev);
 
-/* Used by the via-pmu driver for suspend/resume
- */
-extern void pmac_tweak_clock_spreading(int enable);
-
 /*
  * The part below is for use by macio_asic.c only, do not rely
  * on the data structures or constants below in a normal driver
diff --git a/include/asm-powerpc/pmac_low_i2c.h b/include/asm-powerpc/pmac_low_i2c.h
index 3fb8d51540dd..adf4fa956572 100644
--- a/include/asm-powerpc/pmac_low_i2c.h
+++ b/include/asm-powerpc/pmac_low_i2c.h
@@ -15,30 +15,87 @@
 
 /* i2c mode (based on the platform functions format) */
 enum {
-	pmac_low_i2c_mode_dumb		= 1,
-	pmac_low_i2c_mode_std		= 2,
-	pmac_low_i2c_mode_stdsub	= 3,
-	pmac_low_i2c_mode_combined	= 4,
+	pmac_i2c_mode_dumb	= 1,
+	pmac_i2c_mode_std	= 2,
+	pmac_i2c_mode_stdsub	= 3,
+	pmac_i2c_mode_combined	= 4,
 };
 
 /* RW bit in address */
 enum {
-	pmac_low_i2c_read		= 0x01,
-	pmac_low_i2c_write		= 0x00
+	pmac_i2c_read		= 0x01,
+	pmac_i2c_write		= 0x00
 };
 
+/* i2c bus type */
+enum {
+	pmac_i2c_bus_keywest	= 0,
+	pmac_i2c_bus_pmu	= 1,
+	pmac_i2c_bus_smu	= 2,
+};
+
+/* i2c bus features */
+enum {
+	/* can_largesub : supports >1 byte subaddresses (SMU only) */
+	pmac_i2c_can_largesub	= 0x00000001u,
+
+	/* multibus : device node holds multiple busses, bus number is
+	 * encoded in bits 0xff00 of "reg" of a given device
+	 */
+	pmac_i2c_multibus	= 0x00000002u,
+};
+
+/* i2c busses in the system */
+struct pmac_i2c_bus;
+struct i2c_adapter;
+
 /* Init, called early during boot */
-extern void pmac_init_low_i2c(void);
+extern int pmac_i2c_init(void);
+
+/* Lookup an i2c bus for a device-node. The node can be either the bus
+ * node itself or a device below it. In the case of a multibus, the bus
+ * node itself is the controller node, else, it's a child of the controller
+ * node
+ */
+extern struct pmac_i2c_bus *pmac_i2c_find_bus(struct device_node *node);
+
+/* Get the address for an i2c device. This strips the bus number if
+ * necessary. The 7 bits address is returned 1 bit right shifted so that the
+ * direction can be directly ored in
+ */
+extern u8 pmac_i2c_get_dev_addr(struct device_node *device);
+
+/* Get infos about a bus */
+extern struct device_node *pmac_i2c_get_controller(struct pmac_i2c_bus *bus);
+extern struct device_node *pmac_i2c_get_bus_node(struct pmac_i2c_bus *bus);
+extern int pmac_i2c_get_type(struct pmac_i2c_bus *bus);
+extern int pmac_i2c_get_flags(struct pmac_i2c_bus *bus);
+
+/* i2c layer adapter attach/detach */
+extern void pmac_i2c_attach_adapter(struct pmac_i2c_bus *bus,
+				    struct i2c_adapter *adapter);
+extern void pmac_i2c_detach_adapter(struct pmac_i2c_bus *bus,
+				    struct i2c_adapter *adapter);
+extern struct i2c_adapter *pmac_i2c_get_adapter(struct pmac_i2c_bus *bus);
+
+/* March a device or bus with an i2c adapter structure, to be used by drivers
+ * to match device-tree nodes with i2c adapters during adapter discovery
+ * callbacks
+ */
+extern int pmac_i2c_match_adapter(struct device_node *dev,
+				  struct i2c_adapter *adapter);
+
 
-/* Locking functions exposed to i2c-keywest */
-int pmac_low_i2c_lock(struct device_node *np);
-int pmac_low_i2c_unlock(struct device_node *np);
+/* (legacy) Locking functions exposed to i2c-keywest */
+extern int pmac_low_i2c_lock(struct device_node *np);
+extern int pmac_low_i2c_unlock(struct device_node *np);
 
 /* Access functions for platform code */
-int pmac_low_i2c_open(struct device_node *np, int channel);
-int pmac_low_i2c_close(struct device_node *np);
-int pmac_low_i2c_setmode(struct device_node *np, int mode);
-int pmac_low_i2c_xfer(struct device_node *np, u8 addrdir, u8 subaddr, u8 *data, int len);
+extern int pmac_i2c_open(struct pmac_i2c_bus *bus, int polled);
+extern void pmac_i2c_close(struct pmac_i2c_bus *bus);
+extern int pmac_i2c_setmode(struct pmac_i2c_bus *bus, int mode);
+extern int pmac_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
+			 u32 subaddr, u8 *data,  int len);
 
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-powerpc/smu.h b/include/asm-powerpc/smu.h
index 7fae3ce9a8c1..134c2b5be0f2 100644
--- a/include/asm-powerpc/smu.h
+++ b/include/asm-powerpc/smu.h
@@ -358,6 +358,9 @@ extern unsigned long smu_cmdbuf_abs;
  * Kenrel asynchronous i2c interface
  */
 
+#define SMU_I2C_READ_MAX	0x1d
+#define SMU_I2C_WRITE_MAX	0x15
+
 /* SMU i2c header, exactly matches i2c header on wire */
 struct smu_i2c_param
 {
@@ -368,12 +371,9 @@ struct smu_i2c_param
 	u8	subaddr[3];	/* subaddress */
 	u8	caddr;		/* combined address, filled by SMU driver */
 	u8	datalen;	/* length of transfer */
-	u8	data[7];	/* data */
+	u8	data[SMU_I2C_READ_MAX];	/* data */
 };
 
-#define SMU_I2C_READ_MAX	0x0d
-#define SMU_I2C_WRITE_MAX	0x05
-
 struct smu_i2c_cmd
 {
 	/* public */
@@ -387,7 +387,7 @@ struct smu_i2c_cmd
 	int			read;
 	int			stage;
 	int			retries;
-	u8			pdata[0x10];
+	u8			pdata[32];
 	struct list_head	link;
 };
 
@@ -519,7 +519,7 @@ struct smu_sdbp_cpupiddata {
  * if not found. The data format is described below
  */
 extern struct smu_sdbp_header *smu_get_sdb_partition(int id,
-						     unsigned int *size);
+					unsigned int *size);
 
 #endif /* __KERNEL__ */
 
diff --git a/include/linux/pmu.h b/include/linux/pmu.h
index 373bd3b9b330..217d3daf7336 100644
--- a/include/linux/pmu.h
+++ b/include/linux/pmu.h
@@ -140,7 +140,7 @@ extern int find_via_pmu(void);
 
 extern int pmu_request(struct adb_request *req,
 		void (*done)(struct adb_request *), int nbytes, ...);
-
+extern int pmu_queue_request(struct adb_request *req);
 extern void pmu_poll(void);
 extern void pmu_poll_adb(void); /* For use by xmon */
 extern void pmu_wait_complete(struct adb_request *req);
@@ -160,12 +160,6 @@ extern void pmu_unlock(void);
 extern int pmu_present(void);
 extern int pmu_get_model(void);
 
-extern int pmu_i2c_combined_read(int bus, int addr, int subaddr,  u8* data, int len);
-extern int pmu_i2c_stdsub_write(int bus, int addr, int subaddr,  u8* data, int len);
-extern int pmu_i2c_simple_read(int bus, int addr,  u8* data, int len);
-extern int pmu_i2c_simple_write(int bus, int addr,  u8* data, int len);
-
-
 #ifdef CONFIG_PM
 /*
  * Stuff for putting the powerbook to sleep and waking it again.
-- 
cgit v1.2.3-71-gd317


From 769db45b73896a88d6b40e3e648dfc50a155ec93 Mon Sep 17 00:00:00 2001
From: Coywolf Qi Hunt <qiyong@fc-cn.com>
Date: Wed, 28 Dec 2005 10:55:49 +0100
Subject: make elv_try_merge() static, kill the dead declaration of

elv_try_last_merge().

Signed-off-by: Coywolf Qi Hunt <qiyong@fc-cn.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/elevator.c         | 3 +--
 include/linux/elevator.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/elevator.c b/block/elevator.c
index 39dcccc82ada..99a4d7b2f8ad 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -64,7 +64,7 @@ inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_rq_merge_ok);
 
-inline int elv_try_merge(struct request *__rq, struct bio *bio)
+static inline int elv_try_merge(struct request *__rq, struct bio *bio)
 {
 	int ret = ELEVATOR_NO_MERGE;
 
@@ -80,7 +80,6 @@ inline int elv_try_merge(struct request *__rq, struct bio *bio)
 
 	return ret;
 }
-EXPORT_SYMBOL(elv_try_merge);
 
 static struct elevator_type *elevator_find(const char *name)
 {
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index fb80fa44c4dd..4a6f50e31c73 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -114,8 +114,6 @@ extern ssize_t elv_iosched_store(request_queue_t *, const char *, size_t);
 extern int elevator_init(request_queue_t *, char *);
 extern void elevator_exit(elevator_t *);
 extern int elv_rq_merge_ok(struct request *, struct bio *);
-extern int elv_try_merge(struct request *, struct bio *);
-extern int elv_try_last_merge(request_queue_t *, struct bio *);
 
 /*
  * Return values from elevator merger
-- 
cgit v1.2.3-71-gd317


From 356cebea1123804e4aa85b43ab39bbd0ac8e667c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 9 Jan 2006 15:30:20 +0100
Subject: [BLOCK] Kill blk_attempt_remerge()

It's a broken interface, it's done way too late. And apparently it triggers
slab problems in recent kernels as well (most likely after the generic dispatch
code was merged). So kill it, ide-cd is the only user of it.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/ll_rw_blk.c      | 24 ------------------------
 drivers/ide/ide-cd.c   | 10 ----------
 include/linux/blkdev.h |  1 -
 3 files changed, 35 deletions(-)

(limited to 'include/linux')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index c182c9f4b2c4..c44d6fe9f6ce 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -2734,30 +2734,6 @@ static inline int attempt_front_merge(request_queue_t *q, struct request *rq)
 	return 0;
 }
 
-/**
- * blk_attempt_remerge  - attempt to remerge active head with next request
- * @q:    The &request_queue_t belonging to the device
- * @rq:   The head request (usually)
- *
- * Description:
- *    For head-active devices, the queue can easily be unplugged so quickly
- *    that proper merging is not done on the front request. This may hurt
- *    performance greatly for some devices. The block layer cannot safely
- *    do merging on that first request for these queues, but the driver can
- *    call this function and make it happen any way. Only the driver knows
- *    when it is safe to do so.
- **/
-void blk_attempt_remerge(request_queue_t *q, struct request *rq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	attempt_back_merge(q, rq);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-EXPORT_SYMBOL(blk_attempt_remerge);
-
 static void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->flags |= REQ_CMD;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index d31117eb95aa..e4d55ad32d2f 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1332,8 +1332,6 @@ static ide_startstop_t cdrom_start_read (ide_drive_t *drive, unsigned int block)
 	if (cdrom_read_from_buffer(drive))
 		return ide_stopped;
 
-	blk_attempt_remerge(drive->queue, rq);
-
 	/* Clear the local sector buffer. */
 	info->nsectors_buffered = 0;
 
@@ -1874,14 +1872,6 @@ static ide_startstop_t cdrom_start_write(ide_drive_t *drive, struct request *rq)
 		return ide_stopped;
 	}
 
-	/*
-	 * for dvd-ram and such media, it's a really big deal to get
-	 * big writes all the time. so scour the queue and attempt to
-	 * remerge requests, often the plugging will not have had time
-	 * to do this properly
-	 */
-	blk_attempt_remerge(drive->queue, rq);
-
 	info->nsectors_buffered = 0;
 
 	/* use dma, if possible. we don't need to check more, since we
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fb0985377421..96b233991685 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -592,7 +592,6 @@ extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(request_queue_t *, struct request *);
 extern void blk_end_sync_rq(struct request *rq, int error);
-extern void blk_attempt_remerge(request_queue_t *, struct request *);
 extern struct request *blk_get_request(request_queue_t *, int, gfp_t);
 extern void blk_insert_request(request_queue_t *, struct request *, int, void *);
 extern void blk_requeue_request(request_queue_t *, struct request *);
-- 
cgit v1.2.3-71-gd317


From ff856bad67cb65cb4dc4ef88b808804fc4265782 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 9 Jan 2006 16:02:34 +0100
Subject: [BLOCK] ll_rw_blk: Enable out-of-order request completions through
 softirq

Request completion can be a quite heavy process, since it needs to
iterate through the entire request and complete the bio's it holds.
This patch adds blk_complete_request() which moves this processing
into a dedicated block softirq.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/ll_rw_blk.c         | 106 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/blkdev.h    |  21 +++++++--
 include/linux/interrupt.h |   1 +
 3 files changed, 124 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 91d3b4828c49..8e136450abc2 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -27,6 +27,8 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
 
 /*
  * for max sense size
@@ -62,13 +64,15 @@ static wait_queue_head_t congestion_wqh[2] = {
 /*
  * Controlling structure to kblockd
  */
-static struct workqueue_struct *kblockd_workqueue; 
+static struct workqueue_struct *kblockd_workqueue;
 
 unsigned long blk_max_low_pfn, blk_max_pfn;
 
 EXPORT_SYMBOL(blk_max_low_pfn);
 EXPORT_SYMBOL(blk_max_pfn);
 
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
 /* Amount of time in which a process may batch requests */
 #define BLK_BATCH_TIME	(HZ/50UL)
 
@@ -207,6 +211,13 @@ void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn)
 
 EXPORT_SYMBOL(blk_queue_merge_bvec);
 
+void blk_queue_softirq_done(request_queue_t *q, softirq_done_fn *fn)
+{
+	q->softirq_done_fn = fn;
+}
+
+EXPORT_SYMBOL(blk_queue_softirq_done);
+
 /**
  * blk_queue_make_request - define an alternate make_request function for a device
  * @q:  the request queue for the device to be affected
@@ -270,6 +281,7 @@ EXPORT_SYMBOL(blk_queue_make_request);
 static inline void rq_init(request_queue_t *q, struct request *rq)
 {
 	INIT_LIST_HEAD(&rq->queuelist);
+	INIT_LIST_HEAD(&rq->donelist);
 
 	rq->errors = 0;
 	rq->rq_status = RQ_ACTIVE;
@@ -286,6 +298,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
 	rq->sense = NULL;
 	rq->end_io = NULL;
 	rq->end_io_data = NULL;
+	rq->completion_data = NULL;
 }
 
 /**
@@ -3286,6 +3299,87 @@ int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
 
 EXPORT_SYMBOL(end_that_request_chunk);
 
+/*
+ * splice the completion data to a local structure and hand off to
+ * process_completion_queue() to complete the requests
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+	struct list_head *cpu_list;
+	LIST_HEAD(local_list);
+
+	local_irq_disable();
+	cpu_list = &__get_cpu_var(blk_cpu_done);
+	list_splice_init(cpu_list, &local_list);
+	local_irq_enable();
+
+	while (!list_empty(&local_list)) {
+		struct request *rq = list_entry(local_list.next, struct request, donelist);
+
+		list_del_init(&rq->donelist);
+		rq->q->softirq_done_fn(rq);
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
+			  void *hcpu)
+{
+	/*
+	 * If a CPU goes away, splice its entries to the current CPU
+	 * and trigger a run of the softirq
+	 */
+	if (action == CPU_DEAD) {
+		int cpu = (unsigned long) hcpu;
+
+		local_irq_disable();
+		list_splice_init(&per_cpu(blk_cpu_done, cpu),
+				 &__get_cpu_var(blk_cpu_done));
+		raise_softirq_irqoff(BLOCK_SOFTIRQ);
+		local_irq_enable();
+	}
+
+	return NOTIFY_OK;
+}
+
+
+static struct notifier_block __devinitdata blk_cpu_notifier = {
+	.notifier_call	= blk_cpu_notify,
+};
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/**
+ * blk_complete_request - end I/O on a request
+ * @req:      the request being processed
+ *
+ * Description:
+ *     Ends all I/O on a request. It does not handle partial completions,
+ *     unless the driver actually implements this in its completionc callback
+ *     through requeueing. Theh actual completion happens out-of-order,
+ *     through a softirq handler. The user must have registered a completion
+ *     callback through blk_queue_softirq_done().
+ **/
+
+void blk_complete_request(struct request *req)
+{
+	struct list_head *cpu_list;
+	unsigned long flags;
+
+	BUG_ON(!req->q->softirq_done_fn);
+		
+	local_irq_save(flags);
+
+	cpu_list = &__get_cpu_var(blk_cpu_done);
+	list_add_tail(&req->donelist, cpu_list);
+	raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(blk_complete_request);
+	
 /*
  * queue lock must be held
  */
@@ -3364,6 +3458,8 @@ EXPORT_SYMBOL(kblockd_flush);
 
 int __init blk_dev_init(void)
 {
+	int i;
+
 	kblockd_workqueue = create_workqueue("kblockd");
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
@@ -3377,6 +3473,14 @@ int __init blk_dev_init(void)
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
 
+	for (i = 0; i < NR_CPUS; i++)
+		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+
+	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
+#ifdef CONFIG_HOTPLUG_CPU
+	register_cpu_notifier(&blk_cpu_notifier);
+#endif
+
 	blk_max_low_pfn = max_low_pfn;
 	blk_max_pfn = max_pfn;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fb0985377421..804cc4ec9533 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -118,9 +118,9 @@ struct request_list {
  * try to put the fields that are referenced together in the same cacheline
  */
 struct request {
-	struct list_head queuelist; /* looking for ->queue? you must _not_
-				     * access it directly, use
-				     * blkdev_dequeue_request! */
+	struct list_head queuelist;
+	struct list_head donelist;
+
 	unsigned long flags;		/* see REQ_ bits below */
 
 	/* Maintain bio traversal state for part by part I/O submission.
@@ -141,6 +141,7 @@ struct request {
 	struct bio *biotail;
 
 	void *elevator_private;
+	void *completion_data;
 
 	unsigned short ioprio;
 
@@ -291,6 +292,7 @@ typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
 typedef void (activity_fn) (void *data, int rw);
 typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *);
 typedef void (prepare_flush_fn) (request_queue_t *, struct request *);
+typedef void (softirq_done_fn)(struct request *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -332,6 +334,7 @@ struct request_queue
 	activity_fn		*activity_fn;
 	issue_flush_fn		*issue_flush_fn;
 	prepare_flush_fn	*prepare_flush_fn;
+	softirq_done_fn		*softirq_done_fn;
 
 	/*
 	 * Dispatch queue sorting
@@ -646,6 +649,17 @@ extern int end_that_request_first(struct request *, int, int);
 extern int end_that_request_chunk(struct request *, int, int);
 extern void end_that_request_last(struct request *, int);
 extern void end_request(struct request *req, int uptodate);
+extern void blk_complete_request(struct request *);
+
+static inline int rq_all_done(struct request *rq, unsigned int nr_bytes)
+{
+	if (blk_fs_request(rq))
+		return (nr_bytes >= (rq->hard_nr_sectors << 9));
+	else if (blk_pc_request(rq))
+		return nr_bytes >= rq->data_len;
+
+	return 0;
+}
 
 /*
  * end_that_request_first/chunk() takes an uptodate argument. we account
@@ -694,6 +708,7 @@ extern void blk_queue_segment_boundary(request_queue_t *, unsigned long);
 extern void blk_queue_prep_rq(request_queue_t *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(request_queue_t *, int);
+extern void blk_queue_softirq_done(request_queue_t *, softirq_done_fn *);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(request_queue_t *, unsigned, prepare_flush_fn *);
 extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *);
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index e50a95fbeb11..f02204706984 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -112,6 +112,7 @@ enum
 	TIMER_SOFTIRQ,
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
+	BLOCK_SOFTIRQ,
 	SCSI_SOFTIRQ,
 	TASKLET_SOFTIRQ
 };
-- 
cgit v1.2.3-71-gd317


From 1aea6434eebd25e532d2e5ddabf2733af4e1ff0b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 9 Jan 2006 16:03:03 +0100
Subject: [SCSI] Kill the SCSI softirq handling

This patch moves the SCSI softirq handling to the block layer version.
There should be no functional changes.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/scsi.c       | 109 ++++------------------------------------------
 drivers/scsi/scsi_lib.c   |  36 +++++++++++++++
 drivers/scsi/scsi_priv.h  |   1 +
 include/linux/interrupt.h |   1 -
 4 files changed, 45 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 180676d7115a..ee5f4dfdab14 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -69,7 +69,6 @@
 #include "scsi_logging.h"
 
 static void scsi_done(struct scsi_cmnd *cmd);
-static int scsi_retry_command(struct scsi_cmnd *cmd);
 
 /*
  * Definitions and constants.
@@ -752,7 +751,7 @@ static void scsi_done(struct scsi_cmnd *cmd)
  * isn't running --- used by scsi_times_out */
 void __scsi_done(struct scsi_cmnd *cmd)
 {
-	unsigned long flags;
+	struct request *rq = cmd->request;
 
 	/*
 	 * Set the serial numbers back to zero
@@ -763,71 +762,14 @@ void __scsi_done(struct scsi_cmnd *cmd)
 	if (cmd->result)
 		atomic_inc(&cmd->device->ioerr_cnt);
 
+	BUG_ON(!rq);
+
 	/*
-	 * Next, enqueue the command into the done queue.
-	 * It is a per-CPU queue, so we just disable local interrupts
-	 * and need no spinlock.
+	 * The uptodate/nbytes values don't matter, as we allow partial
+	 * completes and thus will check this in the softirq callback
 	 */
-	local_irq_save(flags);
-	list_add_tail(&cmd->eh_entry, &__get_cpu_var(scsi_done_q));
-	raise_softirq_irqoff(SCSI_SOFTIRQ);
-	local_irq_restore(flags);
-}
-
-/**
- * scsi_softirq - Perform post-interrupt processing of finished SCSI commands.
- *
- * This is the consumer of the done queue.
- *
- * This is called with all interrupts enabled.  This should reduce
- * interrupt latency, stack depth, and reentrancy of the low-level
- * drivers.
- */
-static void scsi_softirq(struct softirq_action *h)
-{
-	int disposition;
-	LIST_HEAD(local_q);
-
-	local_irq_disable();
-	list_splice_init(&__get_cpu_var(scsi_done_q), &local_q);
-	local_irq_enable();
-
-	while (!list_empty(&local_q)) {
-		struct scsi_cmnd *cmd = list_entry(local_q.next,
-						   struct scsi_cmnd, eh_entry);
-		/* The longest time any command should be outstanding is the
-		 * per command timeout multiplied by the number of retries.
-		 *
-		 * For a typical command, this is 2.5 minutes */
-		unsigned long wait_for 
-			= cmd->allowed * cmd->timeout_per_command;
-		list_del_init(&cmd->eh_entry);
-
-		disposition = scsi_decide_disposition(cmd);
-		if (disposition != SUCCESS &&
-		    time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
-			sdev_printk(KERN_ERR, cmd->device,
-				    "timing out command, waited %lus\n",
-				    wait_for/HZ);
-			disposition = SUCCESS;
-		}
-			
-		scsi_log_completion(cmd, disposition);
-		switch (disposition) {
-		case SUCCESS:
-			scsi_finish_command(cmd);
-			break;
-		case NEEDS_RETRY:
-			scsi_retry_command(cmd);
-			break;
-		case ADD_TO_MLQUEUE:
-			scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
-			break;
-		default:
-			if (!scsi_eh_scmd_add(cmd, 0))
-				scsi_finish_command(cmd);
-		}
-	}
+	rq->completion_data = cmd;
+	blk_complete_request(rq);
 }
 
 /*
@@ -840,7 +782,7 @@ static void scsi_softirq(struct softirq_action *h)
  *              level drivers should not become re-entrant as a result of
  *              this.
  */
-static int scsi_retry_command(struct scsi_cmnd *cmd)
+int scsi_retry_command(struct scsi_cmnd *cmd)
 {
 	/*
 	 * Restore the SCSI command state.
@@ -1273,38 +1215,6 @@ int scsi_device_cancel(struct scsi_device *sdev, int recovery)
 }
 EXPORT_SYMBOL(scsi_device_cancel);
 
-#ifdef CONFIG_HOTPLUG_CPU
-static int scsi_cpu_notify(struct notifier_block *self,
-			   unsigned long action, void *hcpu)
-{
-	int cpu = (unsigned long)hcpu;
-
-	switch(action) {
-	case CPU_DEAD:
-		/* Drain scsi_done_q. */
-		local_irq_disable();
-		list_splice_init(&per_cpu(scsi_done_q, cpu),
-				 &__get_cpu_var(scsi_done_q));
-		raise_softirq_irqoff(SCSI_SOFTIRQ);
-		local_irq_enable();
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block __devinitdata scsi_cpu_nb = {
-	.notifier_call	= scsi_cpu_notify,
-};
-
-#define register_scsi_cpu() register_cpu_notifier(&scsi_cpu_nb)
-#define unregister_scsi_cpu() unregister_cpu_notifier(&scsi_cpu_nb)
-#else
-#define register_scsi_cpu()
-#define unregister_scsi_cpu()
-#endif /* CONFIG_HOTPLUG_CPU */
-
 MODULE_DESCRIPTION("SCSI core");
 MODULE_LICENSE("GPL");
 
@@ -1338,8 +1248,6 @@ static int __init init_scsi(void)
 		INIT_LIST_HEAD(&per_cpu(scsi_done_q, i));
 
 	devfs_mk_dir("scsi");
-	open_softirq(SCSI_SOFTIRQ, scsi_softirq, NULL);
-	register_scsi_cpu();
 	printk(KERN_NOTICE "SCSI subsystem initialized\n");
 	return 0;
 
@@ -1367,7 +1275,6 @@ static void __exit exit_scsi(void)
 	devfs_remove("scsi");
 	scsi_exit_procfs();
 	scsi_exit_queue();
-	unregister_scsi_cpu();
 }
 
 subsys_initcall(init_scsi);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ba93d6e66d48..00c9bf383e23 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1493,6 +1493,41 @@ static void scsi_kill_request(struct request *req, request_queue_t *q)
 	__scsi_done(cmd);
 }
 
+static void scsi_softirq_done(struct request *rq)
+{
+	struct scsi_cmnd *cmd = rq->completion_data;
+	unsigned long wait_for = cmd->allowed * cmd->timeout_per_command;
+	int disposition;
+
+	INIT_LIST_HEAD(&cmd->eh_entry);
+
+	disposition = scsi_decide_disposition(cmd);
+	if (disposition != SUCCESS &&
+	    time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
+		sdev_printk(KERN_ERR, cmd->device,
+			    "timing out command, waited %lus\n",
+			    wait_for/HZ);
+		disposition = SUCCESS;
+	}
+			
+	scsi_log_completion(cmd, disposition);
+
+	switch (disposition) {
+		case SUCCESS:
+			scsi_finish_command(cmd);
+			break;
+		case NEEDS_RETRY:
+			scsi_retry_command(cmd);
+			break;
+		case ADD_TO_MLQUEUE:
+			scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
+			break;
+		default:
+			if (!scsi_eh_scmd_add(cmd, 0))
+				scsi_finish_command(cmd);
+	}
+}
+
 /*
  * Function:    scsi_request_fn()
  *
@@ -1667,6 +1702,7 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);
 	blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);
+	blk_queue_softirq_done(q, scsi_softirq_done);
 
 	if (!shost->use_clustering)
 		clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index f04e7e11f57a..14a6198cb8d2 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -44,6 +44,7 @@ extern void scsi_init_cmd_from_req(struct scsi_cmnd *cmd,
 		struct scsi_request *sreq);
 extern void __scsi_release_request(struct scsi_request *sreq);
 extern void __scsi_done(struct scsi_cmnd *cmd);
+extern int scsi_retry_command(struct scsi_cmnd *cmd);
 #ifdef CONFIG_SCSI_LOGGING
 void scsi_log_send(struct scsi_cmnd *cmd);
 void scsi_log_completion(struct scsi_cmnd *cmd, int disposition);
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f02204706984..2c08fdc2bdf7 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -113,7 +113,6 @@ enum
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
 	BLOCK_SOFTIRQ,
-	SCSI_SOFTIRQ,
 	TASKLET_SOFTIRQ
 };
 
-- 
cgit v1.2.3-71-gd317


From 8672d57138b34447719cd7749f3d21070e1175a1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 9 Jan 2006 16:03:35 +0100
Subject: [IDE] Use the block layer deferred softirq request completion

This patch makes IDE use the new blk_complete_request() interface.
There's still room for improvement, as __ide_end_request() really
could drop the lock after getting HWGROUP->rq (why does it need to
hold it in the first place? If ->rq access isn't serialized, we are
screwed anyways).

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/ide/ide-io.c    | 42 +++++++++++++++++++++++++++++++++++-------
 drivers/ide/ide-probe.c |  2 ++
 include/linux/ide.h     |  1 +
 3 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index b5dc6df8e67d..dea2d4dcc698 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -55,9 +55,22 @@
 #include <asm/io.h>
 #include <asm/bitops.h>
 
+void ide_softirq_done(struct request *rq)
+{
+	request_queue_t *q = rq->q;
+
+	add_disk_randomness(rq->rq_disk);
+	end_that_request_chunk(rq, rq->errors, rq->data_len);
+
+	spin_lock_irq(q->queue_lock);
+	end_that_request_last(rq, rq->errors);
+	spin_unlock_irq(q->queue_lock);
+}
+
 int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate,
 		      int nr_sectors)
 {
+	unsigned int nbytes;
 	int ret = 1;
 
 	BUG_ON(!(rq->flags & REQ_STARTED));
@@ -81,17 +94,28 @@ int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate,
 		HWGROUP(drive)->hwif->ide_dma_on(drive);
 	}
 
-	if (!end_that_request_first(rq, uptodate, nr_sectors)) {
-		add_disk_randomness(rq->rq_disk);
-
-		if (blk_rq_tagged(rq))
-			blk_queue_end_tag(drive->queue, rq);
-
+	/*
+	 * For partial completions (or non fs/pc requests), use the regular
+	 * direct completion path.
+	 */
+	nbytes = nr_sectors << 9;
+	if (rq_all_done(rq, nbytes)) {
+		rq->errors = uptodate;
+		rq->data_len = nbytes;
 		blkdev_dequeue_request(rq);
 		HWGROUP(drive)->rq = NULL;
-		end_that_request_last(rq, uptodate);
+		blk_complete_request(rq);
 		ret = 0;
+	} else {
+		if (!end_that_request_first(rq, uptodate, nr_sectors)) {
+			add_disk_randomness(rq->rq_disk);
+			blkdev_dequeue_request(rq);
+			HWGROUP(drive)->rq = NULL;
+			end_that_request_last(rq, uptodate);
+			ret = 0;
+		}
 	}
+
 	return ret;
 }
 EXPORT_SYMBOL(__ide_end_request);
@@ -113,6 +137,10 @@ int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
 	unsigned long flags;
 	int ret = 1;
 
+	/*
+	 * room for locking improvements here, the calls below don't
+	 * need the queue lock held at all
+	 */
 	spin_lock_irqsave(&ide_lock, flags);
 	rq = HWGROUP(drive)->rq;
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 02167a5b751d..1ddaa71a8f45 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1011,6 +1011,8 @@ static int ide_init_queue(ide_drive_t *drive)
 	blk_queue_max_hw_segments(q, max_sg_entries);
 	blk_queue_max_phys_segments(q, max_sg_entries);
 
+	blk_queue_softirq_done(q, ide_softirq_done);
+
 	/* assign drive queue */
 	drive->queue = q;
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 4dd6694963c0..ef8d0cbb832f 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1001,6 +1001,7 @@ extern int noautodma;
 
 extern int ide_end_request (ide_drive_t *drive, int uptodate, int nrsecs);
 extern int __ide_end_request (ide_drive_t *drive, struct request *rq, int uptodate, int nrsecs);
+extern void ide_softirq_done(struct request *rq);
 
 /*
  * This is used on exit from the driver to designate the next irq handler
-- 
cgit v1.2.3-71-gd317


From 0d0fbf8152fb3bb4393be11e8df7f70e1fbbd738 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 9 Jan 2006 15:24:57 -0200
Subject: V4L (926_2): Moves compat32 functions from fs to v4l subsystem

This moves the 32 bit ioctl compatibility handlers for
Video4Linux into a new file and adds explicit calls to them
to each v4l device driver.

Unfortunately, there does not seem to be any code handling
the v4l2 ioctls, so quite often the code goes through two
separate conversions, first from 32 bit v4l to 64 bit v4l,
and from there to 64 bit v4l2. My patch does not change
that, so there is still much room for improvement.

Also, some drivers have additional ioctl numbers, for
which the conversion should be handled internally to
that driver.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
---
 drivers/media/radio/miropcm20-radio.c       |   1 +
 drivers/media/radio/radio-aimslab.c         |   1 +
 drivers/media/radio/radio-aztech.c          |   1 +
 drivers/media/radio/radio-cadet.c           |   1 +
 drivers/media/radio/radio-gemtek-pci.c      |   1 +
 drivers/media/radio/radio-gemtek.c          |   1 +
 drivers/media/radio/radio-maestro.c         |   1 +
 drivers/media/radio/radio-maxiradio.c       |   1 +
 drivers/media/radio/radio-rtrack2.c         |   1 +
 drivers/media/radio/radio-sf16fmi.c         |   1 +
 drivers/media/radio/radio-sf16fmr2.c        |   1 +
 drivers/media/radio/radio-terratec.c        |   1 +
 drivers/media/radio/radio-trust.c           |   1 +
 drivers/media/radio/radio-typhoon.c         |   1 +
 drivers/media/radio/radio-zoltrix.c         |   1 +
 drivers/media/video/Makefile                |   3 +-
 drivers/media/video/arv.c                   |   1 +
 drivers/media/video/bttv-driver.c           |   1 +
 drivers/media/video/bw-qcam.c               |   1 +
 drivers/media/video/c-qcam.c                |   1 +
 drivers/media/video/compat_ioctl32.c        | 318 ++++++++++++++++++++++++++++
 drivers/media/video/cpia.c                  |   1 +
 drivers/media/video/cx88/cx88-video.c       |   2 +
 drivers/media/video/meye.c                  |   1 +
 drivers/media/video/pms.c                   |   1 +
 drivers/media/video/saa5249.c               |   1 +
 drivers/media/video/saa7134/saa7134-video.c |   2 +
 drivers/media/video/stradis.c               |   1 +
 drivers/media/video/w9966.c                 |   1 +
 drivers/media/video/zoran_driver.c          |   1 +
 drivers/media/video/zr36120.c               |   1 +
 drivers/usb/media/dsbr100.c                 |   1 +
 drivers/usb/media/ov511.c                   |   1 +
 drivers/usb/media/pwc/pwc-if.c              |   1 +
 drivers/usb/media/se401.c                   |   1 +
 drivers/usb/media/stv680.c                  |   1 +
 drivers/usb/media/usbvideo.c                |   1 +
 drivers/usb/media/vicam.c                   |   1 +
 drivers/usb/media/w9968cf.c                 |   1 +
 fs/compat_ioctl.c                           | 246 ---------------------
 include/linux/compat_ioctl.h                |  26 ---
 include/linux/videodev2.h                   |   3 +
 42 files changed, 362 insertions(+), 273 deletions(-)
 create mode 100644 drivers/media/video/compat_ioctl32.c

(limited to 'include/linux')

diff --git a/drivers/media/radio/miropcm20-radio.c b/drivers/media/radio/miropcm20-radio.c
index c2ebe8754a95..dc292da2605f 100644
--- a/drivers/media/radio/miropcm20-radio.c
+++ b/drivers/media/radio/miropcm20-radio.c
@@ -220,6 +220,7 @@ static struct file_operations pcm20_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= pcm20_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-aimslab.c b/drivers/media/radio/radio-aimslab.c
index 877c770558e9..914deab4e044 100644
--- a/drivers/media/radio/radio-aimslab.c
+++ b/drivers/media/radio/radio-aimslab.c
@@ -299,6 +299,7 @@ static struct file_operations rtrack_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl	        = rt_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-aztech.c b/drivers/media/radio/radio-aztech.c
index 5319a9c9a979..523be820f9c6 100644
--- a/drivers/media/radio/radio-aztech.c
+++ b/drivers/media/radio/radio-aztech.c
@@ -256,6 +256,7 @@ static struct file_operations aztech_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= az_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-cadet.c b/drivers/media/radio/radio-cadet.c
index 9b0406318f2d..f1b5ac81e9d2 100644
--- a/drivers/media/radio/radio-cadet.c
+++ b/drivers/media/radio/radio-cadet.c
@@ -490,6 +490,7 @@ static struct file_operations cadet_fops = {
 	.release       	= cadet_release,
 	.read		= cadet_read,
 	.ioctl		= cadet_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-gemtek-pci.c b/drivers/media/radio/radio-gemtek-pci.c
index 630cc786d0a4..42c8fce04aa2 100644
--- a/drivers/media/radio/radio-gemtek-pci.c
+++ b/drivers/media/radio/radio-gemtek-pci.c
@@ -301,6 +301,7 @@ static struct file_operations gemtek_pci_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= gemtek_pci_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-gemtek.c b/drivers/media/radio/radio-gemtek.c
index 6418f03b9ce4..47173be97b9f 100644
--- a/drivers/media/radio/radio-gemtek.c
+++ b/drivers/media/radio/radio-gemtek.c
@@ -233,6 +233,7 @@ static struct file_operations gemtek_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= gemtek_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-maestro.c b/drivers/media/radio/radio-maestro.c
index e5e2021a7312..c30effdf711f 100644
--- a/drivers/media/radio/radio-maestro.c
+++ b/drivers/media/radio/radio-maestro.c
@@ -72,6 +72,7 @@ static struct file_operations maestro_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= radio_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-maxiradio.c b/drivers/media/radio/radio-maxiradio.c
index 02d39a50d5ed..30869308332a 100644
--- a/drivers/media/radio/radio-maxiradio.c
+++ b/drivers/media/radio/radio-maxiradio.c
@@ -80,6 +80,7 @@ static struct file_operations maxiradio_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl	        = radio_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 static struct video_device maxiradio_radio =
diff --git a/drivers/media/radio/radio-rtrack2.c b/drivers/media/radio/radio-rtrack2.c
index b2256d675b44..28a47c9e7a81 100644
--- a/drivers/media/radio/radio-rtrack2.c
+++ b/drivers/media/radio/radio-rtrack2.c
@@ -199,6 +199,7 @@ static struct file_operations rtrack2_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= rt_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-sf16fmi.c b/drivers/media/radio/radio-sf16fmi.c
index 6f03ce4dd7b0..0229f792a059 100644
--- a/drivers/media/radio/radio-sf16fmi.c
+++ b/drivers/media/radio/radio-sf16fmi.c
@@ -225,6 +225,7 @@ static struct file_operations fmi_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= fmi_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-sf16fmr2.c b/drivers/media/radio/radio-sf16fmr2.c
index 71971e9bb342..26632cead09a 100644
--- a/drivers/media/radio/radio-sf16fmr2.c
+++ b/drivers/media/radio/radio-sf16fmr2.c
@@ -356,6 +356,7 @@ static struct file_operations fmr2_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl          = fmr2_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-terratec.c b/drivers/media/radio/radio-terratec.c
index b03573c6840e..fcfde2e4f195 100644
--- a/drivers/media/radio/radio-terratec.c
+++ b/drivers/media/radio/radio-terratec.c
@@ -276,6 +276,7 @@ static struct file_operations terratec_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= tt_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-trust.c b/drivers/media/radio/radio-trust.c
index b300bedf7c74..5a099a50d4d0 100644
--- a/drivers/media/radio/radio-trust.c
+++ b/drivers/media/radio/radio-trust.c
@@ -255,6 +255,7 @@ static struct file_operations trust_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= tr_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-typhoon.c b/drivers/media/radio/radio-typhoon.c
index f304f3c14763..8ac9a8ef9094 100644
--- a/drivers/media/radio/radio-typhoon.c
+++ b/drivers/media/radio/radio-typhoon.c
@@ -261,6 +261,7 @@ static struct file_operations typhoon_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= typhoon_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-zoltrix.c b/drivers/media/radio/radio-zoltrix.c
index 4c6d6fb49034..d590e80c922e 100644
--- a/drivers/media/radio/radio-zoltrix.c
+++ b/drivers/media/radio/radio-zoltrix.c
@@ -313,6 +313,7 @@ static struct file_operations zoltrix_fops =
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= zol_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/video/Makefile b/drivers/media/video/Makefile
index 82060f9909d8..618a08ab940a 100644
--- a/drivers/media/video/Makefile
+++ b/drivers/media/video/Makefile
@@ -8,7 +8,8 @@ zoran-objs      :=	zr36120.o zr36120_i2c.o zr36120_mem.o
 zr36067-objs	:=	zoran_procfs.o zoran_device.o \
 			zoran_driver.o zoran_card.o
 tuner-objs	:=	tuner-core.o tuner-simple.o mt20xx.o tda8290.o tea5767.o
-obj-$(CONFIG_VIDEO_DEV) += videodev.o v4l2-common.o v4l1-compat.o
+
+obj-$(CONFIG_VIDEO_DEV) += videodev.o v4l2-common.o v4l1-compat.o compat_ioctl32.o
 
 obj-$(CONFIG_VIDEO_BT848) += bttv.o msp3400.o tvaudio.o \
 	tda7432.o tda9875.o ir-kbd-i2c.o ir-kbd-gpio.o
diff --git a/drivers/media/video/arv.c b/drivers/media/video/arv.c
index 881cdcb1875d..7d5a068353f2 100644
--- a/drivers/media/video/arv.c
+++ b/drivers/media/video/arv.c
@@ -749,6 +749,7 @@ static struct file_operations ar_fops = {
 	.release	= video_exclusive_release,
 	.read		= ar_read,
 	.ioctl		= ar_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek		= no_llseek,
 };
 
diff --git a/drivers/media/video/bttv-driver.c b/drivers/media/video/bttv-driver.c
index 1ddf9ba613ef..03f925724ce9 100644
--- a/drivers/media/video/bttv-driver.c
+++ b/drivers/media/video/bttv-driver.c
@@ -3120,6 +3120,7 @@ static struct file_operations bttv_fops =
 	.open	  = bttv_open,
 	.release  = bttv_release,
 	.ioctl	  = bttv_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek	  = no_llseek,
 	.read	  = bttv_read,
 	.mmap	  = bttv_mmap,
diff --git a/drivers/media/video/bw-qcam.c b/drivers/media/video/bw-qcam.c
index 0065d0c240d1..6bad93ef969f 100644
--- a/drivers/media/video/bw-qcam.c
+++ b/drivers/media/video/bw-qcam.c
@@ -875,6 +875,7 @@ static struct file_operations qcam_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl          = qcam_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.read		= qcam_read,
 	.llseek         = no_llseek,
 };
diff --git a/drivers/media/video/c-qcam.c b/drivers/media/video/c-qcam.c
index 75442ec49f35..9976db4f6da8 100644
--- a/drivers/media/video/c-qcam.c
+++ b/drivers/media/video/c-qcam.c
@@ -687,6 +687,7 @@ static struct file_operations qcam_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl          = qcam_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.read		= qcam_read,
 	.llseek         = no_llseek,
 };
diff --git a/drivers/media/video/compat_ioctl32.c b/drivers/media/video/compat_ioctl32.c
new file mode 100644
index 000000000000..42dc11c63c0d
--- /dev/null
+++ b/drivers/media/video/compat_ioctl32.c
@@ -0,0 +1,318 @@
+#include <linux/config.h>
+#include <linux/compat.h>
+#include <linux/videodev.h>
+
+#ifdef CONFIG_COMPAT
+struct video_tuner32 {
+	compat_int_t tuner;
+	char name[32];
+	compat_ulong_t rangelow, rangehigh;
+	u32 flags;	/* It is really u32 in videodev.h */
+	u16 mode, signal;
+};
+
+static int get_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
+{
+	int i;
+
+	if(get_user(kp->tuner, &up->tuner))
+		return -EFAULT;
+	for(i = 0; i < 32; i++)
+		__get_user(kp->name[i], &up->name[i]);
+	__get_user(kp->rangelow, &up->rangelow);
+	__get_user(kp->rangehigh, &up->rangehigh);
+	__get_user(kp->flags, &up->flags);
+	__get_user(kp->mode, &up->mode);
+	__get_user(kp->signal, &up->signal);
+	return 0;
+}
+
+static int put_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
+{
+	int i;
+
+	if(put_user(kp->tuner, &up->tuner))
+		return -EFAULT;
+	for(i = 0; i < 32; i++)
+		__put_user(kp->name[i], &up->name[i]);
+	__put_user(kp->rangelow, &up->rangelow);
+	__put_user(kp->rangehigh, &up->rangehigh);
+	__put_user(kp->flags, &up->flags);
+	__put_user(kp->mode, &up->mode);
+	__put_user(kp->signal, &up->signal);
+	return 0;
+}
+
+struct video_buffer32 {
+	compat_caddr_t base;
+	compat_int_t height, width, depth, bytesperline;
+};
+
+static int get_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up)
+{
+	u32 tmp;
+
+	if (get_user(tmp, &up->base))
+		return -EFAULT;
+
+	/* This is actually a physical address stored
+	 * as a void pointer.
+	 */
+	kp->base = (void *)(unsigned long) tmp;
+
+	__get_user(kp->height, &up->height);
+	__get_user(kp->width, &up->width);
+	__get_user(kp->depth, &up->depth);
+	__get_user(kp->bytesperline, &up->bytesperline);
+	return 0;
+}
+
+static int put_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up)
+{
+	u32 tmp = (u32)((unsigned long)kp->base);
+
+	if(put_user(tmp, &up->base))
+		return -EFAULT;
+	__put_user(kp->height, &up->height);
+	__put_user(kp->width, &up->width);
+	__put_user(kp->depth, &up->depth);
+	__put_user(kp->bytesperline, &up->bytesperline);
+	return 0;
+}
+
+struct video_clip32 {
+	s32 x, y, width, height;	/* Its really s32 in videodev.h */
+	compat_caddr_t next;
+};
+
+struct video_window32 {
+	u32 x, y, width, height, chromakey, flags;
+	compat_caddr_t clips;
+	compat_int_t clipcount;
+};
+
+static int native_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret = -ENOIOCTLCMD;
+
+	if (file->f_ops->unlocked_ioctl)
+		ret = file->f_ops->unlocked_ioctl(file, cmd, arg);
+	else if (file->f_ops->ioctl) {
+		lock_kernel();
+		ret = file->f_ops->ioctl(file->f_dentry->d_inode, file, cmd, arg);
+		unlock_kernel();
+	}
+
+	return ret;
+}
+
+
+/* You get back everything except the clips... */
+static int put_video_window32(struct video_window *kp, struct video_window32 __user *up)
+{
+	if(put_user(kp->x, &up->x))
+		return -EFAULT;
+	__put_user(kp->y, &up->y);
+	__put_user(kp->width, &up->width);
+	__put_user(kp->height, &up->height);
+	__put_user(kp->chromakey, &up->chromakey);
+	__put_user(kp->flags, &up->flags);
+	__put_user(kp->clipcount, &up->clipcount);
+	return 0;
+}
+
+#define VIDIOCGTUNER32		_IOWR('v',4, struct video_tuner32)
+#define VIDIOCSTUNER32		_IOW('v',5, struct video_tuner32)
+#define VIDIOCGWIN32		_IOR('v',9, struct video_window32)
+#define VIDIOCSWIN32		_IOW('v',10, struct video_window32)
+#define VIDIOCGFBUF32		_IOR('v',11, struct video_buffer32)
+#define VIDIOCSFBUF32		_IOW('v',12, struct video_buffer32)
+#define VIDIOCGFREQ32		_IOR('v',14, u32)
+#define VIDIOCSFREQ32		_IOW('v',15, u32)
+
+enum {
+	MaxClips = (~0U-sizeof(struct video_window))/sizeof(struct video_clip)
+};
+
+static int do_set_window(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct video_window32 __user *up = compat_ptr(arg);
+	struct video_window __user *vw;
+	struct video_clip __user *p;
+	int nclips;
+	u32 n;
+
+	if (get_user(nclips, &up->clipcount))
+		return -EFAULT;
+
+	/* Peculiar interface... */
+	if (nclips < 0)
+		nclips = VIDEO_CLIPMAP_SIZE;
+
+	if (nclips > MaxClips)
+		return -ENOMEM;
+
+	vw = compat_alloc_user_space(sizeof(struct video_window) +
+				    nclips * sizeof(struct video_clip));
+
+	p = nclips ? (struct video_clip __user *)(vw + 1) : NULL;
+
+	if (get_user(n, &up->x) || put_user(n, &vw->x) ||
+	    get_user(n, &up->y) || put_user(n, &vw->y) ||
+	    get_user(n, &up->width) || put_user(n, &vw->width) ||
+	    get_user(n, &up->height) || put_user(n, &vw->height) ||
+	    get_user(n, &up->chromakey) || put_user(n, &vw->chromakey) ||
+	    get_user(n, &up->flags) || put_user(n, &vw->flags) ||
+	    get_user(n, &up->clipcount) || put_user(n, &vw->clipcount) ||
+	    get_user(n, &up->clips) || put_user(p, &vw->clips))
+		return -EFAULT;
+
+	if (nclips) {
+		struct video_clip32 __user *u = compat_ptr(n);
+		int i;
+		if (!u)
+			return -EINVAL;
+		for (i = 0; i < nclips; i++, u++, p++) {
+			s32 v;
+			if (get_user(v, &u->x) ||
+			    put_user(v, &p->x) ||
+			    get_user(v, &u->y) ||
+			    put_user(v, &p->y) ||
+			    get_user(v, &u->width) ||
+			    put_user(v, &p->width) ||
+			    get_user(v, &u->height) ||
+			    put_user(v, &p->height) ||
+			    put_user(NULL, &p->next))
+				return -EFAULT;
+		}
+	}
+
+	return native_ioctl(file, VIDIOCSWIN, (unsigned long)p);
+}
+
+static int do_video_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	union {
+		struct video_tuner vt;
+		struct video_buffer vb;
+		struct video_window vw;
+		unsigned long vx;
+	} karg;
+	mm_segment_t old_fs = get_fs();
+	void __user *up = compat_ptr(arg);
+	int err = 0;
+
+	/* First, convert the command. */
+	switch(cmd) {
+	case VIDIOCGTUNER32: cmd = VIDIOCGTUNER; break;
+	case VIDIOCSTUNER32: cmd = VIDIOCSTUNER; break;
+	case VIDIOCGWIN32: cmd = VIDIOCGWIN; break;
+	case VIDIOCGFBUF32: cmd = VIDIOCGFBUF; break;
+	case VIDIOCSFBUF32: cmd = VIDIOCSFBUF; break;
+	case VIDIOCGFREQ32: cmd = VIDIOCGFREQ; break;
+	case VIDIOCSFREQ32: cmd = VIDIOCSFREQ; break;
+	};
+
+	switch(cmd) {
+	case VIDIOCSTUNER:
+	case VIDIOCGTUNER:
+		err = get_video_tuner32(&karg.vt, up);
+		break;
+
+	case VIDIOCSFBUF:
+		err = get_video_buffer32(&karg.vb, up);
+		break;
+
+	case VIDIOCSFREQ:
+		err = get_user(karg.vx, (u32 __user *)up);
+		break;
+	};
+	if(err)
+		goto out;
+
+	set_fs(KERNEL_DS);
+	err = native_ioctl(file, cmd, (unsigned long)&karg);
+	set_fs(old_fs);
+
+	if(err == 0) {
+		switch(cmd) {
+		case VIDIOCGTUNER:
+			err = put_video_tuner32(&karg.vt, up);
+			break;
+
+		case VIDIOCGWIN:
+			err = put_video_window32(&karg.vw, up);
+			break;
+
+		case VIDIOCGFBUF:
+			err = put_video_buffer32(&karg.vb, up);
+			break;
+
+		case VIDIOCGFREQ:
+			err = put_user(((u32)karg.vx), (u32 __user *)up);
+			break;
+		};
+	}
+out:
+	return err;
+}
+
+long v4l_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret = -ENOIOCTLCMD;
+
+	if (!file->f_ops->ioctl)
+		return ret;
+
+	switch (cmd) {
+	case VIDIOCSWIN32:
+		ret = do_set_window(file, cmd, arg);
+		break;
+	case VIDIOCGTUNER32:
+	case VIDIOCSTUNER32:
+	case VIDIOCGWIN32:
+	case VIDIOCGFBUF32:
+	case VIDIOCSFBUF32:
+	case VIDIOCGFREQ32:
+	case VIDIOCSFREQ32
+		ret = do_video_ioctl(file, cmd, arg);
+		break;
+
+	/* Little v, the video4linux ioctls (conflict?) */
+	case VIDIOCGCAP:
+	case VIDIOCGCHAN:
+	case VIDIOCSCHAN:
+	case VIDIOCGPICT:
+	case VIDIOCSPICT:
+	case VIDIOCCAPTURE:
+	case VIDIOCKEY:
+	case VIDIOCGAUDIO:
+	case VIDIOCSAUDIO:
+	case VIDIOCSYNC:
+	case VIDIOCMCAPTURE:
+	case VIDIOCGMBUF:
+	case VIDIOCGUNIT:
+	case VIDIOCGCAPTURE:
+	case VIDIOCSCAPTURE:
+
+	/* BTTV specific... */
+	case _IOW('v',  BASE_VIDIOCPRIVATE+0, char [256]):
+	case _IOR('v',  BASE_VIDIOCPRIVATE+1, char [256]):
+	case _IOR('v' , BASE_VIDIOCPRIVATE+2, unsigned int):
+	case _IOW('v' , BASE_VIDIOCPRIVATE+3, char [16]): /* struct bttv_pll_info */
+	case _IOR('v' , BASE_VIDIOCPRIVATE+4, int):
+	case _IOR('v' , BASE_VIDIOCPRIVATE+5, int):
+	case _IOR('v' , BASE_VIDIOCPRIVATE+6, int):
+	case _IOR('v' , BASE_VIDIOCPRIVATE+7, int):
+		ret = native_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+		break;
+
+	return ret;
+}
+#else
+long v4l_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	return -ENOIOCTLCMD;
+}
+#endif
+EXPORT_SYMBOL_GPL(v4l_compat_ioctl32);
diff --git a/drivers/media/video/cpia.c b/drivers/media/video/cpia.c
index b7ec9bf45085..9f59541155d9 100644
--- a/drivers/media/video/cpia.c
+++ b/drivers/media/video/cpia.c
@@ -3807,6 +3807,7 @@ static struct file_operations cpia_fops = {
 	.read		= cpia_read,
 	.mmap		= cpia_mmap,
 	.ioctl          = cpia_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c
index 24a48f8a48c1..bc025c46aedf 100644
--- a/drivers/media/video/cx88/cx88-video.c
+++ b/drivers/media/video/cx88/cx88-video.c
@@ -1740,6 +1740,7 @@ static struct file_operations video_fops =
 	.poll          = video_poll,
 	.mmap	       = video_mmap,
 	.ioctl	       = video_ioctl,
+	.compat_ioctl  = v4l_compat_ioctl32,
 	.llseek        = no_llseek,
 };
 
@@ -1767,6 +1768,7 @@ static struct file_operations radio_fops =
 	.open          = video_open,
 	.release       = video_release,
 	.ioctl         = radio_ioctl,
+	.compat_ioctl  = v4l_compat_ioctl32,
 	.llseek        = no_llseek,
 };
 
diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c
index 3f2a882bc20a..2869464aee0d 100644
--- a/drivers/media/video/meye.c
+++ b/drivers/media/video/meye.c
@@ -1754,6 +1754,7 @@ static struct file_operations meye_fops = {
 	.release	= meye_release,
 	.mmap		= meye_mmap,
 	.ioctl		= meye_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.poll		= meye_poll,
 	.llseek		= no_llseek,
 };
diff --git a/drivers/media/video/pms.c b/drivers/media/video/pms.c
index 2504207b2e3d..9e6448639480 100644
--- a/drivers/media/video/pms.c
+++ b/drivers/media/video/pms.c
@@ -883,6 +883,7 @@ static struct file_operations pms_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl          = pms_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.read           = pms_read,
 	.llseek         = no_llseek,
 };
diff --git a/drivers/media/video/saa5249.c b/drivers/media/video/saa5249.c
index a51c7bd96618..73b4f0e2abf0 100644
--- a/drivers/media/video/saa5249.c
+++ b/drivers/media/video/saa5249.c
@@ -702,6 +702,7 @@ static struct file_operations saa_fops = {
 	.open		= saa5249_open,
 	.release       	= saa5249_release,
 	.ioctl          = saa5249_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/video/saa7134/saa7134-video.c b/drivers/media/video/saa7134/saa7134-video.c
index 45c852df13ed..9b9e1e7f05ef 100644
--- a/drivers/media/video/saa7134/saa7134-video.c
+++ b/drivers/media/video/saa7134/saa7134-video.c
@@ -2262,6 +2262,7 @@ static struct file_operations video_fops =
 	.poll     = video_poll,
 	.mmap	  = video_mmap,
 	.ioctl	  = video_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek   = no_llseek,
 };
 
@@ -2271,6 +2272,7 @@ static struct file_operations radio_fops =
 	.open	  = video_open,
 	.release  = video_release,
 	.ioctl	  = radio_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek   = no_llseek,
 };
 
diff --git a/drivers/media/video/stradis.c b/drivers/media/video/stradis.c
index d4497dbae05c..6ee54a45411f 100644
--- a/drivers/media/video/stradis.c
+++ b/drivers/media/video/stradis.c
@@ -1974,6 +1974,7 @@ static struct file_operations saa_fops =
 	.open		= saa_open,
 	.release	= saa_release,
 	.ioctl		= saa_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.read		= saa_read,
 	.llseek		= no_llseek,
 	.write		= saa_write,
diff --git a/drivers/media/video/w9966.c b/drivers/media/video/w9966.c
index c318ba32fbaf..b7b0afffd214 100644
--- a/drivers/media/video/w9966.c
+++ b/drivers/media/video/w9966.c
@@ -187,6 +187,7 @@ static struct file_operations w9966_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl          = w9966_v4l_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.read           = w9966_v4l_read,
 	.llseek         = no_llseek,
 };
diff --git a/drivers/media/video/zoran_driver.c b/drivers/media/video/zoran_driver.c
index 4034f1b45366..15283f44e79f 100644
--- a/drivers/media/video/zoran_driver.c
+++ b/drivers/media/video/zoran_driver.c
@@ -4678,6 +4678,7 @@ static struct file_operations zoran_fops = {
 	.open = zoran_open,
 	.release = zoran_close,
 	.ioctl = zoran_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek = no_llseek,
 	.read = zoran_read,
 	.write = zoran_write,
diff --git a/drivers/media/video/zr36120.c b/drivers/media/video/zr36120.c
index 07286816d7df..d4c633b8a7f5 100644
--- a/drivers/media/video/zr36120.c
+++ b/drivers/media/video/zr36120.c
@@ -1490,6 +1490,7 @@ static struct video_device zr36120_template=
 	.write		= zoran_write,
 	.poll		= zoran_poll,
 	.ioctl		= zoran_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.mmap		= zoran_mmap,
 	.minor		= -1,
 };
diff --git a/drivers/usb/media/dsbr100.c b/drivers/usb/media/dsbr100.c
index 6a5700e9d428..25646804d5be 100644
--- a/drivers/usb/media/dsbr100.c
+++ b/drivers/usb/media/dsbr100.c
@@ -127,6 +127,7 @@ static struct file_operations usb_dsbr100_fops = {
 	.open =		usb_dsbr100_open,
 	.release =     	usb_dsbr100_close,
 	.ioctl =        usb_dsbr100_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek =       no_llseek,
 };
 
diff --git a/drivers/usb/media/ov511.c b/drivers/usb/media/ov511.c
index 3a0e8ce67ebe..8af665bbe330 100644
--- a/drivers/usb/media/ov511.c
+++ b/drivers/usb/media/ov511.c
@@ -4774,6 +4774,7 @@ static struct file_operations ov511_fops = {
 	.read =		ov51x_v4l1_read,
 	.mmap =		ov51x_v4l1_mmap,
 	.ioctl =	ov51x_v4l1_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek =	no_llseek,
 };
 
diff --git a/drivers/usb/media/pwc/pwc-if.c b/drivers/usb/media/pwc/pwc-if.c
index 09ca6128ac20..4f9b0dc6fd7b 100644
--- a/drivers/usb/media/pwc/pwc-if.c
+++ b/drivers/usb/media/pwc/pwc-if.c
@@ -154,6 +154,7 @@ static struct file_operations pwc_fops = {
 	.poll =		pwc_video_poll,
 	.mmap =		pwc_video_mmap,
 	.ioctl =        pwc_video_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek =       no_llseek,
 };
 static struct video_device pwc_template = {
diff --git a/drivers/usb/media/se401.c b/drivers/usb/media/se401.c
index b2ae29af5940..2ba562285fda 100644
--- a/drivers/usb/media/se401.c
+++ b/drivers/usb/media/se401.c
@@ -1193,6 +1193,7 @@ static struct file_operations se401_fops = {
         .read =         se401_read,
         .mmap =         se401_mmap,
 	.ioctl =        se401_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek =       no_llseek,
 };
 static struct video_device se401_template = {
diff --git a/drivers/usb/media/stv680.c b/drivers/usb/media/stv680.c
index 774038b352cd..b497a6a0a206 100644
--- a/drivers/usb/media/stv680.c
+++ b/drivers/usb/media/stv680.c
@@ -1343,6 +1343,7 @@ static struct file_operations stv680_fops = {
 	.read =		stv680_read,
 	.mmap =		stv680_mmap,
 	.ioctl =        stv680_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek =       no_llseek,
 };
 static struct video_device stv680_template = {
diff --git a/drivers/usb/media/usbvideo.c b/drivers/usb/media/usbvideo.c
index 4bd113325ef9..63a72e550a1b 100644
--- a/drivers/usb/media/usbvideo.c
+++ b/drivers/usb/media/usbvideo.c
@@ -953,6 +953,7 @@ static struct file_operations usbvideo_fops = {
 	.read =   usbvideo_v4l_read,
 	.mmap =   usbvideo_v4l_mmap,
 	.ioctl =  usbvideo_v4l_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek = no_llseek,
 };
 static const struct video_device usbvideo_template = {
diff --git a/drivers/usb/media/vicam.c b/drivers/usb/media/vicam.c
index 1c73155c8d77..5df144073871 100644
--- a/drivers/usb/media/vicam.c
+++ b/drivers/usb/media/vicam.c
@@ -1236,6 +1236,7 @@ static struct file_operations vicam_fops = {
 	.read		= vicam_read,
 	.mmap		= vicam_mmap,
 	.ioctl		= vicam_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek		= no_llseek,
 };
 
diff --git a/drivers/usb/media/w9968cf.c b/drivers/usb/media/w9968cf.c
index 3605a6f3067b..bff9434c8e55 100644
--- a/drivers/usb/media/w9968cf.c
+++ b/drivers/usb/media/w9968cf.c
@@ -3490,6 +3490,7 @@ static struct file_operations w9968cf_fops = {
 	.release = w9968cf_release,
 	.read =    w9968cf_read,
 	.ioctl =   w9968cf_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.mmap =    w9968cf_mmap,
 	.llseek =  no_llseek,
 };
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 43a2508ac696..55d9a3a954cf 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -207,244 +207,6 @@ static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
 }
 
-struct video_tuner32 {
-	compat_int_t tuner;
-	char name[32];
-	compat_ulong_t rangelow, rangehigh;
-	u32 flags;	/* It is really u32 in videodev.h */
-	u16 mode, signal;
-};
-
-static int get_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
-{
-	int i;
-
-	if(get_user(kp->tuner, &up->tuner))
-		return -EFAULT;
-	for(i = 0; i < 32; i++)
-		__get_user(kp->name[i], &up->name[i]);
-	__get_user(kp->rangelow, &up->rangelow);
-	__get_user(kp->rangehigh, &up->rangehigh);
-	__get_user(kp->flags, &up->flags);
-	__get_user(kp->mode, &up->mode);
-	__get_user(kp->signal, &up->signal);
-	return 0;
-}
-
-static int put_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
-{
-	int i;
-
-	if(put_user(kp->tuner, &up->tuner))
-		return -EFAULT;
-	for(i = 0; i < 32; i++)
-		__put_user(kp->name[i], &up->name[i]);
-	__put_user(kp->rangelow, &up->rangelow);
-	__put_user(kp->rangehigh, &up->rangehigh);
-	__put_user(kp->flags, &up->flags);
-	__put_user(kp->mode, &up->mode);
-	__put_user(kp->signal, &up->signal);
-	return 0;
-}
-
-struct video_buffer32 {
-	compat_caddr_t base;
-	compat_int_t height, width, depth, bytesperline;
-};
-
-static int get_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up)
-{
-	u32 tmp;
-
-	if (get_user(tmp, &up->base))
-		return -EFAULT;
-
-	/* This is actually a physical address stored
-	 * as a void pointer.
-	 */
-	kp->base = (void *)(unsigned long) tmp;
-
-	__get_user(kp->height, &up->height);
-	__get_user(kp->width, &up->width);
-	__get_user(kp->depth, &up->depth);
-	__get_user(kp->bytesperline, &up->bytesperline);
-	return 0;
-}
-
-static int put_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up)
-{
-	u32 tmp = (u32)((unsigned long)kp->base);
-
-	if(put_user(tmp, &up->base))
-		return -EFAULT;
-	__put_user(kp->height, &up->height);
-	__put_user(kp->width, &up->width);
-	__put_user(kp->depth, &up->depth);
-	__put_user(kp->bytesperline, &up->bytesperline);
-	return 0;
-}
-
-struct video_clip32 {
-	s32 x, y, width, height;	/* Its really s32 in videodev.h */
-	compat_caddr_t next;
-};
-
-struct video_window32 {
-	u32 x, y, width, height, chromakey, flags;
-	compat_caddr_t clips;
-	compat_int_t clipcount;
-};
-
-/* You get back everything except the clips... */
-static int put_video_window32(struct video_window *kp, struct video_window32 __user *up)
-{
-	if(put_user(kp->x, &up->x))
-		return -EFAULT;
-	__put_user(kp->y, &up->y);
-	__put_user(kp->width, &up->width);
-	__put_user(kp->height, &up->height);
-	__put_user(kp->chromakey, &up->chromakey);
-	__put_user(kp->flags, &up->flags);
-	__put_user(kp->clipcount, &up->clipcount);
-	return 0;
-}
-
-#define VIDIOCGTUNER32		_IOWR('v',4, struct video_tuner32)
-#define VIDIOCSTUNER32		_IOW('v',5, struct video_tuner32)
-#define VIDIOCGWIN32		_IOR('v',9, struct video_window32)
-#define VIDIOCSWIN32		_IOW('v',10, struct video_window32)
-#define VIDIOCGFBUF32		_IOR('v',11, struct video_buffer32)
-#define VIDIOCSFBUF32		_IOW('v',12, struct video_buffer32)
-#define VIDIOCGFREQ32		_IOR('v',14, u32)
-#define VIDIOCSFREQ32		_IOW('v',15, u32)
-
-enum {
-	MaxClips = (~0U-sizeof(struct video_window))/sizeof(struct video_clip)
-};
-
-static int do_set_window(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct video_window32 __user *up = compat_ptr(arg);
-	struct video_window __user *vw;
-	struct video_clip __user *p;
-	int nclips;
-	u32 n;
-
-	if (get_user(nclips, &up->clipcount))
-		return -EFAULT;
-
-	/* Peculiar interface... */
-	if (nclips < 0)
-		nclips = VIDEO_CLIPMAP_SIZE;
-
-	if (nclips > MaxClips)
-		return -ENOMEM;
-
-	vw = compat_alloc_user_space(sizeof(struct video_window) +
-				    nclips * sizeof(struct video_clip));
-
-	p = nclips ? (struct video_clip __user *)(vw + 1) : NULL;
-
-	if (get_user(n, &up->x) || put_user(n, &vw->x) ||
-	    get_user(n, &up->y) || put_user(n, &vw->y) ||
-	    get_user(n, &up->width) || put_user(n, &vw->width) ||
-	    get_user(n, &up->height) || put_user(n, &vw->height) ||
-	    get_user(n, &up->chromakey) || put_user(n, &vw->chromakey) ||
-	    get_user(n, &up->flags) || put_user(n, &vw->flags) ||
-	    get_user(n, &up->clipcount) || put_user(n, &vw->clipcount) ||
-	    get_user(n, &up->clips) || put_user(p, &vw->clips))
-		return -EFAULT;
-
-	if (nclips) {
-		struct video_clip32 __user *u = compat_ptr(n);
-		int i;
-		if (!u)
-			return -EINVAL;
-		for (i = 0; i < nclips; i++, u++, p++) {
-			s32 v;
-			if (get_user(v, &u->x) ||
-			    put_user(v, &p->x) ||
-			    get_user(v, &u->y) ||
-			    put_user(v, &p->y) ||
-			    get_user(v, &u->width) ||
-			    put_user(v, &p->width) ||
-			    get_user(v, &u->height) ||
-			    put_user(v, &p->height) ||
-			    put_user(NULL, &p->next))
-				return -EFAULT;
-		}
-	}
-
-	return sys_ioctl(fd, VIDIOCSWIN, (unsigned long)p);
-}
-
-static int do_video_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	union {
-		struct video_tuner vt;
-		struct video_buffer vb;
-		struct video_window vw;
-		unsigned long vx;
-	} karg;
-	mm_segment_t old_fs = get_fs();
-	void __user *up = compat_ptr(arg);
-	int err = 0;
-
-	/* First, convert the command. */
-	switch(cmd) {
-	case VIDIOCGTUNER32: cmd = VIDIOCGTUNER; break;
-	case VIDIOCSTUNER32: cmd = VIDIOCSTUNER; break;
-	case VIDIOCGWIN32: cmd = VIDIOCGWIN; break;
-	case VIDIOCGFBUF32: cmd = VIDIOCGFBUF; break;
-	case VIDIOCSFBUF32: cmd = VIDIOCSFBUF; break;
-	case VIDIOCGFREQ32: cmd = VIDIOCGFREQ; break;
-	case VIDIOCSFREQ32: cmd = VIDIOCSFREQ; break;
-	};
-
-	switch(cmd) {
-	case VIDIOCSTUNER:
-	case VIDIOCGTUNER:
-		err = get_video_tuner32(&karg.vt, up);
-		break;
-
-	case VIDIOCSFBUF:
-		err = get_video_buffer32(&karg.vb, up);
-		break;
-
-	case VIDIOCSFREQ:
-		err = get_user(karg.vx, (u32 __user *)up);
-		break;
-	};
-	if(err)
-		goto out;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&karg);
-	set_fs(old_fs);
-
-	if(err == 0) {
-		switch(cmd) {
-		case VIDIOCGTUNER:
-			err = put_video_tuner32(&karg.vt, up);
-			break;
-
-		case VIDIOCGWIN:
-			err = put_video_window32(&karg.vw, up);
-			break;
-
-		case VIDIOCGFBUF:
-			err = put_video_buffer32(&karg.vb, up);
-			break;
-
-		case VIDIOCGFREQ:
-			err = put_user(((u32)karg.vx), (u32 __user *)up);
-			break;
-		};
-	}
-out:
-	return err;
-}
-
 struct compat_dmx_event {
 	dmx_event_t	event;
 	compat_time_t	timeStamp;
@@ -3015,14 +2777,6 @@ COMPATIBLE_IOCTL(EXT3_IOC_GROUP_ADD)
 #ifdef CONFIG_JBD_DEBUG
 HANDLE_IOCTL(EXT3_IOC32_WAIT_FOR_READONLY, do_ext3_ioctl)
 #endif
-HANDLE_IOCTL(VIDIOCGTUNER32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCSTUNER32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCGWIN32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCSWIN32, do_set_window)
-HANDLE_IOCTL(VIDIOCGFBUF32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCSFBUF32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCGFREQ32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCSFREQ32, do_video_ioctl)
 /* One SMB ioctl needs translations. */
 #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
 HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid)
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 119f9d064cc6..339878952f12 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -218,32 +218,6 @@ COMPATIBLE_IOCTL(VT_RESIZE)
 COMPATIBLE_IOCTL(VT_RESIZEX)
 COMPATIBLE_IOCTL(VT_LOCKSWITCH)
 COMPATIBLE_IOCTL(VT_UNLOCKSWITCH)
-/* Little v */
-/* Little v, the video4linux ioctls (conflict?) */
-COMPATIBLE_IOCTL(VIDIOCGCAP)
-COMPATIBLE_IOCTL(VIDIOCGCHAN)
-COMPATIBLE_IOCTL(VIDIOCSCHAN)
-COMPATIBLE_IOCTL(VIDIOCGPICT)
-COMPATIBLE_IOCTL(VIDIOCSPICT)
-COMPATIBLE_IOCTL(VIDIOCCAPTURE)
-COMPATIBLE_IOCTL(VIDIOCKEY)
-COMPATIBLE_IOCTL(VIDIOCGAUDIO)
-COMPATIBLE_IOCTL(VIDIOCSAUDIO)
-COMPATIBLE_IOCTL(VIDIOCSYNC)
-COMPATIBLE_IOCTL(VIDIOCMCAPTURE)
-COMPATIBLE_IOCTL(VIDIOCGMBUF)
-COMPATIBLE_IOCTL(VIDIOCGUNIT)
-COMPATIBLE_IOCTL(VIDIOCGCAPTURE)
-COMPATIBLE_IOCTL(VIDIOCSCAPTURE)
-/* BTTV specific... */
-COMPATIBLE_IOCTL(_IOW('v',  BASE_VIDIOCPRIVATE+0, char [256]))
-COMPATIBLE_IOCTL(_IOR('v',  BASE_VIDIOCPRIVATE+1, char [256]))
-COMPATIBLE_IOCTL(_IOR('v' , BASE_VIDIOCPRIVATE+2, unsigned int))
-COMPATIBLE_IOCTL(_IOW('v' , BASE_VIDIOCPRIVATE+3, char [16])) /* struct bttv_pll_info */
-COMPATIBLE_IOCTL(_IOR('v' , BASE_VIDIOCPRIVATE+4, int))
-COMPATIBLE_IOCTL(_IOR('v' , BASE_VIDIOCPRIVATE+5, int))
-COMPATIBLE_IOCTL(_IOR('v' , BASE_VIDIOCPRIVATE+6, int))
-COMPATIBLE_IOCTL(_IOR('v' , BASE_VIDIOCPRIVATE+7, int))
 /* Little p (/dev/rtc, /dev/envctrl, etc.) */
 COMPATIBLE_IOCTL(RTC_AIE_ON)
 COMPATIBLE_IOCTL(RTC_AIE_OFF)
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 1cded681eb6d..13f78ec4bf76 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1117,6 +1117,9 @@ typedef int (*v4l2_kioctl)(struct inode *inode, struct file *file,
 			   unsigned int cmd, void *arg);
 int v4l_compat_translate_ioctl(struct inode *inode, struct file *file,
 			       int cmd, void *arg, v4l2_kioctl driver_ioctl);
+/* 32 Bits compatibility layer for 64 bits processors */
+extern long v4l_compat_ioctl32(struct file *file, unsigned int cmd,
+				unsigned long arg);
 
 #endif /* __KERNEL__ */
 #endif /* __LINUX_VIDEODEV2_H */
-- 
cgit v1.2.3-71-gd317


From f3c5987a386300abea9854b32814d0eab7af7841 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
Date: Mon, 9 Jan 2006 15:25:00 -0200
Subject: V4L (0987): Added Secam L' std on tda9887 and common macros moved to
 videodev2.h

- Added SECAM L' video standard
- Common std macros moved to videodev2.h

Signed-off-by: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
---
 drivers/media/video/tda8290.c |  6 ------
 drivers/media/video/tda9887.c |  9 +++++++++
 include/linux/videodev2.h     | 11 ++++++++++-
 3 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/tda8290.c b/drivers/media/video/tda8290.c
index 61d94ddaff41..2498b76df429 100644
--- a/drivers/media/video/tda8290.c
+++ b/drivers/media/video/tda8290.c
@@ -398,14 +398,8 @@ static int tda8290_tune(struct i2c_client *c, u16 ifc, unsigned int freq)
 	return 0;
 }
 
-
 /*---------------------------------------------------------------------*/
 
-#define V4L2_STD_MN	(V4L2_STD_PAL_M|V4L2_STD_PAL_N|V4L2_STD_PAL_Nc|V4L2_STD_NTSC)
-#define V4L2_STD_B	(V4L2_STD_PAL_B|V4L2_STD_PAL_B1|V4L2_STD_SECAM_B)
-#define V4L2_STD_GH	(V4L2_STD_PAL_G|V4L2_STD_PAL_H|V4L2_STD_SECAM_G|V4L2_STD_SECAM_H)
-#define V4L2_STD_DK	(V4L2_STD_PAL_DK|V4L2_STD_SECAM_DK)
-
 static void set_audio(struct tuner *t)
 {
 	char* mode;
diff --git a/drivers/media/video/tda9887.c b/drivers/media/video/tda9887.c
index 7165a1b9625a..93bf10436b82 100644
--- a/drivers/media/video/tda9887.c
+++ b/drivers/media/video/tda9887.c
@@ -189,6 +189,15 @@ static struct tvnorm tvnorms[] = {
 		.e     = ( cGating_36	  |
 			   cAudioIF_6_5   |
 			   cVideoIF_38_90 ),
+	},{
+		.std   = V4L2_STD_SECAM_LC,
+		.name  = "SECAM-L'",
+		.b     = ( cOutputPort2Inactive |
+			   cPositiveAmTV  |
+			   cQSS           ),
+		.e     = ( cGating_36	  |
+			   cAudioIF_6_5   |
+			   cVideoIF_33_90 ),
 	},{
 		.std   = V4L2_STD_SECAM_DK,
 		.name  = "SECAM-DK",
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 13f78ec4bf76..b2f5e864b397 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -642,6 +642,12 @@ typedef __u64 v4l2_std_id;
 #define V4L2_STD_ATSC_8_VSB     ((v4l2_std_id)0x01000000)
 #define V4L2_STD_ATSC_16_VSB    ((v4l2_std_id)0x02000000)
 
+/* some merged standards */
+#define V4L2_STD_MN	(V4L2_STD_PAL_M|V4L2_STD_PAL_N|V4L2_STD_PAL_Nc|V4L2_STD_NTSC)
+#define V4L2_STD_B	(V4L2_STD_PAL_B|V4L2_STD_PAL_B1|V4L2_STD_SECAM_B)
+#define V4L2_STD_GH	(V4L2_STD_PAL_G|V4L2_STD_PAL_H|V4L2_STD_SECAM_G|V4L2_STD_SECAM_H)
+#define V4L2_STD_DK	(V4L2_STD_PAL_DK|V4L2_STD_SECAM_DK)
+
 /* some common needed stuff */
 #define V4L2_STD_PAL_BG		(V4L2_STD_PAL_B		|\
 				 V4L2_STD_PAL_B1	|\
@@ -662,7 +668,8 @@ typedef __u64 v4l2_std_id;
 				 V4L2_STD_SECAM_G	|\
 				 V4L2_STD_SECAM_H	|\
 				 V4L2_STD_SECAM_DK	|\
-				 V4L2_STD_SECAM_L)
+				 V4L2_STD_SECAM_L       |\
+				 V4L2_STD_SECAM_LC)
 
 #define V4L2_STD_525_60		(V4L2_STD_PAL_M		|\
 				 V4L2_STD_PAL_60	|\
@@ -1117,10 +1124,12 @@ typedef int (*v4l2_kioctl)(struct inode *inode, struct file *file,
 			   unsigned int cmd, void *arg);
 int v4l_compat_translate_ioctl(struct inode *inode, struct file *file,
 			       int cmd, void *arg, v4l2_kioctl driver_ioctl);
+
 /* 32 Bits compatibility layer for 64 bits processors */
 extern long v4l_compat_ioctl32(struct file *file, unsigned int cmd,
 				unsigned long arg);
 
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_VIDEODEV2_H */
 
-- 
cgit v1.2.3-71-gd317


From 36cb557a2f64513e2fdc1a542167e5e8a6c1c67e Mon Sep 17 00:00:00 2001
From: Andrew de Quincey <adq_dvb@lidskialf.net>
Date: Mon, 9 Jan 2006 15:25:07 -0200
Subject: DVB (2444): Implement frontend-specific tuning and the ability to
 disable zigzag

- Implement frontend-specific tuning and the ability to disable zigzag

Signed-off-by: Andrew de Quincey <adq_dvb@lidskialf.net>
Signed-off-by: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
---
 drivers/media/dvb/bt8xx/dst.c             |  52 +++--
 drivers/media/dvb/dvb-core/dvb_frontend.c | 307 +++++++++++++++++-------------
 drivers/media/dvb/dvb-core/dvb_frontend.h |  11 +-
 include/linux/dvb/frontend.h              |  10 +
 4 files changed, 228 insertions(+), 152 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/dvb/bt8xx/dst.c b/drivers/media/dvb/bt8xx/dst.c
index 8977c7a313df..3a2ff1cc24b7 100644
--- a/drivers/media/dvb/bt8xx/dst.c
+++ b/drivers/media/dvb/bt8xx/dst.c
@@ -1341,30 +1341,40 @@ static int dst_read_snr(struct dvb_frontend *fe, u16 *snr)
 	return 0;
 }
 
-static int dst_set_frontend(struct dvb_frontend *fe, struct dvb_frontend_parameters *p)
+static int dst_set_frontend(struct dvb_frontend* fe,
+			    struct dvb_frontend_parameters* p,
+			    unsigned int mode_flags,
+			    int *delay,
+			    fe_status_t *status)
 {
 	struct dst_state *state = fe->demodulator_priv;
 
-	dst_set_freq(state, p->frequency);
-	dprintk(verbose, DST_DEBUG, 1, "Set Frequency=[%d]", p->frequency);
+	if (p != NULL) {
+		dst_set_freq(state, p->frequency);
+		dprintk(verbose, DST_DEBUG, 1, "Set Frequency=[%d]", p->frequency);
 
-	if (state->dst_type == DST_TYPE_IS_SAT) {
-		if (state->type_flags & DST_TYPE_HAS_OBS_REGS)
-			dst_set_inversion(state, p->inversion);
-		dst_set_fec(state, p->u.qpsk.fec_inner);
-		dst_set_symbolrate(state, p->u.qpsk.symbol_rate);
-		dst_set_polarization(state);
-		dprintk(verbose, DST_DEBUG, 1, "Set Symbolrate=[%d]", p->u.qpsk.symbol_rate);
-
-	} else if (state->dst_type == DST_TYPE_IS_TERR)
-		dst_set_bandwidth(state, p->u.ofdm.bandwidth);
-	else if (state->dst_type == DST_TYPE_IS_CABLE) {
-		dst_set_fec(state, p->u.qam.fec_inner);
-		dst_set_symbolrate(state, p->u.qam.symbol_rate);
-		dst_set_modulation(state, p->u.qam.modulation);
+		if (state->dst_type == DST_TYPE_IS_SAT) {
+			if (state->type_flags & DST_TYPE_HAS_OBS_REGS)
+				dst_set_inversion(state, p->inversion);
+			dst_set_fec(state, p->u.qpsk.fec_inner);
+			dst_set_symbolrate(state, p->u.qpsk.symbol_rate);
+			dst_set_polarization(state);
+			dprintk(verbose, DST_DEBUG, 1, "Set Symbolrate=[%d]", p->u.qpsk.symbol_rate);
+
+		} else if (state->dst_type == DST_TYPE_IS_TERR)
+			dst_set_bandwidth(state, p->u.ofdm.bandwidth);
+		else if (state->dst_type == DST_TYPE_IS_CABLE) {
+			dst_set_fec(state, p->u.qam.fec_inner);
+			dst_set_symbolrate(state, p->u.qam.symbol_rate);
+			dst_set_modulation(state, p->u.qam.modulation);
+		}
+		dst_write_tuna(fe);
 	}
-	dst_write_tuna(fe);
 
+	if (!(mode_flags & FE_TUNE_MODE_ONESHOT))
+		dst_read_status(fe, status);
+
+	*delay = HZ/10;
 	return 0;
 }
 
@@ -1445,7 +1455,7 @@ static struct dvb_frontend_ops dst_dvbt_ops = {
 
 	.release = dst_release,
 	.init = dst_init,
-	.set_frontend = dst_set_frontend,
+	.tune = dst_set_frontend,
 	.get_frontend = dst_get_frontend,
 	.read_status = dst_read_status,
 	.read_signal_strength = dst_read_signal_strength,
@@ -1469,7 +1479,7 @@ static struct dvb_frontend_ops dst_dvbs_ops = {
 
 	.release = dst_release,
 	.init = dst_init,
-	.set_frontend = dst_set_frontend,
+	.tune = dst_set_frontend,
 	.get_frontend = dst_get_frontend,
 	.read_status = dst_read_status,
 	.read_signal_strength = dst_read_signal_strength,
@@ -1496,7 +1506,7 @@ static struct dvb_frontend_ops dst_dvbc_ops = {
 
 	.release = dst_release,
 	.init = dst_init,
-	.set_frontend = dst_set_frontend,
+	.tune = dst_set_frontend,
 	.get_frontend = dst_get_frontend,
 	.read_status = dst_read_status,
 	.read_signal_strength = dst_read_signal_strength,
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c
index 95ea5095e07e..9b5fa540e1e7 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -92,6 +92,7 @@ static DECLARE_MUTEX(frontend_mutex);
 
 struct dvb_frontend_private {
 
+	/* thread/frontend values */
 	struct dvb_device *dvbdev;
 	struct dvb_frontend_parameters parameters;
 	struct dvb_fe_events events;
@@ -100,20 +101,25 @@ struct dvb_frontend_private {
 	wait_queue_head_t wait_queue;
 	pid_t thread_pid;
 	unsigned long release_jiffies;
-	int state;
-	int bending;
-	int lnb_drift;
-	int inversion;
-	int auto_step;
-	int auto_sub_step;
-	int started_auto_step;
-	int min_delay;
-	int max_drift;
-	int step_size;
-	int exit;
-	int wakeup;
+	unsigned int exit;
+	unsigned int wakeup;
 	fe_status_t status;
-	fe_sec_tone_mode_t tone;
+	unsigned int tune_mode_flags;
+	unsigned int delay;
+
+	/* swzigzag values */
+	unsigned int state;
+	unsigned int bending;
+	int lnb_drift;
+	unsigned int inversion;
+	unsigned int auto_step;
+	unsigned int auto_sub_step;
+	unsigned int started_auto_step;
+	unsigned int min_delay;
+	unsigned int max_drift;
+	unsigned int step_size;
+	int quality;
+	unsigned int check_wrapped;
 };
 
 
@@ -208,21 +214,21 @@ static void dvb_frontend_init(struct dvb_frontend *fe)
 		fe->ops->init(fe);
 }
 
-static void update_delay(int *quality, int *delay, int min_delay, int locked)
+static void dvb_frontend_swzigzag_update_delay(struct dvb_frontend_private *fepriv, int locked)
 {
-	    int q2;
+	int q2;
 
-	    dprintk ("%s\n", __FUNCTION__);
+	dprintk ("%s\n", __FUNCTION__);
 
-	    if (locked)
-		      (*quality) = (*quality * 220 + 36*256) / 256;
-	    else
-		      (*quality) = (*quality * 220 + 0) / 256;
+	if (locked)
+		(fepriv->quality) = (fepriv->quality * 220 + 36*256) / 256;
+	else
+		(fepriv->quality) = (fepriv->quality * 220 + 0) / 256;
 
-	    q2 = *quality - 128;
-	    q2 *= q2;
+	q2 = fepriv->quality - 128;
+	q2 *= q2;
 
-	    *delay = min_delay + q2 * HZ / (128*128);
+	fepriv->delay = fepriv->min_delay + q2 * HZ / (128*128);
 }
 
 /**
@@ -232,7 +238,7 @@ static void update_delay(int *quality, int *delay, int min_delay, int locked)
  * @param check_wrapped Checks if an iteration has completed. DO NOT SET ON THE FIRST ATTEMPT
  * @returns Number of complete iterations that have been performed.
  */
-static int dvb_frontend_autotune(struct dvb_frontend *fe, int check_wrapped)
+static int dvb_frontend_swzigzag_autotune(struct dvb_frontend *fe, int check_wrapped)
 {
 	int autoinversion;
 	int ready = 0;
@@ -321,6 +327,129 @@ static int dvb_frontend_autotune(struct dvb_frontend *fe, int check_wrapped)
 	return 0;
 }
 
+static void dvb_frontend_swzigzag(struct dvb_frontend *fe)
+{
+	fe_status_t s;
+	struct dvb_frontend_private *fepriv = fe->frontend_priv;
+
+	/* if we've got no parameters, just keep idling */
+	if (fepriv->state & FESTATE_IDLE) {
+		fepriv->delay = 3*HZ;
+		fepriv->quality = 0;
+		return;
+	}
+
+	/* in SCAN mode, we just set the frontend when asked and leave it alone */
+	if (fepriv->tune_mode_flags & FE_TUNE_MODE_ONESHOT) {
+		if (fepriv->state & FESTATE_RETUNE) {
+			if (fe->ops->set_frontend)
+				fe->ops->set_frontend(fe, &fepriv->parameters);
+			fepriv->state = FESTATE_TUNED;
+		}
+		fepriv->delay = 3*HZ;
+		fepriv->quality = 0;
+		return;
+	}
+
+	/* get the frontend status */
+	if (fepriv->state & FESTATE_RETUNE) {
+		s = 0;
+	} else {
+		if (fe->ops->read_status)
+			fe->ops->read_status(fe, &s);
+		if (s != fepriv->status) {
+			dvb_frontend_add_event(fe, s);
+			fepriv->status = s;
+		}
+	}
+
+	/* if we're not tuned, and we have a lock, move to the TUNED state */
+	if ((fepriv->state & FESTATE_WAITFORLOCK) && (s & FE_HAS_LOCK)) {
+		dvb_frontend_swzigzag_update_delay(fepriv, s & FE_HAS_LOCK);
+		fepriv->state = FESTATE_TUNED;
+
+		/* if we're tuned, then we have determined the correct inversion */
+		if ((!(fe->ops->info.caps & FE_CAN_INVERSION_AUTO)) &&
+		    (fepriv->parameters.inversion == INVERSION_AUTO)) {
+			fepriv->parameters.inversion = fepriv->inversion;
+		}
+		return;
+	}
+
+	/* if we are tuned already, check we're still locked */
+	if (fepriv->state & FESTATE_TUNED) {
+		dvb_frontend_swzigzag_update_delay(fepriv, s & FE_HAS_LOCK);
+
+		/* we're tuned, and the lock is still good... */
+		if (s & FE_HAS_LOCK) {
+			return;
+		} else { /* if we _WERE_ tuned, but now don't have a lock */
+			fepriv->state = FESTATE_ZIGZAG_FAST;
+			fepriv->started_auto_step = fepriv->auto_step;
+			fepriv->check_wrapped = 0;
+		}
+	}
+
+	/* don't actually do anything if we're in the LOSTLOCK state,
+	 * the frontend is set to FE_CAN_RECOVER, and the max_drift is 0 */
+	if ((fepriv->state & FESTATE_LOSTLOCK) &&
+	    (fe->ops->info.caps & FE_CAN_RECOVER) && (fepriv->max_drift == 0)) {
+		dvb_frontend_swzigzag_update_delay(fepriv, s & FE_HAS_LOCK);
+		return;
+	}
+
+	/* don't do anything if we're in the DISEQC state, since this
+	 * might be someone with a motorized dish controlled by DISEQC.
+	 * If its actually a re-tune, there will be a SET_FRONTEND soon enough.	*/
+	if (fepriv->state & FESTATE_DISEQC) {
+		dvb_frontend_swzigzag_update_delay(fepriv, s & FE_HAS_LOCK);
+		return;
+	}
+
+	/* if we're in the RETUNE state, set everything up for a brand
+	 * new scan, keeping the current inversion setting, as the next
+	 * tune is _very_ likely to require the same */
+	if (fepriv->state & FESTATE_RETUNE) {
+		fepriv->lnb_drift = 0;
+		fepriv->auto_step = 0;
+		fepriv->auto_sub_step = 0;
+		fepriv->started_auto_step = 0;
+		fepriv->check_wrapped = 0;
+	}
+
+	/* fast zigzag. */
+	if ((fepriv->state & FESTATE_SEARCHING_FAST) || (fepriv->state & FESTATE_RETUNE)) {
+		fepriv->delay = fepriv->min_delay;
+
+		/* peform a tune */
+		if (dvb_frontend_swzigzag_autotune(fe, fepriv->check_wrapped)) {
+			/* OK, if we've run out of trials at the fast speed.
+			 * Drop back to slow for the _next_ attempt */
+			fepriv->state = FESTATE_SEARCHING_SLOW;
+			fepriv->started_auto_step = fepriv->auto_step;
+			return;
+		}
+		fepriv->check_wrapped = 1;
+
+		/* if we've just retuned, enter the ZIGZAG_FAST state.
+		 * This ensures we cannot return from an
+		 * FE_SET_FRONTEND ioctl before the first frontend tune
+		 * occurs */
+		if (fepriv->state & FESTATE_RETUNE) {
+			fepriv->state = FESTATE_TUNING_FAST;
+		}
+	}
+
+	/* slow zigzag */
+	if (fepriv->state & FESTATE_SEARCHING_SLOW) {
+		dvb_frontend_swzigzag_update_delay(fepriv, s & FE_HAS_LOCK);
+
+		/* Note: don't bother checking for wrapping; we stay in this
+		 * state until we get a lock */
+		dvb_frontend_swzigzag_autotune(fe, 0);
+	}
+}
+
 static int dvb_frontend_is_exiting(struct dvb_frontend *fe)
 {
 	struct dvb_frontend_private *fepriv = fe->frontend_priv;
@@ -330,7 +459,7 @@ static int dvb_frontend_is_exiting(struct dvb_frontend *fe)
 
 	if (fepriv->dvbdev->writers == 1)
 		if (time_after(jiffies, fepriv->release_jiffies +
-					dvb_shutdown_timeout * HZ))
+				  dvb_shutdown_timeout * HZ))
 			return 1;
 
 	return 0;
@@ -355,18 +484,14 @@ static void dvb_frontend_wakeup(struct dvb_frontend *fe)
 	wake_up_interruptible(&fepriv->wait_queue);
 }
 
-/*
- * FIXME: use linux/kthread.h
- */
 static int dvb_frontend_thread(void *data)
 {
 	struct dvb_frontend *fe = data;
 	struct dvb_frontend_private *fepriv = fe->frontend_priv;
 	unsigned long timeout;
 	char name [15];
-	int quality = 0, delay = 3*HZ;
 	fe_status_t s;
-	int check_wrapped = 0;
+	struct dvb_frontend_parameters *params;
 
 	dprintk("%s\n", __FUNCTION__);
 
@@ -377,6 +502,9 @@ static int dvb_frontend_thread(void *data)
 	sigfillset(&current->blocked);
 	unlock_kernel();
 
+	fepriv->check_wrapped = 0;
+	fepriv->quality = 0;
+	fepriv->delay = 3*HZ;
 	fepriv->status = 0;
 	dvb_frontend_init(fe);
 	fepriv->wakeup = 0;
@@ -386,7 +514,7 @@ static int dvb_frontend_thread(void *data)
 
 		timeout = wait_event_interruptible_timeout(fepriv->wait_queue,
 							   dvb_frontend_should_wakeup(fe),
-							   delay);
+							   fepriv->delay);
 		if (0 != dvb_frontend_is_exiting(fe)) {
 			/* got signal or quitting */
 			break;
@@ -397,108 +525,22 @@ static int dvb_frontend_thread(void *data)
 		if (down_interruptible(&fepriv->sem))
 			break;
 
-		/* if we've got no parameters, just keep idling */
-		if (fepriv->state & FESTATE_IDLE) {
-			delay = 3*HZ;
-			quality = 0;
-			continue;
-		}
+		/* do an iteration of the tuning loop */
+		if (fe->ops->tune) {
+			/* have we been asked to retune? */
+			params = NULL;
+			if (fepriv->state & FESTATE_RETUNE) {
+				params = &fepriv->parameters;
+				fepriv->state = FESTATE_TUNED;
+			}
 
-		/* get the frontend status */
-		if (fepriv->state & FESTATE_RETUNE) {
-			s = 0;
-		} else {
-			if (fe->ops->read_status)
-				fe->ops->read_status(fe, &s);
+			fe->ops->tune(fe, params, fepriv->tune_mode_flags, &fepriv->delay, &s);
 			if (s != fepriv->status) {
 				dvb_frontend_add_event(fe, s);
 				fepriv->status = s;
 			}
-		}
-		/* if we're not tuned, and we have a lock, move to the TUNED state */
-		if ((fepriv->state & FESTATE_WAITFORLOCK) && (s & FE_HAS_LOCK)) {
-			update_delay(&quality, &delay, fepriv->min_delay, s & FE_HAS_LOCK);
-			fepriv->state = FESTATE_TUNED;
-
-			/* if we're tuned, then we have determined the correct inversion */
-			if ((!(fe->ops->info.caps & FE_CAN_INVERSION_AUTO)) &&
-			    (fepriv->parameters.inversion == INVERSION_AUTO)) {
-				fepriv->parameters.inversion = fepriv->inversion;
-			}
-			continue;
-		}
-
-		/* if we are tuned already, check we're still locked */
-		if (fepriv->state & FESTATE_TUNED) {
-			update_delay(&quality, &delay, fepriv->min_delay, s & FE_HAS_LOCK);
-
-			/* we're tuned, and the lock is still good... */
-			if (s & FE_HAS_LOCK)
-				continue;
-			else { /* if we _WERE_ tuned, but now don't have a lock */
-				fepriv->state = FESTATE_ZIGZAG_FAST;
-				fepriv->started_auto_step = fepriv->auto_step;
-				check_wrapped = 0;
-			}
-		}
-
-		/* don't actually do anything if we're in the LOSTLOCK state,
-		 * the frontend is set to FE_CAN_RECOVER, and the max_drift is 0 */
-		if ((fepriv->state & FESTATE_LOSTLOCK) &&
-		    (fe->ops->info.caps & FE_CAN_RECOVER) && (fepriv->max_drift == 0)) {
-			update_delay(&quality, &delay, fepriv->min_delay, s & FE_HAS_LOCK);
-			continue;
-		}
-
-		/* don't do anything if we're in the DISEQC state, since this
-		 * might be someone with a motorized dish controlled by DISEQC.
-		 * If its actually a re-tune, there will be a SET_FRONTEND soon enough.	*/
-		if (fepriv->state & FESTATE_DISEQC) {
-			update_delay(&quality, &delay, fepriv->min_delay, s & FE_HAS_LOCK);
-			continue;
-		}
-
-		/* if we're in the RETUNE state, set everything up for a brand
-		 * new scan, keeping the current inversion setting, as the next
-		 * tune is _very_ likely to require the same */
-		if (fepriv->state & FESTATE_RETUNE) {
-			fepriv->lnb_drift = 0;
-			fepriv->auto_step = 0;
-			fepriv->auto_sub_step = 0;
-			fepriv->started_auto_step = 0;
-			check_wrapped = 0;
-		}
-
-		/* fast zigzag. */
-		if ((fepriv->state & FESTATE_SEARCHING_FAST) || (fepriv->state & FESTATE_RETUNE)) {
-			delay = fepriv->min_delay;
-
-			/* peform a tune */
-			if (dvb_frontend_autotune(fe, check_wrapped)) {
-				/* OK, if we've run out of trials at the fast speed.
-				 * Drop back to slow for the _next_ attempt */
-				fepriv->state = FESTATE_SEARCHING_SLOW;
-				fepriv->started_auto_step = fepriv->auto_step;
-				continue;
-			}
-			check_wrapped = 1;
-
-			/* if we've just retuned, enter the ZIGZAG_FAST state.
-			 * This ensures we cannot return from an
-			 * FE_SET_FRONTEND ioctl before the first frontend tune
-			 * occurs */
-			if (fepriv->state & FESTATE_RETUNE) {
-				fepriv->state = FESTATE_TUNING_FAST;
-			}
-		}
-
-		/* slow zigzag */
-		if (fepriv->state & FESTATE_SEARCHING_SLOW) {
-			update_delay(&quality, &delay, fepriv->min_delay, s & FE_HAS_LOCK);
-
-			/* Note: don't bother checking for wrapping; we stay in this
-			 * state until we get a lock */
-			dvb_frontend_autotune(fe, 0);
+		} else {
+			dvb_frontend_swzigzag(fe);
 		}
 	}
 
@@ -733,7 +775,6 @@ static int dvb_frontend_ioctl(struct inode *inode, struct file *file,
 			err = fe->ops->set_tone(fe, (fe_sec_tone_mode_t) parg);
 			fepriv->state = FESTATE_DISEQC;
 			fepriv->status = 0;
-			fepriv->tone = (fe_sec_tone_mode_t) parg;
 		}
 		break;
 
@@ -891,6 +932,10 @@ static int dvb_frontend_ioctl(struct inode *inode, struct file *file,
 			err = fe->ops->get_frontend(fe, (struct dvb_frontend_parameters*) parg);
 		}
 		break;
+
+	case FE_SET_FRONTEND_TUNE_MODE:
+		fepriv->tune_mode_flags = (unsigned int) parg;
+		break;
 	};
 
 	up (&fepriv->sem);
@@ -932,6 +977,9 @@ static int dvb_frontend_open(struct inode *inode, struct file *file)
 
 		/*  empty event queue */
 		fepriv->events.eventr = fepriv->events.eventw = 0;
+
+		/* normal tune mode when opened R/W */
+		fepriv->tune_mode_flags &= ~FE_TUNE_MODE_ONESHOT;
 	}
 
 	return ret;
@@ -990,7 +1038,6 @@ int dvb_register_frontend(struct dvb_adapter* dvb,
 	init_MUTEX (&fepriv->events.sem);
 	fe->dvb = dvb;
 	fepriv->inversion = INVERSION_OFF;
-	fepriv->tone = SEC_TONE_OFF;
 
 	printk ("DVB: registering frontend %i (%s)...\n",
 		fe->dvb->num,
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.h b/drivers/media/dvb/dvb-core/dvb_frontend.h
index 1e0840d02f1f..48c3f81be912 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.h
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.h
@@ -58,10 +58,19 @@ struct dvb_frontend_ops {
 	int (*init)(struct dvb_frontend* fe);
 	int (*sleep)(struct dvb_frontend* fe);
 
+	/* if this is set, it overrides the default swzigzag */
+	int (*tune)(struct dvb_frontend* fe,
+		    struct dvb_frontend_parameters* params,
+		    unsigned int mode_flags,
+		    int *delay,
+		    fe_status_t *status);
+
+	/* these two are only used for the swzigzag code */
 	int (*set_frontend)(struct dvb_frontend* fe, struct dvb_frontend_parameters* params);
-	int (*get_frontend)(struct dvb_frontend* fe, struct dvb_frontend_parameters* params);
 	int (*get_tune_settings)(struct dvb_frontend* fe, struct dvb_frontend_tune_settings* settings);
 
+	int (*get_frontend)(struct dvb_frontend* fe, struct dvb_frontend_parameters* params);
+
 	int (*read_status)(struct dvb_frontend* fe, fe_status_t* status);
 	int (*read_ber)(struct dvb_frontend* fe, u32* ber);
 	int (*read_signal_strength)(struct dvb_frontend* fe, u16* strength);
diff --git a/include/linux/dvb/frontend.h b/include/linux/dvb/frontend.h
index d41df7047ed7..c8cbd90ba375 100644
--- a/include/linux/dvb/frontend.h
+++ b/include/linux/dvb/frontend.h
@@ -240,6 +240,15 @@ struct dvb_frontend_event {
 };
 
 
+/**
+ * When set, this flag will disable any zigzagging or other "normal" tuning
+ * behaviour. Additionally, there will be no automatic monitoring of the lock
+ * status, and hence no frontend events will be generated. If a frontend device
+ * is closed, this flag will be automatically turned off when the device is
+ * reopened read-write.
+ */
+#define FE_TUNE_MODE_ONESHOT 0x01
+
 
 #define FE_GET_INFO		   _IOR('o', 61, struct dvb_frontend_info)
 
@@ -260,6 +269,7 @@ struct dvb_frontend_event {
 
 #define FE_SET_FRONTEND		   _IOW('o', 76, struct dvb_frontend_parameters)
 #define FE_GET_FRONTEND		   _IOR('o', 77, struct dvb_frontend_parameters)
+#define FE_SET_FRONTEND_TUNE_MODE  _IO('o', 81) /* unsigned int */
 #define FE_GET_EVENT		   _IOR('o', 78, struct dvb_frontend_event)
 
 #define FE_DISHNETWORK_SEND_LEGACY_CMD _IO('o', 80) /* unsigned int */
-- 
cgit v1.2.3-71-gd317


From 9bb13a6dc3a6f68c990264838ff0493d900c48d7 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
Date: Mon, 9 Jan 2006 15:25:37 -0200
Subject: V4L/DVB (3233): Fixed API to set I2S speed control

- Created a new ioctl to control I2S speed. Old calls to an
inadequate V4L2 API replaced.

Signed-off-by: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
---
 drivers/media/video/em28xx/em28xx-cards.c |  8 ++------
 drivers/media/video/em28xx/em28xx-video.c |  2 ++
 drivers/media/video/em28xx/em28xx.h       |  2 ++
 drivers/media/video/msp3400.c             | 30 ++++++++++++++++++++++--------
 include/linux/videodev2.h                 |  1 -
 include/media/v4l2-common.h               |  7 +++++++
 6 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/em28xx/em28xx-cards.c b/drivers/media/video/em28xx/em28xx-cards.c
index 57779e63f35d..58f7b4194a0d 100644
--- a/drivers/media/video/em28xx/em28xx-cards.c
+++ b/drivers/media/video/em28xx/em28xx-cards.c
@@ -30,6 +30,7 @@
 #include <media/tuner.h>
 #include <media/audiochip.h>
 #include <media/tveeprom.h>
+#include <media/v4l2-common.h>
 #include "msp3400.h"
 
 #include "em28xx.h"
@@ -261,7 +262,6 @@ void em28xx_card_setup(struct em28xx *dev)
 	/* request some modules */
 	if (dev->model == EM2820_BOARD_HAUPPAUGE_WINTV_USB_2) {
 		struct tveeprom tv;
-		struct v4l2_audioout ao;
 #ifdef CONFIG_MODULES
 		request_module("tveeprom");
 		request_module("ir-kbd-i2c");
@@ -274,12 +274,8 @@ void em28xx_card_setup(struct em28xx *dev)
 
 		dev->tuner_type= tv.tuner_type;
 		if (tv.audio_processor == AUDIO_CHIP_MSP34XX) {
+			dev->i2s_speed=2048000;
 			dev->has_msp34xx=1;
-			memset (&ao,0,sizeof(ao));
-
-			ao.index=2;
-			ao.mode=V4L2_AUDMODE_32BITS;
-			em28xx_i2c_call_clients(dev, VIDIOC_S_AUDOUT, &ao);
 		} else
 			dev->has_msp34xx=0;
 	}
diff --git a/drivers/media/video/em28xx/em28xx-video.c b/drivers/media/video/em28xx/em28xx-video.c
index 5e831fccf3fd..0b5557c479ae 100644
--- a/drivers/media/video/em28xx/em28xx-video.c
+++ b/drivers/media/video/em28xx/em28xx-video.c
@@ -287,6 +287,8 @@ static void video_mux(struct em28xx *dev, int index)
 	em28xx_videodbg("Setting input index=%d, vmux=%d, amux=%d\n",index,input,dev->ctl_ainput);
 
 	if (dev->has_msp34xx) {
+		if (dev->i2s_speed)
+			em28xx_i2c_call_clients(dev, VIDIOC_INT_I2S_CLOCK_FREQ, &dev->i2s_speed);
 		em28xx_i2c_call_clients(dev, VIDIOC_S_AUDIO, &dev->ctl_ainput);
 		ainput = EM28XX_AUDIO_SRC_TUNER;
 		em28xx_audio_source(dev, ainput);
diff --git a/drivers/media/video/em28xx/em28xx.h b/drivers/media/video/em28xx/em28xx.h
index 5c7a41ce69f3..ffa9acc9be37 100644
--- a/drivers/media/video/em28xx/em28xx.h
+++ b/drivers/media/video/em28xx/em28xx.h
@@ -216,6 +216,8 @@ struct em28xx {
 	unsigned int has_msp34xx:1;
 	unsigned int has_tda9887:1;
 
+	u32 i2s_speed;		/* I2S speed for audio digital stream */
+
 	enum em28xx_decoder decoder;
 
 	int tuner_type;		/* type of the tuner */
diff --git a/drivers/media/video/msp3400.c b/drivers/media/video/msp3400.c
index fd0589352822..11235c1ac5c6 100644
--- a/drivers/media/video/msp3400.c
+++ b/drivers/media/video/msp3400.c
@@ -54,6 +54,7 @@
 
 #include <linux/videodev.h>
 #include <media/audiochip.h>
+#include <media/v4l2-common.h>
 #include "msp3400.h"
 
 /* ---------------------------------------------------------------------- */
@@ -2104,23 +2105,36 @@ static int msp_command(struct i2c_client *client, unsigned int cmd, void *arg)
 		if (a->index<0||a->index>2)
 			return -EINVAL;
 
-		if (a->index==2) {
-			if (a->mode == V4L2_AUDMODE_32BITS)
-				msp->i2s_mode=1;
-			else
-				msp->i2s_mode=0;
-		}
-		msp3400_dbg("Setting audio out on msp34xx to input %i, mode %i\n",a->index,msp->i2s_mode);
+		msp3400_dbg("Setting audio out on msp34xx to input %i\n",a->index);
 		msp3400c_set_scart(client,msp->in_scart,a->index+1);
 
 		break;
 	}
+	case VIDIOC_INT_I2S_CLOCK_FREQ:
+	{
+		u32 *a=(u32 *)arg;
+
+		msp3400_dbg("Setting I2S speed to %d\n",*a);
+
+		switch (*a) {
+			case 1024000:
+				msp->i2s_mode=0;
+				break;
+			case 2048000:
+				msp->i2s_mode=1;
+				break;
+			default:
+				return -EINVAL;
+		}
+		break;
+	}
+
 	case VIDIOC_QUERYCTRL:
 	{
 		struct v4l2_queryctrl *qc = arg;
 		int i;
 
-		msp3400_dbg("VIDIOC_QUERYCTRL");
+		msp3400_dbg("VIDIOC_QUERYCTRL\n");
 
 		for (i = 0; i < ARRAY_SIZE(msp34xx_qctrl); i++)
 			if (qc->id && qc->id ==  msp34xx_qctrl[i].id) {
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index b2f5e864b397..6ac7c1f7902f 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -895,7 +895,6 @@ struct v4l2_audio
 
 /*  Flags for the 'mode' field */
 #define V4L2_AUDMODE_AVL		0x00001
-#define V4L2_AUDMODE_32BITS		0x00002
 
 struct v4l2_audioout
 {
diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h
index d3fd48157eb8..2f2402996409 100644
--- a/include/media/v4l2-common.h
+++ b/include/media/v4l2-common.h
@@ -107,4 +107,11 @@ enum v4l2_chip_ident {
    be made. */
 #define VIDIOC_INT_G_CHIP_IDENT		_IOR ('d', 107, enum v4l2_chip_ident *)
 
+/* Sets I2S speed in bps. This is used to provide a standard way to select I2S
+   clock used by driving digital audio streams at some board designs.
+   Usual values for the frequency are 1024000 and 2048000.
+   If the frequency is not supported, then -EINVAL is returned. */
+#define VIDIOC_INT_I2S_CLOCK_FREQ 	_IOW ('d', 108, u32)
+
+
 #endif /* V4L2_COMMON_H_ */
-- 
cgit v1.2.3-71-gd317


From 21dcd8ccd76e80118f524b1a730c35ab1c46c09e Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
Date: Mon, 9 Jan 2006 15:25:37 -0200
Subject: V4L/DVB (3234): Included advanced debug option to tvp5150.c

- Included advanced debug option to tvp5150.c
- Now, advanced debug info is the first item at V4L menu.

Signed-off-by: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
---
 drivers/media/video/Kconfig   | 16 +++++++++-------
 drivers/media/video/tvp5150.c | 24 ++++++++++++++++++++++++
 include/linux/i2c-id.h        |  1 +
 3 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/Kconfig b/drivers/media/video/Kconfig
index c89cc0a922ee..2fe260fff85d 100644
--- a/drivers/media/video/Kconfig
+++ b/drivers/media/video/Kconfig
@@ -7,6 +7,15 @@ menu "Video For Linux"
 
 comment "Video Adapters"
 
+config VIDEO_ADV_DEBUG
+	bool "Enable advanced debug functionality"
+	depends on VIDEO_DEV
+	default n
+	---help---
+	  Say Y here to enable advanced debugging functionality on some
+	  V4L devices.
+	  In doubt, say N.
+
 config VIDEO_BT848
 	tristate "BT848 Video For Linux"
 	depends on VIDEO_DEV && PCI && I2C
@@ -344,11 +353,4 @@ config VIDEO_DECODER
 	  Say Y here to compile drivers for SAA7115, SAA7127 and CX25840
 	  video decoders.
 
-config VIDEO_ADV_DEBUG
-	bool "Enable advanced debug functionality"
-	depends on VIDEO_DEV && VIDEO_DECODER && EXPERIMENTAL
-	---help---
-	  Say Y here to enable advanced debugging functionality in the
-	  SAA7115, SAA7127 and CX25840 video decoders.
-
 endmenu
diff --git a/drivers/media/video/tvp5150.c b/drivers/media/video/tvp5150.c
index 9ed839d688eb..07ad675cd58e 100644
--- a/drivers/media/video/tvp5150.c
+++ b/drivers/media/video/tvp5150.c
@@ -850,6 +850,30 @@ static int tvp5150_command(struct i2c_client *c,
 		*(v4l2_std_id *)arg = decoder->norm;
 		break;
 
+#ifdef CONFIG_VIDEO_ADV_DEBUG
+	case VIDIOC_INT_G_REGISTER:
+	{
+		struct v4l2_register *reg = arg;
+
+		if (reg->i2c_id != I2C_DRIVERID_TVP5150)
+			return -EINVAL;
+		reg->val = tvp5150_read(c, reg->reg & 0xff);
+		break;
+	}
+
+	case VIDIOC_INT_S_REGISTER:
+	{
+		struct v4l2_register *reg = arg;
+
+		if (reg->i2c_id != I2C_DRIVERID_TVP5150)
+			return -EINVAL;
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		tvp5150_write(c, reg->reg & 0xff, reg->val & 0xff);
+		break;
+	}
+#endif
+
 	case DECODER_DUMP:
 		dump_reg(c);
 		break;
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index fb46f8d56999..6ff2d365895f 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -103,6 +103,7 @@
 #define I2C_DRIVERID_SAA711X	73	/* saa711x video encoders	*/
 #define I2C_DRIVERID_AKITAIOEXP	74	/* IO Expander on Sharp SL-C1000 */
 #define I2C_DRIVERID_INFRARED	75	/* I2C InfraRed on Video boards */
+#define I2C_DRIVERID_TVP5150	76	/* TVP5150 video decoder        */
 
 #define I2C_DRIVERID_I2CDEV	900
 #define I2C_DRIVERID_ARP        902    /* SMBus ARP Client              */
-- 
cgit v1.2.3-71-gd317


From 5e453dc757385ec892a818e4e3b5de027987ced9 Mon Sep 17 00:00:00 2001
From: Michael Krufky <mkrufky@m1k.net>
Date: Mon, 9 Jan 2006 15:32:31 -0200
Subject: V4L/DVB (3269): ioctls cleanups.

- Now, all internal ioctls are at v4l2-common.h
- removed unused ioctl at saa6752hs.h
- all debug ioctl code moved to v4l2-common.c
- removed duplicated stuff from other cards

Signed-off-by: Michael Krufky <mkrufky@m1k.net>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
---
 drivers/media/video/bttv-cards.c              |   1 +
 drivers/media/video/bttv-driver.c             |  28 +---
 drivers/media/video/bttv-i2c.c                |   1 +
 drivers/media/video/cx88/cx88-blackbird.c     |   3 +-
 drivers/media/video/cx88/cx88-core.c          |  56 +------
 drivers/media/video/cx88/cx88-dvb.c           |   1 +
 drivers/media/video/cx88/cx88-i2c.c           |   1 +
 drivers/media/video/cx88/cx88-video.c         |   7 +-
 drivers/media/video/cx88/cx88.h               |   1 -
 drivers/media/video/em28xx/em28xx-core.c      |  53 -------
 drivers/media/video/em28xx/em28xx-i2c.c       |   1 +
 drivers/media/video/em28xx/em28xx-video.c     |   2 +-
 drivers/media/video/em28xx/em28xx.h           |   2 -
 drivers/media/video/mxb.c                     |   1 +
 drivers/media/video/saa7134/saa7134-cards.c   |   1 +
 drivers/media/video/saa7134/saa7134-core.c    |  72 ---------
 drivers/media/video/saa7134/saa7134-dvb.c     |   1 +
 drivers/media/video/saa7134/saa7134-empress.c |   3 +-
 drivers/media/video/saa7134/saa7134-i2c.c     |   1 +
 drivers/media/video/saa7134/saa7134-oss.c     |  40 ++++-
 drivers/media/video/saa7134/saa7134-video.c   |   5 +-
 drivers/media/video/saa7134/saa7134.h         |   1 -
 drivers/media/video/tda9887.c                 |   2 +-
 drivers/media/video/tuner-core.c              |  12 +-
 drivers/media/video/tvaudio.c                 |   1 +
 drivers/media/video/v4l2-common.c             | 219 ++++++++++++++++++++------
 include/linux/video_decoder.h                 |   2 +
 include/linux/videodev2.h                     |   1 -
 include/media/audiochip.h                     |   5 -
 include/media/tuner.h                         |   4 -
 include/media/v4l2-common.h                   |  25 +++
 31 files changed, 267 insertions(+), 286 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/bttv-cards.c b/drivers/media/video/bttv-cards.c
index 440f635e020f..1621ab133d23 100644
--- a/drivers/media/video/bttv-cards.c
+++ b/drivers/media/video/bttv-cards.c
@@ -38,6 +38,7 @@
 #include <asm/io.h>
 
 #include "bttvp.h"
+#include <media/v4l2-common.h>
 
 /* fwd decl */
 static void boot_msp34xx(struct bttv *btv, int pin);
diff --git a/drivers/media/video/bttv-driver.c b/drivers/media/video/bttv-driver.c
index 69a147b85f1a..f3de85251719 100644
--- a/drivers/media/video/bttv-driver.c
+++ b/drivers/media/video/bttv-driver.c
@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/kdev_t.h>
 #include "bttvp.h"
+#include <media/v4l2-common.h>
 
 #include <linux/dma-mapping.h>
 
@@ -1520,14 +1521,6 @@ static struct videobuf_queue_ops bttv_video_qops = {
 	.buf_release  = buffer_release,
 };
 
-static const char *v4l1_ioctls[] = {
-	"?", "CGAP", "GCHAN", "SCHAN", "GTUNER", "STUNER", "GPICT", "SPICT",
-	"CCAPTURE", "GWIN", "SWIN", "GFBUF", "SFBUF", "KEY", "GFREQ",
-	"SFREQ", "GAUDIO", "SAUDIO", "SYNC", "MCAPTURE", "GMBUF", "GUNIT",
-	"GCAPTURE", "SCAPTURE", "SPLAYMODE", "SWRITEMODE", "GPLAYINFO",
-	"SMICROCODE", "GVBIFMT", "SVBIFMT" };
-#define V4L1_IOCTLS ARRAY_SIZE(v4l1_ioctls)
-
 static int bttv_common_ioctls(struct bttv *btv, unsigned int cmd, void *arg)
 {
 	switch (cmd) {
@@ -2216,22 +2209,9 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 	unsigned long flags;
 	int retval = 0;
 
-	if (bttv_debug > 1) {
-		switch (_IOC_TYPE(cmd)) {
-		case 'v':
-			printk("bttv%d: ioctl 0x%x (v4l1, VIDIOC%s)\n",
-			       btv->c.nr, cmd, (_IOC_NR(cmd) < V4L1_IOCTLS) ?
-			       v4l1_ioctls[_IOC_NR(cmd)] : "???");
-			break;
-		case 'V':
-			printk("bttv%d: ioctl 0x%x (v4l2, %s)\n",
-			       btv->c.nr, cmd,  v4l2_ioctl_names[_IOC_NR(cmd)]);
-			break;
-		default:
-			printk("bttv%d: ioctl 0x%x (???)\n",
-			       btv->c.nr, cmd);
-		}
-	}
+	if (bttv_debug > 1)
+		v4l_print_ioctl(btv->c.name, cmd);
+
 	if (btv->errors)
 		bttv_reinit_bt848(btv);
 
diff --git a/drivers/media/video/bttv-i2c.c b/drivers/media/video/bttv-i2c.c
index a8873f48c808..fd66d386fa7d 100644
--- a/drivers/media/video/bttv-i2c.c
+++ b/drivers/media/video/bttv-i2c.c
@@ -30,6 +30,7 @@
 #include <linux/delay.h>
 
 #include "bttvp.h"
+#include <media/v4l2-common.h>
 #include <linux/jiffies.h>
 #include <asm/io.h>
 
diff --git a/drivers/media/video/cx88/cx88-blackbird.c b/drivers/media/video/cx88/cx88-blackbird.c
index 5a7f940565cc..a49062119313 100644
--- a/drivers/media/video/cx88/cx88-blackbird.c
+++ b/drivers/media/video/cx88/cx88-blackbird.c
@@ -32,6 +32,7 @@
 #include <linux/firmware.h>
 
 #include "cx88.h"
+#include <media/v4l2-common.h>
 
 MODULE_DESCRIPTION("driver for cx2388x/cx23416 based mpeg encoder cards");
 MODULE_AUTHOR("Jelle Foks <jelle@foks.8m.com>, Gerd Knorr <kraxel@bytesex.org> [SuSE Labs]");
@@ -1374,7 +1375,7 @@ static int mpeg_do_ioctl(struct inode *inode, struct file *file,
 	struct cx88_core  *core = dev->core;
 
 	if (debug > 1)
-		cx88_print_ioctl(core->name,cmd);
+		v4l_print_ioctl(core->name,cmd);
 
 	switch (cmd) {
 
diff --git a/drivers/media/video/cx88/cx88-core.c b/drivers/media/video/cx88/cx88-core.c
index bb6eb54e19ce..fc814d198694 100644
--- a/drivers/media/video/cx88/cx88-core.c
+++ b/drivers/media/video/cx88/cx88-core.c
@@ -34,6 +34,7 @@
 #include <linux/videodev2.h>
 
 #include "cx88.h"
+#include <media/v4l2-common.h>
 
 MODULE_DESCRIPTION("v4l2 driver module for cx2388x based TV cards");
 MODULE_AUTHOR("Gerd Knorr <kraxel@bytesex.org> [SuSE Labs]");
@@ -76,60 +77,6 @@ static unsigned int cx88_devcount;
 static LIST_HEAD(cx88_devlist);
 static DECLARE_MUTEX(devlist);
 
-/* ------------------------------------------------------------------ */
-/* debug help functions                                               */
-
-static const char *v4l1_ioctls[] = {
-	"0", "CGAP", "GCHAN", "SCHAN", "GTUNER", "STUNER", "GPICT", "SPICT",
-	"CCAPTURE", "GWIN", "SWIN", "GFBUF", "SFBUF", "KEY", "GFREQ",
-	"SFREQ", "GAUDIO", "SAUDIO", "SYNC", "MCAPTURE", "GMBUF", "GUNIT",
-	"GCAPTURE", "SCAPTURE", "SPLAYMODE", "SWRITEMODE", "GPLAYINFO",
-	"SMICROCODE", "GVBIFMT", "SVBIFMT" };
-#define V4L1_IOCTLS ARRAY_SIZE(v4l1_ioctls)
-
-static const char *v4l2_ioctls[] = {
-	"QUERYCAP", "1", "ENUM_PIXFMT", "ENUM_FBUFFMT", "G_FMT", "S_FMT",
-	"G_COMP", "S_COMP", "REQBUFS", "QUERYBUF", "G_FBUF", "S_FBUF",
-	"G_WIN", "S_WIN", "PREVIEW", "QBUF", "16", "DQBUF", "STREAMON",
-	"STREAMOFF", "G_PERF", "G_PARM", "S_PARM", "G_STD", "S_STD",
-	"ENUMSTD", "ENUMINPUT", "G_CTRL", "S_CTRL", "G_TUNER", "S_TUNER",
-	"G_FREQ", "S_FREQ", "G_AUDIO", "S_AUDIO", "35", "QUERYCTRL",
-	"QUERYMENU", "G_INPUT", "S_INPUT", "ENUMCVT", "41", "42", "43",
-	"44", "45",  "G_OUTPUT", "S_OUTPUT", "ENUMOUTPUT", "G_AUDOUT",
-	"S_AUDOUT", "ENUMFX", "G_EFFECT", "S_EFFECT", "G_MODULATOR",
-	"S_MODULATOR"
-};
-#define V4L2_IOCTLS ARRAY_SIZE(v4l2_ioctls)
-
-void cx88_print_ioctl(char *name, unsigned int cmd)
-{
-	char *dir;
-
-	switch (_IOC_DIR(cmd)) {
-	case _IOC_NONE:              dir = "--"; break;
-	case _IOC_READ:              dir = "r-"; break;
-	case _IOC_WRITE:             dir = "-w"; break;
-	case _IOC_READ | _IOC_WRITE: dir = "rw"; break;
-	default:                     dir = "??"; break;
-	}
-	switch (_IOC_TYPE(cmd)) {
-	case 'v':
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (v4l1, %s, VIDIOC%s)\n",
-		       name, cmd, dir, (_IOC_NR(cmd) < V4L1_IOCTLS) ?
-		       v4l1_ioctls[_IOC_NR(cmd)] : "???");
-		break;
-	case 'V':
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (v4l2, %s, VIDIOC_%s)\n",
-		       name, cmd, dir, (_IOC_NR(cmd) < V4L2_IOCTLS) ?
-		       v4l2_ioctls[_IOC_NR(cmd)] : "???");
-		break;
-	default:
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (???, %s, #%d)\n",
-		       name, cmd, dir, _IOC_NR(cmd));
-	}
-}
-
-/* ------------------------------------------------------------------ */
 #define NO_SYNC_LINE (-1U)
 
 static u32* cx88_risc_field(u32 *rp, struct scatterlist *sglist,
@@ -1208,7 +1155,6 @@ void cx88_core_put(struct cx88_core *core, struct pci_dev *pci)
 
 /* ------------------------------------------------------------------ */
 
-EXPORT_SYMBOL(cx88_print_ioctl);
 EXPORT_SYMBOL(cx88_print_irqbits);
 
 EXPORT_SYMBOL(cx88_core_irq);
diff --git a/drivers/media/video/cx88/cx88-dvb.c b/drivers/media/video/cx88/cx88-dvb.c
index 201050478711..c63f20fdff48 100644
--- a/drivers/media/video/cx88/cx88-dvb.c
+++ b/drivers/media/video/cx88/cx88-dvb.c
@@ -31,6 +31,7 @@
 
 #include "cx88.h"
 #include "dvb-pll.h"
+#include <media/v4l2-common.h>
 
 #ifdef HAVE_MT352
 # include "mt352.h"
diff --git a/drivers/media/video/cx88/cx88-i2c.c b/drivers/media/video/cx88/cx88-i2c.c
index c6492089ee1a..f720901e9638 100644
--- a/drivers/media/video/cx88/cx88-i2c.c
+++ b/drivers/media/video/cx88/cx88-i2c.c
@@ -30,6 +30,7 @@
 #include <asm/io.h>
 
 #include "cx88.h"
+#include <media/v4l2-common.h>
 
 static unsigned int i2c_debug = 0;
 module_param(i2c_debug, int, 0644);
diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c
index b76abb9b8961..9a02515fe18b 100644
--- a/drivers/media/video/cx88/cx88-video.c
+++ b/drivers/media/video/cx88/cx88-video.c
@@ -33,6 +33,7 @@
 #include <asm/div64.h>
 
 #include "cx88.h"
+#include <media/v4l2-common.h>
 
 /* Include V4L1 specific functions. Should be removed soon */
 #include <linux/videodev.h>
@@ -1118,7 +1119,7 @@ static int video_do_ioctl(struct inode *inode, struct file *file,
 	int err;
 
 	if (video_debug > 1)
-		cx88_print_ioctl(core->name,cmd);
+		v4l_print_ioctl(core->name,cmd);
 	switch (cmd) {
 
 	/* --- capabilities ------------------------------------------ */
@@ -1254,7 +1255,7 @@ int cx88_do_ioctl(struct inode *inode, struct file *file, int radio,
 
 	dprintk( 1, "CORE IOCTL: 0x%x\n", cmd );
 	if (video_debug > 1)
-		cx88_print_ioctl(core->name,cmd);
+		v4l_print_ioctl(core->name,cmd);
 
 	switch (cmd) {
 	/* ---------- tv norms ---------- */
@@ -1474,7 +1475,7 @@ static int radio_do_ioctl(struct inode *inode, struct file *file,
 	struct cx88_core  *core = dev->core;
 
 	if (video_debug > 1)
-		cx88_print_ioctl(core->name,cmd);
+		v4l_print_ioctl(core->name,cmd);
 
 	switch (cmd) {
 	case VIDIOC_QUERYCAP:
diff --git a/drivers/media/video/cx88/cx88.h b/drivers/media/video/cx88/cx88.h
index 6d370d1b333f..022ef13c45bc 100644
--- a/drivers/media/video/cx88/cx88.h
+++ b/drivers/media/video/cx88/cx88.h
@@ -461,7 +461,6 @@ struct cx8802_dev {
 
 extern void cx88_print_irqbits(char *name, char *tag, char **strings,
 			       u32 bits, u32 mask);
-extern void cx88_print_ioctl(char *name, unsigned int cmd);
 
 extern int cx88_core_irq(struct cx88_core *core, u32 status);
 extern void cx88_wakeup(struct cx88_core *core,
diff --git a/drivers/media/video/em28xx/em28xx-core.c b/drivers/media/video/em28xx/em28xx-core.c
index c0db0e9d2cea..dff3893f32fd 100644
--- a/drivers/media/video/em28xx/em28xx-core.c
+++ b/drivers/media/video/em28xx/em28xx-core.c
@@ -63,59 +63,6 @@ static int alt = EM28XX_PINOUT;
 module_param(alt, int, 0644);
 MODULE_PARM_DESC(alt, "alternate setting to use for video endpoint");
 
-/* ------------------------------------------------------------------ */
-/* debug help functions                                               */
-
-static const char *v4l1_ioctls[] = {
-	"0", "CGAP", "GCHAN", "SCHAN", "GTUNER", "STUNER", "GPICT", "SPICT",
-	"CCAPTURE", "GWIN", "SWIN", "GFBUF", "SFBUF", "KEY", "GFREQ",
-	"SFREQ", "GAUDIO", "SAUDIO", "SYNC", "MCAPTURE", "GMBUF", "GUNIT",
-	"GCAPTURE", "SCAPTURE", "SPLAYMODE", "SWRITEMODE", "GPLAYINFO",
-	"SMICROCODE", "GVBIFMT", "SVBIFMT" };
-#define V4L1_IOCTLS ARRAY_SIZE(v4l1_ioctls)
-
-static const char *v4l2_ioctls[] = {
-	"QUERYCAP", "1", "ENUM_PIXFMT", "ENUM_FBUFFMT", "G_FMT", "S_FMT",
-	"G_COMP", "S_COMP", "REQBUFS", "QUERYBUF", "G_FBUF", "S_FBUF",
-	"G_WIN", "S_WIN", "PREVIEW", "QBUF", "16", "DQBUF", "STREAMON",
-	"STREAMOFF", "G_PERF", "G_PARM", "S_PARM", "G_STD", "S_STD",
-	"ENUMSTD", "ENUMINPUT", "G_CTRL", "S_CTRL", "G_TUNER", "S_TUNER",
-	"G_FREQ", "S_FREQ", "G_AUDIO", "S_AUDIO", "35", "QUERYCTRL",
-	"QUERYMENU", "G_INPUT", "S_INPUT", "ENUMCVT", "41", "42", "43",
-	"44", "45",  "G_OUTPUT", "S_OUTPUT", "ENUMOUTPUT", "G_AUDOUT",
-	"S_AUDOUT", "ENUMFX", "G_EFFECT", "S_EFFECT", "G_MODULATOR",
-	"S_MODULATOR"
-};
-#define V4L2_IOCTLS ARRAY_SIZE(v4l2_ioctls)
-
-void em28xx_print_ioctl(char *name, unsigned int cmd)
-{
-	char *dir;
-
-	switch (_IOC_DIR(cmd)) {
-	case _IOC_NONE:              dir = "--"; break;
-	case _IOC_READ:              dir = "r-"; break;
-	case _IOC_WRITE:             dir = "-w"; break;
-	case _IOC_READ | _IOC_WRITE: dir = "rw"; break;
-	default:                     dir = "??"; break;
-	}
-	switch (_IOC_TYPE(cmd)) {
-	case 'v':
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (v4l1, %s, VIDIOC%s)\n",
-		       name, cmd, dir, (_IOC_NR(cmd) < V4L1_IOCTLS) ?
-		       v4l1_ioctls[_IOC_NR(cmd)] : "???");
-		break;
-	case 'V':
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (v4l2, %s, VIDIOC_%s)\n",
-		       name, cmd, dir, (_IOC_NR(cmd) < V4L2_IOCTLS) ?
-		       v4l2_ioctls[_IOC_NR(cmd)] : "???");
-		break;
-	default:
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (???, %s, #%d)\n",
-		       name, cmd, dir, _IOC_NR(cmd));
-	}
-}
-
 
 /*
  * em28xx_request_buffers()
diff --git a/drivers/media/video/em28xx/em28xx-i2c.c b/drivers/media/video/em28xx/em28xx-i2c.c
index 5385338efbf4..0591a705b7a1 100644
--- a/drivers/media/video/em28xx/em28xx-i2c.c
+++ b/drivers/media/video/em28xx/em28xx-i2c.c
@@ -28,6 +28,7 @@
 #include <linux/video_decoder.h>
 
 #include "em28xx.h"
+#include <media/v4l2-common.h>
 #include <media/tuner.h>
 
 /* ----------------------------------------------------------- */
diff --git a/drivers/media/video/em28xx/em28xx-video.c b/drivers/media/video/em28xx/em28xx-video.c
index 0b5557c479ae..fdc255918dde 100644
--- a/drivers/media/video/em28xx/em28xx-video.c
+++ b/drivers/media/video/em28xx/em28xx-video.c
@@ -1269,7 +1269,7 @@ static int em28xx_video_do_ioctl(struct inode *inode, struct file *filp,
 		return -ENODEV;
 
 	if (video_debug > 1)
-		em28xx_print_ioctl(dev->name,cmd);
+		v4l_print_ioctl(dev->name,cmd);
 
 	switch (cmd) {
 
diff --git a/drivers/media/video/em28xx/em28xx.h b/drivers/media/video/em28xx/em28xx.h
index ffa9acc9be37..f99ee8eb5577 100644
--- a/drivers/media/video/em28xx/em28xx.h
+++ b/drivers/media/video/em28xx/em28xx.h
@@ -295,8 +295,6 @@ void em28xx_set_ir(struct em28xx * dev,struct IR_i2c *ir);
 
 /* Provided by em28xx-core.c */
 
-void em28xx_print_ioctl(char *name, unsigned int cmd);
-
 u32 em28xx_request_buffers(struct em28xx *dev, u32 count);
 void em28xx_queue_unusedframes(struct em28xx *dev);
 void em28xx_release_buffers(struct em28xx *dev);
diff --git a/drivers/media/video/mxb.c b/drivers/media/video/mxb.c
index d04793fb80fc..91681aa6c657 100644
--- a/drivers/media/video/mxb.c
+++ b/drivers/media/video/mxb.c
@@ -26,6 +26,7 @@
 #include <media/saa7146_vv.h>
 #include <media/tuner.h>
 #include <linux/video_decoder.h>
+#include <media/v4l2-common.h>
 
 #include "mxb.h"
 #include "tea6415c.h"
diff --git a/drivers/media/video/saa7134/saa7134-cards.c b/drivers/media/video/saa7134/saa7134-cards.c
index 73f2525bc764..991829eb15da 100644
--- a/drivers/media/video/saa7134/saa7134-cards.c
+++ b/drivers/media/video/saa7134/saa7134-cards.c
@@ -25,6 +25,7 @@
 
 #include "saa7134-reg.h"
 #include "saa7134.h"
+#include <media/v4l2-common.h>
 
 /* commly used strings */
 static char name_mute[]    = "mute";
diff --git a/drivers/media/video/saa7134/saa7134-core.c b/drivers/media/video/saa7134/saa7134-core.c
index 0bdbd99d0ae6..d4be1fd20a36 100644
--- a/drivers/media/video/saa7134/saa7134-core.c
+++ b/drivers/media/video/saa7134/saa7134-core.c
@@ -95,77 +95,6 @@ int (*dmasound_exit)(struct saa7134_dev *dev);
 #define dprintk(fmt, arg...)	if (core_debug) \
 	printk(KERN_DEBUG "%s/core: " fmt, dev->name , ## arg)
 
-/* ------------------------------------------------------------------ */
-/* debug help functions                                               */
-
-static const char *v4l1_ioctls[] = {
-	"0", "GCAP", "GCHAN", "SCHAN", "GTUNER", "STUNER", "GPICT", "SPICT",
-	"CCAPTURE", "GWIN", "SWIN", "GFBUF", "SFBUF", "KEY", "GFREQ",
-	"SFREQ", "GAUDIO", "SAUDIO", "SYNC", "MCAPTURE", "GMBUF", "GUNIT",
-	"GCAPTURE", "SCAPTURE", "SPLAYMODE", "SWRITEMODE", "GPLAYINFO",
-	"SMICROCODE", "GVBIFMT", "SVBIFMT" };
-#define V4L1_IOCTLS ARRAY_SIZE(v4l1_ioctls)
-
-static const char *v4l2_ioctls[] = {
-	"QUERYCAP", "1", "ENUM_PIXFMT", "ENUM_FBUFFMT", "G_FMT", "S_FMT",
-	"G_COMP", "S_COMP", "REQBUFS", "QUERYBUF", "G_FBUF", "S_FBUF",
-	"G_WIN", "S_WIN", "PREVIEW", "QBUF", "16", "DQBUF", "STREAMON",
-	"STREAMOFF", "G_PERF", "G_PARM", "S_PARM", "G_STD", "S_STD",
-	"ENUMSTD", "ENUMINPUT", "G_CTRL", "S_CTRL", "G_TUNER", "S_TUNER",
-	"G_FREQ", "S_FREQ", "G_AUDIO", "S_AUDIO", "35", "QUERYCTRL",
-	"QUERYMENU", "G_INPUT", "S_INPUT", "ENUMCVT", "41", "42", "43",
-	"44", "45",  "G_OUTPUT", "S_OUTPUT", "ENUMOUTPUT", "G_AUDOUT",
-	"S_AUDOUT", "ENUMFX", "G_EFFECT", "S_EFFECT", "G_MODULATOR",
-	"S_MODULATOR"
-};
-#define V4L2_IOCTLS ARRAY_SIZE(v4l2_ioctls)
-
-static const char *osspcm_ioctls[] = {
-	"RESET", "SYNC", "SPEED", "STEREO", "GETBLKSIZE", "SETFMT",
-	"CHANNELS", "?", "POST", "SUBDIVIDE", "SETFRAGMENT", "GETFMTS",
-	"GETOSPACE", "GETISPACE", "NONBLOCK", "GETCAPS", "GET/SETTRIGGER",
-	"GETIPTR", "GETOPTR", "MAPINBUF", "MAPOUTBUF", "SETSYNCRO",
-	"SETDUPLEX", "GETODELAY"
-};
-#define OSSPCM_IOCTLS ARRAY_SIZE(v4l2_ioctls)
-
-void saa7134_print_ioctl(char *name, unsigned int cmd)
-{
-	char *dir;
-
-	switch (_IOC_DIR(cmd)) {
-	case _IOC_NONE:              dir = "--"; break;
-	case _IOC_READ:              dir = "r-"; break;
-	case _IOC_WRITE:             dir = "-w"; break;
-	case _IOC_READ | _IOC_WRITE: dir = "rw"; break;
-	default:                     dir = "??"; break;
-	}
-	switch (_IOC_TYPE(cmd)) {
-	case 'v':
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (v4l1, %s, VIDIOC%s)\n",
-		       name, cmd, dir, (_IOC_NR(cmd) < V4L1_IOCTLS) ?
-		       v4l1_ioctls[_IOC_NR(cmd)] : "???");
-		break;
-	case 'V':
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (v4l2, %s, VIDIOC_%s)\n",
-		       name, cmd, dir, (_IOC_NR(cmd) < V4L2_IOCTLS) ?
-		       v4l2_ioctls[_IOC_NR(cmd)] : "???");
-		break;
-	case 'P':
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (oss dsp, %s, SNDCTL_DSP_%s)\n",
-		       name, cmd, dir, (_IOC_NR(cmd) < OSSPCM_IOCTLS) ?
-		       osspcm_ioctls[_IOC_NR(cmd)] : "???");
-		break;
-	case 'M':
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (oss mixer, %s, #%d)\n",
-		       name, cmd, dir, _IOC_NR(cmd));
-		break;
-	default:
-		printk(KERN_DEBUG "%s: ioctl 0x%08x (???, %s, #%d)\n",
-		       name, cmd, dir, _IOC_NR(cmd));
-	}
-}
-
 void saa7134_track_gpio(struct saa7134_dev *dev, char *msg)
 {
 	unsigned long mode,status;
@@ -1173,7 +1102,6 @@ module_exit(saa7134_fini);
 
 /* ----------------------------------------------------------- */
 
-EXPORT_SYMBOL(saa7134_print_ioctl);
 EXPORT_SYMBOL(saa7134_i2c_call_clients);
 EXPORT_SYMBOL(saa7134_devlist);
 EXPORT_SYMBOL(saa7134_boards);
diff --git a/drivers/media/video/saa7134/saa7134-dvb.c b/drivers/media/video/saa7134/saa7134-dvb.c
index e016480c3468..399f9952596c 100644
--- a/drivers/media/video/saa7134/saa7134-dvb.c
+++ b/drivers/media/video/saa7134/saa7134-dvb.c
@@ -31,6 +31,7 @@
 
 #include "saa7134-reg.h"
 #include "saa7134.h"
+#include <media/v4l2-common.h>
 
 #ifdef HAVE_MT352
 # include "mt352.h"
diff --git a/drivers/media/video/saa7134/saa7134-empress.c b/drivers/media/video/saa7134/saa7134-empress.c
index 575f3e835f91..bd4c389d4c37 100644
--- a/drivers/media/video/saa7134/saa7134-empress.c
+++ b/drivers/media/video/saa7134/saa7134-empress.c
@@ -29,6 +29,7 @@
 #include "saa7134.h"
 
 #include <media/saa6752hs.h>
+#include <media/v4l2-common.h>
 
 /* ------------------------------------------------------------------ */
 
@@ -163,7 +164,7 @@ static int ts_do_ioctl(struct inode *inode, struct file *file,
 	struct saa7134_dev *dev = file->private_data;
 
 	if (debug > 1)
-		saa7134_print_ioctl(dev->name,cmd);
+		v4l_print_ioctl(dev->name,cmd);
 	switch (cmd) {
 	case VIDIOC_QUERYCAP:
 	{
diff --git a/drivers/media/video/saa7134/saa7134-i2c.c b/drivers/media/video/saa7134/saa7134-i2c.c
index 7283caa0484b..6162550c4136 100644
--- a/drivers/media/video/saa7134/saa7134-i2c.c
+++ b/drivers/media/video/saa7134/saa7134-i2c.c
@@ -30,6 +30,7 @@
 
 #include "saa7134-reg.h"
 #include "saa7134.h"
+#include <media/v4l2-common.h>
 
 /* ----------------------------------------------------------- */
 
diff --git a/drivers/media/video/saa7134/saa7134-oss.c b/drivers/media/video/saa7134/saa7134-oss.c
index 8badd2a9cb2f..7448e386a804 100644
--- a/drivers/media/video/saa7134/saa7134-oss.c
+++ b/drivers/media/video/saa7134/saa7134-oss.c
@@ -373,6 +373,42 @@ static ssize_t dsp_write(struct file *file, const char __user *buffer,
 	return -EINVAL;
 }
 
+static const char *osspcm_ioctls[] = {
+	"RESET", "SYNC", "SPEED", "STEREO", "GETBLKSIZE", "SETFMT",
+	"CHANNELS", "?", "POST", "SUBDIVIDE", "SETFRAGMENT", "GETFMTS",
+	"GETOSPACE", "GETISPACE", "NONBLOCK", "GETCAPS", "GET/SETTRIGGER",
+	"GETIPTR", "GETOPTR", "MAPINBUF", "MAPOUTBUF", "SETSYNCRO",
+	"SETDUPLEX", "GETODELAY"
+};
+#define OSSPCM_IOCTLS ARRAY_SIZE(osspcm_ioctls)
+
+static void saa7134_oss_print_ioctl(char *name, unsigned int cmd)
+{
+	char *dir;
+
+	switch (_IOC_DIR(cmd)) {
+	case _IOC_NONE:              dir = "--"; break;
+	case _IOC_READ:              dir = "r-"; break;
+	case _IOC_WRITE:             dir = "-w"; break;
+	case _IOC_READ | _IOC_WRITE: dir = "rw"; break;
+	default:                     dir = "??"; break;
+	}
+	switch (_IOC_TYPE(cmd)) {
+	case 'P':
+		printk(KERN_DEBUG "%s: ioctl 0x%08x (oss dsp, %s, SNDCTL_DSP_%s)\n",
+		       name, cmd, dir, (_IOC_NR(cmd) < OSSPCM_IOCTLS) ?
+		       osspcm_ioctls[_IOC_NR(cmd)] : "???");
+		break;
+	case 'M':
+		printk(KERN_DEBUG "%s: ioctl 0x%08x (oss mixer, %s, #%d)\n",
+		       name, cmd, dir, _IOC_NR(cmd));
+		break;
+	default:
+		printk(KERN_DEBUG "%s: ioctl 0x%08x (???, %s, #%d)\n",
+		       name, cmd, dir, _IOC_NR(cmd));
+	}
+}
+
 static int dsp_ioctl(struct inode *inode, struct file *file,
 		     unsigned int cmd, unsigned long arg)
 {
@@ -382,7 +418,7 @@ static int dsp_ioctl(struct inode *inode, struct file *file,
 	int val = 0;
 
 	if (debug > 1)
-		saa7134_print_ioctl(dev->name,cmd);
+		saa7134_oss_print_ioctl(dev->name,cmd);
 	switch (cmd) {
 	case OSS_GETVERSION:
 		return put_user(SOUND_VERSION, p);
@@ -678,7 +714,7 @@ static int mixer_ioctl(struct inode *inode, struct file *file,
 	int __user *p = argp;
 
 	if (debug > 1)
-		saa7134_print_ioctl(dev->name,cmd);
+		saa7134_oss_print_ioctl(dev->name,cmd);
 	switch (cmd) {
 	case OSS_GETVERSION:
 		return put_user(SOUND_VERSION, p);
diff --git a/drivers/media/video/saa7134/saa7134-video.c b/drivers/media/video/saa7134/saa7134-video.c
index 9b9e1e7f05ef..adfa8fe49a11 100644
--- a/drivers/media/video/saa7134/saa7134-video.c
+++ b/drivers/media/video/saa7134/saa7134-video.c
@@ -29,6 +29,7 @@
 
 #include "saa7134-reg.h"
 #include "saa7134.h"
+#include <media/v4l2-common.h>
 
 /* Include V4L1 specific functions. Should be removed soon */
 #include <linux/videodev.h>
@@ -1689,7 +1690,7 @@ static int video_do_ioctl(struct inode *inode, struct file *file,
 	int err;
 
 	if (video_debug > 1)
-		saa7134_print_ioctl(dev->name,cmd);
+		v4l_print_ioctl(dev->name,cmd);
 
 	switch (cmd) {
 	case VIDIOC_S_CTRL:
@@ -2142,7 +2143,7 @@ static int radio_do_ioctl(struct inode *inode, struct file *file,
 	struct saa7134_dev *dev = fh->dev;
 
 	if (video_debug > 1)
-		saa7134_print_ioctl(dev->name,cmd);
+		v4l_print_ioctl(dev->name,cmd);
 	switch (cmd) {
 	case VIDIOC_QUERYCAP:
 	{
diff --git a/drivers/media/video/saa7134/saa7134.h b/drivers/media/video/saa7134/saa7134.h
index 2f28e83102fd..18978a484ddb 100644
--- a/drivers/media/video/saa7134/saa7134.h
+++ b/drivers/media/video/saa7134/saa7134.h
@@ -546,7 +546,6 @@ struct saa7134_dev {
 
 extern struct list_head  saa7134_devlist;
 
-void saa7134_print_ioctl(char *name, unsigned int cmd);
 void saa7134_track_gpio(struct saa7134_dev *dev, char *msg);
 
 #define SAA7134_PGTABLE_SIZE 4096
diff --git a/drivers/media/video/tda9887.c b/drivers/media/video/tda9887.c
index 9ae43a8cea52..f64baa4b0025 100644
--- a/drivers/media/video/tda9887.c
+++ b/drivers/media/video/tda9887.c
@@ -9,7 +9,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 
-#include <media/audiochip.h>
+#include <media/v4l2-common.h>
 #include <media/tuner.h>
 
 
diff --git a/drivers/media/video/tuner-core.c b/drivers/media/video/tuner-core.c
index df994311251e..fd18a882668e 100644
--- a/drivers/media/video/tuner-core.c
+++ b/drivers/media/video/tuner-core.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 
 #include <media/tuner.h>
+#include <media/v4l2-common.h>
 #include <media/audiochip.h>
 
 #include "msp3400.h"
@@ -545,6 +546,9 @@ static int tuner_command(struct i2c_client *client, unsigned int cmd, void *arg)
 {
 	struct tuner *t = i2c_get_clientdata(client);
 
+	if (tuner_debug>1)
+		v4l_i2c_print_ioctl(&(t->i2c),cmd);
+
 	switch (cmd) {
 	/* --- configuration --- */
 	case TUNER_SET_TYPE_ADDR:
@@ -575,9 +579,6 @@ static int tuner_command(struct i2c_client *client, unsigned int cmd, void *arg)
 		/* Should be implemented, since bttv calls it */
 		tuner_dbg("VIDIOCSAUDIO not implemented.\n");
 
-		break;
-	case MSP_SET_MATRIX:
-	case TDA9887_SET_CONFIG:
 		break;
 	/* --- v4l ioctls --- */
 	/* take care: bttv does userspace copying, we'll get a
@@ -764,11 +765,6 @@ static int tuner_command(struct i2c_client *client, unsigned int cmd, void *arg)
 	case VIDIOC_LOG_STATUS:
 		tuner_status(client);
 		break;
-	default:
-		tuner_dbg("Unimplemented IOCTL 0x%08x(dir=%d,tp='%c',nr=%d,sz=%d)\n",
-					 cmd, _IOC_DIR(cmd), _IOC_TYPE(cmd),
-					_IOC_NR(cmd), _IOC_SIZE(cmd));
-		break;
 	}
 
 	return 0;
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index ed6a843dd34a..fec620073aa3 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -31,6 +31,7 @@
 #include <linux/smp_lock.h>
 
 #include <media/audiochip.h>
+#include <media/v4l2-common.h>
 
 #include "tvaudio.h"
 
diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index 62a7d636ef11..5dbd7c1b362a 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -58,6 +58,8 @@
 #include <asm/pgtable.h>
 #include <asm/io.h>
 #include <asm/div64.h>
+#include <linux/video_decoder.h>
+#include <media/v4l2-common.h>
 
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
@@ -190,55 +192,174 @@ char *v4l2_type_names[] = {
 	[V4L2_BUF_TYPE_VBI_OUTPUT]    = "vbi-out",
 };
 
-char *v4l2_ioctl_names[256] = {
-	[0 ... 255]                      = "UNKNOWN",
-	[_IOC_NR(VIDIOC_QUERYCAP)]       = "VIDIOC_QUERYCAP",
-	[_IOC_NR(VIDIOC_RESERVED)]       = "VIDIOC_RESERVED",
-	[_IOC_NR(VIDIOC_ENUM_FMT)]       = "VIDIOC_ENUM_FMT",
-	[_IOC_NR(VIDIOC_G_FMT)]          = "VIDIOC_G_FMT",
-	[_IOC_NR(VIDIOC_S_FMT)]          = "VIDIOC_S_FMT",
-	[_IOC_NR(VIDIOC_REQBUFS)]        = "VIDIOC_REQBUFS",
-	[_IOC_NR(VIDIOC_QUERYBUF)]       = "VIDIOC_QUERYBUF",
-	[_IOC_NR(VIDIOC_G_FBUF)]         = "VIDIOC_G_FBUF",
-	[_IOC_NR(VIDIOC_S_FBUF)]         = "VIDIOC_S_FBUF",
-	[_IOC_NR(VIDIOC_OVERLAY)]        = "VIDIOC_OVERLAY",
-	[_IOC_NR(VIDIOC_QBUF)]           = "VIDIOC_QBUF",
-	[_IOC_NR(VIDIOC_DQBUF)]          = "VIDIOC_DQBUF",
-	[_IOC_NR(VIDIOC_STREAMON)]       = "VIDIOC_STREAMON",
-	[_IOC_NR(VIDIOC_STREAMOFF)]      = "VIDIOC_STREAMOFF",
-	[_IOC_NR(VIDIOC_G_PARM)]         = "VIDIOC_G_PARM",
-	[_IOC_NR(VIDIOC_S_PARM)]         = "VIDIOC_S_PARM",
-	[_IOC_NR(VIDIOC_G_STD)]          = "VIDIOC_G_STD",
-	[_IOC_NR(VIDIOC_S_STD)]          = "VIDIOC_S_STD",
-	[_IOC_NR(VIDIOC_ENUMSTD)]        = "VIDIOC_ENUMSTD",
-	[_IOC_NR(VIDIOC_ENUMINPUT)]      = "VIDIOC_ENUMINPUT",
-	[_IOC_NR(VIDIOC_G_CTRL)]         = "VIDIOC_G_CTRL",
-	[_IOC_NR(VIDIOC_S_CTRL)]         = "VIDIOC_S_CTRL",
-	[_IOC_NR(VIDIOC_G_TUNER)]        = "VIDIOC_G_TUNER",
-	[_IOC_NR(VIDIOC_S_TUNER)]        = "VIDIOC_S_TUNER",
-	[_IOC_NR(VIDIOC_G_AUDIO)]        = "VIDIOC_G_AUDIO",
-	[_IOC_NR(VIDIOC_S_AUDIO)]        = "VIDIOC_S_AUDIO",
-	[_IOC_NR(VIDIOC_QUERYCTRL)]      = "VIDIOC_QUERYCTRL",
-	[_IOC_NR(VIDIOC_QUERYMENU)]      = "VIDIOC_QUERYMENU",
-	[_IOC_NR(VIDIOC_G_INPUT)]        = "VIDIOC_G_INPUT",
-	[_IOC_NR(VIDIOC_S_INPUT)]        = "VIDIOC_S_INPUT",
-	[_IOC_NR(VIDIOC_G_OUTPUT)]       = "VIDIOC_G_OUTPUT",
-	[_IOC_NR(VIDIOC_S_OUTPUT)]       = "VIDIOC_S_OUTPUT",
-	[_IOC_NR(VIDIOC_ENUMOUTPUT)]     = "VIDIOC_ENUMOUTPUT",
-	[_IOC_NR(VIDIOC_G_AUDOUT)]       = "VIDIOC_G_AUDOUT",
-	[_IOC_NR(VIDIOC_S_AUDOUT)]       = "VIDIOC_S_AUDOUT",
-	[_IOC_NR(VIDIOC_G_MODULATOR)]    = "VIDIOC_G_MODULATOR",
-	[_IOC_NR(VIDIOC_S_MODULATOR)]    = "VIDIOC_S_MODULATOR",
-	[_IOC_NR(VIDIOC_G_FREQUENCY)]    = "VIDIOC_G_FREQUENCY",
-	[_IOC_NR(VIDIOC_S_FREQUENCY)]    = "VIDIOC_S_FREQUENCY",
-	[_IOC_NR(VIDIOC_CROPCAP)]        = "VIDIOC_CROPCAP",
-	[_IOC_NR(VIDIOC_G_CROP)]         = "VIDIOC_G_CROP",
-	[_IOC_NR(VIDIOC_S_CROP)]         = "VIDIOC_S_CROP",
-	[_IOC_NR(VIDIOC_G_JPEGCOMP)]     = "VIDIOC_G_JPEGCOMP",
-	[_IOC_NR(VIDIOC_S_JPEGCOMP)]     = "VIDIOC_S_JPEGCOMP",
-	[_IOC_NR(VIDIOC_QUERYSTD)]       = "VIDIOC_QUERYSTD",
-	[_IOC_NR(VIDIOC_TRY_FMT)]        = "VIDIOC_TRY_FMT",
+/* ------------------------------------------------------------------ */
+/* debug help functions                                               */
+
+#ifdef HAVE_V4L1
+static const char *v4l1_ioctls[] = {
+	[_IOC_NR(VIDIOCGCAP)]       = "VIDIOCGCAP",
+	[_IOC_NR(VIDIOCGCHAN)]      = "VIDIOCGCHAN",
+	[_IOC_NR(VIDIOCSCHAN)]      = "VIDIOCSCHAN",
+	[_IOC_NR(VIDIOCGTUNER)]     = "VIDIOCGTUNER",
+	[_IOC_NR(VIDIOCSTUNER)]     = "VIDIOCSTUNER",
+	[_IOC_NR(VIDIOCGPICT)]      = "VIDIOCGPICT",
+	[_IOC_NR(VIDIOCSPICT)]      = "VIDIOCSPICT",
+	[_IOC_NR(VIDIOCCAPTURE)]    = "VIDIOCCAPTURE",
+	[_IOC_NR(VIDIOCGWIN)]       = "VIDIOCGWIN",
+	[_IOC_NR(VIDIOCSWIN)]       = "VIDIOCSWIN",
+	[_IOC_NR(VIDIOCGFBUF)]      = "VIDIOCGFBUF",
+	[_IOC_NR(VIDIOCSFBUF)]      = "VIDIOCSFBUF",
+	[_IOC_NR(VIDIOCKEY)]        = "VIDIOCKEY",
+	[_IOC_NR(VIDIOCGFREQ)]      = "VIDIOCGFREQ",
+	[_IOC_NR(VIDIOCSFREQ)]      = "VIDIOCSFREQ",
+	[_IOC_NR(VIDIOCGAUDIO)]     = "VIDIOCGAUDIO",
+	[_IOC_NR(VIDIOCSAUDIO)]     = "VIDIOCSAUDIO",
+	[_IOC_NR(VIDIOCSYNC)]       = "VIDIOCSYNC",
+	[_IOC_NR(VIDIOCMCAPTURE)]   = "VIDIOCMCAPTURE",
+	[_IOC_NR(VIDIOCGMBUF)]      = "VIDIOCGMBUF",
+	[_IOC_NR(VIDIOCGUNIT)]      = "VIDIOCGUNIT",
+	[_IOC_NR(VIDIOCGCAPTURE)]   = "VIDIOCGCAPTURE",
+	[_IOC_NR(VIDIOCSCAPTURE)]   = "VIDIOCSCAPTURE",
+	[_IOC_NR(VIDIOCSPLAYMODE)]  = "VIDIOCSPLAYMODE",
+	[_IOC_NR(VIDIOCSWRITEMODE)] = "VIDIOCSWRITEMODE",
+	[_IOC_NR(VIDIOCGPLAYINFO)]  = "VIDIOCGPLAYINFO",
+	[_IOC_NR(VIDIOCSMICROCODE)] = "VIDIOCSMICROCODE",
+	[_IOC_NR(VIDIOCGVBIFMT)]    = "VIDIOCGVBIFMT",
+	[_IOC_NR(VIDIOCSVBIFMT)]    = "VIDIOCSVBIFMT"
 };
+#define V4L1_IOCTLS ARRAY_SIZE(v4l1_ioctls)
+#endif
+
+static const char *v4l2_ioctls[] = {
+	[_IOC_NR(VIDIOC_QUERYCAP)]         = "VIDIOC_QUERYCAP",
+	[_IOC_NR(VIDIOC_RESERVED)]         = "VIDIOC_RESERVED",
+	[_IOC_NR(VIDIOC_ENUM_FMT)]         = "VIDIOC_ENUM_FMT",
+	[_IOC_NR(VIDIOC_G_FMT)]            = "VIDIOC_G_FMT",
+	[_IOC_NR(VIDIOC_S_FMT)]            = "VIDIOC_S_FMT",
+	[_IOC_NR(VIDIOC_G_MPEGCOMP)]       = "VIDIOC_G_MPEGCOMP",
+	[_IOC_NR(VIDIOC_S_MPEGCOMP)]       = "VIDIOC_S_MPEGCOMP",
+	[_IOC_NR(VIDIOC_REQBUFS)]          = "VIDIOC_REQBUFS",
+	[_IOC_NR(VIDIOC_QUERYBUF)]         = "VIDIOC_QUERYBUF",
+	[_IOC_NR(VIDIOC_G_FBUF)]           = "VIDIOC_G_FBUF",
+	[_IOC_NR(VIDIOC_S_FBUF)]           = "VIDIOC_S_FBUF",
+	[_IOC_NR(VIDIOC_OVERLAY)]          = "VIDIOC_OVERLAY",
+	[_IOC_NR(VIDIOC_QBUF)]             = "VIDIOC_QBUF",
+	[_IOC_NR(VIDIOC_DQBUF)]            = "VIDIOC_DQBUF",
+	[_IOC_NR(VIDIOC_STREAMON)]         = "VIDIOC_STREAMON",
+	[_IOC_NR(VIDIOC_STREAMOFF)]        = "VIDIOC_STREAMOFF",
+	[_IOC_NR(VIDIOC_G_PARM)]           = "VIDIOC_G_PARM",
+	[_IOC_NR(VIDIOC_S_PARM)]           = "VIDIOC_S_PARM",
+	[_IOC_NR(VIDIOC_G_STD)]            = "VIDIOC_G_STD",
+	[_IOC_NR(VIDIOC_S_STD)]            = "VIDIOC_S_STD",
+	[_IOC_NR(VIDIOC_ENUMSTD)]          = "VIDIOC_ENUMSTD",
+	[_IOC_NR(VIDIOC_ENUMINPUT)]        = "VIDIOC_ENUMINPUT",
+	[_IOC_NR(VIDIOC_G_CTRL)]           = "VIDIOC_G_CTRL",
+	[_IOC_NR(VIDIOC_S_CTRL)]           = "VIDIOC_S_CTRL",
+	[_IOC_NR(VIDIOC_G_TUNER)]          = "VIDIOC_G_TUNER",
+	[_IOC_NR(VIDIOC_S_TUNER)]          = "VIDIOC_S_TUNER",
+	[_IOC_NR(VIDIOC_G_AUDIO)]          = "VIDIOC_G_AUDIO",
+	[_IOC_NR(VIDIOC_S_AUDIO)]          = "VIDIOC_S_AUDIO",
+	[_IOC_NR(VIDIOC_QUERYCTRL)]        = "VIDIOC_QUERYCTRL",
+	[_IOC_NR(VIDIOC_QUERYMENU)]        = "VIDIOC_QUERYMENU",
+	[_IOC_NR(VIDIOC_G_INPUT)]          = "VIDIOC_G_INPUT",
+	[_IOC_NR(VIDIOC_S_INPUT)]          = "VIDIOC_S_INPUT",
+	[_IOC_NR(VIDIOC_G_OUTPUT)]         = "VIDIOC_G_OUTPUT",
+	[_IOC_NR(VIDIOC_S_OUTPUT)]         = "VIDIOC_S_OUTPUT",
+	[_IOC_NR(VIDIOC_ENUMOUTPUT)]       = "VIDIOC_ENUMOUTPUT",
+	[_IOC_NR(VIDIOC_G_AUDOUT)]         = "VIDIOC_G_AUDOUT",
+	[_IOC_NR(VIDIOC_S_AUDOUT)]         = "VIDIOC_S_AUDOUT",
+	[_IOC_NR(VIDIOC_G_MODULATOR)]      = "VIDIOC_G_MODULATOR",
+	[_IOC_NR(VIDIOC_S_MODULATOR)]      = "VIDIOC_S_MODULATOR",
+	[_IOC_NR(VIDIOC_G_FREQUENCY)]      = "VIDIOC_G_FREQUENCY",
+	[_IOC_NR(VIDIOC_S_FREQUENCY)]      = "VIDIOC_S_FREQUENCY",
+	[_IOC_NR(VIDIOC_CROPCAP)]          = "VIDIOC_CROPCAP",
+	[_IOC_NR(VIDIOC_G_CROP)]           = "VIDIOC_G_CROP",
+	[_IOC_NR(VIDIOC_S_CROP)]           = "VIDIOC_S_CROP",
+	[_IOC_NR(VIDIOC_G_JPEGCOMP)]       = "VIDIOC_G_JPEGCOMP",
+	[_IOC_NR(VIDIOC_S_JPEGCOMP)]       = "VIDIOC_S_JPEGCOMP",
+	[_IOC_NR(VIDIOC_QUERYSTD)]         = "VIDIOC_QUERYSTD",
+	[_IOC_NR(VIDIOC_TRY_FMT)]          = "VIDIOC_TRY_FMT",
+	[_IOC_NR(VIDIOC_ENUMAUDIO)]        = "VIDIOC_ENUMAUDIO",
+	[_IOC_NR(VIDIOC_ENUMAUDOUT)]       = "VIDIOC_ENUMAUDOUT",
+	[_IOC_NR(VIDIOC_G_PRIORITY)]       = "VIDIOC_G_PRIORITY",
+	[_IOC_NR(VIDIOC_S_PRIORITY)]       = "VIDIOC_S_PRIORITY",
+#if 1
+	[_IOC_NR(VIDIOC_G_SLICED_VBI_CAP)] = "VIDIOC_G_SLICED_VBI_CAP",
+#endif
+	[_IOC_NR(VIDIOC_LOG_STATUS)]       = "VIDIOC_LOG_STATUS"
+};
+#define V4L2_IOCTLS ARRAY_SIZE(v4l2_ioctls)
+
+static const char *v4l2_int_ioctls[] = {
+#ifdef HAVE_VIDEO_DECODER
+	[_IOC_NR(DECODER_GET_CAPABILITIES)]    = "DECODER_GET_CAPABILITIES",
+	[_IOC_NR(DECODER_GET_STATUS)]          = "DECODER_GET_STATUS",
+	[_IOC_NR(DECODER_SET_NORM)]            = "DECODER_SET_NORM",
+	[_IOC_NR(DECODER_SET_INPUT)]           = "DECODER_SET_INPUT",
+	[_IOC_NR(DECODER_SET_OUTPUT)]          = "DECODER_SET_OUTPUT",
+	[_IOC_NR(DECODER_ENABLE_OUTPUT)]       = "DECODER_ENABLE_OUTPUT",
+	[_IOC_NR(DECODER_SET_PICTURE)]         = "DECODER_SET_PICTURE",
+	[_IOC_NR(DECODER_SET_GPIO)]            = "DECODER_SET_GPIO",
+	[_IOC_NR(DECODER_INIT)]                = "DECODER_INIT",
+	[_IOC_NR(DECODER_SET_VBI_BYPASS)]      = "DECODER_SET_VBI_BYPASS",
+	[_IOC_NR(DECODER_DUMP)]                = "DECODER_DUMP",
+#endif
+	[_IOC_NR(AUDC_SET_RADIO)]              = "AUDC_SET_RADIO",
+	[_IOC_NR(AUDC_SET_INPUT)]              = "AUDC_SET_INPUT",
+
+	[_IOC_NR(TUNER_SET_TYPE_ADDR)]         = "TUNER_SET_TYPE_ADDR",
+	[_IOC_NR(TUNER_SET_STANDBY)]           = "TUNER_SET_STANDBY",
+	[_IOC_NR(TDA9887_SET_CONFIG)]          = "TDA9887_SET_CONFIG",
+
+	[_IOC_NR(VIDIOC_INT_S_REGISTER)]       = "VIDIOC_INT_S_REGISTER",
+	[_IOC_NR(VIDIOC_INT_G_REGISTER)]       = "VIDIOC_INT_G_REGISTER",
+	[_IOC_NR(VIDIOC_INT_RESET)]            = "VIDIOC_INT_RESET",
+	[_IOC_NR(VIDIOC_INT_AUDIO_CLOCK_FREQ)] = "VIDIOC_INT_AUDIO_CLOCK_FREQ",
+	[_IOC_NR(VIDIOC_INT_DECODE_VBI_LINE)]  = "VIDIOC_INT_DECODE_VBI_LINE",
+	[_IOC_NR(VIDIOC_INT_S_VBI_DATA)]       = "VIDIOC_INT_S_VBI_DATA",
+	[_IOC_NR(VIDIOC_INT_G_VBI_DATA)]       = "VIDIOC_INT_G_VBI_DATA",
+	[_IOC_NR(VIDIOC_INT_G_CHIP_IDENT)]     = "VIDIOC_INT_G_CHIP_IDENT",
+	[_IOC_NR(VIDIOC_INT_I2S_CLOCK_FREQ)]   = "VIDIOC_INT_I2S_CLOCK_FREQ"
+};
+#define V4L2_INT_IOCTLS ARRAY_SIZE(v4l2_int_ioctls)
+
+/* Common ioctl debug function. This function can be used by
+   external ioctl messages as well as internal V4L ioctl */
+void v4l_printk_ioctl(unsigned int cmd)
+{
+	char *dir;
+
+	switch (_IOC_DIR(cmd)) {
+	case _IOC_NONE:              dir = "--"; break;
+	case _IOC_READ:              dir = "r-"; break;
+	case _IOC_WRITE:             dir = "-w"; break;
+	case _IOC_READ | _IOC_WRITE: dir = "rw"; break;
+	default:                     dir = "*ERR*"; break;
+	}
+	switch (_IOC_TYPE(cmd)) {
+	case 'd':
+		printk("v4l2_int ioctl %s, dir=%s (0x%08x)\n",
+		       (_IOC_NR(cmd) < V4L2_INT_IOCTLS) ?
+		       v4l2_int_ioctls[_IOC_NR(cmd)] : "UNKNOWN", dir, cmd);
+		break;
+#ifdef HAVE_V4L1
+	case 'v':
+		printk("v4l1 ioctl %s, dir=%s (0x%08x)\n",
+		       (_IOC_NR(cmd) < V4L1_IOCTLS) ?
+		       v4l1_ioctls[_IOC_NR(cmd)] : "UNKNOWN", dir, cmd);
+		break;
+#endif
+	case 'V':
+		printk("v4l2 ioctl %s, dir=%s (0x%08x)\n",
+		       (_IOC_NR(cmd) < V4L2_IOCTLS) ?
+		       v4l2_ioctls[_IOC_NR(cmd)] : "UNKNOWN", dir, cmd);
+		break;
+
+	default:
+		printk("unknown ioctl '%c', dir=%s, #%d (0x%08x)\n",
+		       _IOC_TYPE(cmd), dir, _IOC_NR(cmd), cmd);
+	}
+}
 
 /* ----------------------------------------------------------------- */
 
@@ -253,7 +374,7 @@ EXPORT_SYMBOL(v4l2_prio_check);
 
 EXPORT_SYMBOL(v4l2_field_names);
 EXPORT_SYMBOL(v4l2_type_names);
-EXPORT_SYMBOL(v4l2_ioctl_names);
+EXPORT_SYMBOL(v4l_printk_ioctl);
 
 /*
  * Local variables:
diff --git a/include/linux/video_decoder.h b/include/linux/video_decoder.h
index 0e9e48b83e3b..121e26da2c18 100644
--- a/include/linux/video_decoder.h
+++ b/include/linux/video_decoder.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_VIDEO_DECODER_H
 #define _LINUX_VIDEO_DECODER_H
 
+#define HAVE_VIDEO_DECODER 1
+
 struct video_decoder_capability { /* this name is too long */
 	__u32	flags;
 #define	VIDEO_DECODER_PAL	1	/* can decode PAL signal */
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 6ac7c1f7902f..ce40675324bd 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1116,7 +1116,6 @@ int v4l2_prio_check(struct v4l2_prio_state *global, enum v4l2_priority *local);
 /* names for fancy debug output */
 extern char *v4l2_field_names[];
 extern char *v4l2_type_names[];
-extern char *v4l2_ioctl_names[];
 
 /*  Compatibility layer interface  --  v4l1-compat module */
 typedef int (*v4l2_kioctl)(struct inode *inode, struct file *file,
diff --git a/include/media/audiochip.h b/include/media/audiochip.h
index 411f09fc4574..295d256ee811 100644
--- a/include/media/audiochip.h
+++ b/include/media/audiochip.h
@@ -23,11 +23,6 @@ enum audiochip {
 
 /* ---------------------------------------------------------------------- */
 
-/* v4l device was opened in Radio mode */
-#define AUDC_SET_RADIO        _IO('m',2)
-/* select from TV,radio,extern,MUTE */
-#define AUDC_SET_INPUT        _IOW('m',17,int)
-
 /* audio inputs */
 #define AUDIO_TUNER        0x00
 #define AUDIO_RADIO        0x01
diff --git a/include/media/tuner.h b/include/media/tuner.h
index c5f034ecb002..6a8ef189e5fa 100644
--- a/include/media/tuner.h
+++ b/include/media/tuner.h
@@ -115,10 +115,6 @@
 #define TUNER_PHILIPS_TUV1236D		68	/* ATI HDTV Wonder */
 #define TUNER_TNF_5335MF                69	/* Sabrent Bt848   */
 
-#define TUNER_SET_TYPE_ADDR          _IOW('T',3,int)
-#define TUNER_SET_STANDBY            _IOW('T',4,int)
-#define TDA9887_SET_CONFIG           _IOW('t',5,int)
-
 /* tv card specific */
 #define TDA9887_PRESENT 		(1<<0)
 #define TDA9887_PORT1_INACTIVE 		(1<<1)
diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h
index 90248d29ed0a..9ee616261d66 100644
--- a/include/media/v4l2-common.h
+++ b/include/media/v4l2-common.h
@@ -63,6 +63,20 @@ enum v4l2_chip_ident {
 	V4L2_IDENT_CX25843 = 243,
 };
 
+/* audio ioctls */
+/* v4l device was opened in Radio mode */
+#define AUDC_SET_RADIO        _IO('d',88)
+/* select from TV,radio,extern,MUTE */
+#define AUDC_SET_INPUT        _IOW('d',89,int)
+
+/* tuner ioctls */
+/* Sets tuner type and its I2C addr */
+#define TUNER_SET_TYPE_ADDR          _IOW('d',90,int)
+/* Puts tuner on powersaving state, disabling it, except for i2c */
+#define TUNER_SET_STANDBY            _IOW('d',91,int)
+/* Sets tda9887 specific stuff, like port1, port2 and qss */
+#define TDA9887_SET_CONFIG           _IOW('d',92,int)
+
 /* only implemented if CONFIG_VIDEO_ADV_DEBUG is defined */
 #define	VIDIOC_INT_S_REGISTER 		_IOR ('d', 100, struct v4l2_register)
 #define	VIDIOC_INT_G_REGISTER 		_IOWR('d', 101, struct v4l2_register)
@@ -108,5 +122,16 @@ enum v4l2_chip_ident {
    If the frequency is not supported, then -EINVAL is returned. */
 #define VIDIOC_INT_I2S_CLOCK_FREQ 	_IOW ('d', 108, u32)
 
+/* Prints used ioctl */
+extern void v4l_printk_ioctl(unsigned int cmd);
+
+#define v4l_print_ioctl(name,cmd) do {\
+	printk(KERN_DEBUG "%s: ", name); \
+	v4l_printk_ioctl(cmd); } while (0)
+
+#define v4l_i2c_print_ioctl(client,cmd) do {\
+	printk(KERN_DEBUG "%s %d-%04x: ", (client)->driver->name, \
+			i2c_adapter_id((client)->adapter),(client)->addr); \
+	v4l_printk_ioctl(cmd); } while (0)
 
 #endif /* V4L2_COMMON_H_ */
-- 
cgit v1.2.3-71-gd317


From 677517771b7b6efaf8617e70f655b16f3cafcc9b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 22:19:16 +0300
Subject: [PATCH] rcu: uninline __rcu_pending()

__rcu_pending() is rather fat and called twice from rcu_pending().

rcu_pending() has multiple callers, and not that small too.

This patch uninlines both of them.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h | 31 +------------------------------
 kernel/rcupdate.c        | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 51747cd88d1a..a1d26cb28925 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -125,36 +125,7 @@ static inline void rcu_bh_qsctr_inc(int cpu)
 	rdp->passed_quiesc = 1;
 }
 
-static inline int __rcu_pending(struct rcu_ctrlblk *rcp,
-						struct rcu_data *rdp)
-{
-	/* This cpu has pending rcu entries and the grace period
-	 * for them has completed.
-	 */
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
-		return 1;
-
-	/* This cpu has no pending entries, but there are new entries */
-	if (!rdp->curlist && rdp->nxtlist)
-		return 1;
-
-	/* This cpu has finished callbacks to invoke */
-	if (rdp->donelist)
-		return 1;
-
-	/* The rcu core waits for a quiescent state from the cpu */
-	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
-		return 1;
-
-	/* nothing to do */
-	return 0;
-}
-
-static inline int rcu_pending(int cpu)
-{
-	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
-		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
-}
+extern int rcu_pending(int cpu);
 
 /**
  * rcu_read_lock - mark the beginning of an RCU read-side critical section.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 30b0bba03859..ccc45d49ce71 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -429,6 +429,36 @@ static void rcu_process_callbacks(unsigned long unused)
 				&__get_cpu_var(rcu_bh_data));
 }
 
+static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* This cpu has pending rcu entries and the grace period
+	 * for them has completed.
+	 */
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
+		return 1;
+
+	/* This cpu has no pending entries, but there are new entries */
+	if (!rdp->curlist && rdp->nxtlist)
+		return 1;
+
+	/* This cpu has finished callbacks to invoke */
+	if (rdp->donelist)
+		return 1;
+
+	/* The rcu core waits for a quiescent state from the cpu */
+	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
+		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+}
+
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user || 
-- 
cgit v1.2.3-71-gd317


From ffeff788d6b10e342b4a887f28d339dfec1737f6 Mon Sep 17 00:00:00 2001
From: Kristen Accardi <kristen.c.accardi@intel.com>
Date: Wed, 2 Nov 2005 16:24:32 -0800
Subject: [PATCH] pci: store PCI_INTERRUPT_PIN in pci_dev

Store the value of the INTERRUPT_PIN in the pci_dev structure
so that it can be retrieved later.

Signed-off-by: Kristen Carlson Accardi <kristen.c.accardi@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/probe.c | 1 +
 include/linux/pci.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index fce2cb2112d8..2f82e63323f3 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -571,6 +571,7 @@ static void pci_read_irq(struct pci_dev *dev)
 	unsigned char irq;
 
 	pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &irq);
+	dev->pin = irq;
 	if (irq)
 		pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
 	dev->irq = irq;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index de690ca73d58..b32f70fe2dbd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -98,6 +98,7 @@ struct pci_dev {
 	unsigned int	class;		/* 3 bytes: (base,sub,prog-if) */
 	u8		hdr_type;	/* PCI header type (`multi' flag masked out) */
 	u8		rom_base_reg;	/* which config register controls the ROM */
+	u8		pin;  		/* which interrupt pin this device uses */
 
 	struct pci_driver *driver;	/* which driver has allocated this device */
 	u64		dma_mask;	/* Mask of the bits of bus address this
-- 
cgit v1.2.3-71-gd317


From ac7dc65ac0b945270548414491efa9c4357417d9 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 13 Dec 2005 18:09:16 +1100
Subject: [PATCH] PCI: Export pci_cfg_space_size

The powerpc PCI code sets up the PCI tree without doing config space
accesses in most cases, from the firmware tree. However, it still wants
to call pci_cfg_space_size() under some conditions, thus it needs to
be made non-static (though I don't see a point to export it to modules).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/probe.c | 2 +-
 include/linux/pci.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 3c9834d80850..adfad4fd6a13 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -717,7 +717,7 @@ static void pci_release_dev(struct device *dev)
  * reading the dword at 0x100 which must either be 0 or a valid extended
  * capability header.
  */
-static int pci_cfg_space_size(struct pci_dev *dev)
+int pci_cfg_space_size(struct pci_dev *dev)
 {
 	int pos;
 	u32 status;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b32f70fe2dbd..d0e003926744 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -449,6 +449,7 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass
 
 void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
 		  void *userdata);
+int pci_cfg_space_size(struct pci_dev *dev);
 
 /* kmem_cache style wrapper around pci_alloc_consistent() */
 
-- 
cgit v1.2.3-71-gd317


From 392a1ce761bc3b3a5d642ee341c1ff082cbb71f0 Mon Sep 17 00:00:00 2001
From: linas <linas@austin.ibm.com>
Date: Wed, 16 Nov 2005 17:10:41 -0600
Subject: [PATCH] PCI Error Recovery: header file patch

Various PCI bus errors can be signaled by newer PCI controllers.
Recovering from those errors requires an infrastructure to notify
affected device drivers of the error, and a way of walking through a
reset sequence.  This patch adds a set of callbacks to be used by error
recovery routines to notify device drivers of the various stages of
recovery.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci.h | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index d0e003926744..0a44072383ec 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -78,6 +78,23 @@ typedef int __bitwise pci_power_t;
 #define PCI_UNKNOWN	((pci_power_t __force) 5)
 #define PCI_POWER_ERROR	((pci_power_t __force) -1)
 
+/** The pci_channel state describes connectivity between the CPU and
+ *  the pci device.  If some PCI bus between here and the pci device
+ *  has crashed or locked up, this info is reflected here.
+ */
+typedef unsigned int __bitwise pci_channel_state_t;
+
+enum pci_channel_state {
+	/* I/O channel is in normal state */
+	pci_channel_io_normal = (__force pci_channel_state_t) 1,
+
+	/* I/O to channel is blocked */
+	pci_channel_io_frozen = (__force pci_channel_state_t) 2,
+
+	/* PCI card is dead */
+	pci_channel_io_perm_failure = (__force pci_channel_state_t) 3,
+};
+
 /*
  * The pci_dev structure is used to describe PCI devices.
  */
@@ -111,6 +128,7 @@ struct pci_dev {
 					   this is D0-D3, D0 being fully functional,
 					   and D3 being off. */
 
+	pci_channel_state_t error_state;	/* current connectivity state */
 	struct	device	dev;		/* Generic device interface */
 
 	/* device is compatible with these IDs */
@@ -233,6 +251,54 @@ struct pci_dynids {
 	unsigned int use_driver_data:1; /* pci_driver->driver_data is used */
 };
 
+/* ---------------------------------------------------------------- */
+/** PCI Error Recovery System (PCI-ERS).  If a PCI device driver provides
+ *  a set fof callbacks in struct pci_error_handlers, then that device driver
+ *  will be notified of PCI bus errors, and will be driven to recovery
+ *  when an error occurs.
+ */
+
+typedef unsigned int __bitwise pci_ers_result_t;
+
+enum pci_ers_result {
+	/* no result/none/not supported in device driver */
+	PCI_ERS_RESULT_NONE = (__force pci_ers_result_t) 1,
+
+	/* Device driver can recover without slot reset */
+	PCI_ERS_RESULT_CAN_RECOVER = (__force pci_ers_result_t) 2,
+
+	/* Device driver wants slot to be reset. */
+	PCI_ERS_RESULT_NEED_RESET = (__force pci_ers_result_t) 3,
+
+	/* Device has completely failed, is unrecoverable */
+	PCI_ERS_RESULT_DISCONNECT = (__force pci_ers_result_t) 4,
+
+	/* Device driver is fully recovered and operational */
+	PCI_ERS_RESULT_RECOVERED = (__force pci_ers_result_t) 5,
+};
+
+/* PCI bus error event callbacks */
+struct pci_error_handlers
+{
+	/* PCI bus error detected on this device */
+	pci_ers_result_t (*error_detected)(struct pci_dev *dev,
+	                      enum pci_channel_state error);
+
+	/* MMIO has been re-enabled, but not DMA */
+	pci_ers_result_t (*mmio_enabled)(struct pci_dev *dev);
+
+	/* PCI Express link has been reset */
+	pci_ers_result_t (*link_reset)(struct pci_dev *dev);
+
+	/* PCI slot has been reset */
+	pci_ers_result_t (*slot_reset)(struct pci_dev *dev);
+
+	/* Device driver may resume normal operations */
+	void (*resume)(struct pci_dev *dev);
+};
+
+/* ---------------------------------------------------------------- */
+
 struct module;
 struct pci_driver {
 	struct list_head node;
@@ -245,6 +311,7 @@ struct pci_driver {
 	int  (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable);   /* Enable wake event */
 	void (*shutdown) (struct pci_dev *dev);
 
+	struct pci_error_handlers *err_handler;
 	struct device_driver	driver;
 	struct pci_dynids dynids;
 };
-- 
cgit v1.2.3-71-gd317


From 788ee7b09883515f3a72a8f2a980df5e94f37e2c Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Mon, 9 Jan 2006 21:12:17 +0000
Subject: [MMC] Add DATA_MULTI flag

Some hosts need to know that a transfer will be multi-block.
Add a data flag to indicate multiple data block transfers.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/mmc_block.c | 8 +++++++-
 include/linux/mmc/mmc.h | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c
index 198561d21710..f43cdfcbfd2c 100644
--- a/drivers/mmc/mmc_block.c
+++ b/drivers/mmc/mmc_block.c
@@ -200,7 +200,13 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 			brq.data.flags |= MMC_DATA_WRITE;
 			brq.data.blocks = 1;
 		}
-		brq.mrq.stop = brq.data.blocks > 1 ? &brq.stop : NULL;
+
+		if (brq.data.blocks > 1) {
+			brq.data.flags |= MMC_DATA_MULTI;
+			brq.mrq.stop = &brq.stop;
+		} else {
+			brq.mrq.stop = NULL;
+		}
 
 		brq.data.sg = mq->sg;
 		brq.data.sg_len = blk_rq_map_sg(req->q, req, brq.data.sg);
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index aef6042f8f0b..86d491b3b73f 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -64,6 +64,7 @@ struct mmc_data {
 #define MMC_DATA_WRITE	(1 << 8)
 #define MMC_DATA_READ	(1 << 9)
 #define MMC_DATA_STREAM	(1 << 10)
+#define MMC_DATA_MULTI	(1 << 11)
 
 	unsigned int		bytes_xfered;
 
-- 
cgit v1.2.3-71-gd317


From 5cb1454b862ab3040b78364d58330262fea1ddba Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 5 Nov 2005 16:58:14 +1100
Subject: [CRYPTO] Allow multiple implementations of the same algorithm

This is the first step on the road towards asynchronous support in
the Crypto API.  It adds support for having multiple crypto_alg objects
for the same algorithm registered in the system.

For example, each device driver would register a crypto_alg object
for each algorithm that it supports.  While at the same time the
user may load software implementations of those same algorithms.

Users of the Crypto API may then select a specific implementation
by name, or choose any implementation for a given algorithm with
the highest priority.

The priority field is a 32-bit signed integer.  In future it will be
possible to modify it from user-space.

This also provides a solution to the problem of selecting amongst
various AES implementations, that is, aes vs. aes-i586 vs. aes-padlock.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c           | 52 ++++++++++++++++++++++++++++++++++++++++++++------
 crypto/internal.h      |  6 ++++++
 crypto/proc.c          |  6 +++---
 include/linux/crypto.h |  5 +++++
 4 files changed, 60 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index 40ae42e9b6a6..2715afdf678c 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
  * Copyright (c) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
  * and Nettle, by Niels M�ller.
@@ -18,9 +19,11 @@
 #include <linux/init.h>
 #include <linux/crypto.h>
 #include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/rwsem.h>
 #include <linux/slab.h>
+#include <linux/string.h>
 #include "internal.h"
 
 LIST_HEAD(crypto_alg_list);
@@ -39,6 +42,7 @@ static inline void crypto_alg_put(struct crypto_alg *alg)
 static struct crypto_alg *crypto_alg_lookup(const char *name)
 {
 	struct crypto_alg *q, *alg = NULL;
+	int best = -1;
 
 	if (!name)
 		return NULL;
@@ -46,11 +50,23 @@ static struct crypto_alg *crypto_alg_lookup(const char *name)
 	down_read(&crypto_alg_sem);
 	
 	list_for_each_entry(q, &crypto_alg_list, cra_list) {
-		if (!(strcmp(q->cra_name, name))) {
-			if (crypto_alg_get(q))
-				alg = q;
+		int exact, fuzzy;
+
+		exact = !strcmp(q->cra_driver_name, name);
+		fuzzy = !strcmp(q->cra_name, name);
+		if (!exact && !(fuzzy && q->cra_priority > best))
+			continue;
+
+		if (unlikely(!crypto_alg_get(q)))
+			continue;
+
+		best = q->cra_priority;
+		if (alg)
+			crypto_alg_put(alg);
+		alg = q;
+
+		if (exact)
 			break;
-		}
 	}
 	
 	up_read(&crypto_alg_sem);
@@ -207,9 +223,26 @@ void crypto_free_tfm(struct crypto_tfm *tfm)
 	kfree(tfm);
 }
 
+static inline int crypto_set_driver_name(struct crypto_alg *alg)
+{
+	static const char suffix[] = "-generic";
+	char *driver_name = (char *)alg->cra_driver_name;
+	int len;
+
+	if (*driver_name)
+		return 0;
+
+	len = strlcpy(driver_name, alg->cra_name, CRYPTO_MAX_ALG_NAME);
+	if (len + sizeof(suffix) > CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
+	memcpy(driver_name + len, suffix, sizeof(suffix));
+	return 0;
+}
+
 int crypto_register_alg(struct crypto_alg *alg)
 {
-	int ret = 0;
+	int ret;
 	struct crypto_alg *q;
 
 	if (alg->cra_alignmask & (alg->cra_alignmask + 1))
@@ -220,11 +253,18 @@ int crypto_register_alg(struct crypto_alg *alg)
 
 	if (alg->cra_blocksize > PAGE_SIZE)
 		return -EINVAL;
+
+	if (alg->cra_priority < 0)
+		return -EINVAL;
 	
+	ret = crypto_set_driver_name(alg);
+	if (unlikely(ret))
+		return ret;
+
 	down_write(&crypto_alg_sem);
 	
 	list_for_each_entry(q, &crypto_alg_list, cra_list) {
-		if (!(strcmp(q->cra_name, alg->cra_name))) {
+		if (!strcmp(q->cra_driver_name, alg->cra_driver_name)) {
 			ret = -EEXIST;
 			goto out;
 		}
diff --git a/crypto/internal.h b/crypto/internal.h
index 37aa652ce5ce..959e602909a6 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -2,6 +2,7 @@
  * Cryptographic API.
  *
  * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -16,10 +17,15 @@
 #include <linux/highmem.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/list.h>
 #include <linux/kernel.h>
+#include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <asm/kmap_types.h>
 
+extern struct list_head crypto_alg_list;
+extern struct rw_semaphore crypto_alg_sem;
+
 extern enum km_type crypto_km_types[];
 
 static inline enum km_type crypto_kmap_type(int out)
diff --git a/crypto/proc.c b/crypto/proc.c
index 630ba91c08f1..c0a5dd7ce2cc 100644
--- a/crypto/proc.c
+++ b/crypto/proc.c
@@ -4,6 +4,7 @@
  * Procfs information.
  *
  * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -18,9 +19,6 @@
 #include <linux/seq_file.h>
 #include "internal.h"
 
-extern struct list_head crypto_alg_list;
-extern struct rw_semaphore crypto_alg_sem;
-
 static void *c_start(struct seq_file *m, loff_t *pos)
 {
 	struct list_head *v;
@@ -53,7 +51,9 @@ static int c_show(struct seq_file *m, void *p)
 	struct crypto_alg *alg = (struct crypto_alg *)p;
 	
 	seq_printf(m, "name         : %s\n", alg->cra_name);
+	seq_printf(m, "driver       : %s\n", alg->cra_driver_name);
 	seq_printf(m, "module       : %s\n", module_name(alg->cra_module));
+	seq_printf(m, "priority     : %d\n", alg->cra_priority);
 	
 	switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) {
 	case CRYPTO_ALG_TYPE_CIPHER:
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 3c89df6e7768..d88bf8aa8b47 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
  * Copyright (c) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
  * and Nettle, by Niels M�ller.
@@ -126,7 +127,11 @@ struct crypto_alg {
 	unsigned int cra_blocksize;
 	unsigned int cra_ctxsize;
 	unsigned int cra_alignmask;
+
+	int cra_priority;
+
 	const char cra_name[CRYPTO_MAX_ALG_NAME];
+	const char cra_driver_name[CRYPTO_MAX_ALG_NAME];
 
 	union {
 		struct cipher_alg cipher;
-- 
cgit v1.2.3-71-gd317


From 7225b3fd0b6e224235fc50a69f70479ff96d5602 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus@drzeus.cx>
Date: Mon, 9 Jan 2006 22:51:46 +0000
Subject: [MMC] Indicate that R1/R1b contains command opcode

Some controllers actually check the first byte of the response (most
don't).  This byte contains the command opcode for R1/R1b and all 1:s
for other types. The difference must be indicated to the controller
so it knows which reply to expect.

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
---
 include/linux/mmc/mmc.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 86d491b3b73f..ccd3e13de1e8 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -27,14 +27,15 @@ struct mmc_command {
 #define MMC_RSP_MASK	(3 << 0)
 #define MMC_RSP_CRC	(1 << 3)		/* expect valid crc */
 #define MMC_RSP_BUSY	(1 << 4)		/* card may send busy */
+#define MMC_RSP_OPCODE	(1 << 5)		/* response contains opcode */
 
 /*
  * These are the response types, and correspond to valid bit
  * patterns of the above flags.  One additional valid pattern
  * is all zeros, which means we don't expect a response.
  */
-#define MMC_RSP_R1	(MMC_RSP_SHORT|MMC_RSP_CRC)
-#define MMC_RSP_R1B	(MMC_RSP_SHORT|MMC_RSP_CRC|MMC_RSP_BUSY)
+#define MMC_RSP_R1	(MMC_RSP_SHORT|MMC_RSP_CRC|MMC_RSP_OPCODE)
+#define MMC_RSP_R1B	(MMC_RSP_SHORT|MMC_RSP_CRC|MMC_RSP_OPCODE|MMC_RSP_BUSY)
 #define MMC_RSP_R2	(MMC_RSP_LONG|MMC_RSP_CRC)
 #define MMC_RSP_R3	(MMC_RSP_SHORT)
 #define MMC_RSP_R6	(MMC_RSP_SHORT|MMC_RSP_CRC)
-- 
cgit v1.2.3-71-gd317


From 540695886075964c0e5295bea8e4793e8765d010 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 10 Jan 2006 00:09:36 +0100
Subject: s/assoicated/associated/

Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 include/linux/elevator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4a6f50e31c73..23fe746a1d51 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -66,7 +66,7 @@ struct elevator_type
 };
 
 /*
- * each queue has an elevator_queue assoicated with it
+ * each queue has an elevator_queue associated with it
  */
 struct elevator_queue
 {
-- 
cgit v1.2.3-71-gd317


From 711a660dc2064013a2b0167ee67389707fc9cac3 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Mon, 9 Jan 2006 15:59:17 -0800
Subject: [PATCH] mutex subsystem, add typecheck_fn(type, function)

add typecheck_fn(type, function) to do type-checking of function
pointers.

Modified-by: Ingo Molnar <mingo@elte.hu>

(made it typeof() based, instead of typedef based.)

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index ca7ff8fdd090..d0e6ca3b00ef 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -286,6 +286,15 @@ extern void dump_stack(void);
 	1; \
 })
 
+/*
+ * Check at compile time that 'function' is a certain type, or is a pointer
+ * to that type (needs to use typedef for the function type.)
+ */
+#define typecheck_fn(type,function) \
+({	typeof(type) __tmp = function; \
+	(void)__tmp; \
+})
+
 #endif /* __KERNEL__ */
 
 #define SI_LOAD_SHIFT	16
-- 
cgit v1.2.3-71-gd317


From 6053ee3b32e3437e8c1e72687850f436e779bd49 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 15:59:19 -0800
Subject: [PATCH] mutex subsystem, core

mutex implementation, core files: just the basic subsystem, no users of it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
---
 include/linux/mutex.h | 119 ++++++++++++++++++
 kernel/Makefile       |   2 +-
 kernel/mutex.c        | 325 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/mutex.h        |  35 ++++++
 4 files changed, 480 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/mutex.h
 create mode 100644 kernel/mutex.c
 create mode 100644 kernel/mutex.h

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
new file mode 100644
index 000000000000..9bce0fee68d4
--- /dev/null
+++ b/include/linux/mutex.h
@@ -0,0 +1,119 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains the main data structure and API definitions.
+ */
+#ifndef __LINUX_MUTEX_H
+#define __LINUX_MUTEX_H
+
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+#include <asm/atomic.h>
+
+/*
+ * Simple, straightforward mutexes with strict semantics:
+ *
+ * - only one task can hold the mutex at a time
+ * - only the owner can unlock the mutex
+ * - multiple unlocks are not permitted
+ * - recursive locking is not permitted
+ * - a mutex object must be initialized via the API
+ * - a mutex object must not be initialized via memset or copying
+ * - task may not exit with mutex held
+ * - memory areas where held locks reside must not be freed
+ * - held mutexes must not be reinitialized
+ * - mutexes may not be used in irq contexts
+ *
+ * These semantics are fully enforced when DEBUG_MUTEXES is
+ * enabled. Furthermore, besides enforcing the above rules, the mutex
+ * debugging code also implements a number of additional features
+ * that make lock debugging easier and faster:
+ *
+ * - uses symbolic names of mutexes, whenever they are printed in debug output
+ * - point-of-acquire tracking, symbolic lookup of function names
+ * - list of all locks held in the system, printout of them
+ * - owner tracking
+ * - detects self-recursing locks and prints out all relevant info
+ * - detects multi-task circular deadlocks and prints out all affected
+ *   locks and tasks (and only those tasks)
+ */
+struct mutex {
+	/* 1: unlocked, 0: locked, negative: locked, possible waiters */
+	atomic_t		count;
+	spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_MUTEXES
+	struct thread_info	*owner;
+	struct list_head	held_list;
+	unsigned long		acquire_ip;
+	const char 		*name;
+	void			*magic;
+#endif
+};
+
+/*
+ * This is the control structure for tasks blocked on mutex,
+ * which resides on the blocked task's kernel stack:
+ */
+struct mutex_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+#ifdef CONFIG_DEBUG_MUTEXES
+	struct mutex		*lock;
+	void			*magic;
+#endif
+};
+
+#ifdef CONFIG_DEBUG_MUTEXES
+# include <linux/mutex-debug.h>
+#else
+# define __DEBUG_MUTEX_INITIALIZER(lockname)
+# define mutex_init(mutex)			__mutex_init(mutex, NULL)
+# define mutex_destroy(mutex)				do { } while (0)
+# define mutex_debug_show_all_locks()			do { } while (0)
+# define mutex_debug_show_held_locks(p)			do { } while (0)
+# define mutex_debug_check_no_locks_held(task)		do { } while (0)
+# define mutex_debug_check_no_locks_freed(from, to)	do { } while (0)
+#endif
+
+#define __MUTEX_INITIALIZER(lockname) \
+		{ .count = ATOMIC_INIT(1) \
+		, .wait_lock = SPIN_LOCK_UNLOCKED \
+		, .wait_list = LIST_HEAD_INIT(lockname.wait_list) \
+		__DEBUG_MUTEX_INITIALIZER(lockname) }
+
+#define DEFINE_MUTEX(mutexname) \
+	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
+
+extern void fastcall __mutex_init(struct mutex *lock, const char *name);
+
+/***
+ * mutex_is_locked - is the mutex locked
+ * @lock: the mutex to be queried
+ *
+ * Returns 1 if the mutex is locked, 0 if unlocked.
+ */
+static inline int fastcall mutex_is_locked(struct mutex *lock)
+{
+	return atomic_read(&lock->count) != 1;
+}
+
+/*
+ * See kernel/mutex.c for detailed documentation of these APIs.
+ * Also see Documentation/mutex-design.txt.
+ */
+extern void fastcall mutex_lock(struct mutex *lock);
+extern int fastcall mutex_lock_interruptible(struct mutex *lock);
+/*
+ * NOTE: mutex_trylock() follows the spin_trylock() convention,
+ *       not the down_trylock() convention!
+ */
+extern int fastcall mutex_trylock(struct mutex *lock);
+extern void fastcall mutex_unlock(struct mutex *lock);
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 4f5a1453093a..de580b4d54a4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o
 
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
diff --git a/kernel/mutex.c b/kernel/mutex.c
new file mode 100644
index 000000000000..7eb960661441
--- /dev/null
+++ b/kernel/mutex.c
@@ -0,0 +1,325 @@
+/*
+ * kernel/mutex.c
+ *
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * Started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
+ * David Howells for suggestions and improvements.
+ *
+ * Also see Documentation/mutex-design.txt.
+ */
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+/*
+ * In the DEBUG case we are using the "NULL fastpath" for mutexes,
+ * which forces all calls into the slowpath:
+ */
+#ifdef CONFIG_DEBUG_MUTEXES
+# include "mutex-debug.h"
+# include <asm-generic/mutex-null.h>
+#else
+# include "mutex.h"
+# include <asm/mutex.h>
+#endif
+
+/***
+ * mutex_init - initialize the mutex
+ * @lock: the mutex to be initialized
+ *
+ * Initialize the mutex to unlocked state.
+ *
+ * It is not allowed to initialize an already locked mutex.
+ */
+void fastcall __mutex_init(struct mutex *lock, const char *name)
+{
+	atomic_set(&lock->count, 1);
+	spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list);
+
+	debug_mutex_init(lock, name);
+}
+
+EXPORT_SYMBOL(__mutex_init);
+
+/*
+ * We split the mutex lock/unlock logic into separate fastpath and
+ * slowpath functions, to reduce the register pressure on the fastpath.
+ * We also put the fastpath first in the kernel image, to make sure the
+ * branch is predicted by the CPU as default-untaken.
+ */
+static void fastcall noinline __sched
+__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
+
+/***
+ * mutex_lock - acquire the mutex
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex exclusively for this task. If the mutex is not
+ * available right now, it will sleep until it can get it.
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. Recursive locking is not allowed. The task
+ * may not exit without first unlocking the mutex. Also, kernel
+ * memory where the mutex resides mutex must not be freed with
+ * the mutex still locked. The mutex must first be initialized
+ * (or statically defined) before it can be locked. memset()-ing
+ * the mutex to 0 is not allowed.
+ *
+ * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
+ *   checks that will enforce the restrictions and will also do
+ *   deadlock debugging. )
+ *
+ * This function is similar to (but not equivalent to) down().
+ */
+void fastcall __sched mutex_lock(struct mutex *lock)
+{
+	/*
+	 * The locking fastpath is the 1->0 transition from
+	 * 'unlocked' into 'locked' state.
+	 *
+	 * NOTE: if asm/mutex.h is included, then some architectures
+	 * rely on mutex_lock() having _no other code_ here but this
+	 * fastpath. That allows the assembly fastpath to do
+	 * tail-merging optimizations. (If you want to put testcode
+	 * here, do it under #ifndef CONFIG_MUTEX_DEBUG.)
+	 */
+	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_lock);
+
+static void fastcall noinline __sched
+__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__);
+
+/***
+ * mutex_unlock - release the mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a not locked mutex is not allowed.
+ *
+ * This function is similar to (but not equivalent to) up().
+ */
+void fastcall __sched mutex_unlock(struct mutex *lock)
+{
+	/*
+	 * The unlocking fastpath is the 0->1 transition from 'locked'
+	 * into 'unlocked' state:
+	 *
+	 * NOTE: no other code must be here - see mutex_lock() .
+	 */
+	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_unlock);
+
+/*
+ * Lock a mutex (possibly interruptible), slowpath:
+ */
+static inline int __sched
+__mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
+{
+	struct task_struct *task = current;
+	struct mutex_waiter waiter;
+	unsigned int old_val;
+
+	debug_mutex_init_waiter(&waiter);
+
+	spin_lock_mutex(&lock->wait_lock);
+
+	debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
+
+	/* add waiting tasks to the end of the waitqueue (FIFO): */
+	list_add_tail(&waiter.list, &lock->wait_list);
+	waiter.task = task;
+
+	for (;;) {
+		/*
+		 * Lets try to take the lock again - this is needed even if
+		 * we get here for the first time (shortly after failing to
+		 * acquire the lock), to make sure that we get a wakeup once
+		 * it's unlocked. Later on, if we sleep, this is the
+		 * operation that gives us the lock. We xchg it to -1, so
+		 * that when we release the lock, we properly wake up the
+		 * other waiters:
+		 */
+		old_val = atomic_xchg(&lock->count, -1);
+		if (old_val == 1)
+			break;
+
+		/*
+		 * got a signal? (This code gets eliminated in the
+		 * TASK_UNINTERRUPTIBLE case.)
+		 */
+		if (unlikely(state == TASK_INTERRUPTIBLE &&
+						signal_pending(task))) {
+			mutex_remove_waiter(lock, &waiter, task->thread_info);
+			spin_unlock_mutex(&lock->wait_lock);
+
+			debug_mutex_free_waiter(&waiter);
+			return -EINTR;
+		}
+		__set_task_state(task, state);
+
+		/* didnt get the lock, go to sleep: */
+		spin_unlock_mutex(&lock->wait_lock);
+		schedule();
+		spin_lock_mutex(&lock->wait_lock);
+	}
+
+	/* got the lock - rejoice! */
+	mutex_remove_waiter(lock, &waiter, task->thread_info);
+	debug_mutex_set_owner(lock, task->thread_info __IP__);
+
+	/* set it to 0 if there are no waiters left: */
+	if (likely(list_empty(&lock->wait_list)))
+		atomic_set(&lock->count, 0);
+
+	spin_unlock_mutex(&lock->wait_lock);
+
+	debug_mutex_free_waiter(&waiter);
+
+	DEBUG_WARN_ON(list_empty(&lock->held_list));
+	DEBUG_WARN_ON(lock->owner != task->thread_info);
+
+	return 0;
+}
+
+static void fastcall noinline __sched
+__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__)
+{
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__);
+}
+
+/*
+ * Release the lock, slowpath:
+ */
+static fastcall noinline void
+__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
+{
+        struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+	DEBUG_WARN_ON(lock->owner != current_thread_info());
+
+	spin_lock_mutex(&lock->wait_lock);
+
+	/*
+	 * some architectures leave the lock unlocked in the fastpath failure
+	 * case, others need to leave it locked. In the later case we have to
+	 * unlock it here
+	 */
+	if (__mutex_slowpath_needs_to_unlock())
+		atomic_set(&lock->count, 1);
+
+	debug_mutex_unlock(lock);
+
+	if (!list_empty(&lock->wait_list)) {
+		/* get the first entry from the wait-list: */
+		struct mutex_waiter *waiter =
+				list_entry(lock->wait_list.next,
+					   struct mutex_waiter, list);
+
+		debug_mutex_wake_waiter(lock, waiter);
+
+		wake_up_process(waiter->task);
+	}
+
+	debug_mutex_clear_owner(lock);
+
+	spin_unlock_mutex(&lock->wait_lock);
+}
+
+/*
+ * Here come the less common (and hence less performance-critical) APIs:
+ * mutex_lock_interruptible() and mutex_trylock().
+ */
+static int fastcall noinline __sched
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__);
+
+/***
+ * mutex_lock_interruptible - acquire the mutex, interruptable
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex like mutex_lock(), and return 0 if the mutex has
+ * been acquired or sleep until the mutex becomes available. If a
+ * signal arrives while waiting for the lock then this function
+ * returns -EINTR.
+ *
+ * This function is similar to (but not equivalent to) down_interruptible().
+ */
+int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
+{
+	/* NOTE: no other code must be here - see mutex_lock() */
+	return __mutex_fastpath_lock_retval
+			(&lock->count, __mutex_lock_interruptible_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+static int fastcall noinline __sched
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
+{
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__);
+}
+
+/*
+ * Spinlock based trylock, we take the spinlock and check whether we
+ * can get the lock:
+ */
+static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
+{
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+	int prev;
+
+	spin_lock_mutex(&lock->wait_lock);
+
+	prev = atomic_xchg(&lock->count, -1);
+	if (likely(prev == 1))
+		debug_mutex_set_owner(lock, current_thread_info() __RET_IP__);
+	/* Set it back to 0 if there are no waiters: */
+	if (likely(list_empty(&lock->wait_list)))
+		atomic_set(&lock->count, 0);
+
+	spin_unlock_mutex(&lock->wait_lock);
+
+	return prev == 1;
+}
+
+/***
+ * mutex_trylock - try acquire the mutex, without waiting
+ * @lock: the mutex to be acquired
+ *
+ * Try to acquire the mutex atomically. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated to the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+int fastcall mutex_trylock(struct mutex *lock)
+{
+	return __mutex_fastpath_trylock(&lock->count,
+					__mutex_trylock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_trylock);
+
+
+
diff --git a/kernel/mutex.h b/kernel/mutex.h
new file mode 100644
index 000000000000..00fe84e7b672
--- /dev/null
+++ b/kernel/mutex.h
@@ -0,0 +1,35 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal prototypes, for the
+ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
+ */
+
+#define spin_lock_mutex(lock)			spin_lock(lock)
+#define spin_unlock_mutex(lock)			spin_unlock(lock)
+#define mutex_remove_waiter(lock, waiter, ti) \
+		__list_del((waiter)->list.prev, (waiter)->list.next)
+
+#define DEBUG_WARN_ON(c)				do { } while (0)
+#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
+#define debug_mutex_clear_owner(lock)			do { } while (0)
+#define debug_mutex_init_waiter(waiter)			do { } while (0)
+#define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
+#define debug_mutex_free_waiter(waiter)			do { } while (0)
+#define debug_mutex_add_waiter(lock, waiter, ti, ip)	do { } while (0)
+#define debug_mutex_unlock(lock)			do { } while (0)
+#define debug_mutex_init(lock, name)			do { } while (0)
+
+/*
+ * Return-address parameters/declarations. They are very useful for
+ * debugging, but add overhead in the !DEBUG case - so we go the
+ * trouble of using this not too elegant but zero-cost solution:
+ */
+#define __IP_DECL__
+#define __IP__
+#define __RET_IP__
+
-- 
cgit v1.2.3-71-gd317


From 408894ee4dd4debfdedd472eb4d8414892fc90f6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 15:59:20 -0800
Subject: [PATCH] mutex subsystem, debugging code

mutex implementation - add debugging code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
---
 include/linux/mutex-debug.h |  21 ++
 include/linux/sched.h       |   5 +
 kernel/Makefile             |   1 +
 kernel/fork.c               |   4 +
 kernel/mutex-debug.c        | 464 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/mutex-debug.h        | 134 +++++++++++++
 lib/Kconfig.debug           |   8 +
 7 files changed, 637 insertions(+)
 create mode 100644 include/linux/mutex-debug.h
 create mode 100644 kernel/mutex-debug.c
 create mode 100644 kernel/mutex-debug.h

(limited to 'include/linux')

diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
new file mode 100644
index 000000000000..0ccd8f983b50
--- /dev/null
+++ b/include/linux/mutex-debug.h
@@ -0,0 +1,21 @@
+#ifndef __LINUX_MUTEX_DEBUG_H
+#define __LINUX_MUTEX_DEBUG_H
+
+/*
+ * Mutexes - debugging helpers:
+ */
+
+#define __DEBUG_MUTEX_INITIALIZER(lockname) \
+	, .held_list = LIST_HEAD_INIT(lockname.held_list), \
+	  .name = #lockname , .magic = &lockname
+
+#define mutex_init(sem)		__mutex_init(sem, __FUNCTION__)
+
+extern void FASTCALL(mutex_destroy(struct mutex *lock));
+
+extern void mutex_debug_show_all_locks(void);
+extern void mutex_debug_show_held_locks(struct task_struct *filter);
+extern void mutex_debug_check_no_locks_held(struct task_struct *task);
+extern void mutex_debug_check_no_locks_freed(const void *from, const void *to);
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78eb92ae4d94..85b53f87c703 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -817,6 +817,11 @@ struct task_struct {
 /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
 	spinlock_t proc_lock;
 
+#ifdef CONFIG_DEBUG_MUTEXES
+	/* mutex deadlock detection */
+	struct mutex_waiter *blocked_on;
+#endif
+
 /* journalling filesystem info */
 	void *journal_info;
 
diff --git a/kernel/Makefile b/kernel/Makefile
index de580b4d54a4..a940bac02837 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,6 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o
 
+obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 72e3252c6763..b18d64554feb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -979,6 +979,10 @@ static task_t *copy_process(unsigned long clone_flags,
  	}
 #endif
 
+#ifdef CONFIG_DEBUG_MUTEXES
+	p->blocked_on = NULL; /* not blocked yet */
+#endif
+
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
 		p->tgid = current->tgid;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
new file mode 100644
index 000000000000..4fcb051a8b9e
--- /dev/null
+++ b/kernel/mutex-debug.c
@@ -0,0 +1,464 @@
+/*
+ * kernel/mutex-debug.c
+ *
+ * Debugging code for mutexes
+ *
+ * Started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * lock debugging, locking tree, deadlock detection started by:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ */
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+
+#include <asm/mutex.h>
+
+#include "mutex-debug.h"
+
+/*
+ * We need a global lock when we walk through the multi-process
+ * lock tree. Only used in the deadlock-debugging case.
+ */
+DEFINE_SPINLOCK(debug_mutex_lock);
+
+/*
+ * All locks held by all tasks, in a single global list:
+ */
+LIST_HEAD(debug_mutex_held_locks);
+
+/*
+ * In the debug case we carry the caller's instruction pointer into
+ * other functions, but we dont want the function argument overhead
+ * in the nondebug case - hence these macros:
+ */
+#define __IP_DECL__		, unsigned long ip
+#define __IP__			, ip
+#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
+
+/*
+ * "mutex debugging enabled" flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int debug_mutex_on = 1;
+
+static void printk_task(struct task_struct *p)
+{
+	if (p)
+		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_ti(struct thread_info *ti)
+{
+	if (ti)
+		printk_task(ti->task);
+	else
+		printk("<none>");
+}
+
+static void printk_task_short(struct task_struct *p)
+{
+	if (p)
+		printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_lock(struct mutex *lock, int print_owner)
+{
+	printk(" [%p] {%s}\n", lock, lock->name);
+
+	if (print_owner && lock->owner) {
+		printk(".. held by:  ");
+		printk_ti(lock->owner);
+		printk("\n");
+	}
+	if (lock->owner) {
+		printk("... acquired at:               ");
+		print_symbol("%s\n", lock->acquire_ip);
+	}
+}
+
+/*
+ * printk locks held by a task:
+ */
+static void show_task_locks(struct task_struct *p)
+{
+	switch (p->state) {
+	case TASK_RUNNING:		printk("R"); break;
+	case TASK_INTERRUPTIBLE:	printk("S"); break;
+	case TASK_UNINTERRUPTIBLE:	printk("D"); break;
+	case TASK_STOPPED:		printk("T"); break;
+	case EXIT_ZOMBIE:		printk("Z"); break;
+	case EXIT_DEAD:			printk("X"); break;
+	default:			printk("?"); break;
+	}
+	printk_task(p);
+	if (p->blocked_on) {
+		struct mutex *lock = p->blocked_on->lock;
+
+		printk(" blocked on mutex:");
+		printk_lock(lock, 1);
+	} else
+		printk(" (not blocked on mutex)\n");
+}
+
+/*
+ * printk all locks held in the system (if filter == NULL),
+ * or all locks belonging to a single task (if filter != NULL):
+ */
+void show_held_locks(struct task_struct *filter)
+{
+	struct list_head *curr, *cursor = NULL;
+	struct mutex *lock;
+	struct thread_info *t;
+	unsigned long flags;
+	int count = 0;
+
+	if (filter) {
+		printk("------------------------------\n");
+		printk("| showing all locks held by: |  (");
+		printk_task_short(filter);
+		printk("):\n");
+		printk("------------------------------\n");
+	} else {
+		printk("---------------------------\n");
+		printk("| showing all locks held: |\n");
+		printk("---------------------------\n");
+	}
+
+	/*
+	 * Play safe and acquire the global trace lock. We
+	 * cannot printk with that lock held so we iterate
+	 * very carefully:
+	 */
+next:
+	debug_spin_lock_save(&debug_mutex_lock, flags);
+	list_for_each(curr, &debug_mutex_held_locks) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct mutex, held_list);
+		t = lock->owner;
+		if (filter && (t != filter->thread_info))
+			continue;
+		count++;
+		cursor = curr->next;
+		debug_spin_lock_restore(&debug_mutex_lock, flags);
+
+		printk("\n#%03d:            ", count);
+		printk_lock(lock, filter ? 0 : 1);
+		goto next;
+	}
+	debug_spin_lock_restore(&debug_mutex_lock, flags);
+	printk("\n");
+}
+
+void mutex_debug_show_all_locks(void)
+{
+	struct task_struct *g, *p;
+	int count = 10;
+	int unlock = 1;
+
+	printk("\nShowing all blocking locks in the system:\n");
+
+	/*
+	 * Here we try to get the tasklist_lock as hard as possible,
+	 * if not successful after 2 seconds we ignore it (but keep
+	 * trying). This is to enable a debug printout even if a
+	 * tasklist_lock-holding task deadlocks or crashes.
+	 */
+retry:
+	if (!read_trylock(&tasklist_lock)) {
+		if (count == 10)
+			printk("hm, tasklist_lock locked, retrying... ");
+		if (count) {
+			count--;
+			printk(" #%d", 10-count);
+			mdelay(200);
+			goto retry;
+		}
+		printk(" ignoring it.\n");
+		unlock = 0;
+	}
+	if (count != 10)
+		printk(" locked it.\n");
+
+	do_each_thread(g, p) {
+		show_task_locks(p);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+	printk("\n");
+	show_held_locks(NULL);
+	printk("=============================================\n\n");
+
+	if (unlock)
+		read_unlock(&tasklist_lock);
+}
+
+static void report_deadlock(struct task_struct *task, struct mutex *lock,
+			    struct mutex *lockblk, unsigned long ip)
+{
+	printk("\n%s/%d is trying to acquire this lock:\n",
+		current->comm, current->pid);
+	printk_lock(lock, 1);
+	printk("... trying at:                 ");
+	print_symbol("%s\n", ip);
+	show_held_locks(current);
+
+	if (lockblk) {
+		printk("but %s/%d is deadlocking current task %s/%d!\n\n",
+			task->comm, task->pid, current->comm, current->pid);
+		printk("\n%s/%d is blocked on this lock:\n",
+			task->comm, task->pid);
+		printk_lock(lockblk, 1);
+
+		show_held_locks(task);
+
+		printk("\n%s/%d's [blocked] stackdump:\n\n",
+			task->comm, task->pid);
+		show_stack(task, NULL);
+	}
+
+	printk("\n%s/%d's [current] stackdump:\n\n",
+		current->comm, current->pid);
+	dump_stack();
+	mutex_debug_show_all_locks();
+	printk("[ turning off deadlock detection. Please report this. ]\n\n");
+	local_irq_disable();
+}
+
+/*
+ * Recursively check for mutex deadlocks:
+ */
+static int check_deadlock(struct mutex *lock, int depth,
+			  struct thread_info *ti, unsigned long ip)
+{
+	struct mutex *lockblk;
+	struct task_struct *task;
+
+	if (!debug_mutex_on)
+		return 0;
+
+	ti = lock->owner;
+	if (!ti)
+		return 0;
+
+	task = ti->task;
+	lockblk = NULL;
+	if (task->blocked_on)
+		lockblk = task->blocked_on->lock;
+
+	/* Self-deadlock: */
+	if (current == task) {
+		DEBUG_OFF();
+		if (depth)
+			return 1;
+		printk("\n==========================================\n");
+		printk(  "[ BUG: lock recursion deadlock detected! |\n");
+		printk(  "------------------------------------------\n");
+		report_deadlock(task, lock, NULL, ip);
+		return 0;
+	}
+
+	/* Ugh, something corrupted the lock data structure? */
+	if (depth > 20) {
+		DEBUG_OFF();
+		printk("\n===========================================\n");
+		printk(  "[ BUG: infinite lock dependency detected!? |\n");
+		printk(  "-------------------------------------------\n");
+		report_deadlock(task, lock, lockblk, ip);
+		return 0;
+	}
+
+	/* Recursively check for dependencies: */
+	if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) {
+		printk("\n============================================\n");
+		printk(  "[ BUG: circular locking deadlock detected! ]\n");
+		printk(  "--------------------------------------------\n");
+		report_deadlock(task, lock, lockblk, ip);
+		return 0;
+	}
+	return 0;
+}
+
+/*
+ * Called when a task exits, this function checks whether the
+ * task is holding any locks, and reports the first one if so:
+ */
+void mutex_debug_check_no_locks_held(struct task_struct *task)
+{
+	struct list_head *curr, *next;
+	struct thread_info *t;
+	unsigned long flags;
+	struct mutex *lock;
+
+	if (!debug_mutex_on)
+		return;
+
+	debug_spin_lock_save(&debug_mutex_lock, flags);
+	list_for_each_safe(curr, next, &debug_mutex_held_locks) {
+		lock = list_entry(curr, struct mutex, held_list);
+		t = lock->owner;
+		if (t != task->thread_info)
+			continue;
+		list_del_init(curr);
+		DEBUG_OFF();
+		debug_spin_lock_restore(&debug_mutex_lock, flags);
+
+		printk("BUG: %s/%d, lock held at task exit time!\n",
+			task->comm, task->pid);
+		printk_lock(lock, 1);
+		if (lock->owner != task->thread_info)
+			printk("exiting task is not even the owner??\n");
+		return;
+	}
+	debug_spin_lock_restore(&debug_mutex_lock, flags);
+}
+
+/*
+ * Called when kernel memory is freed (or unmapped), or if a mutex
+ * is destroyed or reinitialized - this code checks whether there is
+ * any held lock in the memory range of <from> to <to>:
+ */
+void mutex_debug_check_no_locks_freed(const void *from, const void *to)
+{
+	struct list_head *curr, *next;
+	unsigned long flags;
+	struct mutex *lock;
+	void *lock_addr;
+
+	if (!debug_mutex_on)
+		return;
+
+	debug_spin_lock_save(&debug_mutex_lock, flags);
+	list_for_each_safe(curr, next, &debug_mutex_held_locks) {
+		lock = list_entry(curr, struct mutex, held_list);
+		lock_addr = lock;
+		if (lock_addr < from || lock_addr >= to)
+			continue;
+		list_del_init(curr);
+		DEBUG_OFF();
+		debug_spin_lock_restore(&debug_mutex_lock, flags);
+
+		printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
+			current->comm, current->pid, lock, from, to);
+		dump_stack();
+		printk_lock(lock, 1);
+		if (lock->owner != current_thread_info())
+			printk("freeing task is not even the owner??\n");
+		return;
+	}
+	debug_spin_lock_restore(&debug_mutex_lock, flags);
+}
+
+/*
+ * Must be called with lock->wait_lock held.
+ */
+void debug_mutex_set_owner(struct mutex *lock,
+			   struct thread_info *new_owner __IP_DECL__)
+{
+	lock->owner = new_owner;
+	DEBUG_WARN_ON(!list_empty(&lock->held_list));
+	if (debug_mutex_on) {
+		list_add_tail(&lock->held_list, &debug_mutex_held_locks);
+		lock->acquire_ip = ip;
+	}
+}
+
+void debug_mutex_init_waiter(struct mutex_waiter *waiter)
+{
+	memset(waiter, 0x11, sizeof(*waiter));
+	waiter->magic = waiter;
+	INIT_LIST_HEAD(&waiter->list);
+}
+
+void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+	SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
+	DEBUG_WARN_ON(list_empty(&lock->wait_list));
+	DEBUG_WARN_ON(waiter->magic != waiter);
+	DEBUG_WARN_ON(list_empty(&waiter->list));
+}
+
+void debug_mutex_free_waiter(struct mutex_waiter *waiter)
+{
+	DEBUG_WARN_ON(!list_empty(&waiter->list));
+	memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+			    struct thread_info *ti __IP_DECL__)
+{
+	SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
+	check_deadlock(lock, 0, ti, ip);
+	/* Mark the current thread as blocked on the lock: */
+	ti->task->blocked_on = waiter;
+	waiter->lock = lock;
+}
+
+void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+			 struct thread_info *ti)
+{
+	DEBUG_WARN_ON(list_empty(&waiter->list));
+	DEBUG_WARN_ON(waiter->task != ti->task);
+	DEBUG_WARN_ON(ti->task->blocked_on != waiter);
+	ti->task->blocked_on = NULL;
+
+	list_del_init(&waiter->list);
+	waiter->task = NULL;
+}
+
+void debug_mutex_unlock(struct mutex *lock)
+{
+	DEBUG_WARN_ON(lock->magic != lock);
+	DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+	DEBUG_WARN_ON(lock->owner != current_thread_info());
+	if (debug_mutex_on) {
+		DEBUG_WARN_ON(list_empty(&lock->held_list));
+		list_del_init(&lock->held_list);
+	}
+}
+
+void debug_mutex_init(struct mutex *lock, const char *name)
+{
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	mutex_debug_check_no_locks_freed((void *)lock, (void *)(lock + 1));
+	lock->owner = NULL;
+	INIT_LIST_HEAD(&lock->held_list);
+	lock->name = name;
+	lock->magic = lock;
+}
+
+/***
+ * mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void fastcall mutex_destroy(struct mutex *lock)
+{
+	DEBUG_WARN_ON(mutex_is_locked(lock));
+	lock->magic = NULL;
+}
+
+EXPORT_SYMBOL_GPL(mutex_destroy);
+
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
new file mode 100644
index 000000000000..fd384050acb1
--- /dev/null
+++ b/kernel/mutex-debug.h
@@ -0,0 +1,134 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal declarations,
+ * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
+ * More details are in kernel/mutex-debug.c.
+ */
+
+extern spinlock_t debug_mutex_lock;
+extern struct list_head debug_mutex_held_locks;
+extern int debug_mutex_on;
+
+/*
+ * In the debug case we carry the caller's instruction pointer into
+ * other functions, but we dont want the function argument overhead
+ * in the nondebug case - hence these macros:
+ */
+#define __IP_DECL__		, unsigned long ip
+#define __IP__			, ip
+#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
+
+/*
+ * This must be called with lock->wait_lock held.
+ */
+extern void debug_mutex_set_owner(struct mutex *lock,
+				  struct thread_info *new_owner __IP_DECL__);
+
+static inline void debug_mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
+extern void debug_mutex_init_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+				    struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+				   struct mutex_waiter *waiter,
+				   struct thread_info *ti __IP_DECL__);
+extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+				struct thread_info *ti);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock, const char *name);
+
+#define debug_spin_lock(lock)				\
+	do {						\
+		local_irq_disable();			\
+		if (debug_mutex_on)			\
+			spin_lock(lock);		\
+	} while (0)
+
+#define debug_spin_unlock(lock)				\
+	do {						\
+		if (debug_mutex_on)			\
+			spin_unlock(lock);		\
+		local_irq_enable();			\
+		preempt_check_resched();		\
+	} while (0)
+
+#define debug_spin_lock_save(lock, flags)		\
+	do {						\
+		local_irq_save(flags);			\
+		if (debug_mutex_on)			\
+			spin_lock(lock);		\
+	} while (0)
+
+#define debug_spin_lock_restore(lock, flags)		\
+	do {						\
+		if (debug_mutex_on)			\
+			spin_unlock(lock);		\
+		local_irq_restore(flags);		\
+		preempt_check_resched();		\
+	} while (0)
+
+#define spin_lock_mutex(lock)				\
+	do {						\
+		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
+							\
+		DEBUG_WARN_ON(in_interrupt());		\
+		debug_spin_lock(&debug_mutex_lock);	\
+		spin_lock(lock);			\
+		DEBUG_WARN_ON(l->magic != l);		\
+	} while (0)
+
+#define spin_unlock_mutex(lock)				\
+	do {						\
+		spin_unlock(lock);			\
+		debug_spin_unlock(&debug_mutex_lock);	\
+	} while (0)
+
+#define DEBUG_OFF()					\
+do {							\
+	if (debug_mutex_on) {				\
+		debug_mutex_on = 0;			\
+		console_verbose();			\
+		if (spin_is_locked(&debug_mutex_lock))	\
+			spin_unlock(&debug_mutex_lock);	\
+	}						\
+} while (0)
+
+#define DEBUG_BUG()					\
+do {							\
+	if (debug_mutex_on) {				\
+		DEBUG_OFF();				\
+		BUG();					\
+	}						\
+} while (0)
+
+#define DEBUG_WARN_ON(c)				\
+do {							\
+	if (unlikely(c && debug_mutex_on)) {		\
+		DEBUG_OFF();				\
+		WARN_ON(1);				\
+	}						\
+} while (0)
+
+# define DEBUG_BUG_ON(c)				\
+do {							\
+	if (unlikely(c))				\
+		DEBUG_BUG();				\
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_DEBUG_WARN_ON(c)			DEBUG_WARN_ON(c)
+# define SMP_DEBUG_BUG_ON(c)			DEBUG_BUG_ON(c)
+#else
+# define SMP_DEBUG_WARN_ON(c)			do { } while (0)
+# define SMP_DEBUG_BUG_ON(c)			do { } while (0)
+#endif
+
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c48260fb8fd9..1fcd856edec1 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -95,6 +95,14 @@ config DEBUG_PREEMPT
 	  if kernel code uses it in a preemption-unsafe way. Also, the kernel
 	  will detect preemption count underflows.
 
+config DEBUG_MUTEXES
+	bool "Mutex debugging, deadlock detection"
+	default y
+	depends on DEBUG_KERNEL
+	help
+	 This allows mutex semantics violations and mutex related deadlocks
+	 (lockups) to be detected and reported automatically.
+
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3-71-gd317


From de5097c2e73f826302cd8957c225b3725e0c7553 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 15:59:21 -0800
Subject: [PATCH] mutex subsystem, more debugging code

more mutex debugging: check for held locks during memory freeing,
task exit, enable sysrq printouts, etc.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
---
 arch/i386/mm/pageattr.c |  4 ++++
 drivers/char/sysrq.c    | 19 +++++++++++++++++++
 include/linux/mm.h      |  4 ++++
 kernel/exit.c           |  5 +++++
 kernel/sched.c          |  1 +
 mm/page_alloc.c         |  3 +++
 mm/slab.c               |  1 +
 7 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index c30a16df6440..e8a53552b13d 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -222,6 +222,10 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (PageHighMem(page))
 		return;
+	if (!enable)
+		mutex_debug_check_no_locks_freed(page_address(page),
+						 page_address(page+numpages));
+
 	/* the return value is ignored - the calls cannot fail,
 	 * large pages are disabled at boot time.
 	 */
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 145275ebdd7e..5765f672e853 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -153,6 +153,21 @@ static struct sysrq_key_op sysrq_mountro_op = {
 
 /* END SYNC SYSRQ HANDLERS BLOCK */
 
+#ifdef CONFIG_DEBUG_MUTEXES
+
+static void
+sysrq_handle_showlocks(int key, struct pt_regs *pt_regs, struct tty_struct *tty)
+{
+	mutex_debug_show_all_locks();
+}
+
+static struct sysrq_key_op sysrq_showlocks_op = {
+	.handler	= sysrq_handle_showlocks,
+	.help_msg	= "show-all-locks(D)",
+	.action_msg	= "Show Locks Held",
+};
+
+#endif
 
 /* SHOW SYSRQ HANDLERS BLOCK */
 
@@ -294,7 +309,11 @@ static struct sysrq_key_op *sysrq_key_table[SYSRQ_KEY_TABLE_LENGTH] = {
 #else
 /* c */	NULL,
 #endif
+#ifdef CONFIG_DEBUG_MUTEXES
+/* d */ &sysrq_showlocks_op,
+#else
 /* d */ NULL,
+#endif
 /* e */	&sysrq_term_op,
 /* f */	&sysrq_moom_op,
 /* g */	NULL,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index df80e63903b5..3f1fafc0245e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,6 +13,7 @@
 #include <linux/rbtree.h>
 #include <linux/prio_tree.h>
 #include <linux/fs.h>
+#include <linux/mutex.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1024,6 +1025,9 @@ static inline void vm_stat_account(struct mm_struct *mm,
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
+	if (!PageHighMem(page) && !enable)
+		mutex_debug_check_no_locks_freed(page_address(page),
+						 page_address(page + numpages));
 }
 #endif
 
diff --git a/kernel/exit.c b/kernel/exit.c
index caceabf3f230..309a46fa16f8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,6 +29,7 @@
 #include <linux/syscalls.h>
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
+#include <linux/mutex.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -869,6 +870,10 @@ fastcall NORET_TYPE void do_exit(long code)
 	mpol_free(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
+	/*
+	 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
+	 */
+	mutex_debug_check_no_locks_held(tsk);
 
 	/* PF_DEAD causes final put_task_struct after we schedule. */
 	preempt_disable();
diff --git a/kernel/sched.c b/kernel/sched.c
index 92733091154c..34a945bcc022 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4386,6 +4386,7 @@ void show_state(void)
 	} while_each_thread(g, p);
 
 	read_unlock(&tasklist_lock);
+	mutex_debug_show_all_locks();
 }
 
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e0e84924171b..a5e6891f7bb6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -415,6 +415,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	int reserved = 0;
 
 	arch_free_page(page, order);
+	if (!PageHighMem(page))
+		mutex_debug_check_no_locks_freed(page_address(page),
+			page_address(page+(1<<order)));
 
 #ifndef CONFIG_MMU
 	for (i = 1 ; i < (1 << order) ; ++i)
diff --git a/mm/slab.c b/mm/slab.c
index 1c46c6383552..33aab345cd4a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3071,6 +3071,7 @@ void kfree(const void *objp)
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = page_get_cache(virt_to_page(objp));
+	mutex_debug_check_no_locks_freed(objp, objp+obj_reallen(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3-71-gd317


From 1b1dcc1b57a49136f118a0f16367256ff9994a69 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Mon, 9 Jan 2006 15:59:24 -0800
Subject: [PATCH] mutex subsystem, semaphore to mutex: VFS, ->i_sem

This patch converts the inode semaphore to a mutex. I have tested it on
XFS and compiled as much as one can consider on an ia64. Anyway your
luck with it might be different.

Modified-by: Ingo Molnar <mingo@elte.hu>

(finished the conversion)

Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/platforms/cell/spufs/inode.c | 12 ++---
 drivers/block/loop.c                      |  4 +-
 drivers/char/mem.c                        |  4 +-
 drivers/isdn/capi/capifs.c                |  6 +--
 drivers/md/dm.c                           |  4 +-
 drivers/md/md.c                           |  8 +--
 drivers/pci/proc.c                        |  4 +-
 drivers/usb/core/inode.c                  | 28 +++++------
 drivers/usb/gadget/file_storage.c         |  4 +-
 drivers/usb/gadget/inode.c                |  4 +-
 fs/affs/inode.c                           |  4 +-
 fs/autofs/root.c                          |  4 +-
 fs/autofs4/root.c                         |  4 +-
 fs/binfmt_misc.c                          | 12 ++---
 fs/block_dev.c                            |  4 +-
 fs/buffer.c                               |  6 +--
 fs/cifs/cifsfs.c                          |  6 +--
 fs/cifs/inode.c                           |  8 +--
 fs/coda/dir.c                             |  4 +-
 fs/coda/file.c                            |  8 +--
 fs/configfs/dir.c                         | 54 ++++++++++----------
 fs/configfs/file.c                        |  4 +-
 fs/configfs/inode.c                       |  6 +--
 fs/debugfs/inode.c                        |  8 +--
 fs/devfs/base.c                           | 22 ++++-----
 fs/devpts/inode.c                         |  8 +--
 fs/direct-io.c                            | 30 +++++------
 fs/dquot.c                                | 16 +++---
 fs/exportfs/expfs.c                       | 12 ++---
 fs/ext2/acl.c                             | 10 ++--
 fs/ext2/ext2.h                            |  2 +-
 fs/ext2/super.c                           |  4 +-
 fs/ext2/xattr.c                           |  2 +-
 fs/ext3/acl.c                             | 10 ++--
 fs/ext3/super.c                           |  4 +-
 fs/ext3/xattr.c                           |  2 +-
 fs/fat/dir.c                              |  4 +-
 fs/fat/file.c                             |  4 +-
 fs/fifo.c                                 |  6 +--
 fs/fuse/file.c                            |  4 +-
 fs/hfs/inode.c                            |  4 +-
 fs/hfsplus/bitmap.c                       |  8 +--
 fs/hfsplus/inode.c                        |  4 +-
 fs/hpfs/dir.c                             |  6 +--
 fs/hppfs/hppfs_kern.c                     |  6 +--
 fs/hugetlbfs/inode.c                      |  4 +-
 fs/inode.c                                |  2 +-
 fs/jffs/inode-v23.c                       |  2 +-
 fs/jfs/jfs_incore.h                       |  4 +-
 fs/libfs.c                                |  8 +--
 fs/namei.c                                | 82 +++++++++++++++----------------
 fs/namespace.c                            | 12 ++---
 fs/nfs/dir.c                              | 10 ++--
 fs/nfsd/nfs4recover.c                     | 20 ++++----
 fs/nfsd/vfs.c                             | 12 ++---
 fs/ntfs/attrib.c                          |  4 +-
 fs/ntfs/dir.c                             |  8 +--
 fs/ntfs/file.c                            | 18 +++----
 fs/ntfs/index.c                           |  6 +--
 fs/ntfs/inode.c                           |  8 +--
 fs/ntfs/namei.c                           |  6 +--
 fs/ntfs/quota.c                           |  6 +--
 fs/ntfs/super.c                           | 16 +++---
 fs/ocfs2/alloc.c                          | 24 ++++-----
 fs/ocfs2/cluster/nodemanager.c            |  2 +-
 fs/ocfs2/dir.c                            |  4 +-
 fs/ocfs2/file.c                           |  8 +--
 fs/ocfs2/inode.c                          | 12 ++---
 fs/ocfs2/journal.c                        | 14 +++---
 fs/ocfs2/localalloc.c                     |  6 +--
 fs/open.c                                 | 24 ++++-----
 fs/pipe.c                                 | 44 ++++++++---------
 fs/quota.c                                |  6 +--
 fs/read_write.c                           |  4 +-
 fs/readdir.c                              |  4 +-
 fs/reiserfs/file.c                        | 10 ++--
 fs/reiserfs/inode.c                       | 14 +++---
 fs/reiserfs/ioctl.c                       |  4 +-
 fs/reiserfs/super.c                       |  4 +-
 fs/reiserfs/tail_conversion.c             |  2 +-
 fs/reiserfs/xattr.c                       | 34 ++++++-------
 fs/reiserfs/xattr_acl.c                   |  6 +--
 fs/relayfs/inode.c                        | 12 ++---
 fs/sysfs/dir.c                            | 31 ++++++------
 fs/sysfs/file.c                           | 17 +++----
 fs/sysfs/inode.c                          |  8 ++-
 fs/sysfs/symlink.c                        |  5 +-
 fs/ufs/super.c                            |  6 +--
 fs/xattr.c                                |  8 +--
 fs/xfs/linux-2.6/xfs_iops.c               |  2 +-
 fs/xfs/linux-2.6/xfs_lrw.c                | 18 +++----
 fs/xfs/xfs_dmapi.h                        | 14 +++---
 include/linux/ext3_fs_i.h                 |  2 +-
 include/linux/fs.h                        |  7 +--
 include/linux/jffs2_fs_i.h                |  4 +-
 include/linux/nfsd/nfsfh.h                |  6 +--
 include/linux/pipe_fs_i.h                 |  2 +-
 include/linux/reiserfs_fs.h               |  2 +-
 ipc/mqueue.c                              |  8 +--
 kernel/cpuset.c                           | 10 ++--
 mm/filemap.c                              | 30 +++++------
 mm/filemap_xip.c                          |  6 +--
 mm/memory.c                               |  4 +-
 mm/msync.c                                |  2 +-
 mm/rmap.c                                 |  8 +--
 mm/shmem.c                                |  6 +--
 mm/swapfile.c                             |  8 +--
 mm/truncate.c                             |  2 +-
 net/sunrpc/rpc_pipe.c                     | 58 +++++++++++-----------
 net/unix/af_unix.c                        |  4 +-
 security/inode.c                          |  8 +--
 sound/core/oss/pcm_oss.c                  |  2 -
 sound/core/seq/seq_memory.c               |  4 --
 113 files changed, 563 insertions(+), 573 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 1f3507c75e90..d2ba358c6e38 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -137,7 +137,7 @@ spufs_delete_inode(struct inode *inode)
 static void spufs_prune_dir(struct dentry *dir)
 {
 	struct dentry *dentry, *tmp;
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) {
 		spin_lock(&dcache_lock);
 		spin_lock(&dentry->d_lock);
@@ -154,7 +154,7 @@ static void spufs_prune_dir(struct dentry *dir)
 		}
 	}
 	shrink_dcache_parent(dir);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 }
 
 static int spufs_rmdir(struct inode *root, struct dentry *dir_dentry)
@@ -162,15 +162,15 @@ static int spufs_rmdir(struct inode *root, struct dentry *dir_dentry)
 	struct spu_context *ctx;
 
 	/* remove all entries */
-	down(&root->i_sem);
+	mutex_lock(&root->i_mutex);
 	spufs_prune_dir(dir_dentry);
-	up(&root->i_sem);
+	mutex_unlock(&root->i_mutex);
 
 	/* We have to give up the mm_struct */
 	ctx = SPUFS_I(dir_dentry->d_inode)->i_ctx;
 	spu_forget(ctx);
 
-	/* XXX Do we need to hold i_sem here ? */
+	/* XXX Do we need to hold i_mutex here ? */
 	return simple_rmdir(root, dir_dentry);
 }
 
@@ -330,7 +330,7 @@ long spufs_create_thread(struct nameidata *nd,
 out_dput:
 	dput(dentry);
 out_dir:
-	up(&nd->dentry->d_inode->i_sem);
+	mutex_unlock(&nd->dentry->d_inode->i_mutex);
 out:
 	return ret;
 }
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a452b13620a2..bed9ad76c04c 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -215,7 +215,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	unsigned offset, bv_offs;
 	int len, ret;
 
-	down(&mapping->host->i_sem);
+	mutex_lock(&mapping->host->i_mutex);
 	index = pos >> PAGE_CACHE_SHIFT;
 	offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
 	bv_offs = bvec->bv_offset;
@@ -278,7 +278,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	}
 	ret = 0;
 out:
-	up(&mapping->host->i_sem);
+	mutex_unlock(&mapping->host->i_mutex);
 	return ret;
 unlock:
 	unlock_page(page);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 5b2d18035073..704c3c07f0ab 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -741,7 +741,7 @@ static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
 {
 	loff_t ret;
 
-	down(&file->f_dentry->d_inode->i_sem);
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 	switch (orig) {
 		case 0:
 			file->f_pos = offset;
@@ -756,7 +756,7 @@ static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
 		default:
 			ret = -EINVAL;
 	}
-	up(&file->f_dentry->d_inode->i_sem);
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 	return ret;
 }
 
diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index 207cae366256..0a37aded4b54 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -138,7 +138,7 @@ static struct dentry *get_node(int num)
 {
 	char s[10];
 	struct dentry *root = capifs_root;
-	down(&root->d_inode->i_sem);
+	mutex_lock(&root->d_inode->i_mutex);
 	return lookup_one_len(s, root, sprintf(s, "%d", num));
 }
 
@@ -159,7 +159,7 @@ void capifs_new_ncci(unsigned int number, dev_t device)
 	dentry = get_node(number);
 	if (!IS_ERR(dentry) && !dentry->d_inode)
 		d_instantiate(dentry, inode);
-	up(&capifs_root->d_inode->i_sem);
+	mutex_unlock(&capifs_root->d_inode->i_mutex);
 }
 
 void capifs_free_ncci(unsigned int number)
@@ -175,7 +175,7 @@ void capifs_free_ncci(unsigned int number)
 		}
 		dput(dentry);
 	}
-	up(&capifs_root->d_inode->i_sem);
+	mutex_unlock(&capifs_root->d_inode->i_mutex);
 }
 
 static int __init capifs_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0e481512f918..5c210b0a4cb0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -837,9 +837,9 @@ static void __set_size(struct mapped_device *md, sector_t size)
 {
 	set_capacity(md->disk, size);
 
-	down(&md->suspended_bdev->bd_inode->i_sem);
+	mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
 	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-	up(&md->suspended_bdev->bd_inode->i_sem);
+	mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
 }
 
 static int __bind(struct mapped_device *md, struct dm_table *t)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e423a16ba3c9..0302723fa21f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3460,9 +3460,9 @@ static int update_size(mddev_t *mddev, unsigned long size)
 
 		bdev = bdget_disk(mddev->gendisk, 0);
 		if (bdev) {
-			down(&bdev->bd_inode->i_sem);
+			mutex_lock(&bdev->bd_inode->i_mutex);
 			i_size_write(bdev->bd_inode, mddev->array_size << 10);
-			up(&bdev->bd_inode->i_sem);
+			mutex_unlock(&bdev->bd_inode->i_mutex);
 			bdput(bdev);
 		}
 	}
@@ -3486,9 +3486,9 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
 
 		bdev = bdget_disk(mddev->gendisk, 0);
 		if (bdev) {
-			down(&bdev->bd_inode->i_sem);
+			mutex_lock(&bdev->bd_inode->i_mutex);
 			i_size_write(bdev->bd_inode, mddev->array_size << 10);
-			up(&bdev->bd_inode->i_sem);
+			mutex_unlock(&bdev->bd_inode->i_mutex);
 			bdput(bdev);
 		}
 	}
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 9eb465727fce..9cb6dd0834be 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -25,7 +25,7 @@ proc_bus_pci_lseek(struct file *file, loff_t off, int whence)
 	loff_t new = -1;
 	struct inode *inode = file->f_dentry->d_inode;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	switch (whence) {
 	case 0:
 		new = off;
@@ -41,7 +41,7 @@ proc_bus_pci_lseek(struct file *file, loff_t off, int whence)
 		new = -EINVAL;
 	else
 		file->f_pos = new;
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return new;
 }
 
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 4ddc453023a2..3cf945cc5b9a 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -184,13 +184,13 @@ static void update_bus(struct dentry *bus)
 	bus->d_inode->i_gid = busgid;
 	bus->d_inode->i_mode = S_IFDIR | busmode;
 
-	down(&bus->d_inode->i_sem);
+	mutex_lock(&bus->d_inode->i_mutex);
 
 	list_for_each_entry(dev, &bus->d_subdirs, d_u.d_child)
 		if (dev->d_inode)
 			update_dev(dev);
 
-	up(&bus->d_inode->i_sem);
+	mutex_unlock(&bus->d_inode->i_mutex);
 }
 
 static void update_sb(struct super_block *sb)
@@ -201,7 +201,7 @@ static void update_sb(struct super_block *sb)
 	if (!root)
 		return;
 
-	down(&root->d_inode->i_sem);
+	mutex_lock(&root->d_inode->i_mutex);
 
 	list_for_each_entry(bus, &root->d_subdirs, d_u.d_child) {
 		if (bus->d_inode) {
@@ -219,7 +219,7 @@ static void update_sb(struct super_block *sb)
 		}
 	}
 
-	up(&root->d_inode->i_sem);
+	mutex_unlock(&root->d_inode->i_mutex);
 }
 
 static int remount(struct super_block *sb, int *flags, char *data)
@@ -333,10 +333,10 @@ static int usbfs_empty (struct dentry *dentry)
 static int usbfs_unlink (struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	dentry->d_inode->i_nlink--;
 	dput(dentry);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	d_delete(dentry);
 	return 0;
 }
@@ -346,7 +346,7 @@ static int usbfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int error = -ENOTEMPTY;
 	struct inode * inode = dentry->d_inode;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	dentry_unhash(dentry);
 	if (usbfs_empty(dentry)) {
 		dentry->d_inode->i_nlink -= 2;
@@ -355,7 +355,7 @@ static int usbfs_rmdir(struct inode *dir, struct dentry *dentry)
 		dir->i_nlink--;
 		error = 0;
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	if (!error)
 		d_delete(dentry);
 	dput(dentry);
@@ -380,7 +380,7 @@ static loff_t default_file_lseek (struct file *file, loff_t offset, int orig)
 {
 	loff_t retval = -EINVAL;
 
-	down(&file->f_dentry->d_inode->i_sem);
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 	switch(orig) {
 	case 0:
 		if (offset > 0) {
@@ -397,7 +397,7 @@ static loff_t default_file_lseek (struct file *file, loff_t offset, int orig)
 	default:
 		break;
 	}
-	up(&file->f_dentry->d_inode->i_sem);
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 	return retval;
 }
 
@@ -480,7 +480,7 @@ static int fs_create_by_name (const char *name, mode_t mode,
 	}
 
 	*dentry = NULL;
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	*dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry)) {
 		if ((mode & S_IFMT) == S_IFDIR)
@@ -489,7 +489,7 @@ static int fs_create_by_name (const char *name, mode_t mode,
 			error = usbfs_create (parent->d_inode, *dentry, mode);
 	} else
 		error = PTR_ERR(dentry);
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 
 	return error;
 }
@@ -528,7 +528,7 @@ static void fs_remove_file (struct dentry *dentry)
 	if (!parent || !parent->d_inode)
 		return;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	if (usbfs_positive(dentry)) {
 		if (dentry->d_inode) {
 			if (S_ISDIR(dentry->d_inode->i_mode))
@@ -538,7 +538,7 @@ static void fs_remove_file (struct dentry *dentry)
 		dput(dentry);
 		}
 	}
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 }
 
 /* --------------------------------------------------------------------- */
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 0cea9782d7d4..de59c58896d6 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -1891,7 +1891,7 @@ static int fsync_sub(struct lun *curlun)
 		return -EINVAL;
 
 	inode = filp->f_dentry->d_inode;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	current->flags |= PF_SYNCWRITE;
 	rc = filemap_fdatawrite(inode->i_mapping);
 	err = filp->f_op->fsync(filp, filp->f_dentry, 1);
@@ -1901,7 +1901,7 @@ static int fsync_sub(struct lun *curlun)
 	if (!rc)
 		rc = err;
 	current->flags &= ~PF_SYNCWRITE;
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	VLDBG(curlun, "fdatasync -> %d\n", rc);
 	return rc;
 }
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 5c40980a5bd9..c6c279de832e 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -1562,10 +1562,10 @@ restart:
 		spin_unlock_irq (&dev->lock);
 
 		/* break link to dcache */
-		down (&parent->i_sem);
+		mutex_lock (&parent->i_mutex);
 		d_delete (dentry);
 		dput (dentry);
-		up (&parent->i_sem);
+		mutex_unlock (&parent->i_mutex);
 
 		/* fds may still be open */
 		goto restart;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 9ebe881c6786..44d439cb69f4 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -244,10 +244,10 @@ affs_put_inode(struct inode *inode)
 	pr_debug("AFFS: put_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
 	affs_free_prealloc(inode);
 	if (atomic_read(&inode->i_count) == 1) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		if (inode->i_size != AFFS_I(inode)->mmu_private)
 			affs_truncate(inode);
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 }
 
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index a1ab1c0ed215..808134a5a2fa 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -229,9 +229,9 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr
 	dentry->d_flags |= DCACHE_AUTOFS_PENDING;
 	d_add(dentry, NULL);
 
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	autofs_revalidate(dentry, nd);
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 
 	/*
 	 * If we are still pending, check if we had to handle
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2241405ffc41..541b19e6fec9 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -489,9 +489,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	d_add(dentry, NULL);
 
 	if (dentry->d_op && dentry->d_op->d_revalidate) {
-		up(&dir->i_sem);
+		mutex_unlock(&dir->i_mutex);
 		(dentry->d_op->d_revalidate)(dentry, nd);
-		down(&dir->i_sem);
+		mutex_lock(&dir->i_mutex);
 	}
 
 	/*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 2568eb41cb3a..9ccc7d8275b8 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -588,11 +588,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 		case 2: set_bit(Enabled, &e->flags);
 			break;
 		case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root);
-			down(&root->d_inode->i_sem);
+			mutex_lock(&root->d_inode->i_mutex);
 
 			kill_node(e);
 
-			up(&root->d_inode->i_sem);
+			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
 			break;
 		default: return res;
@@ -622,7 +622,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		return PTR_ERR(e);
 
 	root = dget(sb->s_root);
-	down(&root->d_inode->i_sem);
+	mutex_lock(&root->d_inode->i_mutex);
 	dentry = lookup_one_len(e->name, root, strlen(e->name));
 	err = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
@@ -658,7 +658,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 out2:
 	dput(dentry);
 out:
-	up(&root->d_inode->i_sem);
+	mutex_unlock(&root->d_inode->i_mutex);
 	dput(root);
 
 	if (err) {
@@ -703,12 +703,12 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 		case 1: enabled = 0; break;
 		case 2: enabled = 1; break;
 		case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root);
-			down(&root->d_inode->i_sem);
+			mutex_lock(&root->d_inode->i_mutex);
 
 			while (!list_empty(&entries))
 				kill_node(list_entry(entries.next, Node, list));
 
-			up(&root->d_inode->i_sem);
+			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
 		default: return res;
 	}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e0df94c37b7e..6e50346fb1ee 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -202,7 +202,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 	loff_t size;
 	loff_t retval;
 
-	down(&bd_inode->i_sem);
+	mutex_lock(&bd_inode->i_mutex);
 	size = i_size_read(bd_inode);
 
 	switch (origin) {
@@ -219,7 +219,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 		}
 		retval = offset;
 	}
-	up(&bd_inode->i_sem);
+	mutex_unlock(&bd_inode->i_mutex);
 	return retval;
 }
 	
diff --git a/fs/buffer.c b/fs/buffer.c
index 55f0975a9b15..6466bc8a3dc7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -352,11 +352,11 @@ static long do_fsync(unsigned int fd, int datasync)
 	 * We need to protect against concurrent writers,
 	 * which could cause livelocks in fsync_buffers_list
 	 */
-	down(&mapping->host->i_sem);
+	mutex_lock(&mapping->host->i_mutex);
 	err = file->f_op->fsync(file, file->f_dentry, datasync);
 	if (!ret)
 		ret = err;
-	up(&mapping->host->i_sem);
+	mutex_unlock(&mapping->host->i_mutex);
 	err = filemap_fdatawait(mapping);
 	if (!ret)
 		ret = err;
@@ -2338,7 +2338,7 @@ int generic_commit_write(struct file *file, struct page *page,
 	__block_commit_write(inode,page,from,to);
 	/*
 	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_sem.
+	 * cannot change under us because we hold i_mutex.
 	 */
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2a13a2bac8f1..e10213b7541e 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -860,9 +860,9 @@ static int cifs_oplock_thread(void * dummyarg)
 				DeleteOplockQEntry(oplock_item);
 				/* can not grab inode sem here since it would
 				deadlock when oplock received on delete 
-				since vfs_unlink holds the i_sem across
+				since vfs_unlink holds the i_mutex across
 				the call */
-				/* down(&inode->i_sem);*/
+				/* mutex_lock(&inode->i_mutex);*/
 				if (S_ISREG(inode->i_mode)) {
 					rc = filemap_fdatawrite(inode->i_mapping);
 					if(CIFS_I(inode)->clientCanCacheRead == 0) {
@@ -871,7 +871,7 @@ static int cifs_oplock_thread(void * dummyarg)
 					}
 				} else
 					rc = 0;
-				/* up(&inode->i_sem);*/
+				/* mutex_unlock(&inode->i_mutex);*/
 				if (rc)
 					CIFS_I(inode)->write_behind_rc = rc;
 				cFYI(1,("Oplock flush inode %p rc %d",inode,rc));
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9558f51bca55..3ebce9430f4a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1040,9 +1040,9 @@ int cifs_revalidate(struct dentry *direntry)
 	}
 
 	/* can not grab this sem since kernel filesys locking documentation
-	   indicates i_sem may be taken by the kernel on lookup and rename
-	   which could deadlock if we grab the i_sem here as well */
-/*	down(&direntry->d_inode->i_sem);*/
+	   indicates i_mutex may be taken by the kernel on lookup and rename
+	   which could deadlock if we grab the i_mutex here as well */
+/*	mutex_lock(&direntry->d_inode->i_mutex);*/
 	/* need to write out dirty pages here  */
 	if (direntry->d_inode->i_mapping) {
 		/* do we need to lock inode until after invalidate completes
@@ -1066,7 +1066,7 @@ int cifs_revalidate(struct dentry *direntry)
 			}
 		}
 	}
-/*	up(&direntry->d_inode->i_sem); */
+/*	mutex_unlock(&direntry->d_inode->i_mutex); */
 	
 	kfree(full_path);
 	FreeXid(xid);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2391766e9c7c..8f1a517f8b4e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -453,7 +453,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir)
 	coda_vfs_stat.readdir++;
 
 	host_inode = host_file->f_dentry->d_inode;
-	down(&host_inode->i_sem);
+	mutex_lock(&host_inode->i_mutex);
 	host_file->f_pos = coda_file->f_pos;
 
 	if (!host_file->f_op->readdir) {
@@ -475,7 +475,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir)
 	}
 out:
 	coda_file->f_pos = host_file->f_pos;
-	up(&host_inode->i_sem);
+	mutex_unlock(&host_inode->i_mutex);
 
 	return ret;
 }
diff --git a/fs/coda/file.c b/fs/coda/file.c
index e6bc022568f3..30b4630bd735 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -77,14 +77,14 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
 		return -EINVAL;
 
 	host_inode = host_file->f_dentry->d_inode;
-	down(&coda_inode->i_sem);
+	mutex_lock(&coda_inode->i_mutex);
 
 	ret = host_file->f_op->write(host_file, buf, count, ppos);
 
 	coda_inode->i_size = host_inode->i_size;
 	coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
 	coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
-	up(&coda_inode->i_sem);
+	mutex_unlock(&coda_inode->i_mutex);
 
 	return ret;
 }
@@ -272,9 +272,9 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 	if (host_file->f_op && host_file->f_op->fsync) {
 		host_dentry = host_file->f_dentry;
 		host_inode = host_dentry->d_inode;
-		down(&host_inode->i_sem);
+		mutex_lock(&host_inode->i_mutex);
 		err = host_file->f_op->fsync(host_file, host_dentry, datasync);
-		up(&host_inode->i_sem);
+		mutex_unlock(&host_inode->i_mutex);
 	}
 
 	if ( !err && !datasync ) {
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index e48b539243a1..b668ec61527e 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -288,10 +288,10 @@ static struct dentry * configfs_lookup(struct inode *dir,
 
 /*
  * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
- * attributes and are removed by rmdir().  We recurse, taking i_sem
+ * attributes and are removed by rmdir().  We recurse, taking i_mutex
  * on all children that are candidates for default detach.  If the
  * result is clean, then configfs_detach_group() will handle dropping
- * i_sem.  If there is an error, the caller will clean up the i_sem
+ * i_mutex.  If there is an error, the caller will clean up the i_mutex
  * holders via configfs_detach_rollback().
  */
 static int configfs_detach_prep(struct dentry *dentry)
@@ -309,8 +309,8 @@ static int configfs_detach_prep(struct dentry *dentry)
 		if (sd->s_type & CONFIGFS_NOT_PINNED)
 			continue;
 		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
-			down(&sd->s_dentry->d_inode->i_sem);
-			/* Mark that we've taken i_sem */
+			mutex_lock(&sd->s_dentry->d_inode->i_mutex);
+			/* Mark that we've taken i_mutex */
 			sd->s_type |= CONFIGFS_USET_DROPPING;
 
 			ret = configfs_detach_prep(sd->s_dentry);
@@ -327,7 +327,7 @@ out:
 }
 
 /*
- * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is
+ * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is
  * set.
  */
 static void configfs_detach_rollback(struct dentry *dentry)
@@ -341,7 +341,7 @@ static void configfs_detach_rollback(struct dentry *dentry)
 
 			if (sd->s_type & CONFIGFS_USET_DROPPING) {
 				sd->s_type &= ~CONFIGFS_USET_DROPPING;
-				up(&sd->s_dentry->d_inode->i_sem);
+				mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
 			}
 		}
 	}
@@ -424,11 +424,11 @@ static void detach_groups(struct config_group *group)
 
 		/*
 		 * From rmdir/unregister, a configfs_detach_prep() pass
-		 * has taken our i_sem for us.  Drop it.
+		 * has taken our i_mutex for us.  Drop it.
 		 * From mkdir/register cleanup, there is no sem held.
 		 */
 		if (sd->s_type & CONFIGFS_USET_DROPPING)
-			up(&child->d_inode->i_sem);
+			mutex_unlock(&child->d_inode->i_mutex);
 
 		d_delete(child);
 		dput(child);
@@ -493,11 +493,11 @@ static int populate_groups(struct config_group *group)
 		/* FYI, we're faking mkdir here
 		 * I'm not sure we need this semaphore, as we're called
 		 * from our parent's mkdir.  That holds our parent's
-		 * i_sem, so afaik lookup cannot continue through our
+		 * i_mutex, so afaik lookup cannot continue through our
 		 * parent to find us, let alone mess with our tree.
-		 * That said, taking our i_sem is closer to mkdir
+		 * That said, taking our i_mutex is closer to mkdir
 		 * emulation, and shouldn't hurt. */
-		down(&dentry->d_inode->i_sem);
+		mutex_lock(&dentry->d_inode->i_mutex);
 
 		for (i = 0; group->default_groups[i]; i++) {
 			new_group = group->default_groups[i];
@@ -507,7 +507,7 @@ static int populate_groups(struct config_group *group)
 				break;
 		}
 
-		up(&dentry->d_inode->i_sem);
+		mutex_unlock(&dentry->d_inode->i_mutex);
 	}
 
 	if (ret)
@@ -856,7 +856,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
 	down_write(&configfs_rename_sem);
 	parent = item->parent->dentry;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 
 	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
 	if (!IS_ERR(new_dentry)) {
@@ -872,7 +872,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
 			error = -EEXIST;
 		dput(new_dentry);
 	}
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	up_write(&configfs_rename_sem);
 
 	return error;
@@ -884,9 +884,9 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 	struct dentry * dentry = file->f_dentry;
 	struct configfs_dirent * parent_sd = dentry->d_fsdata;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	file->private_data = configfs_new_dirent(parent_sd, NULL);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	return file->private_data ? 0 : -ENOMEM;
 
@@ -897,9 +897,9 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
 	struct dentry * dentry = file->f_dentry;
 	struct configfs_dirent * cursor = file->private_data;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	list_del_init(&cursor->s_sibling);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	release_configfs_dirent(cursor);
 
@@ -975,7 +975,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 {
 	struct dentry * dentry = file->f_dentry;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	switch (origin) {
 		case 1:
 			offset += file->f_pos;
@@ -983,7 +983,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			if (offset >= 0)
 				break;
 		default:
-			up(&file->f_dentry->d_inode->i_sem);
+			mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -1007,7 +1007,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			list_add_tail(&cursor->s_sibling, p);
 		}
 	}
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	return offset;
 }
 
@@ -1037,7 +1037,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	sd = configfs_sb->s_root->d_fsdata;
 	link_group(to_config_group(sd->s_element), group);
 
-	down(&configfs_sb->s_root->d_inode->i_sem);
+	mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
 
 	name.name = group->cg_item.ci_name;
 	name.len = strlen(name.name);
@@ -1057,7 +1057,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	else
 		d_delete(dentry);
 
-	up(&configfs_sb->s_root->d_inode->i_sem);
+	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
 
 	if (dentry) {
 	    dput(dentry);
@@ -1079,18 +1079,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 		return;
 	}
 
-	down(&configfs_sb->s_root->d_inode->i_sem);
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	if (configfs_detach_prep(dentry)) {
 		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
 	}
 	configfs_detach_group(&group->cg_item);
 	dentry->d_inode->i_flags |= S_DEAD;
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	d_delete(dentry);
 
-	up(&configfs_sb->s_root->d_inode->i_sem);
+	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
 
 	dput(dentry);
 
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index af1ffc9a15c0..c26cd61f13af 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -336,9 +336,9 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
 	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
 	int error = 0;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	return error;
 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 6b274c6d428f..6577c588de9d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -122,7 +122,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd)
 
 /*
  * Unhashes the dentry corresponding to given configfs_dirent
- * Called with parent inode's i_sem held.
+ * Called with parent inode's i_mutex held.
  */
 void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 {
@@ -145,7 +145,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 	struct configfs_dirent * sd;
 	struct configfs_dirent * parent_sd = dir->d_fsdata;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element)
 			continue;
@@ -156,7 +156,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 			break;
 		}
 	}
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 }
 
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a86ac4aeaedb..d4f1a2cddd47 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -146,7 +146,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 	}
 
 	*dentry = NULL;
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	*dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry)) {
 		if ((mode & S_IFMT) == S_IFDIR)
@@ -155,7 +155,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 			error = debugfs_create(parent->d_inode, *dentry, mode);
 	} else
 		error = PTR_ERR(dentry);
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 
 	return error;
 }
@@ -273,7 +273,7 @@ void debugfs_remove(struct dentry *dentry)
 	if (!parent || !parent->d_inode)
 		return;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	if (debugfs_positive(dentry)) {
 		if (dentry->d_inode) {
 			if (S_ISDIR(dentry->d_inode->i_mode))
@@ -283,7 +283,7 @@ void debugfs_remove(struct dentry *dentry)
 		dput(dentry);
 		}
 	}
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 1274422a5384..b621521e09d4 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2162,27 +2162,27 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
 	 *
 	 * make sure that
 	 *   d_instantiate always runs under lock
-	 *   we release i_sem lock before going to sleep
+	 *   we release i_mutex lock before going to sleep
 	 *
 	 * unfortunately sometimes d_revalidate is called with
-	 * and sometimes without i_sem lock held. The following checks
+	 * and sometimes without i_mutex lock held. The following checks
 	 * attempt to deduce when we need to add (and drop resp.) lock
 	 * here. This relies on current (2.6.2) calling coventions:
 	 *
-	 *   lookup_hash is always run under i_sem and is passing NULL
+	 *   lookup_hash is always run under i_mutex and is passing NULL
 	 *   as nd
 	 *
-	 *   open(...,O_CREATE,...) calls _lookup_hash under i_sem
+	 *   open(...,O_CREATE,...) calls _lookup_hash under i_mutex
 	 *   and sets flags to LOOKUP_OPEN|LOOKUP_CREATE
 	 *
 	 *   all other invocations of ->d_revalidate seem to happen
-	 *   outside of i_sem
+	 *   outside of i_mutex
 	 */
 	need_lock = nd &&
 	    (!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT));
 
 	if (need_lock)
-		down(&dir->i_sem);
+		mutex_lock(&dir->i_mutex);
 
 	if (is_devfsd_or_child(fs_info)) {
 		devfs_handle_t de = lookup_info->de;
@@ -2221,9 +2221,9 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
 		add_wait_queue(&lookup_info->wait_queue, &wait);
 		read_unlock(&parent->u.dir.lock);
 		/* at this point it is always (hopefully) locked */
-		up(&dir->i_sem);
+		mutex_unlock(&dir->i_mutex);
 		schedule();
-		down(&dir->i_sem);
+		mutex_lock(&dir->i_mutex);
 		/*
 		 * This does not need nor should remove wait from wait_queue.
 		 * Wait queue head is never reused - nothing is ever added to it
@@ -2238,7 +2238,7 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
 
       out:
 	if (need_lock)
-		up(&dir->i_sem);
+		mutex_unlock(&dir->i_mutex);
 	return 1;
 }				/*  End Function devfs_d_revalidate_wait  */
 
@@ -2284,9 +2284,9 @@ static struct dentry *devfs_lookup(struct inode *dir, struct dentry *dentry,
 	/*  Unlock directory semaphore, which will release any waiters. They
 	   will get the hashed dentry, and may be forced to wait for
 	   revalidation  */
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	wait_for_devfsd_finished(fs_info);	/*  If I'm not devfsd, must wait  */
-	down(&dir->i_sem);	/*  Grab it again because them's the rules  */
+	mutex_lock(&dir->i_mutex);	/*  Grab it again because them's the rules  */
 	de = lookup_info.de;
 	/*  If someone else has been so kind as to make the inode, we go home
 	   early  */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f2be44d4491f..bfb8a230bac9 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -130,7 +130,7 @@ static struct dentry *get_node(int num)
 {
 	char s[12];
 	struct dentry *root = devpts_root;
-	down(&root->d_inode->i_sem);
+	mutex_lock(&root->d_inode->i_mutex);
 	return lookup_one_len(s, root, sprintf(s, "%d", num));
 }
 
@@ -161,7 +161,7 @@ int devpts_pty_new(struct tty_struct *tty)
 	if (!IS_ERR(dentry) && !dentry->d_inode)
 		d_instantiate(dentry, inode);
 
-	up(&devpts_root->d_inode->i_sem);
+	mutex_unlock(&devpts_root->d_inode->i_mutex);
 
 	return 0;
 }
@@ -178,7 +178,7 @@ struct tty_struct *devpts_get_tty(int number)
 		dput(dentry);
 	}
 
-	up(&devpts_root->d_inode->i_sem);
+	mutex_unlock(&devpts_root->d_inode->i_mutex);
 
 	return tty;
 }
@@ -196,7 +196,7 @@ void devpts_pty_kill(int number)
 		}
 		dput(dentry);
 	}
-	up(&devpts_root->d_inode->i_sem);
+	mutex_unlock(&devpts_root->d_inode->i_mutex);
 }
 
 static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3931e7f1e6bf..30dbbd1df511 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -56,7 +56,7 @@
  * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
  * This determines whether we need to do the fancy locking which prevents
  * direct-IO from being able to read uninitialised disk blocks.  If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_sem is
+ * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
  * not held for the entire direct write (taken briefly, initially, during a
  * direct read though, but its never held for the duration of a direct-IO).
  */
@@ -930,7 +930,7 @@ out:
 }
 
 /*
- * Releases both i_sem and i_alloc_sem
+ * Releases both i_mutex and i_alloc_sem
  */
 static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
@@ -1062,11 +1062,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 	/*
 	 * All block lookups have been performed. For READ requests
-	 * we can let i_sem go now that its achieved its purpose
+	 * we can let i_mutex go now that its achieved its purpose
 	 * of protecting us from looking up uninitialized blocks.
 	 */
 	if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
-		up(&dio->inode->i_sem);
+		mutex_unlock(&dio->inode->i_mutex);
 
 	/*
 	 * OK, all BIOs are submitted, so we can decrement bio_count to truly
@@ -1145,18 +1145,18 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
  * The locking rules are governed by the dio_lock_type parameter.
  *
  * DIO_NO_LOCKING (no locking, for raw block device access)
- * For writes, i_sem is not held on entry; it is never taken.
+ * For writes, i_mutex is not held on entry; it is never taken.
  *
  * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_sem and return with i_sem held, even though
+ * For writes we are called under i_mutex and return with i_mutex held, even though
  * it is internally dropped.
- * For reads, i_sem is not held on entry, but it is taken and dropped before
+ * For reads, i_mutex is not held on entry, but it is taken and dropped before
  * returning.
  *
  * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
  *	uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_sem, return without it, never touch it.
- * For reads, i_sem is held on entry and will be released before returning.
+ * For writes we are called without i_mutex, return without it, never touch it.
+ * For reads, i_mutex is held on entry and will be released before returning.
  *
  * Additional i_alloc_sem locking requirements described inline below.
  */
@@ -1214,11 +1214,11 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * For block device access DIO_NO_LOCKING is used,
 	 *	neither readers nor writers do any locking at all
 	 * For regular files using DIO_LOCKING,
-	 *	readers need to grab i_sem and i_alloc_sem
-	 *	writers need to grab i_alloc_sem only (i_sem is already held)
+	 *	readers need to grab i_mutex and i_alloc_sem
+	 *	writers need to grab i_alloc_sem only (i_mutex is already held)
 	 * For regular files using DIO_OWN_LOCKING,
 	 *	neither readers nor writers take any locks here
-	 *	(i_sem is already held and release for writers here)
+	 *	(i_mutex is already held and release for writers here)
 	 */
 	dio->lock_type = dio_lock_type;
 	if (dio_lock_type != DIO_NO_LOCKING) {
@@ -1228,7 +1228,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 			mapping = iocb->ki_filp->f_mapping;
 			if (dio_lock_type != DIO_OWN_LOCKING) {
-				down(&inode->i_sem);
+				mutex_lock(&inode->i_mutex);
 				reader_with_isem = 1;
 			}
 
@@ -1240,7 +1240,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 			}
 
 			if (dio_lock_type == DIO_OWN_LOCKING) {
-				up(&inode->i_sem);
+				mutex_unlock(&inode->i_mutex);
 				reader_with_isem = 0;
 			}
 		}
@@ -1266,7 +1266,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 out:
 	if (reader_with_isem)
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	if (rw & WRITE)
 		current->flags &= ~PF_SYNCWRITE;
 	return retval;
diff --git a/fs/dquot.c b/fs/dquot.c
index 2a62b3dc20ec..cb6d5bfbdfd5 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -100,7 +100,7 @@
  * operation is just reading pointers from inode (or not using them at all) the
  * read lock is enough. If pointers are altered function must hold write lock
  * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
- * for altering the flag i_sem is also needed).  If operation is holding
+ * for altering the flag i_mutex is also needed).  If operation is holding
  * reference to dquot in other way (e.g. quotactl ops) it must be guarded by
  * dqonoff_sem.
  * This locking assures that:
@@ -117,9 +117,9 @@
  * spinlock to internal buffers before writing.
  *
  * Lock ordering (including related VFS locks) is the following:
- *   i_sem > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem >
+ *   i_mutex > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem >
  *   > dquot->dq_lock > dqio_sem
- * i_sem on quota files is special (it's below dqio_sem)
+ * i_mutex on quota files is special (it's below dqio_sem)
  */
 
 static DEFINE_SPINLOCK(dq_list_lock);
@@ -1369,11 +1369,11 @@ int vfs_quota_off(struct super_block *sb, int type)
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
 			if (!sb_has_quota_enabled(sb, cnt)) {
-				down(&toputinode[cnt]->i_sem);
+				mutex_lock(&toputinode[cnt]->i_mutex);
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
 				truncate_inode_pages(&toputinode[cnt]->i_data, 0);
-				up(&toputinode[cnt]->i_sem);
+				mutex_unlock(&toputinode[cnt]->i_mutex);
 				mark_inode_dirty(toputinode[cnt]);
 				iput(toputinode[cnt]);
 			}
@@ -1417,7 +1417,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 	write_inode_now(inode, 1);
 	/* And now flush the block cache so that kernel sees the changes */
 	invalidate_bdev(sb->s_bdev, 0);
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	down(&dqopt->dqonoff_sem);
 	if (sb_has_quota_enabled(sb, type)) {
 		error = -EBUSY;
@@ -1449,7 +1449,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 		goto out_file_init;
 	}
 	up(&dqopt->dqio_sem);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	set_enable_flags(dqopt, type);
 
 	add_dquot_ref(sb, type);
@@ -1470,7 +1470,7 @@ out_lock:
 		inode->i_flags |= oldflags;
 		up_write(&dqopt->dqptr_sem);
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 out_fmt:
 	put_quota_format(fmt);
 
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c49d6254379a..5bfe40085fbc 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -177,9 +177,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 			struct dentry *ppd;
 			struct dentry *npd;
 
-			down(&pd->d_inode->i_sem);
+			mutex_lock(&pd->d_inode->i_mutex);
 			ppd = CALL(nops,get_parent)(pd);
-			up(&pd->d_inode->i_sem);
+			mutex_unlock(&pd->d_inode->i_mutex);
 
 			if (IS_ERR(ppd)) {
 				err = PTR_ERR(ppd);
@@ -201,9 +201,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 				break;
 			}
 			dprintk("find_exported_dentry: found name: %s\n", nbuf);
-			down(&ppd->d_inode->i_sem);
+			mutex_lock(&ppd->d_inode->i_mutex);
 			npd = lookup_one_len(nbuf, ppd, strlen(nbuf));
-			up(&ppd->d_inode->i_sem);
+			mutex_unlock(&ppd->d_inode->i_mutex);
 			if (IS_ERR(npd)) {
 				err = PTR_ERR(npd);
 				dprintk("find_exported_dentry: lookup failed: %d\n", err);
@@ -242,9 +242,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 		struct dentry *nresult;
 		err = CALL(nops,get_name)(target_dir, nbuf, result);
 		if (!err) {
-			down(&target_dir->d_inode->i_sem);
+			mutex_lock(&target_dir->d_inode->i_mutex);
 			nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
-			up(&target_dir->d_inode->i_sem);
+			mutex_unlock(&target_dir->d_inode->i_mutex);
 			if (!IS_ERR(nresult)) {
 				if (nresult->d_inode) {
 					dput(result);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 6af2f4130290..239133d01d91 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -149,7 +149,7 @@ ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
 }
 
 /*
- * inode->i_sem: don't care
+ * inode->i_mutex: don't care
  */
 static struct posix_acl *
 ext2_get_acl(struct inode *inode, int type)
@@ -211,7 +211,7 @@ ext2_get_acl(struct inode *inode, int type)
 }
 
 /*
- * inode->i_sem: down
+ * inode->i_mutex: down
  */
 static int
 ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
@@ -301,8 +301,8 @@ ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
 /*
  * Initialize the ACLs of a new inode. Called from ext2_new_inode.
  *
- * dir->i_sem: down
- * inode->i_sem: up (access to inode is still exclusive)
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
  */
 int
 ext2_init_acl(struct inode *inode, struct inode *dir)
@@ -361,7 +361,7 @@ cleanup:
  * for directories) are added. There are no more bits available in the
  * file mode.
  *
- * inode->i_sem: down
+ * inode->i_mutex: down
  */
 int
 ext2_acl_chmod(struct inode *inode)
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e977f8566d14..00de0a7312a2 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -53,7 +53,7 @@ struct ext2_inode_info {
 #ifdef CONFIG_EXT2_FS_XATTR
 	/*
 	 * Extended attributes can be read independently of the main file
-	 * data. Taking i_sem even when reading would cause contention
+	 * data. Taking i_mutex even when reading would cause contention
 	 * between readers of EAs and writers of regular file data, so
 	 * instead we synchronize on xattr_sem when reading or changing
 	 * EAs.
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 522fa70dd8ea..8d6819846fc9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1152,7 +1152,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
 	struct buffer_head tmp_bh;
 	struct buffer_head *bh;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
 				sb->s_blocksize - offset : towrite;
@@ -1189,7 +1189,7 @@ out:
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(inode);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return len - towrite;
 }
 
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 0099462d4271..f7a3b5fee274 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -325,7 +325,7 @@ cleanup:
 /*
  * Inode operation listxattr()
  *
- * dentry->d_inode->i_sem: don't care
+ * dentry->d_inode->i_mutex: don't care
  */
 ssize_t
 ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 3ac38266fc9e..9ed132c96034 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -152,7 +152,7 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
 /*
  * Inode operation get_posix_acl().
  *
- * inode->i_sem: don't care
+ * inode->i_mutex: don't care
  */
 static struct posix_acl *
 ext3_get_acl(struct inode *inode, int type)
@@ -216,7 +216,7 @@ ext3_get_acl(struct inode *inode, int type)
 /*
  * Set the access or default ACL of an inode.
  *
- * inode->i_sem: down unless called from ext3_new_inode
+ * inode->i_mutex: down unless called from ext3_new_inode
  */
 static int
 ext3_set_acl(handle_t *handle, struct inode *inode, int type,
@@ -306,8 +306,8 @@ ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
 /*
  * Initialize the ACLs of a new inode. Called from ext3_new_inode.
  *
- * dir->i_sem: down
- * inode->i_sem: up (access to inode is still exclusive)
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
  */
 int
 ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
@@ -368,7 +368,7 @@ cleanup:
  * for directories) are added. There are no more bits available in the
  * file mode.
  *
- * inode->i_sem: down
+ * inode->i_mutex: down
  */
 int
 ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7c45acf94589..c3dbebdb9897 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2601,7 +2601,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
 				sb->s_blocksize - offset : towrite;
@@ -2644,7 +2644,7 @@ out:
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	ext3_mark_inode_dirty(handle, inode);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return len - towrite;
 }
 
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 430de9f63be3..238199d82ce5 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -140,7 +140,7 @@ ext3_xattr_handler(int name_index)
 /*
  * Inode operation listxattr()
  *
- * dentry->d_inode->i_sem: don't care
+ * dentry->d_inode->i_mutex: don't care
  */
 ssize_t
 ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index eef1b81aa294..db0de5c621c7 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -729,13 +729,13 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp,
 
 	buf.dirent = d1;
 	buf.result = 0;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
 		ret = __fat_readdir(inode, filp, &buf, fat_ioctl_filldir,
 				    short_only, both);
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	if (ret >= 0)
 		ret = buf.result;
 	return ret;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 9b07c328a6fc..d30876cf35f5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -41,7 +41,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		if (err)
 			return err;
 
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 
 		if (IS_RDONLY(inode)) {
 			err = -EROFS;
@@ -103,7 +103,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
 		mark_inode_dirty(inode);
 	up:
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 		return err;
 	}
 	default:
diff --git a/fs/fifo.c b/fs/fifo.c
index 5455916241f0..923371b753ab 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -35,7 +35,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	int ret;
 
 	ret = -ERESTARTSYS;
-	if (down_interruptible(PIPE_SEM(*inode)))
+	if (mutex_lock_interruptible(PIPE_MUTEX(*inode)))
 		goto err_nolock_nocleanup;
 
 	if (!inode->i_pipe) {
@@ -119,7 +119,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	}
 
 	/* Ok! */
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 	return 0;
 
 err_rd:
@@ -139,7 +139,7 @@ err:
 		free_pipe_info(inode);
 
 err_nocleanup:
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 err_nolock_nocleanup:
 	return ret;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 05dedddf4289..63d2980df5c9 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -560,9 +560,9 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 	struct inode *inode = file->f_dentry->d_inode;
 	ssize_t res;
 	/* Don't allow parallel writes to the same file */
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	res = fuse_direct_io(file, buf, count, ppos, 1);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return res;
 }
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index d499393a8ae7..050a49276499 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -547,13 +547,13 @@ static int hfs_file_release(struct inode *inode, struct file *file)
 	if (atomic_read(&file->f_count) != 0)
 		return 0;
 	if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		hfs_file_truncate(inode);
 		//if (inode->i_flags & S_DEAD) {
 		//	hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
 		//	hfs_delete_inode(inode);
 		//}
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	return 0;
 }
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index c7d316455fa0..9fb51632303c 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -29,7 +29,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		return size;
 
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
-	down(&HFSPLUS_SB(sb).alloc_file->i_sem);
+	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
 			       (filler_t *)mapping->a_ops->readpage, NULL);
@@ -143,7 +143,7 @@ done:
 	sb->s_dirt = 1;
 	dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
 out:
-	up(&HFSPLUS_SB(sb).alloc_file->i_sem);
+	mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	return start;
 }
 
@@ -164,7 +164,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 	if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
 		return -2;
 
-	down(&HFSPLUS_SB(sb).alloc_file->i_sem);
+	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
 	page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL);
@@ -215,7 +215,7 @@ out:
 	kunmap(page);
 	HFSPLUS_SB(sb).free_blocks += len;
 	sb->s_dirt = 1;
-	up(&HFSPLUS_SB(sb).alloc_file->i_sem);
+	mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 
 	return 0;
 }
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index fc98583cf045..983bcd02ac1c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -276,13 +276,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	if (atomic_read(&file->f_count) != 0)
 		return 0;
 	if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		hfsplus_file_truncate(inode);
 		if (inode->i_flags & S_DEAD) {
 			hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
 			hfsplus_delete_inode(inode);
 		}
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	return 0;
 }
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 0217c3a04441..5591f9623aa2 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -32,19 +32,19 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 
 	/*printk("dir lseek\n");*/
 	if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
-	down(&i->i_sem);
+	mutex_lock(&i->i_mutex);
 	pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
 	while (pos != new_off) {
 		if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
 		else goto fail;
 		if (pos == 12) goto fail;
 	}
-	up(&i->i_sem);
+	mutex_unlock(&i->i_mutex);
 ok:
 	unlock_kernel();
 	return filp->f_pos = new_off;
 fail:
-	up(&i->i_sem);
+	mutex_unlock(&i->i_mutex);
 	/*printk("illegal lseek: %016llx\n", new_off);*/
 	unlock_kernel();
 	return -ESPIPE;
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 52930915bad8..a44dc5897399 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -171,12 +171,12 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
 
 	err = -ENOMEM;
 	parent = HPPFS_I(ino)->proc_dentry;
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	proc_dentry = d_lookup(parent, &dentry->d_name);
 	if(proc_dentry == NULL){
 		proc_dentry = d_alloc(parent, &dentry->d_name);
 		if(proc_dentry == NULL){
-			up(&parent->d_inode->i_sem);
+			mutex_unlock(&parent->d_inode->i_mutex);
 			goto out;
 		}
 		new = (*parent->d_inode->i_op->lookup)(parent->d_inode,
@@ -186,7 +186,7 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
 			proc_dentry = new;
 		}
 	}
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 
 	if(IS_ERR(proc_dentry))
 		return(proc_dentry);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c41315a6e42..ff1b7d108bd0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -118,7 +118,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	file_accessed(file);
 	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
 	vma->vm_ops = &hugetlb_vm_ops;
@@ -133,7 +133,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (inode->i_size < len)
 		inode->i_size = len;
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	return ret;
 }
diff --git a/fs/inode.c b/fs/inode.c
index fd568caf7f74..e08767fd57b0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -192,7 +192,7 @@ void inode_init_once(struct inode *inode)
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
-	sema_init(&inode->i_sem, 1);
+	mutex_init(&inode->i_mutex);
 	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	rwlock_init(&inode->i_data.tree_lock);
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 2559ee10beda..fc3855a1aef3 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1415,7 +1415,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
 	 * This will never trigger with sane page sizes.  leave it in
 	 * anyway, since I'm thinking about how to merge larger writes
 	 * (the current idea is to poke a thread that does the actual
-	 * I/O and starts by doing a down(&inode->i_sem).  then we
+	 * I/O and starts by doing a mutex_lock(&inode->i_mutex).  then we
 	 * would need to get the page cache pages and have a list of
 	 * I/O requests and do write-merging here.
 	 * -- prumpf
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index c0fd7b3eadc6..dc21a5bd54d4 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -58,7 +58,7 @@ struct jfs_inode_info {
 	/*
 	 * rdwrlock serializes xtree between reads & writes and synchronizes
 	 * changes to special inodes.  It's use would be redundant on
-	 * directories since the i_sem taken in the VFS is sufficient.
+	 * directories since the i_mutex taken in the VFS is sufficient.
 	 */
 	struct rw_semaphore rdwrlock;
 	/*
@@ -68,7 +68,7 @@ struct jfs_inode_info {
 	 * inode is blocked in txBegin or TxBeginAnon
 	 */
 	struct semaphore commit_sem;
-	/* xattr_sem allows us to access the xattrs without taking i_sem */
+	/* xattr_sem allows us to access the xattrs without taking i_mutex */
 	struct rw_semaphore xattr_sem;
 	lid_t	xtlid;		/* lid of xtree lock on directory */
 #ifdef CONFIG_JFS_POSIX_ACL
diff --git a/fs/libfs.c b/fs/libfs.c
index 9c50523382e7..63c020e6589e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -74,7 +74,7 @@ int dcache_dir_close(struct inode *inode, struct file *file)
 
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 {
-	down(&file->f_dentry->d_inode->i_sem);
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 	switch (origin) {
 		case 1:
 			offset += file->f_pos;
@@ -82,7 +82,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			if (offset >= 0)
 				break;
 		default:
-			up(&file->f_dentry->d_inode->i_sem);
+			mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -106,7 +106,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			spin_unlock(&dcache_lock);
 		}
 	}
-	up(&file->f_dentry->d_inode->i_sem);
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 	return offset;
 }
 
@@ -356,7 +356,7 @@ int simple_commit_write(struct file *file, struct page *page,
 
 	/*
 	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold the i_sem.
+	 * cannot change under us because we hold the i_mutex.
 	 */
 	if (pos > inode->i_size)
 		i_size_write(inode, pos);
diff --git a/fs/namei.c b/fs/namei.c
index 300eae088d5f..0a8f073435af 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -438,7 +438,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 	struct dentry * result;
 	struct inode *dir = parent->d_inode;
 
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 	/*
 	 * First re-do the cached lookup just in case it was created
 	 * while we waited for the directory semaphore..
@@ -464,7 +464,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 			else
 				result = dentry;
 		}
-		up(&dir->i_sem);
+		mutex_unlock(&dir->i_mutex);
 		return result;
 	}
 
@@ -472,7 +472,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 	 * Uhhuh! Nasty case: the cache was re-populated while
 	 * we waited on the semaphore. Need to revalidate.
 	 */
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	if (result->d_op && result->d_op->d_revalidate) {
 		if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
 			dput(result);
@@ -1366,7 +1366,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 	struct dentry *p;
 
 	if (p1 == p2) {
-		down(&p1->d_inode->i_sem);
+		mutex_lock(&p1->d_inode->i_mutex);
 		return NULL;
 	}
 
@@ -1374,30 +1374,30 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 
 	for (p = p1; p->d_parent != p; p = p->d_parent) {
 		if (p->d_parent == p2) {
-			down(&p2->d_inode->i_sem);
-			down(&p1->d_inode->i_sem);
+			mutex_lock(&p2->d_inode->i_mutex);
+			mutex_lock(&p1->d_inode->i_mutex);
 			return p;
 		}
 	}
 
 	for (p = p2; p->d_parent != p; p = p->d_parent) {
 		if (p->d_parent == p1) {
-			down(&p1->d_inode->i_sem);
-			down(&p2->d_inode->i_sem);
+			mutex_lock(&p1->d_inode->i_mutex);
+			mutex_lock(&p2->d_inode->i_mutex);
 			return p;
 		}
 	}
 
-	down(&p1->d_inode->i_sem);
-	down(&p2->d_inode->i_sem);
+	mutex_lock(&p1->d_inode->i_mutex);
+	mutex_lock(&p2->d_inode->i_mutex);
 	return NULL;
 }
 
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
-	up(&p1->d_inode->i_sem);
+	mutex_unlock(&p1->d_inode->i_mutex);
 	if (p1 != p2) {
-		up(&p2->d_inode->i_sem);
+		mutex_unlock(&p2->d_inode->i_mutex);
 		up(&p1->d_inode->i_sb->s_vfs_rename_sem);
 	}
 }
@@ -1563,14 +1563,14 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
 
 	dir = nd->dentry;
 	nd->flags &= ~LOOKUP_PARENT;
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	path.dentry = lookup_hash(nd);
 	path.mnt = nd->mnt;
 
 do_last:
 	error = PTR_ERR(path.dentry);
 	if (IS_ERR(path.dentry)) {
-		up(&dir->d_inode->i_sem);
+		mutex_unlock(&dir->d_inode->i_mutex);
 		goto exit;
 	}
 
@@ -1579,7 +1579,7 @@ do_last:
 		if (!IS_POSIXACL(dir->d_inode))
 			mode &= ~current->fs->umask;
 		error = vfs_create(dir->d_inode, path.dentry, mode, nd);
-		up(&dir->d_inode->i_sem);
+		mutex_unlock(&dir->d_inode->i_mutex);
 		dput(nd->dentry);
 		nd->dentry = path.dentry;
 		if (error)
@@ -1593,7 +1593,7 @@ do_last:
 	/*
 	 * It already exists.
 	 */
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	error = -EEXIST;
 	if (flag & O_EXCL)
@@ -1665,7 +1665,7 @@ do_link:
 		goto exit;
 	}
 	dir = nd->dentry;
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	path.dentry = lookup_hash(nd);
 	path.mnt = nd->mnt;
 	__putname(nd->last.name);
@@ -1680,13 +1680,13 @@ do_link:
  * Simple function to lookup and return a dentry and create it
  * if it doesn't exist.  Is SMP-safe.
  *
- * Returns with nd->dentry->d_inode->i_sem locked.
+ * Returns with nd->dentry->d_inode->i_mutex locked.
  */
 struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 {
 	struct dentry *dentry = ERR_PTR(-EEXIST);
 
-	down(&nd->dentry->d_inode->i_sem);
+	mutex_lock(&nd->dentry->d_inode->i_mutex);
 	/*
 	 * Yucky last component or no last component at all?
 	 * (foo/., foo/.., /////)
@@ -1784,7 +1784,7 @@ asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev)
 		}
 		dput(dentry);
 	}
-	up(&nd.dentry->d_inode->i_sem);
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 	path_release(&nd);
 out:
 	putname(tmp);
@@ -1836,7 +1836,7 @@ asmlinkage long sys_mkdir(const char __user * pathname, int mode)
 			error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
 			dput(dentry);
 		}
-		up(&nd.dentry->d_inode->i_sem);
+		mutex_unlock(&nd.dentry->d_inode->i_mutex);
 		path_release(&nd);
 out:
 		putname(tmp);
@@ -1885,7 +1885,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	DQUOT_INIT(dir);
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	dentry_unhash(dentry);
 	if (d_mountpoint(dentry))
 		error = -EBUSY;
@@ -1897,7 +1897,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 				dentry->d_inode->i_flags |= S_DEAD;
 		}
 	}
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	if (!error) {
 		d_delete(dentry);
 	}
@@ -1932,14 +1932,14 @@ asmlinkage long sys_rmdir(const char __user * pathname)
 			error = -EBUSY;
 			goto exit1;
 	}
-	down(&nd.dentry->d_inode->i_sem);
+	mutex_lock(&nd.dentry->d_inode->i_mutex);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
 		error = vfs_rmdir(nd.dentry->d_inode, dentry);
 		dput(dentry);
 	}
-	up(&nd.dentry->d_inode->i_sem);
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 exit1:
 	path_release(&nd);
 exit:
@@ -1959,7 +1959,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	DQUOT_INIT(dir);
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	if (d_mountpoint(dentry))
 		error = -EBUSY;
 	else {
@@ -1967,7 +1967,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 		if (!error)
 			error = dir->i_op->unlink(dir, dentry);
 	}
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
@@ -1979,7 +1979,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 
 /*
  * Make sure that the actual truncation of the file will occur outside its
- * directory's i_sem.  Truncate can take a long time if there is a lot of
+ * directory's i_mutex.  Truncate can take a long time if there is a lot of
  * writeout happening, and we don't want to prevent access to the directory
  * while waiting on the I/O.
  */
@@ -2001,7 +2001,7 @@ asmlinkage long sys_unlink(const char __user * pathname)
 	error = -EISDIR;
 	if (nd.last_type != LAST_NORM)
 		goto exit1;
-	down(&nd.dentry->d_inode->i_sem);
+	mutex_lock(&nd.dentry->d_inode->i_mutex);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
@@ -2015,7 +2015,7 @@ asmlinkage long sys_unlink(const char __user * pathname)
 	exit2:
 		dput(dentry);
 	}
-	up(&nd.dentry->d_inode->i_sem);
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 	if (inode)
 		iput(inode);	/* truncate the inode here */
 exit1:
@@ -2075,7 +2075,7 @@ asmlinkage long sys_symlink(const char __user * oldname, const char __user * new
 			error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
 			dput(dentry);
 		}
-		up(&nd.dentry->d_inode->i_sem);
+		mutex_unlock(&nd.dentry->d_inode->i_mutex);
 		path_release(&nd);
 out:
 		putname(to);
@@ -2113,10 +2113,10 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (error)
 		return error;
 
-	down(&old_dentry->d_inode->i_sem);
+	mutex_lock(&old_dentry->d_inode->i_mutex);
 	DQUOT_INIT(dir);
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
-	up(&old_dentry->d_inode->i_sem);
+	mutex_unlock(&old_dentry->d_inode->i_mutex);
 	if (!error)
 		fsnotify_create(dir, new_dentry->d_name.name);
 	return error;
@@ -2157,7 +2157,7 @@ asmlinkage long sys_link(const char __user * oldname, const char __user * newnam
 		error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
 		dput(new_dentry);
 	}
-	up(&nd.dentry->d_inode->i_sem);
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 out_release:
 	path_release(&nd);
 out:
@@ -2178,7 +2178,7 @@ exit:
  *	   sb->s_vfs_rename_sem. We might be more accurate, but that's another
  *	   story.
  *	c) we have to lock _three_ objects - parents and victim (if it exists).
- *	   And that - after we got ->i_sem on parents (until then we don't know
+ *	   And that - after we got ->i_mutex on parents (until then we don't know
  *	   whether the target exists).  Solution: try to be smart with locking
  *	   order for inodes.  We rely on the fact that tree topology may change
  *	   only under ->s_vfs_rename_sem _and_ that parent of the object we
@@ -2195,9 +2195,9 @@ exit:
  *	   stuff into VFS), but the former is not going away. Solution: the same
  *	   trick as in rmdir().
  *	e) conversion from fhandle to dentry may come in the wrong moment - when
- *	   we are removing the target. Solution: we will have to grab ->i_sem
+ *	   we are removing the target. Solution: we will have to grab ->i_mutex
  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
- *	   ->i_sem on parents, which works but leads to some truely excessive
+ *	   ->i_mutex on parents, which works but leads to some truely excessive
  *	   locking].
  */
 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2222,7 +2222,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 
 	target = new_dentry->d_inode;
 	if (target) {
-		down(&target->i_sem);
+		mutex_lock(&target->i_mutex);
 		dentry_unhash(new_dentry);
 	}
 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
@@ -2232,7 +2232,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 	if (target) {
 		if (!error)
 			target->i_flags |= S_DEAD;
-		up(&target->i_sem);
+		mutex_unlock(&target->i_mutex);
 		if (d_unhashed(new_dentry))
 			d_rehash(new_dentry);
 		dput(new_dentry);
@@ -2255,7 +2255,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 	dget(new_dentry);
 	target = new_dentry->d_inode;
 	if (target)
-		down(&target->i_sem);
+		mutex_lock(&target->i_mutex);
 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
 		error = -EBUSY;
 	else
@@ -2266,7 +2266,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 			d_move(old_dentry, new_dentry);
 	}
 	if (target)
-		up(&target->i_sem);
+		mutex_unlock(&target->i_mutex);
 	dput(new_dentry);
 	return error;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 3e8fb61ad597..f0e353f5bc30 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -814,7 +814,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 		return -ENOTDIR;
 
 	err = -ENOENT;
-	down(&nd->dentry->d_inode->i_sem);
+	mutex_lock(&nd->dentry->d_inode->i_mutex);
 	if (IS_DEADDIR(nd->dentry->d_inode))
 		goto out_unlock;
 
@@ -826,7 +826,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
 		err = attach_recursive_mnt(mnt, nd, NULL);
 out_unlock:
-	up(&nd->dentry->d_inode->i_sem);
+	mutex_unlock(&nd->dentry->d_inode->i_mutex);
 	if (!err)
 		security_sb_post_addmount(mnt, nd);
 	return err;
@@ -962,7 +962,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 		goto out;
 
 	err = -ENOENT;
-	down(&nd->dentry->d_inode->i_sem);
+	mutex_lock(&nd->dentry->d_inode->i_mutex);
 	if (IS_DEADDIR(nd->dentry->d_inode))
 		goto out1;
 
@@ -1004,7 +1004,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 	list_del_init(&old_nd.mnt->mnt_expire);
 	spin_unlock(&vfsmount_lock);
 out1:
-	up(&nd->dentry->d_inode->i_sem);
+	mutex_unlock(&nd->dentry->d_inode->i_mutex);
 out:
 	up_write(&namespace_sem);
 	if (!err)
@@ -1573,7 +1573,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	user_nd.dentry = dget(current->fs->root);
 	read_unlock(&current->fs->lock);
 	down_write(&namespace_sem);
-	down(&old_nd.dentry->d_inode->i_sem);
+	mutex_lock(&old_nd.dentry->d_inode->i_mutex);
 	error = -EINVAL;
 	if (IS_MNT_SHARED(old_nd.mnt) ||
 		IS_MNT_SHARED(new_nd.mnt->mnt_parent) ||
@@ -1626,7 +1626,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	path_release(&root_parent);
 	path_release(&parent_nd);
 out2:
-	up(&old_nd.dentry->d_inode->i_sem);
+	mutex_unlock(&old_nd.dentry->d_inode->i_mutex);
 	up_write(&namespace_sem);
 	path_release(&user_nd);
 	path_release(&old_nd);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e9255198f767..a1554bead692 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -194,7 +194,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	spin_unlock(&inode->i_lock);
 	/* Ensure consistent page alignment of the data.
 	 * Note: assumes we have exclusive access to this mapping either
-	 *	 through inode->i_sem or some other mechanism.
+	 *	 through inode->i_mutex or some other mechanism.
 	 */
 	if (page->index == 0)
 		invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1);
@@ -573,7 +573,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 {
-	down(&filp->f_dentry->d_inode->i_sem);
+	mutex_lock(&filp->f_dentry->d_inode->i_mutex);
 	switch (origin) {
 		case 1:
 			offset += filp->f_pos;
@@ -589,7 +589,7 @@ loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 		((struct nfs_open_context *)filp->private_data)->dir_cookie = 0;
 	}
 out:
-	up(&filp->f_dentry->d_inode->i_sem);
+	mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
 	return offset;
 }
 
@@ -1001,7 +1001,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	openflags &= ~(O_CREAT|O_TRUNC);
 
 	/*
-	 * Note: we're not holding inode->i_sem and so may be racing with
+	 * Note: we're not holding inode->i_mutex and so may be racing with
 	 * operations that change the directory. We therefore save the
 	 * change attribute *before* we do the RPC call.
 	 */
@@ -1051,7 +1051,7 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 		return dentry;
 	if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
 		return NULL;
-	/* Note: caller is already holding the dir->i_sem! */
+	/* Note: caller is already holding the dir->i_mutex! */
 	dentry = d_alloc(parent, &name);
 	if (dentry == NULL)
 		return NULL;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 954cf893d50c..be963a133aaa 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -121,9 +121,9 @@ out:
 static void
 nfsd4_sync_rec_dir(void)
 {
-	down(&rec_dir.dentry->d_inode->i_sem);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
 	nfsd_sync_dir(rec_dir.dentry);
-	up(&rec_dir.dentry->d_inode->i_sem);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 }
 
 int
@@ -143,7 +143,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	nfs4_save_user(&uid, &gid);
 
 	/* lock the parent */
-	down(&rec_dir.dentry->d_inode->i_sem);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
 
 	dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
 	if (IS_ERR(dentry)) {
@@ -159,7 +159,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 out_put:
 	dput(dentry);
 out_unlock:
-	up(&rec_dir.dentry->d_inode->i_sem);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	if (status == 0) {
 		clp->cl_firststate = 1;
 		nfsd4_sync_rec_dir();
@@ -259,9 +259,9 @@ nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
 		printk("nfsd4: non-file found in client recovery directory\n");
 		return -EINVAL;
 	}
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	status = vfs_unlink(dir->d_inode, dentry);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 	return status;
 }
 
@@ -274,9 +274,9 @@ nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
 	 * any regular files anyway, just in case the directory was created by
 	 * a kernel from the future.... */
 	nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	status = vfs_rmdir(dir->d_inode, dentry);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 	return status;
 }
 
@@ -288,9 +288,9 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
 
 	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
 
-	down(&rec_dir.dentry->d_inode->i_sem);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
 	dentry = lookup_one_len(name, rec_dir.dentry, namlen);
-	up(&rec_dir.dentry->d_inode->i_sem);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
 		return status;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index df4019f04560..bb36b4304491 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -390,12 +390,12 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
 
 	error = -EOPNOTSUPP;
 	if (inode->i_op && inode->i_op->setxattr) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		security_inode_setxattr(dentry, key, buf, len, 0);
 		error = inode->i_op->setxattr(dentry, key, buf, len, 0);
 		if (!error)
 			security_inode_post_setxattr(dentry, key, buf, len, 0);
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 out:
 	kfree(buf);
@@ -739,9 +739,9 @@ nfsd_sync(struct file *filp)
         int err;
 	struct inode *inode = filp->f_dentry->d_inode;
 	dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name);
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	err=nfsd_dosync(filp, filp->f_dentry, filp->f_op);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	return err;
 }
@@ -885,9 +885,9 @@ static void kill_suid(struct dentry *dentry)
 	struct iattr	ia;
 	ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	notify_change(dentry, &ia);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 }
 
 static inline int
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index eda056bac256..9480a0526cd3 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1532,7 +1532,7 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
  * NOTE to self: No changes in the attribute list are required to move from
  *		 a resident to a non-resident attribute.
  *
- * Locking: - The caller must hold i_sem on the inode.
+ * Locking: - The caller must hold i_mutex on the inode.
  */
 int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
 {
@@ -1728,7 +1728,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
 	/*
 	 * This needs to be last since the address space operations ->readpage
 	 * and ->writepage can run concurrently with us as they are not
-	 * serialized on i_sem.  Note, we are not allowed to fail once we flip
+	 * serialized on i_mutex.  Note, we are not allowed to fail once we flip
 	 * this switch, which is another reason to do this last.
 	 */
 	NInoSetNonResident(ni);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 795c3d1930f5..b0690d4c8906 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -69,7 +69,7 @@ ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
  * work but we don't care for how quickly one can access them. This also fixes
  * the dcache aliasing issues.
  *
- * Locking:  - Caller must hold i_sem on the directory.
+ * Locking:  - Caller must hold i_mutex on the directory.
  *	     - Each page cache page in the index allocation mapping must be
  *	       locked whilst being accessed otherwise we may find a corrupt
  *	       page due to it being under ->writepage at the moment which
@@ -1085,11 +1085,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
  * While this will return the names in random order this doesn't matter for
  * ->readdir but OTOH results in a faster ->readdir.
  *
- * VFS calls ->readdir without BKL but with i_sem held. This protects the VFS
+ * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
  * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
  * modifications).
  *
- * Locking:  - Caller must hold i_sem on the directory.
+ * Locking:  - Caller must hold i_mutex on the directory.
  *	     - Each page cache page in the index allocation mapping must be
  *	       locked whilst being accessed otherwise we may find a corrupt
  *	       page due to it being under ->writepage at the moment which
@@ -1520,7 +1520,7 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
  * Note: In the past @filp could be NULL so we ignore it as we don't need it
  * anyway.
  *
- * Locking: Caller must hold i_sem on the inode.
+ * Locking: Caller must hold i_mutex on the inode.
  *
  * TODO: We should probably also write all attribute/index inodes associated
  * with this inode but since we have no simple way of getting to them we ignore
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 727533891813..30f71acdc1cb 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -106,7 +106,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
  * this is the case, the necessary zeroing will also have happened and that all
  * metadata is self-consistent.
  *
- * Locking: i_sem on the vfs inode corrseponsind to the ntfs inode @ni must be
+ * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
  *	    held by the caller.
  */
 static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
@@ -473,7 +473,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
  * @bytes:	number of bytes to be written
  *
  * This is called for non-resident attributes from ntfs_file_buffered_write()
- * with i_sem held on the inode (@pages[0]->mapping->host).  There are
+ * with i_mutex held on the inode (@pages[0]->mapping->host).  There are
  * @nr_pages pages in @pages which are locked but not kmap()ped.  The source
  * data has not yet been copied into the @pages.
  * 
@@ -1637,7 +1637,7 @@ err_out:
  * @pos:	byte position in file at which the write begins
  * @bytes:	number of bytes to be written
  *
- * This is called from ntfs_file_buffered_write() with i_sem held on the inode
+ * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
  * (@pages[0]->mapping->host).  There are @nr_pages pages in @pages which are
  * locked but not kmap()ped.  The source data has already been copied into the
  * @page.  ntfs_prepare_pages_for_non_resident_write() has been called before
@@ -1814,7 +1814,7 @@ err_out:
 /**
  * ntfs_file_buffered_write -
  *
- * Locking: The vfs is holding ->i_sem on the inode.
+ * Locking: The vfs is holding ->i_mutex on the inode.
  */
 static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 		const struct iovec *iov, unsigned long nr_segs,
@@ -2196,9 +2196,9 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err = sync_page_range(inode, mapping, pos, ret);
 		if (err < 0)
@@ -2221,12 +2221,12 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
 	struct kiocb kiocb;
 	ssize_t ret;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	init_sync_kiocb(&kiocb, file);
 	ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
 	if (ret == -EIOCBQUEUED)
 		ret = wait_on_sync_kiocb(&kiocb);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err = sync_page_range(inode, mapping, *ppos - ret, ret);
 		if (err < 0)
@@ -2269,7 +2269,7 @@ static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
  * Note: In the past @filp could be NULL so we ignore it as we don't need it
  * anyway.
  *
- * Locking: Caller must hold i_sem on the inode.
+ * Locking: Caller must hold i_mutex on the inode.
  *
  * TODO: We should probably also write all attribute/index inodes associated
  * with this inode but since we have no simple way of getting to them we ignore
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 8f2d5727546f..9f5427c2d105 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -32,7 +32,7 @@
  * Allocate a new index context, initialize it with @idx_ni and return it.
  * Return NULL if allocation failed.
  *
- * Locking:  Caller must hold i_sem on the index inode.
+ * Locking:  Caller must hold i_mutex on the index inode.
  */
 ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
 {
@@ -50,7 +50,7 @@ ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
  *
  * Release the index context @ictx, releasing all associated resources.
  *
- * Locking:  Caller must hold i_sem on the index inode.
+ * Locking:  Caller must hold i_mutex on the index inode.
  */
 void ntfs_index_ctx_put(ntfs_index_context *ictx)
 {
@@ -106,7 +106,7 @@ void ntfs_index_ctx_put(ntfs_index_context *ictx)
  * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
  * ensure that the changes are written to disk.
  *
- * Locking:  - Caller must hold i_sem on the index inode.
+ * Locking:  - Caller must hold i_mutex on the index inode.
  *	     - Each page cache page in the index allocation mapping must be
  *	       locked whilst being accessed otherwise we may find a corrupt
  *	       page due to it being under ->writepage at the moment which
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index b24f4c4b2c5c..bda7a08911a5 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2125,13 +2125,13 @@ void ntfs_put_inode(struct inode *vi)
 		ntfs_inode *ni = NTFS_I(vi);
 		if (NInoIndexAllocPresent(ni)) {
 			struct inode *bvi = NULL;
-			down(&vi->i_sem);
+			mutex_lock(&vi->i_mutex);
 			if (atomic_read(&vi->i_count) == 2) {
 				bvi = ni->itype.index.bmp_ino;
 				if (bvi)
 					ni->itype.index.bmp_ino = NULL;
 			}
-			up(&vi->i_sem);
+			mutex_unlock(&vi->i_mutex);
 			if (bvi)
 				iput(bvi);
 		}
@@ -2311,7 +2311,7 @@ static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
  *
  * Returns 0 on success or -errno on error.
  *
- * Called with ->i_sem held.  In all but one case ->i_alloc_sem is held for
+ * Called with ->i_mutex held.  In all but one case ->i_alloc_sem is held for
  * writing.  The only case in the kernel where ->i_alloc_sem is not held is
  * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
  * with the current i_size as the offset.  The analogous place in NTFS is in
@@ -2831,7 +2831,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
  * We also abort all changes of user, group, and mode as we do not implement
  * the NTFS ACLs yet.
  *
- * Called with ->i_sem held.  For the ATTR_SIZE (i.e. ->truncate) case, also
+ * Called with ->i_mutex held.  For the ATTR_SIZE (i.e. ->truncate) case, also
  * called with ->i_alloc_sem held for writing.
  *
  * Basically this is a copy of generic notify_change() and inode_setattr()
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 351dbc3b6e40..5ea9eb93af62 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -96,7 +96,7 @@
  *    name. We then convert the name to the current NLS code page, and proceed
  *    searching for a dentry with this name, etc, as in case 2), above.
  *
- * Locking: Caller must hold i_sem on the directory.
+ * Locking: Caller must hold i_mutex on the directory.
  */
 static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
 		struct nameidata *nd)
@@ -254,7 +254,7 @@ handle_name:
 	nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
 
 	/*
-	 * Note: No need for dent->d_lock lock as i_sem is held on the
+	 * Note: No need for dent->d_lock lock as i_mutex is held on the
 	 * parent inode.
 	 */
 
@@ -374,7 +374,7 @@ struct inode_operations ntfs_dir_inode_ops = {
  * The code is based on the ext3 ->get_parent() implementation found in
  * fs/ext3/namei.c::ext3_get_parent().
  *
- * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_sem down.
+ * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down.
  *
  * Return the dentry of the parent directory on success or the error code on
  * error (IS_ERR() is true).
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index 833df2a4e9fb..d0ef4182147b 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c
@@ -48,7 +48,7 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 		ntfs_error(vol->sb, "Quota inodes are not open.");
 		return FALSE;
 	}
-	down(&vol->quota_q_ino->i_sem);
+	mutex_lock(&vol->quota_q_ino->i_mutex);
 	ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
 	if (!ictx) {
 		ntfs_error(vol->sb, "Failed to get index context.");
@@ -98,7 +98,7 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 	ntfs_index_entry_mark_dirty(ictx);
 set_done:
 	ntfs_index_ctx_put(ictx);
-	up(&vol->quota_q_ino->i_sem);
+	mutex_unlock(&vol->quota_q_ino->i_mutex);
 	/*
 	 * We set the flag so we do not try to mark the quotas out of date
 	 * again on remount.
@@ -110,7 +110,7 @@ done:
 err_out:
 	if (ictx)
 		ntfs_index_ctx_put(ictx);
-	up(&vol->quota_q_ino->i_sem);
+	mutex_unlock(&vol->quota_q_ino->i_mutex);
 	return FALSE;
 }
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6c16db9e1a8a..280e383fc84e 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1213,10 +1213,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
 	 * Find the inode number for the hibernation file by looking up the
 	 * filename hiberfil.sys in the root directory.
 	 */
-	down(&vol->root_ino->i_sem);
+	mutex_lock(&vol->root_ino->i_mutex);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
 			&name);
-	up(&vol->root_ino->i_sem);
+	mutex_unlock(&vol->root_ino->i_mutex);
 	if (IS_ERR_MREF(mref)) {
 		ret = MREF_ERR(mref);
 		/* If the file does not exist, Windows is not hibernated. */
@@ -1307,10 +1307,10 @@ static BOOL load_and_init_quota(ntfs_volume *vol)
 	 * Find the inode number for the quota file by looking up the filename
 	 * $Quota in the extended system files directory $Extend.
 	 */
-	down(&vol->extend_ino->i_sem);
+	mutex_lock(&vol->extend_ino->i_mutex);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
 			&name);
-	up(&vol->extend_ino->i_sem);
+	mutex_unlock(&vol->extend_ino->i_mutex);
 	if (IS_ERR_MREF(mref)) {
 		/*
 		 * If the file does not exist, quotas are disabled and have
@@ -1390,10 +1390,10 @@ static BOOL load_and_init_usnjrnl(ntfs_volume *vol)
 	 * Find the inode number for the transaction log file by looking up the
 	 * filename $UsnJrnl in the extended system files directory $Extend.
 	 */
-	down(&vol->extend_ino->i_sem);
+	mutex_lock(&vol->extend_ino->i_mutex);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
 			&name);
-	up(&vol->extend_ino->i_sem);
+	mutex_unlock(&vol->extend_ino->i_mutex);
 	if (IS_ERR_MREF(mref)) {
 		/*
 		 * If the file does not exist, transaction logging is disabled,
@@ -2312,9 +2312,9 @@ static void ntfs_put_super(struct super_block *sb)
 	if (!list_empty(&sb->s_dirty)) {
 		const char *s1, *s2;
 
-		down(&vol->mft_ino->i_sem);
+		mutex_lock(&vol->mft_ino->i_mutex);
 		truncate_inode_pages(vol->mft_ino->i_mapping, 0);
-		up(&vol->mft_ino->i_sem);
+		mutex_unlock(&vol->mft_ino->i_mutex);
 		write_inode_now(vol->mft_ino, 1);
 		if (!list_empty(&sb->s_dirty)) {
 			static const char *_s1 = "inodes";
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 465f797451ee..6b9812db3779 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -966,7 +966,7 @@ static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 	mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
 		   num_clusters);
 
-	BUG_ON(!down_trylock(&tl_inode->i_sem));
+	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
 
@@ -1108,7 +1108,7 @@ bail:
 	return status;
 }
 
-/* Expects you to already be holding tl_inode->i_sem */
+/* Expects you to already be holding tl_inode->i_mutex */
 static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 {
 	int status;
@@ -1123,7 +1123,7 @@ static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	BUG_ON(!down_trylock(&tl_inode->i_sem));
+	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
 	tl = &di->id2.i_dealloc;
@@ -1198,9 +1198,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	int status;
 	struct inode *tl_inode = osb->osb_tl_inode;
 
-	down(&tl_inode->i_sem);
+	mutex_lock(&tl_inode->i_mutex);
 	status = __ocfs2_flush_truncate_log(osb);
-	up(&tl_inode->i_sem);
+	mutex_unlock(&tl_inode->i_mutex);
 
 	return status;
 }
@@ -1363,7 +1363,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 	mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
 	     tl_copy->i_blkno);
 
-	down(&tl_inode->i_sem);
+	mutex_lock(&tl_inode->i_mutex);
 	for(i = 0; i < num_recs; i++) {
 		if (ocfs2_truncate_log_needs_flush(osb)) {
 			status = __ocfs2_flush_truncate_log(osb);
@@ -1395,7 +1395,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 	}
 
 bail_up:
-	up(&tl_inode->i_sem);
+	mutex_unlock(&tl_inode->i_mutex);
 
 	mlog_exit(status);
 	return status;
@@ -1840,7 +1840,7 @@ start:
 
 	mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
 
-	down(&tl_inode->i_sem);
+	mutex_lock(&tl_inode->i_mutex);
 	tl_sem = 1;
 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
 	 * record is free for use. If there isn't any, we flush to get
@@ -1875,7 +1875,7 @@ start:
 		goto bail;
 	}
 
-	up(&tl_inode->i_sem);
+	mutex_unlock(&tl_inode->i_mutex);
 	tl_sem = 0;
 
 	ocfs2_commit_trans(handle);
@@ -1890,7 +1890,7 @@ bail:
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 
 	if (tl_sem)
-		up(&tl_inode->i_sem);
+		mutex_unlock(&tl_inode->i_mutex);
 
 	if (handle)
 		ocfs2_commit_trans(handle);
@@ -1994,7 +1994,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		down(&ext_alloc_inode->i_sem);
+		mutex_lock(&ext_alloc_inode->i_mutex);
 		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
 
 		status = ocfs2_meta_lock(ext_alloc_inode,
@@ -2026,7 +2026,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 		if (tc->tc_ext_alloc_locked)
 			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
 
-		up(&tc->tc_ext_alloc_inode->i_sem);
+		mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
 		iput(tc->tc_ext_alloc_inode);
 	}
 
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 5fd60c105913..cf7828f23361 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -653,7 +653,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
 	struct config_group *o2hb_group = NULL, *ret = NULL;
 	void *defs = NULL;
 
-	/* this runs under the parent dir's i_sem; there can be only
+	/* this runs under the parent dir's i_mutex; there can be only
 	 * one caller in here at a time */
 	if (o2nm_single_cluster)
 		goto out; /* ENOSPC */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 856e20ae8263..57158fa75d91 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -202,7 +202,7 @@ bail:
 }
 
 /*
- * NOTE: this should always be called with parent dir i_sem taken.
+ * NOTE: this should always be called with parent dir i_mutex taken.
  */
 int ocfs2_find_files_on_disk(const char *name,
 			     int namelen,
@@ -245,7 +245,7 @@ leave:
  * Return 0 if the name does not exist
  * Return -EEXIST if the directory contains the name
  *
- * Callers should have i_sem + a cluster lock on dir
+ * Callers should have i_mutex + a cluster lock on dir
  */
 int ocfs2_check_dir_for_entry(struct inode *dir,
 			      const char *name,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 72ae9e3306f4..ca5f9f90d794 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -492,7 +492,7 @@ restart_all:
 	}
 
 	/* blocks peope in read/write from reading our allocation
-	 * until we're done changing it. We depend on i_sem to block
+	 * until we're done changing it. We depend on i_mutex to block
 	 * other extend/truncate calls while we're here. Ordering wrt
 	 * start_trans is important here -- always do it before! */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -958,8 +958,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		filp->f_flags &= ~O_DIRECT;
 #endif
 
-	down(&inode->i_sem);
-	/* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
+	mutex_lock(&inode->i_mutex);
+	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
 	if (filp->f_flags & O_DIRECT) {
 		have_alloc_sem = 1;
 		down_read(&inode->i_alloc_sem);
@@ -1123,7 +1123,7 @@ out:
 		up_read(&inode->i_alloc_sem);
 	if (rw_level != -1) 
 		ocfs2_rw_unlock(inode, rw_level);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	mlog_exit(ret);
 	return ret;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index a91ba4dec936..d4ecc0627716 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -485,10 +485,10 @@ static int ocfs2_remove_inode(struct inode *inode,
 		goto bail;
 	}
 
-	down(&inode_alloc_inode->i_sem);
+	mutex_lock(&inode_alloc_inode->i_mutex);
 	status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
 	if (status < 0) {
-		up(&inode_alloc_inode->i_sem);
+		mutex_unlock(&inode_alloc_inode->i_mutex);
 
 		mlog_errno(status);
 		goto bail;
@@ -536,7 +536,7 @@ bail_commit:
 	ocfs2_commit_trans(handle);
 bail_unlock:
 	ocfs2_meta_unlock(inode_alloc_inode, 1);
-	up(&inode_alloc_inode->i_sem);
+	mutex_unlock(&inode_alloc_inode->i_mutex);
 	brelse(inode_alloc_bh);
 bail:
 	iput(inode_alloc_inode);
@@ -567,10 +567,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
 	/* Lock the orphan dir. The lock will be held for the entire
 	 * delete_inode operation. We do this now to avoid races with
 	 * recovery completion on other nodes. */
-	down(&orphan_dir_inode->i_sem);
+	mutex_lock(&orphan_dir_inode->i_mutex);
 	status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
 	if (status < 0) {
-		up(&orphan_dir_inode->i_sem);
+		mutex_unlock(&orphan_dir_inode->i_mutex);
 
 		mlog_errno(status);
 		goto bail;
@@ -593,7 +593,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
 
 bail_unlock_dir:
 	ocfs2_meta_unlock(orphan_dir_inode, 1);
-	up(&orphan_dir_inode->i_sem);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
 	brelse(orphan_dir_bh);
 bail:
 	iput(orphan_dir_inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 04428042e5e5..303c8d96457f 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -216,7 +216,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
 	atomic_inc(&inode->i_count);
 
 	/* we're obviously changing it... */
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	/* sanity check */
 	BUG_ON(OCFS2_I(inode)->ip_handle);
@@ -241,7 +241,7 @@ static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
 		OCFS2_I(inode)->ip_handle = NULL;
 		list_del_init(&OCFS2_I(inode)->ip_handle_list);
 
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 		iput(inode);
 	}
 }
@@ -1433,10 +1433,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	down(&orphan_dir_inode->i_sem);
+	mutex_lock(&orphan_dir_inode->i_mutex);
 	status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
 	if (status < 0) {
-		up(&orphan_dir_inode->i_sem);
+		mutex_unlock(&orphan_dir_inode->i_mutex);
 		mlog_errno(status);
 		goto out;
 	}
@@ -1451,7 +1451,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		if (!bh)
 			status = -EINVAL;
 		if (status < 0) {
-			up(&orphan_dir_inode->i_sem);
+			mutex_unlock(&orphan_dir_inode->i_mutex);
 			if (bh)
 				brelse(bh);
 			mlog_errno(status);
@@ -1465,7 +1465,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 
 			if (!ocfs2_check_dir_entry(orphan_dir_inode,
 						  de, bh, local)) {
-				up(&orphan_dir_inode->i_sem);
+				mutex_unlock(&orphan_dir_inode->i_mutex);
 				status = -EINVAL;
 				mlog_errno(status);
 				brelse(bh);
@@ -1509,7 +1509,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		}
 		brelse(bh);
 	}
-	up(&orphan_dir_inode->i_sem);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
 
 	ocfs2_meta_unlock(orphan_dir_inode, 0);
 	have_disk_lock = 0;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index fe373a2101d9..149b35181666 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -334,7 +334,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
 				  &alloc_bh, 0, inode);
@@ -367,7 +367,7 @@ bail:
 		brelse(alloc_bh);
 
 	if (inode) {
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 		iput(inode);
 	}
 
@@ -446,7 +446,7 @@ bail:
 
 /*
  * make sure we've got at least bitswanted contiguous bits in the
- * local alloc. You lose them when you drop i_sem.
+ * local alloc. You lose them when you drop i_mutex.
  *
  * We will add ourselves to the transaction passed in, but may start
  * our own in order to shift windows.
diff --git a/fs/open.c b/fs/open.c
index 75f3329e8a67..a3b3a9b5c2ff 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -211,9 +211,9 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 		newattrs.ia_valid |= ATTR_FILE;
 	}
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	err = notify_change(dentry, &newattrs);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	return err;
 }
 
@@ -398,9 +398,9 @@ asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times)
 		    (error = vfs_permission(&nd, MAY_WRITE)) != 0)
 			goto dput_and_out;
 	}
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	error = notify_change(nd.dentry, &newattrs);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 dput_and_out:
 	path_release(&nd);
 out:
@@ -451,9 +451,9 @@ long do_utimes(char __user * filename, struct timeval * times)
 		    (error = vfs_permission(&nd, MAY_WRITE)) != 0)
 			goto dput_and_out;
 	}
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	error = notify_change(nd.dentry, &newattrs);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 dput_and_out:
 	path_release(&nd);
 out:
@@ -620,13 +620,13 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	err = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		goto out_putf;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	err = notify_change(dentry, &newattrs);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 out_putf:
 	fput(file);
@@ -654,13 +654,13 @@ asmlinkage long sys_chmod(const char __user * filename, mode_t mode)
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		goto dput_and_out;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	error = notify_change(nd.dentry, &newattrs);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 dput_and_out:
 	path_release(&nd);
@@ -696,9 +696,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 	}
 	if (!S_ISDIR(inode->i_mode))
 		newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	error = notify_change(dentry, &newattrs);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 out:
 	return error;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index 66aa0b938d6a..acb030b61fb0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -44,10 +44,10 @@ void pipe_wait(struct inode * inode)
 	 * is considered a noninteractive wait:
 	 */
 	prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 	schedule();
 	finish_wait(PIPE_WAIT(*inode), &wait);
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 }
 
 static inline int
@@ -136,7 +136,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 
 	do_wakeup = 0;
 	ret = 0;
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	info = inode->i_pipe;
 	for (;;) {
 		int bufs = info->nrbufs;
@@ -200,7 +200,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 		}
 		pipe_wait(inode);
 	}
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 	/* Signal writers asynchronously that there is more room.  */
 	if (do_wakeup) {
 		wake_up_interruptible(PIPE_WAIT(*inode));
@@ -237,7 +237,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
 	do_wakeup = 0;
 	ret = 0;
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	info = inode->i_pipe;
 
 	if (!PIPE_READERS(*inode)) {
@@ -341,7 +341,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 		PIPE_WAITING_WRITERS(*inode)--;
 	}
 out:
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 	if (do_wakeup) {
 		wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
@@ -381,7 +381,7 @@ pipe_ioctl(struct inode *pino, struct file *filp,
 
 	switch (cmd) {
 		case FIONREAD:
-			down(PIPE_SEM(*inode));
+			mutex_lock(PIPE_MUTEX(*inode));
 			info =  inode->i_pipe;
 			count = 0;
 			buf = info->curbuf;
@@ -390,7 +390,7 @@ pipe_ioctl(struct inode *pino, struct file *filp,
 				count += info->bufs[buf].len;
 				buf = (buf+1) & (PIPE_BUFFERS-1);
 			}
-			up(PIPE_SEM(*inode));
+			mutex_unlock(PIPE_MUTEX(*inode));
 			return put_user(count, (int __user *)arg);
 		default:
 			return -EINVAL;
@@ -433,7 +433,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	PIPE_READERS(*inode) -= decr;
 	PIPE_WRITERS(*inode) -= decw;
 	if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
@@ -443,7 +443,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
 		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 	}
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	return 0;
 }
@@ -454,9 +454,9 @@ pipe_read_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	if (retval < 0)
 		return retval;
@@ -471,9 +471,9 @@ pipe_write_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	if (retval < 0)
 		return retval;
@@ -488,14 +488,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 
 	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
 
 	if (retval >= 0)
 		retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
 
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	if (retval < 0)
 		return retval;
@@ -534,9 +534,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
 {
 	/* We could have perhaps used atomic_t, but this and friends
 	   below are the only places.  So it doesn't seem worthwhile.  */
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	PIPE_READERS(*inode)++;
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	return 0;
 }
@@ -544,9 +544,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
 static int
 pipe_write_open(struct inode *inode, struct file *filp)
 {
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	PIPE_WRITERS(*inode)++;
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	return 0;
 }
@@ -554,12 +554,12 @@ pipe_write_open(struct inode *inode, struct file *filp)
 static int
 pipe_rdwr_open(struct inode *inode, struct file *filp)
 {
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	if (filp->f_mode & FMODE_READ)
 		PIPE_READERS(*inode)++;
 	if (filp->f_mode & FMODE_WRITE)
 		PIPE_WRITERS(*inode)++;
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	return 0;
 }
diff --git a/fs/quota.c b/fs/quota.c
index 612e04db4b93..d14d872646d4 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -168,7 +168,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	sync_blockdev(sb->s_bdev);
 
 	/* Now when everything is written we can discard the pagecache so
-	 * that userspace sees the changes. We need i_sem and so we could
+	 * that userspace sees the changes. We need i_mutex and so we could
 	 * not do it inside dqonoff_sem. Moreover we need to be carefull
 	 * about races with quotaoff() (that is the reason why we have own
 	 * reference to inode). */
@@ -184,9 +184,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	up(&sb_dqopt(sb)->dqonoff_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (discard[cnt]) {
-			down(&discard[cnt]->i_sem);
+			mutex_lock(&discard[cnt]->i_mutex);
 			truncate_inode_pages(&discard[cnt]->i_data, 0);
-			up(&discard[cnt]->i_sem);
+			mutex_unlock(&discard[cnt]->i_mutex);
 			iput(discard[cnt]);
 		}
 	}
diff --git a/fs/read_write.c b/fs/read_write.c
index df3468a22fea..3f7a1a62165f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -33,7 +33,7 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 	long long retval;
 	struct inode *inode = file->f_mapping->host;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	switch (origin) {
 		case 2:
 			offset += inode->i_size;
@@ -49,7 +49,7 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 		}
 		retval = offset;
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
 
diff --git a/fs/readdir.c b/fs/readdir.c
index b03579bc0210..b6109329b607 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -30,13 +30,13 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
 	if (res)
 		goto out;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
 		res = file->f_op->readdir(file, buf, filler);
 		file_accessed(file);
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 out:
 	return res;
 }
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 7892a865b58a..127e7d2cabdd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -49,7 +49,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 	}
 
 	reiserfs_write_lock(inode->i_sb);
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	/* freeing preallocation only involves relogging blocks that
 	 * are already in the current transaction.  preallocation gets
 	 * freed at the end of each transaction, so it is impossible for
@@ -100,7 +100,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 		err = reiserfs_truncate_file(inode, 0);
 	}
       out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	reiserfs_write_unlock(inode->i_sb);
 	return err;
 }
@@ -1342,7 +1342,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 		return -EFAULT;
 
-	down(&inode->i_sem);	// locks the entire file for just us
+	mutex_lock(&inode->i_mutex);	// locks the entire file for just us
 
 	pos = *ppos;
 
@@ -1532,12 +1532,12 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 		    generic_osync_inode(inode, file->f_mapping,
 					OSYNC_METADATA | OSYNC_DATA);
 
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	reiserfs_async_progress_wait(inode->i_sb);
 	return (already_written != 0) ? already_written : res;
 
       out:
-	up(&inode->i_sem);	// unlock the file on exit.
+	mutex_unlock(&inode->i_mutex);	// unlock the file on exit.
 	return res;
 }
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a5e3a0ddbe53..ffa34b861bdb 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -40,12 +40,12 @@ void reiserfs_delete_inode(struct inode *inode)
 
 	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
 	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 
 		reiserfs_delete_xattrs(inode);
 
 		if (journal_begin(&th, inode->i_sb, jbegin_count)) {
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 			goto out;
 		}
 		reiserfs_update_inode_transaction(inode);
@@ -59,11 +59,11 @@ void reiserfs_delete_inode(struct inode *inode)
 			DQUOT_FREE_INODE(inode);
 
 		if (journal_end(&th, inode->i_sb, jbegin_count)) {
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 			goto out;
 		}
 
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 
 		/* check return value from reiserfs_delete_object after
 		 * ending the transaction
@@ -551,7 +551,7 @@ static int convert_tail_for_hole(struct inode *inode,
 
 	/* we don't have to make sure the conversion did not happen while
 	 ** we were locking the page because anyone that could convert
-	 ** must first take i_sem.
+	 ** must first take i_mutex.
 	 **
 	 ** We must fix the tail page for writing because it might have buffers
 	 ** that are mapped, but have a block number of 0.  This indicates tail
@@ -586,7 +586,7 @@ static inline int _allocate_block(struct reiserfs_transaction_handle *th,
 	BUG_ON(!th->t_trans_id);
 
 #ifdef REISERFS_PREALLOCATE
-	if (!(flags & GET_BLOCK_NO_ISEM)) {
+	if (!(flags & GET_BLOCK_NO_IMUX)) {
 		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
 						  path, block);
 	}
@@ -2318,7 +2318,7 @@ static int map_block_for_writepage(struct inode *inode,
 	/* this is where we fill in holes in the file. */
 	if (use_get_block) {
 		retval = reiserfs_get_block(inode, block, bh_result,
-					    GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM
+					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
 					    | GET_BLOCK_NO_DANGLE);
 		if (!retval) {
 			if (!buffer_mapped(bh_result)
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 81fc00285f60..ba8bf8df6dc7 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -120,7 +120,7 @@ static int reiserfs_unpack(struct inode *inode, struct file *filp)
 	/* we need to make sure nobody is changing the file size beneath
 	 ** us
 	 */
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	write_from = inode->i_size & (blocksize - 1);
 	/* if we are on a block boundary, we are already unpacked.  */
@@ -156,7 +156,7 @@ static int reiserfs_unpack(struct inode *inode, struct file *filp)
 	page_cache_release(page);
 
       out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	reiserfs_write_unlock(inode->i_sb);
 	return retval;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 42afb5bef111..397d9590c8f2 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2211,7 +2211,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 	size_t towrite = len;
 	struct buffer_head tmp_bh, *bh;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
 		    sb->s_blocksize - offset : towrite;
@@ -2250,7 +2250,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(inode);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return len - towrite;
 }
 
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index c92e124f628e..196e971c03c9 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -205,7 +205,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 					 1) * p_s_sb->s_blocksize;
 	pos1 = pos;
 
-	// we are protected by i_sem. The tail can not disapper, not
+	// we are protected by i_mutex. The tail can not disapper, not
 	// append can be done either
 	// we are in truncate or packing tail in file_release
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 02091eaac0b4..f1895f0a278e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -67,11 +67,11 @@ static struct dentry *create_xa_root(struct super_block *sb)
 		goto out;
 	} else if (!xaroot->d_inode) {
 		int err;
-		down(&privroot->d_inode->i_sem);
+		mutex_lock(&privroot->d_inode->i_mutex);
 		err =
 		    privroot->d_inode->i_op->mkdir(privroot->d_inode, xaroot,
 						   0700);
-		up(&privroot->d_inode->i_sem);
+		mutex_unlock(&privroot->d_inode->i_mutex);
 
 		if (err) {
 			dput(xaroot);
@@ -219,7 +219,7 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode,
 	} else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
 		goto out;
 	} else {
-		/* inode->i_sem is down, so nothing else can try to create
+		/* inode->i_mutex is down, so nothing else can try to create
 		 * the same xattr */
 		err = xadir->d_inode->i_op->create(xadir->d_inode, xafile,
 						   0700 | S_IFREG, NULL);
@@ -268,7 +268,7 @@ static struct file *open_xa_file(const struct inode *inode, const char *name,
  * and don't mess with f->f_pos, but the idea is the same.  Do some
  * action on each and every entry in the directory.
  *
- * we're called with i_sem held, so there are no worries about the directory
+ * we're called with i_mutex held, so there are no worries about the directory
  * changing underneath us.
  */
 static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir)
@@ -426,7 +426,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf)
 	int res = -ENOTDIR;
 	if (!file->f_op || !file->f_op->readdir)
 		goto out;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 //        down(&inode->i_zombie);
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
@@ -435,7 +435,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf)
 		unlock_kernel();
 	}
 //        up(&inode->i_zombie);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
       out:
 	return res;
 }
@@ -480,7 +480,7 @@ static inline __u32 xattr_hash(const char *msg, int len)
 /* Generic extended attribute operations that can be used by xa plugins */
 
 /*
- * inode->i_sem: down
+ * inode->i_mutex: down
  */
 int
 reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
@@ -535,7 +535,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 	/* Resize it so we're ok to write there */
 	newattrs.ia_size = buffer_size;
 	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
-	down(&xinode->i_sem);
+	mutex_lock(&xinode->i_mutex);
 	err = notify_change(fp->f_dentry, &newattrs);
 	if (err)
 		goto out_filp;
@@ -598,7 +598,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 	}
 
       out_filp:
-	up(&xinode->i_sem);
+	mutex_unlock(&xinode->i_mutex);
 	fput(fp);
 
       out:
@@ -606,7 +606,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 }
 
 /*
- * inode->i_sem: down
+ * inode->i_mutex: down
  */
 int
 reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
@@ -793,7 +793,7 @@ reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
 
 }
 
-/* This is called w/ inode->i_sem downed */
+/* This is called w/ inode->i_mutex downed */
 int reiserfs_delete_xattrs(struct inode *inode)
 {
 	struct file *fp;
@@ -946,7 +946,7 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
 
 /*
  * Inode operation getxattr()
- * Preliminary locking: we down dentry->d_inode->i_sem
+ * Preliminary locking: we down dentry->d_inode->i_mutex
  */
 ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
@@ -970,7 +970,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
 /*
  * Inode operation setxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_mutex down
  */
 int
 reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
@@ -1008,7 +1008,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 /*
  * Inode operation removexattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_mutex down
  */
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
@@ -1091,7 +1091,7 @@ reiserfs_listxattr_filler(void *buf, const char *name, int namelen,
 /*
  * Inode operation listxattr()
  *
- * Preliminary locking: we down dentry->d_inode->i_sem
+ * Preliminary locking: we down dentry->d_inode->i_mutex
  */
 ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
@@ -1289,9 +1289,9 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 		if (!IS_ERR(dentry)) {
 			if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) {
 				struct inode *inode = dentry->d_parent->d_inode;
-				down(&inode->i_sem);
+				mutex_lock(&inode->i_mutex);
 				err = inode->i_op->mkdir(inode, dentry, 0700);
-				up(&inode->i_sem);
+				mutex_unlock(&inode->i_mutex);
 				if (err) {
 					dput(dentry);
 					dentry = NULL;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index a47ac9aac8b2..2dc953504cc0 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -174,7 +174,7 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
 /*
  * Inode operation get_posix_acl().
  *
- * inode->i_sem: down
+ * inode->i_mutex: down
  * BKL held [before 2.5.x]
  */
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
@@ -237,7 +237,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 /*
  * Inode operation set_posix_acl().
  *
- * inode->i_sem: down
+ * inode->i_mutex: down
  * BKL held [before 2.5.x]
  */
 static int
@@ -312,7 +312,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return error;
 }
 
-/* dir->i_sem: down,
+/* dir->i_mutex: locked,
  * inode is new and not released into the wild yet */
 int
 reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 7b7f2cb5f0e1..383523011aad 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -109,7 +109,7 @@ static struct dentry *relayfs_create_entry(const char *name,
 	}
 
 	parent = dget(parent);
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	d = lookup_one_len(name, parent, strlen(name));
 	if (IS_ERR(d)) {
 		d = NULL;
@@ -139,7 +139,7 @@ release_mount:
 	simple_release_fs(&relayfs_mount, &relayfs_mount_count);
 
 exit:
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	dput(parent);
 	return d;
 }
@@ -204,7 +204,7 @@ int relayfs_remove(struct dentry *dentry)
 		return -EINVAL;
 
 	parent = dget(parent);
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	if (dentry->d_inode) {
 		if (S_ISDIR(dentry->d_inode->i_mode))
 			error = simple_rmdir(parent->d_inode, dentry);
@@ -215,7 +215,7 @@ int relayfs_remove(struct dentry *dentry)
 	}
 	if (!error)
 		dput(dentry);
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	dput(parent);
 
 	if (!error)
@@ -476,7 +476,7 @@ static ssize_t relay_file_read(struct file *filp,
 	ssize_t ret = 0;
 	void *from;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if(!relay_file_read_avail(buf, *ppos))
 		goto out;
 
@@ -494,7 +494,7 @@ static ssize_t relay_file_read(struct file *filp,
 	relay_file_read_consume(buf, read_start, count);
 	*ppos = relay_file_read_end_pos(buf, read_start, count);
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index d36780382176..49bd219275db 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -99,7 +99,7 @@ static int create_dir(struct kobject * k, struct dentry * p,
 	int error;
 	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
 
-	down(&p->d_inode->i_sem);
+	mutex_lock(&p->d_inode->i_mutex);
 	*d = lookup_one_len(n, p, strlen(n));
 	if (!IS_ERR(*d)) {
 		error = sysfs_make_dirent(p->d_fsdata, *d, k, mode, SYSFS_DIR);
@@ -122,7 +122,7 @@ static int create_dir(struct kobject * k, struct dentry * p,
 		dput(*d);
 	} else
 		error = PTR_ERR(*d);
-	up(&p->d_inode->i_sem);
+	mutex_unlock(&p->d_inode->i_mutex);
 	return error;
 }
 
@@ -246,7 +246,7 @@ static void remove_dir(struct dentry * d)
 	struct dentry * parent = dget(d->d_parent);
 	struct sysfs_dirent * sd;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	d_delete(d);
 	sd = d->d_fsdata;
  	list_del_init(&sd->s_sibling);
@@ -257,7 +257,7 @@ static void remove_dir(struct dentry * d)
 	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
 		 atomic_read(&d->d_count));
 
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	dput(parent);
 }
 
@@ -286,7 +286,7 @@ void sysfs_remove_dir(struct kobject * kobj)
 		return;
 
 	pr_debug("sysfs %s: removing dir\n",dentry->d_name.name);
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	parent_sd = dentry->d_fsdata;
 	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element || !(sd->s_type & SYSFS_NOT_PINNED))
@@ -295,7 +295,7 @@ void sysfs_remove_dir(struct kobject * kobj)
 		sysfs_drop_dentry(sd, dentry);
 		sysfs_put(sd);
 	}
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	remove_dir(dentry);
 	/**
@@ -318,7 +318,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 	down_write(&sysfs_rename_sem);
 	parent = kobj->parent->dentry;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 
 	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
 	if (!IS_ERR(new_dentry)) {
@@ -334,7 +334,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 			error = -EEXIST;
 		dput(new_dentry);
 	}
-	up(&parent->d_inode->i_sem);	
+	mutex_unlock(&parent->d_inode->i_mutex);
 	up_write(&sysfs_rename_sem);
 
 	return error;
@@ -345,9 +345,9 @@ static int sysfs_dir_open(struct inode *inode, struct file *file)
 	struct dentry * dentry = file->f_dentry;
 	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	file->private_data = sysfs_new_dirent(parent_sd, NULL);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	return file->private_data ? 0 : -ENOMEM;
 
@@ -358,9 +358,9 @@ static int sysfs_dir_close(struct inode *inode, struct file *file)
 	struct dentry * dentry = file->f_dentry;
 	struct sysfs_dirent * cursor = file->private_data;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	list_del_init(&cursor->s_sibling);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	release_sysfs_dirent(cursor);
 
@@ -436,7 +436,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 {
 	struct dentry * dentry = file->f_dentry;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	switch (origin) {
 		case 1:
 			offset += file->f_pos;
@@ -444,7 +444,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			if (offset >= 0)
 				break;
 		default:
-			up(&file->f_dentry->d_inode->i_sem);
+			mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -468,7 +468,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			list_add_tail(&cursor->s_sibling, p);
 		}
 	}
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	return offset;
 }
 
@@ -483,4 +483,3 @@ struct file_operations sysfs_dir_operations = {
 EXPORT_SYMBOL_GPL(sysfs_create_dir);
 EXPORT_SYMBOL_GPL(sysfs_remove_dir);
 EXPORT_SYMBOL_GPL(sysfs_rename_dir);
-
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4013d7905e84..d0e3d8495165 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -364,9 +364,9 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type)
 	umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG;
 	int error = 0;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	error = sysfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	return error;
 }
@@ -398,7 +398,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 	struct dentry * victim;
 	int res = -ENOENT;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	victim = lookup_one_len(attr->name, dir, strlen(attr->name));
 	if (!IS_ERR(victim)) {
 		/* make sure dentry is really there */
@@ -420,7 +420,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 		 */
 		dput(victim);
 	}
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	return res;
 }
@@ -441,22 +441,22 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 	struct iattr newattrs;
 	int res = -ENOENT;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	victim = lookup_one_len(attr->name, dir, strlen(attr->name));
 	if (!IS_ERR(victim)) {
 		if (victim->d_inode &&
 		    (victim->d_parent->d_inode == dir->d_inode)) {
 			inode = victim->d_inode;
-			down(&inode->i_sem);
+			mutex_lock(&inode->i_mutex);
 			newattrs.ia_mode = (mode & S_IALLUGO) |
 						(inode->i_mode & ~S_IALLUGO);
 			newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 			res = notify_change(victim, &newattrs);
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 		}
 		dput(victim);
 	}
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	return res;
 }
@@ -480,4 +480,3 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 EXPORT_SYMBOL_GPL(sysfs_create_file);
 EXPORT_SYMBOL_GPL(sysfs_remove_file);
 EXPORT_SYMBOL_GPL(sysfs_update_file);
-
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 970a33f03299..c3133219941c 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -201,7 +201,7 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd)
 
 /*
  * Unhashes the dentry corresponding to given sysfs_dirent
- * Called with parent inode's i_sem held.
+ * Called with parent inode's i_mutex held.
  */
 void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent)
 {
@@ -232,7 +232,7 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name)
 		/* no inode means this hasn't been made visible yet */
 		return;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element)
 			continue;
@@ -243,7 +243,5 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name)
 			break;
 		}
 	}
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 }
-
-
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index de402fa915f2..e38d6338a20d 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -86,9 +86,9 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 
 	BUG_ON(!kobj || !kobj->dentry || !name);
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	error = sysfs_add_link(dentry, name, target);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	return error;
 }
 
@@ -177,4 +177,3 @@ struct inode_operations sysfs_symlink_inode_operations = {
 
 EXPORT_SYMBOL_GPL(sysfs_create_link);
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
-
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2ba11a9aa995..e9a42c711a9e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1275,7 +1275,7 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type,
 	size_t towrite = len;
 	struct buffer_head *bh;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
 				sb->s_blocksize - offset : towrite;
@@ -1297,7 +1297,7 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type,
 	}
 out:
 	if (len == towrite) {
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 		return err;
 	}
 	if (inode->i_size < off+len-towrite)
@@ -1305,7 +1305,7 @@ out:
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return len - towrite;
 }
 
diff --git a/fs/xattr.c b/fs/xattr.c
index bcc2156d4d28..386a532ee5a9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -51,7 +51,7 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
 		}
 	}
 
-	down(&d->d_inode->i_sem);
+	mutex_lock(&d->d_inode->i_mutex);
 	error = security_inode_setxattr(d, kname, kvalue, size, flags);
 	if (error)
 		goto out;
@@ -73,7 +73,7 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
 			fsnotify_xattr(d);
 	}
 out:
-	up(&d->d_inode->i_sem);
+	mutex_unlock(&d->d_inode->i_mutex);
 	kfree(kvalue);
 	return error;
 }
@@ -323,9 +323,9 @@ removexattr(struct dentry *d, char __user *name)
 		error = security_inode_removexattr(d, kname);
 		if (error)
 			goto out;
-		down(&d->d_inode->i_sem);
+		mutex_lock(&d->d_inode->i_mutex);
 		error = d->d_inode->i_op->removexattr(d, kname);
-		up(&d->d_inode->i_sem);
+		mutex_unlock(&d->d_inode->i_mutex);
 		if (!error)
 			fsnotify_xattr(d);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 14215a7db59f..41c478bb1ffc 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -203,7 +203,7 @@ validate_fields(
 		ip->i_nlink = va.va_nlink;
 		ip->i_blocks = va.va_nblocks;
 
-		/* we're under i_sem so i_size can't change under us */
+		/* we're under i_mutex so i_size can't change under us */
 		if (i_size_read(ip) != va.va_size)
 			i_size_write(ip, va.va_size);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 279e9bc92aba..5675117ef227 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -254,7 +254,7 @@ xfs_read(
 	}
 
 	if (unlikely(ioflags & IO_ISDIRECT))
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
 	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
@@ -286,7 +286,7 @@ xfs_read(
 
 unlock_isem:
 	if (unlikely(ioflags & IO_ISDIRECT))
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
@@ -655,7 +655,7 @@ relock:
 		iolock = XFS_IOLOCK_EXCL;
 		locktype = VRWLOCK_WRITE;
 
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 	} else {
 		iolock = XFS_IOLOCK_SHARED;
 		locktype = VRWLOCK_WRITE_DIRECT;
@@ -686,7 +686,7 @@ start:
 		int		dmflags = FILP_DELAY_FLAG(file);
 
 		if (need_isem)
-			dmflags |= DM_FLAGS_ISEM;
+			dmflags |= DM_FLAGS_IMUX;
 
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
@@ -772,7 +772,7 @@ retry:
 		if (need_isem) {
 			/* demote the lock now the cached pages are gone */
 			XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 
 			iolock = XFS_IOLOCK_SHARED;
 			locktype = VRWLOCK_WRITE_DIRECT;
@@ -817,14 +817,14 @@ retry:
 
 		xfs_rwunlock(bdp, locktype);
 		if (need_isem)
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
 				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
 				0, 0, 0); /* Delay flag intentionally  unused */
 		if (error)
 			goto out_nounlocks;
 		if (need_isem)
-			down(&inode->i_sem);
+			mutex_lock(&inode->i_mutex);
 		xfs_rwlock(bdp, locktype);
 		pos = xip->i_d.di_size;
 		ret = 0;
@@ -926,7 +926,7 @@ retry:
 	
 		xfs_rwunlock(bdp, locktype);
 		if (need_isem)
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 
 		error = sync_page_range(inode, mapping, pos, ret);
 		if (!error)
@@ -938,7 +938,7 @@ retry:
 	xfs_rwunlock(bdp, locktype);
  out_unlock_isem:
 	if (need_isem)
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
  out_nounlocks:
 	return -error;
 }
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index 864bf6955689..b4c7f2bc55a0 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -152,7 +152,7 @@ typedef enum {
 
 #define DM_FLAGS_NDELAY		0x001	/* return EAGAIN after dm_pending() */
 #define DM_FLAGS_UNWANTED	0x002	/* event not in fsys dm_eventset_t */
-#define DM_FLAGS_ISEM		0x004	/* thread holds i_sem */
+#define DM_FLAGS_IMUX		0x004	/* thread holds i_mutex */
 #define DM_FLAGS_IALLOCSEM_RD	0x010	/* thread holds i_alloc_sem rd */
 #define DM_FLAGS_IALLOCSEM_WR	0x020	/* thread holds i_alloc_sem wr */
 
@@ -161,21 +161,21 @@ typedef enum {
  */
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
 #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
-			      DM_FLAGS_ISEM : 0)
-#define DM_SEM_FLAG_WR	(DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM)
+			      DM_FLAGS_IMUX : 0)
+#define DM_SEM_FLAG_WR	(DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
 #endif
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \
     (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,22))
 #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
-			      DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_ISEM)
-#define DM_SEM_FLAG_WR	(DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM)
+			      DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_IMUX)
+#define DM_SEM_FLAG_WR	(DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
 #endif
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,21)
 #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
-			      0 : DM_FLAGS_ISEM)
-#define DM_SEM_FLAG_WR	(DM_FLAGS_ISEM)
+			      0 : DM_FLAGS_IMUX)
+#define DM_SEM_FLAG_WR	(DM_FLAGS_IMUX)
 #endif
 
 
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 2914f7b07156..e71dd98dbcae 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -87,7 +87,7 @@ struct ext3_inode_info {
 #ifdef CONFIG_EXT3_FS_XATTR
 	/*
 	 * Extended attributes can be read independently of the main file
-	 * data. Taking i_sem even when reading would cause contention
+	 * data. Taking i_mutex even when reading would cause contention
 	 * between readers of EAs and writers of regular file data, so
 	 * instead we synchronize on xattr_sem when reading or changing
 	 * EAs.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4c82219b0fae..01654b218e42 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -219,6 +219,7 @@ extern int dir_notify_enable;
 #include <linux/prio_tree.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/mutex.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -484,7 +485,7 @@ struct inode {
 	unsigned long		i_blocks;
 	unsigned short          i_bytes;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
-	struct semaphore	i_sem;
+	struct mutex		i_mutex;
 	struct rw_semaphore	i_alloc_sem;
 	struct inode_operations	*i_op;
 	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
@@ -1191,7 +1192,7 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc);
  *    directory.  The name should be stored in the @name (with the
  *    understanding that it is already pointing to a a %NAME_MAX+1 sized
  *    buffer.   get_name() should return %0 on success, a negative error code
- *    or error.  @get_name will be called without @parent->i_sem held.
+ *    or error.  @get_name will be called without @parent->i_mutex held.
  *
  * get_parent:
  *    @get_parent should find the parent directory for the given @child which
@@ -1213,7 +1214,7 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc);
  *    nfsd_find_fh_dentry() in either the @obj or @parent parameters.
  *
  * Locking rules:
- *    get_parent is called with child->d_inode->i_sem down
+ *    get_parent is called with child->d_inode->i_mutex down
  *    get_name is not (which is possibly inconsistent)
  */
 
diff --git a/include/linux/jffs2_fs_i.h b/include/linux/jffs2_fs_i.h
index ef85ab56302b..ad565bf9dcc1 100644
--- a/include/linux/jffs2_fs_i.h
+++ b/include/linux/jffs2_fs_i.h
@@ -8,11 +8,11 @@
 #include <asm/semaphore.h>
 
 struct jffs2_inode_info {
-	/* We need an internal semaphore similar to inode->i_sem.
+	/* We need an internal mutex similar to inode->i_mutex.
 	   Unfortunately, we can't used the existing one, because
 	   either the GC would deadlock, or we'd have to release it
 	   before letting GC proceed. Or we'd have to put ugliness
-	   into the GC code so it didn't attempt to obtain the i_sem
+	   into the GC code so it didn't attempt to obtain the i_mutex
 	   for the inode(s) which are already locked */
 	struct semaphore sem;
 
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
index bb842ea41033..0798b7781a6e 100644
--- a/include/linux/nfsd/nfsfh.h
+++ b/include/linux/nfsd/nfsfh.h
@@ -294,7 +294,7 @@ fill_post_wcc(struct svc_fh *fhp)
 /*
  * Lock a file handle/inode
  * NOTE: both fh_lock and fh_unlock are done "by hand" in
- * vfs.c:nfsd_rename as it needs to grab 2 i_sem's at once
+ * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
  * so, any changes here should be reflected there.
  */
 static inline void
@@ -317,7 +317,7 @@ fh_lock(struct svc_fh *fhp)
 	}
 
 	inode = dentry->d_inode;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	fill_pre_wcc(fhp);
 	fhp->fh_locked = 1;
 }
@@ -333,7 +333,7 @@ fh_unlock(struct svc_fh *fhp)
 
 	if (fhp->fh_locked) {
 		fill_post_wcc(fhp);
-		up(&fhp->fh_dentry->d_inode->i_sem);
+		mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
 		fhp->fh_locked = 0;
 	}
 }
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 1767073df26f..b12e59c75752 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -37,7 +37,7 @@ struct pipe_inode_info {
    memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
 #define PIPE_SIZE		PAGE_SIZE
 
-#define PIPE_SEM(inode)		(&(inode).i_sem)
+#define PIPE_MUTEX(inode)	(&(inode).i_mutex)
 #define PIPE_WAIT(inode)	(&(inode).i_pipe->wait)
 #define PIPE_READERS(inode)	((inode).i_pipe->readers)
 #define PIPE_WRITERS(inode)	((inode).i_pipe->writers)
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 001ab82df051..e276c5ba2bb7 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1857,7 +1857,7 @@ void padd_item(char *item, int total_length, int length);
 #define GET_BLOCK_CREATE 1	/* add anything you need to find block */
 #define GET_BLOCK_NO_HOLE 2	/* return -ENOENT for file holes */
 #define GET_BLOCK_READ_DIRECT 4	/* read the tail if indirect item not found */
-#define GET_BLOCK_NO_ISEM     8	/* i_sem is not held, don't preallocate */
+#define GET_BLOCK_NO_IMUX     8	/* i_mutex is not held, don't preallocate */
 #define GET_BLOCK_NO_DANGLE   16	/* don't leave any transactions running */
 
 int restart_transaction(struct reiserfs_transaction_handle *th,
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c8943b53d8e6..a8aa6152eea6 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -660,7 +660,7 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
 	if (fd < 0)
 		goto out_putname;
 
-	down(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	mutex_lock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
 	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
@@ -697,7 +697,7 @@ out_putfd:
 out_err:
 	fd = error;
 out_upsem:
-	up(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	mutex_unlock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
 out_putname:
 	putname(name);
 	return fd;
@@ -714,7 +714,7 @@ asmlinkage long sys_mq_unlink(const char __user *u_name)
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
-	down(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	mutex_lock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
 	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -735,7 +735,7 @@ out_err:
 	dput(dentry);
 
 out_unlock:
-	up(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	mutex_unlock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
 	putname(name);
 	if (inode)
 		iput(inode);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index eab64e23bcae..2a75e44e1a41 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1513,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
 	struct dentry *dentry;
 	int error;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	dentry = cpuset_get_dentry(dir, cft->name);
 	if (!IS_ERR(dentry)) {
 		error = cpuset_create_file(dentry, 0644 | S_IFREG);
@@ -1522,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
 		dput(dentry);
 	} else
 		error = PTR_ERR(dentry);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 	return error;
 }
 
@@ -1793,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	/*
 	 * Release manage_sem before cpuset_populate_dir() because it
-	 * will down() this new directory's i_sem and if we race with
+	 * will down() this new directory's i_mutex and if we race with
 	 * another mkdir, we might deadlock.
 	 */
 	up(&manage_sem);
@@ -1812,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	struct cpuset *c_parent = dentry->d_parent->d_fsdata;
 
-	/* the vfs holds inode->i_sem already */
+	/* the vfs holds inode->i_mutex already */
 	return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
 }
 
@@ -1823,7 +1823,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cpuset *parent;
 	char *pathbuf = NULL;
 
-	/* the vfs holds both inode->i_sem already */
+	/* the vfs holds both inode->i_mutex already */
 
 	down(&manage_sem);
 	cpuset_update_task_memory_state();
diff --git a/mm/filemap.c b/mm/filemap.c
index 478f4c74cc31..5fca2737c971 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -61,7 +61,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
  *
- *  ->i_sem
+ *  ->i_mutex
  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
@@ -73,9 +73,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *    ->lock_page		(access_process_vm)
  *
  *  ->mmap_sem
- *    ->i_sem			(msync)
+ *    ->i_mutex			(msync)
  *
- *  ->i_sem
+ *  ->i_mutex
  *    ->i_alloc_sem             (various)
  *
  *  ->inode_lock
@@ -276,7 +276,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
  * integrity" operation.  It waits upon in-flight writeout before starting and
  * waiting upon new writeout.  If there was an IO error, return it.
  *
- * We need to re-take i_sem during the generic_osync_inode list walk because
+ * We need to re-take i_mutex during the generic_osync_inode list walk because
  * it is otherwise livelockable.
  */
 int sync_page_range(struct inode *inode, struct address_space *mapping,
@@ -290,9 +290,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 		return 0;
 	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 	if (ret == 0) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	if (ret == 0)
 		ret = wait_on_page_writeback_range(mapping, start, end);
@@ -301,7 +301,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 EXPORT_SYMBOL(sync_page_range);
 
 /*
- * Note: Holding i_sem across sync_page_range_nolock is not a good idea
+ * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
  * as it forces O_SYNC writers to different parts of the same file
  * to be serialised right until io completion.
  */
@@ -1892,7 +1892,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	/*
 	 * Sync the fs metadata but not the minor inode changes and
 	 * of course not the data as we did direct DMA for the IO.
-	 * i_sem is held, which protects generic_osync_inode() from
+	 * i_mutex is held, which protects generic_osync_inode() from
 	 * livelocking.
 	 */
 	if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -2195,10 +2195,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
 						&iocb->ki_pos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		ssize_t err;
@@ -2220,9 +2220,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
 	struct iovec local_iov = { .iov_base = (void __user *)buf,
 					.iov_len = count };
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		ssize_t err;
@@ -2256,9 +2256,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
 	struct inode *inode = mapping->host;
 	ssize_t ret;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err;
@@ -2272,7 +2272,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
 EXPORT_SYMBOL(generic_file_writev);
 
 /*
- * Called under i_sem for writes to S_ISREG files.   Returns -EIO if something
+ * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
  * went wrong during pagecache shootdown.
  */
 static ssize_t
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9cf687e4a29a..e2b34e95913e 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 	*ppos = pos;
 	/*
 	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_sem.
+	 * cannot change under us because we hold i_mutex.
 	 */
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
@@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
 	loff_t pos;
 	ssize_t ret;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	if (!access_ok(VERIFY_READ, buf, len)) {
 		ret=-EFAULT;
@@ -390,7 +390,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
  out_backing:
 	current->backing_dev_info = NULL;
  out_up:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/memory.c b/mm/memory.c
index 3944fec38012..7a11ddd5060f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1784,13 +1784,13 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 	if (!inode->i_op || !inode->i_op->truncate_range)
 		return -ENOSYS;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	down_write(&inode->i_alloc_sem);
 	unmap_mapping_range(mapping, offset, (end - offset), 1);
 	truncate_inode_pages_range(mapping, offset, end);
 	inode->i_op->truncate_range(inode, offset, end);
 	up_write(&inode->i_alloc_sem);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
diff --git a/mm/msync.c b/mm/msync.c
index 1b5b6f662dcf..3563a56e1a51 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -137,7 +137,7 @@ static int msync_interval(struct vm_area_struct *vma,
 			ret = filemap_fdatawrite(mapping);
 			if (file->f_op && file->f_op->fsync) {
 				/*
-				 * We don't take i_sem here because mmap_sem
+				 * We don't take i_mutex here because mmap_sem
 				 * is already held.
 				 */
 				err = file->f_op->fsync(file,file->f_dentry,1);
diff --git a/mm/rmap.c b/mm/rmap.c
index 66ec43053a4d..dfbb89f99a15 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,13 +20,13 @@
 /*
  * Lock ordering in mm:
  *
- * inode->i_sem	(while writing or truncating, not reading or faulting)
+ * inode->i_mutex	(while writing or truncating, not reading or faulting)
  *   inode->i_alloc_sem
  *
  * When a page fault occurs in writing from user to file, down_read
- * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
- * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
- * taken together; in truncation, i_sem is taken outermost.
+ * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
+ * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
+ * taken together; in truncation, i_mutex is taken outermost.
  *
  * mm->mmap_sem
  *   page->flags PG_locked (lock_page)
diff --git a/mm/shmem.c b/mm/shmem.c
index a1f2f02af724..343b3c0937e5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1370,7 +1370,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
 	if (!access_ok(VERIFY_READ, buf, count))
 		return -EFAULT;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	pos = *ppos;
 	written = 0;
@@ -1455,7 +1455,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
 	if (written)
 		err = written;
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return err;
 }
 
@@ -1491,7 +1491,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 
 		/*
 		 * We must evaluate after, since reads (unlike writes)
-		 * are called without i_sem protection against truncate
+		 * are called without i_mutex protection against truncate
 		 */
 		nr = PAGE_CACHE_SIZE;
 		i_size = i_size_read(inode);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 80f948a2028b..6544565a7c0f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1187,9 +1187,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		set_blocksize(bdev, p->old_block_size);
 		bd_release(bdev);
 	} else {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		inode->i_flags &= ~S_SWAPFILE;
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	filp_close(swap_file, NULL);
 	err = 0;
@@ -1406,7 +1406,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		p->bdev = bdev;
 	} else if (S_ISREG(inode->i_mode)) {
 		p->bdev = inode->i_sb->s_bdev;
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		did_down = 1;
 		if (IS_SWAPFILE(inode)) {
 			error = -EBUSY;
@@ -1596,7 +1596,7 @@ out:
 	if (did_down) {
 		if (!error)
 			inode->i_flags |= S_SWAPFILE;
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	return error;
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index b1a463d0fe71..6cb3fff25f67 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -196,7 +196,7 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
  * @mapping: mapping to truncate
  * @lstart: offset from which to truncate
  *
- * Called under (and serialised by) inode->i_sem.
+ * Called under (and serialised by) inode->i_mutex.
  */
 void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 {
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index e14c1cae7460..9764c80ab0b2 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -69,13 +69,13 @@ rpc_timeout_upcall_queue(void *data)
 	struct rpc_inode *rpci = (struct rpc_inode *)data;
 	struct inode *inode = &rpci->vfs_inode;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops == NULL)
 		goto out;
 	if (rpci->nreaders == 0 && !list_empty(&rpci->pipe))
 		__rpc_purge_upcall(inode, -ETIMEDOUT);
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 }
 
 int
@@ -84,7 +84,7 @@ rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
 	struct rpc_inode *rpci = RPC_I(inode);
 	int res = -EPIPE;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops == NULL)
 		goto out;
 	if (rpci->nreaders) {
@@ -100,7 +100,7 @@ rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
 		res = 0;
 	}
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	wake_up(&rpci->waitq);
 	return res;
 }
@@ -116,7 +116,7 @@ rpc_close_pipes(struct inode *inode)
 {
 	struct rpc_inode *rpci = RPC_I(inode);
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops != NULL) {
 		rpci->nreaders = 0;
 		__rpc_purge_list(rpci, &rpci->in_upcall, -EPIPE);
@@ -127,7 +127,7 @@ rpc_close_pipes(struct inode *inode)
 		rpci->ops = NULL;
 	}
 	rpc_inode_setowner(inode, NULL);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	cancel_delayed_work(&rpci->queue_timeout);
 	flush_scheduled_work();
 }
@@ -154,7 +154,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
 	struct rpc_inode *rpci = RPC_I(inode);
 	int res = -ENXIO;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops != NULL) {
 		if (filp->f_mode & FMODE_READ)
 			rpci->nreaders ++;
@@ -162,7 +162,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
 			rpci->nwriters ++;
 		res = 0;
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return res;
 }
 
@@ -172,7 +172,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
 	struct rpc_inode *rpci = RPC_I(inode);
 	struct rpc_pipe_msg *msg;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops == NULL)
 		goto out;
 	msg = (struct rpc_pipe_msg *)filp->private_data;
@@ -190,7 +190,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
 	if (rpci->ops->release_pipe)
 		rpci->ops->release_pipe(inode);
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
 
@@ -202,7 +202,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 	struct rpc_pipe_msg *msg;
 	int res = 0;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops == NULL) {
 		res = -EPIPE;
 		goto out_unlock;
@@ -229,7 +229,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 		rpci->ops->destroy_msg(msg);
 	}
 out_unlock:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return res;
 }
 
@@ -240,11 +240,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of
 	struct rpc_inode *rpci = RPC_I(inode);
 	int res;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	res = -EPIPE;
 	if (rpci->ops != NULL)
 		res = rpci->ops->downcall(filp, buf, len);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return res;
 }
 
@@ -322,7 +322,7 @@ rpc_info_open(struct inode *inode, struct file *file)
 
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		clnt = RPC_I(inode)->private;
 		if (clnt) {
 			atomic_inc(&clnt->cl_users);
@@ -331,7 +331,7 @@ rpc_info_open(struct inode *inode, struct file *file)
 			single_release(inode, file);
 			ret = -EINVAL;
 		}
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	return ret;
 }
@@ -491,7 +491,7 @@ rpc_depopulate(struct dentry *parent)
 	struct dentry *dentry, *dvec[10];
 	int n = 0;
 
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 repeat:
 	spin_lock(&dcache_lock);
 	list_for_each_safe(pos, next, &parent->d_subdirs) {
@@ -519,7 +519,7 @@ repeat:
 		} while (n);
 		goto repeat;
 	}
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 }
 
 static int
@@ -532,7 +532,7 @@ rpc_populate(struct dentry *parent,
 	struct dentry *dentry;
 	int mode, i;
 
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 	for (i = start; i < eof; i++) {
 		dentry = d_alloc_name(parent, files[i].name);
 		if (!dentry)
@@ -552,10 +552,10 @@ rpc_populate(struct dentry *parent,
 			dir->i_nlink++;
 		d_add(dentry, inode);
 	}
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	return 0;
 out_bad:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	printk(KERN_WARNING "%s: %s failed to populate directory %s\n",
 			__FILE__, __FUNCTION__, parent->d_name.name);
 	return -ENOMEM;
@@ -609,7 +609,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd)
 	if ((error = rpc_lookup_parent(path, nd)) != 0)
 		return ERR_PTR(error);
 	dir = nd->dentry->d_inode;
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 	dentry = lookup_hash(nd);
 	if (IS_ERR(dentry))
 		goto out_err;
@@ -620,7 +620,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd)
 	}
 	return dentry;
 out_err:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(nd);
 	return dentry;
 }
@@ -646,7 +646,7 @@ rpc_mkdir(char *path, struct rpc_clnt *rpc_client)
 	if (error)
 		goto err_depopulate;
 out:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(&nd);
 	return dentry;
 err_depopulate:
@@ -671,7 +671,7 @@ rpc_rmdir(char *path)
 	if ((error = rpc_lookup_parent(path, &nd)) != 0)
 		return error;
 	dir = nd.dentry->d_inode;
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 	dentry = lookup_hash(&nd);
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
@@ -681,7 +681,7 @@ rpc_rmdir(char *path)
 	error = __rpc_rmdir(dir, dentry);
 	dput(dentry);
 out_release:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(&nd);
 	return error;
 }
@@ -710,7 +710,7 @@ rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags)
 	rpci->ops = ops;
 	inode_dir_notify(dir, DN_CREATE);
 out:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(&nd);
 	return dentry;
 err_dput:
@@ -732,7 +732,7 @@ rpc_unlink(char *path)
 	if ((error = rpc_lookup_parent(path, &nd)) != 0)
 		return error;
 	dir = nd.dentry->d_inode;
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 	dentry = lookup_hash(&nd);
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
@@ -746,7 +746,7 @@ rpc_unlink(char *path)
 	dput(dentry);
 	inode_dir_notify(dir, DN_DELETE);
 out_release:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(&nd);
 	return error;
 }
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5f6ae79b8b16..1b5989b1b670 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -784,7 +784,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
 		if (err)
 			goto out_mknod_dput;
-		up(&nd.dentry->d_inode->i_sem);
+		mutex_unlock(&nd.dentry->d_inode->i_mutex);
 		dput(nd.dentry);
 		nd.dentry = dentry;
 
@@ -823,7 +823,7 @@ out:
 out_mknod_dput:
 	dput(dentry);
 out_mknod_unlock:
-	up(&nd.dentry->d_inode->i_sem);
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 	path_release(&nd);
 out_mknod_parent:
 	if (err==-EEXIST)
diff --git a/security/inode.c b/security/inode.c
index a5964502ae30..0f77b0223662 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -172,7 +172,7 @@ static int create_by_name(const char *name, mode_t mode,
 		return -EFAULT;
 	}
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	*dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry)) {
 		if ((mode & S_IFMT) == S_IFDIR)
@@ -181,7 +181,7 @@ static int create_by_name(const char *name, mode_t mode,
 			error = create(parent->d_inode, *dentry, mode);
 	} else
 		error = PTR_ERR(dentry);
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 
 	return error;
 }
@@ -302,7 +302,7 @@ void securityfs_remove(struct dentry *dentry)
 	if (!parent || !parent->d_inode)
 		return;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	if (positive(dentry)) {
 		if (dentry->d_inode) {
 			if (S_ISDIR(dentry->d_inode->i_mode))
@@ -312,7 +312,7 @@ void securityfs_remove(struct dentry *dentry)
 			dput(dentry);
 		}
 	}
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	simple_release_fs(&mount, &mount_count);
 }
 EXPORT_SYMBOL_GPL(securityfs_remove);
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 16df1246a131..7fd072392c7e 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -2135,9 +2135,7 @@ static ssize_t snd_pcm_oss_write(struct file *file, const char __user *buf, size
 	substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
 	if (substream == NULL)
 		return -ENXIO;
-	up(&file->f_dentry->d_inode->i_sem);
 	result = snd_pcm_oss_write1(substream, buf, count);
-	down(&file->f_dentry->d_inode->i_sem);
 #ifdef OSS_DEBUG
 	printk("pcm_oss: write %li bytes (wrote %li bytes)\n", (long)count, (long)result);
 #endif
diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c
index 9ee6c177db0c..40b4f679c80e 100644
--- a/sound/core/seq/seq_memory.c
+++ b/sound/core/seq/seq_memory.c
@@ -32,10 +32,6 @@
 #include "seq_info.h"
 #include "seq_lock.h"
 
-/* semaphore in struct file record */
-#define semaphore_of(fp)	((fp)->f_dentry->d_inode->i_sem)
-
-
 static inline int snd_seq_pool_available(struct snd_seq_pool *pool)
 {
 	return pool->total_elements - atomic_read(&pool->counter);
-- 
cgit v1.2.3-71-gd317


From 7892f2f48d165a34b0b8130c8a195dfd807b8cb6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 15:59:25 -0800
Subject: [PATCH] mutex subsystem, semaphore to mutex: VFS, sb->s_lock

This patch converts the superblock-lock semaphore to a mutex, affecting
lock_super()/unlock_super(). Tested on ext3 and XFS.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/ext3/super.c    | 2 +-
 fs/ocfs2/super.c   | 2 +-
 fs/super.c         | 2 +-
 include/linux/fs.h | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index c3dbebdb9897..56bf76586019 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2150,7 +2150,7 @@ int ext3_force_commit(struct super_block *sb)
 
 static void ext3_write_super (struct super_block * sb)
 {
-	if (down_trylock(&sb->s_lock) == 0)
+	if (mutex_trylock(&sb->s_lock) != 0)
 		BUG();
 	sb->s_dirt = 0;
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 48bf7f0ce544..364d64bd5f10 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -169,7 +169,7 @@ static match_table_t tokens = {
  */
 static void ocfs2_write_super(struct super_block *sb)
 {
-	if (down_trylock(&sb->s_lock) == 0)
+	if (mutex_trylock(&sb->s_lock) != 0)
 		BUG();
 	sb->s_dirt = 0;
 }
diff --git a/fs/super.c b/fs/super.c
index 0a30e51692cf..c177b92419c5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -72,7 +72,7 @@ static struct super_block *alloc_super(void)
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		init_rwsem(&s->s_umount);
-		sema_init(&s->s_lock, 1);
+		mutex_init(&s->s_lock);
 		down_write(&s->s_umount);
 		s->s_count = S_BIAS;
 		atomic_set(&s->s_active, 1);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 01654b218e42..92ae3e2067b0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -821,7 +821,7 @@ struct super_block {
 	unsigned long		s_magic;
 	struct dentry		*s_root;
 	struct rw_semaphore	s_umount;
-	struct semaphore	s_lock;
+	struct mutex		s_lock;
 	int			s_count;
 	int			s_syncing;
 	int			s_need_sync_fs;
@@ -893,13 +893,13 @@ static inline int has_fs_excl(void)
 static inline void lock_super(struct super_block * sb)
 {
 	get_fs_excl();
-	down(&sb->s_lock);
+	mutex_lock(&sb->s_lock);
 }
 
 static inline void unlock_super(struct super_block * sb)
 {
 	put_fs_excl();
-	up(&sb->s_lock);
+	mutex_unlock(&sb->s_lock);
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From f36d4024caa3790606e43228a574157c45b73b22 Mon Sep 17 00:00:00 2001
From: Aleksey Makarov <amakarov@ru.mvista.com>
Date: Mon, 9 Jan 2006 15:59:27 -0800
Subject: [PATCH] mutex subsystem, semaphore to completion: IDE
 ->gendev_rel_sem

The patch changes semaphores that are initialized as
locked to complete().

Source: MontaVista Software, Inc.

Modified-by: Steven Rostedt <rostedt@goodmis.org>

The following patch is from Montavista.  I modified it slightly.
Semaphores are currently being used where it makes more sense for
completions.  This patch corrects that.

Signed-off-by: Aleksey Makarov <amakarov@ru.mvista.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/ide/ide-probe.c | 4 ++--
 drivers/ide/ide.c       | 8 ++++----
 include/linux/ide.h     | 5 +++--
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 1ddaa71a8f45..7cb2d86601db 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -655,7 +655,7 @@ static void hwif_release_dev (struct device *dev)
 {
 	ide_hwif_t *hwif = container_of(dev, ide_hwif_t, gendev);
 
-	up(&hwif->gendev_rel_sem);
+	complete(&hwif->gendev_rel_comp);
 }
 
 static void hwif_register (ide_hwif_t *hwif)
@@ -1327,7 +1327,7 @@ static void drive_release_dev (struct device *dev)
 	drive->queue = NULL;
 	spin_unlock_irq(&ide_lock);
 
-	up(&drive->gendev_rel_sem);
+	complete(&drive->gendev_rel_comp);
 }
 
 /*
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index b069b13b75a7..ec5a4cb173b0 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -222,7 +222,7 @@ static void init_hwif_data(ide_hwif_t *hwif, unsigned int index)
 	hwif->mwdma_mask = 0x80;	/* disable all mwdma */
 	hwif->swdma_mask = 0x80;	/* disable all swdma */
 
-	sema_init(&hwif->gendev_rel_sem, 0);
+	init_completion(&hwif->gendev_rel_comp);
 
 	default_hwif_iops(hwif);
 	default_hwif_transport(hwif);
@@ -245,7 +245,7 @@ static void init_hwif_data(ide_hwif_t *hwif, unsigned int index)
 		drive->is_flash			= 0;
 		drive->vdma			= 0;
 		INIT_LIST_HEAD(&drive->list);
-		sema_init(&drive->gendev_rel_sem, 0);
+		init_completion(&drive->gendev_rel_comp);
 	}
 }
 
@@ -602,7 +602,7 @@ void ide_unregister(unsigned int index)
 		}
 		spin_unlock_irq(&ide_lock);
 		device_unregister(&drive->gendev);
-		down(&drive->gendev_rel_sem);
+		wait_for_completion(&drive->gendev_rel_comp);
 		spin_lock_irq(&ide_lock);
 	}
 	hwif->present = 0;
@@ -662,7 +662,7 @@ void ide_unregister(unsigned int index)
 	/* More messed up locking ... */
 	spin_unlock_irq(&ide_lock);
 	device_unregister(&hwif->gendev);
-	down(&hwif->gendev_rel_sem);
+	wait_for_completion(&hwif->gendev_rel_comp);
 
 	/*
 	 * Remove us from the kernel's knowledge
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ef8d0cbb832f..9a8c05dbe4f3 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/device.h>
 #include <linux/pci.h>
+#include <linux/completion.h>
 #include <asm/byteorder.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -638,7 +639,7 @@ typedef struct ide_drive_s {
 	int		crc_count;	/* crc counter to reduce drive speed */
 	struct list_head list;
 	struct device	gendev;
-	struct semaphore gendev_rel_sem;	/* to deal with device release() */
+	struct completion gendev_rel_comp;	/* to deal with device release() */
 } ide_drive_t;
 
 #define to_ide_device(dev)container_of(dev, ide_drive_t, gendev)
@@ -794,7 +795,7 @@ typedef struct hwif_s {
 	unsigned	sg_mapped  : 1;	/* sg_table and sg_nents are ready */
 
 	struct device	gendev;
-	struct semaphore gendev_rel_sem; /* To deal with device release() */
+	struct completion gendev_rel_comp; /* To deal with device release() */
 
 	void		*hwif_data;	/* extra hwif data */
 
-- 
cgit v1.2.3-71-gd317


From 11b751ae8c8ca3fa24c85bd5a3e51dd9f95cda17 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 15:59:27 -0800
Subject: [PATCH] mutex subsystem, semaphore to completion:
 drivers/block/loop.c

convert the block loop device from semaphores to completions.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/block/loop.c | 27 ++++++++++++---------------
 include/linux/loop.h |  4 ++--
 2 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bed9ad76c04c..864729046e22 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -527,12 +527,12 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio)
 	lo->lo_pending++;
 	loop_add_bio(lo, old_bio);
 	spin_unlock_irq(&lo->lo_lock);
-	up(&lo->lo_bh_mutex);
+	complete(&lo->lo_bh_done);
 	return 0;
 
 out:
 	if (lo->lo_pending == 0)
-		up(&lo->lo_bh_mutex);
+		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 	bio_io_error(old_bio, old_bio->bi_size);
 	return 0;
@@ -593,23 +593,20 @@ static int loop_thread(void *data)
 	lo->lo_pending = 1;
 
 	/*
-	 * up sem, we are running
+	 * complete it, we are running
 	 */
-	up(&lo->lo_sem);
+	complete(&lo->lo_done);
 
 	for (;;) {
 		int pending;
 
-		/*
-		 * interruptible just to not contribute to load avg
-		 */
-		if (down_interruptible(&lo->lo_bh_mutex))
+		if (wait_for_completion_interruptible(&lo->lo_bh_done))
 			continue;
 
 		spin_lock_irq(&lo->lo_lock);
 
 		/*
-		 * could be upped because of tear-down, not pending work
+		 * could be completed because of tear-down, not pending work
 		 */
 		if (unlikely(!lo->lo_pending)) {
 			spin_unlock_irq(&lo->lo_lock);
@@ -632,7 +629,7 @@ static int loop_thread(void *data)
 			break;
 	}
 
-	up(&lo->lo_sem);
+	complete(&lo->lo_done);
 	return 0;
 }
 
@@ -843,7 +840,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 	set_blocksize(bdev, lo_blocksize);
 
 	kernel_thread(loop_thread, lo, CLONE_KERNEL);
-	down(&lo->lo_sem);
+	wait_for_completion(&lo->lo_done);
 	return 0;
 
  out_putf:
@@ -909,10 +906,10 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 	lo->lo_state = Lo_rundown;
 	lo->lo_pending--;
 	if (!lo->lo_pending)
-		up(&lo->lo_bh_mutex);
+		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 
-	down(&lo->lo_sem);
+	wait_for_completion(&lo->lo_done);
 
 	lo->lo_backing_file = NULL;
 
@@ -1289,8 +1286,8 @@ static int __init loop_init(void)
 		if (!lo->lo_queue)
 			goto out_mem4;
 		init_MUTEX(&lo->lo_ctl_mutex);
-		init_MUTEX_LOCKED(&lo->lo_sem);
-		init_MUTEX_LOCKED(&lo->lo_bh_mutex);
+		init_completion(&lo->lo_done);
+		init_completion(&lo->lo_bh_done);
 		lo->lo_number = i;
 		spin_lock_init(&lo->lo_lock);
 		disk->major = LOOP_MAJOR;
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 40f63c9879d2..f96506782ebe 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -58,9 +58,9 @@ struct loop_device {
 	struct bio 		*lo_bio;
 	struct bio		*lo_biotail;
 	int			lo_state;
-	struct semaphore	lo_sem;
+	struct completion	lo_done;
+	struct completion	lo_bh_done;
 	struct semaphore	lo_ctl_mutex;
-	struct semaphore	lo_bh_mutex;
 	int			lo_pending;
 
 	request_queue_t		*lo_queue;
-- 
cgit v1.2.3-71-gd317


From 8b4ad5e3ff94409973e824716c65568f0d97364c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Mon, 9 Jan 2006 21:38:23 -0800
Subject: [MUTEX]: linux/mutex-debug.h needs linux/linkage.h

For FASTCALL() define.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mutex-debug.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
index 0ccd8f983b50..8138d9eb58ec 100644
--- a/include/linux/mutex-debug.h
+++ b/include/linux/mutex-debug.h
@@ -1,6 +1,8 @@
 #ifndef __LINUX_MUTEX_DEBUG_H
 #define __LINUX_MUTEX_DEBUG_H
 
+#include <linux/linkage.h>
+
 /*
  * Mutexes - debugging helpers:
  */
-- 
cgit v1.2.3-71-gd317


From e329113ca437e44ec399b7ffe114ed36e84ccf5e Mon Sep 17 00:00:00 2001
From: Ben Gardner <gardner.ben@gmail.com>
Date: Mon, 9 Jan 2006 20:51:29 -0800
Subject: [PATCH] i386: GPIO driver for AMD CS5535/CS5536

A simple driver for the CS5535 and CS5536 that allows a user-space program
to manipulate GPIO pins.  The CS5535/CS5536 chips are Geode processor
companion devices.

Signed-off-by: Ben Gardner <bgardner@wabtec.com>
Signed-off-by: Richard Knutsson <ricknu-0@student.ltu.se>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/Kconfig                |   9 ++
 drivers/char/Makefile               |   1 +
 drivers/char/cs5535_gpio.c          | 250 ++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h             |   9 ++
 sound/pci/cs5535audio/cs5535audio.c |   2 +-
 5 files changed, 270 insertions(+), 1 deletion(-)
 create mode 100644 drivers/char/cs5535_gpio.c

(limited to 'include/linux')

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index dd7e6901c575..77286eb5826d 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -936,6 +936,15 @@ config SCx200_GPIO
 
 	  If compiled as a module, it will be called scx200_gpio.
 
+config CS5535_GPIO
+	tristate "AMD CS5535/CS5536 GPIO (Geode Companion Device)"
+	depends on X86_32
+	help
+	  Give userspace access to the GPIO pins on the AMD CS5535 and
+	  CS5536 Geode companion devices.
+
+	  If compiled as a module, it will be called cs5535_gpio.
+
 config GPIO_VR41XX
 	tristate "NEC VR4100 series General-purpose I/O Unit support"
 	depends on CPU_VR41XX
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index d973d14d8f7f..503dd901d406 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_PPDEV) += ppdev.o
 obj-$(CONFIG_NWBUTTON) += nwbutton.o
 obj-$(CONFIG_NWFLASH) += nwflash.o
 obj-$(CONFIG_SCx200_GPIO) += scx200_gpio.o
+obj-$(CONFIG_CS5535_GPIO) += cs5535_gpio.o
 obj-$(CONFIG_GPIO_VR41XX) += vr41xx_giu.o
 obj-$(CONFIG_TANBAC_TB0219) += tb0219.o
 obj-$(CONFIG_TELCLOCK) += tlclk.o
diff --git a/drivers/char/cs5535_gpio.c b/drivers/char/cs5535_gpio.c
new file mode 100644
index 000000000000..5d72f50de1ac
--- /dev/null
+++ b/drivers/char/cs5535_gpio.c
@@ -0,0 +1,250 @@
+/*
+ * AMD CS5535/CS5536 GPIO driver.
+ * Allows a user space process to play with the GPIO pins.
+ *
+ * Copyright (c) 2005 Ben Gardner <bgardner@wabtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the smems of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/cdev.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+
+#define NAME			"cs5535_gpio"
+
+MODULE_AUTHOR("Ben Gardner <bgardner@wabtec.com>");
+MODULE_DESCRIPTION("AMD CS5535/CS5536 GPIO Pin Driver");
+MODULE_LICENSE("GPL");
+
+static int major;
+module_param(major, int, 0);
+MODULE_PARM_DESC(major, "Major device number");
+
+static ulong mask;
+module_param(mask, ulong, 0);
+MODULE_PARM_DESC(mask, "GPIO channel mask");
+
+#define MSR_LBAR_GPIO		0x5140000C
+
+static u32 gpio_base;
+
+static struct pci_device_id divil_pci[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS,  PCI_DEVICE_ID_NS_CS5535_ISA) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA) },
+	{ } /* NULL entry */
+};
+
+static struct cdev cs5535_gpio_cdev;
+
+/* reserve 32 entries even though some aren't usable */
+#define CS5535_GPIO_COUNT	32
+
+/* IO block size */
+#define CS5535_GPIO_SIZE	256
+
+struct gpio_regmap {
+	u32	rd_offset;
+	u32	wr_offset;
+	char	on;
+	char	off;
+};
+static struct gpio_regmap rm[] =
+{
+	{ 0x30, 0x00, '1', '0' },	/* GPIOx_READ_BACK / GPIOx_OUT_VAL */
+	{ 0x20, 0x20, 'I', 'i' },	/* GPIOx_IN_EN */
+	{ 0x04, 0x04, 'O', 'o' },	/* GPIOx_OUT_EN */
+	{ 0x08, 0x08, 't', 'T' },	/* GPIOx_OUT_OD_EN */
+	{ 0x18, 0x18, 'P', 'p' },	/* GPIOx_OUT_PU_EN */
+	{ 0x1c, 0x1c, 'D', 'd' },	/* GPIOx_OUT_PD_EN */
+};
+
+
+/**
+ * Gets the register offset for the GPIO bank.
+ * Low (0-15) starts at 0x00, high (16-31) starts at 0x80
+ */
+static inline u32 cs5535_lowhigh_base(int reg)
+{
+	return (reg & 0x10) << 3;
+}
+
+static ssize_t cs5535_gpio_write(struct file *file, const char __user *data,
+				 size_t len, loff_t *ppos)
+{
+	u32	m = iminor(file->f_dentry->d_inode);
+	int	i, j;
+	u32	base = gpio_base + cs5535_lowhigh_base(m);
+	u32	m0, m1;
+	char	c;
+
+	/**
+	 * Creates the mask for atomic bit programming.
+	 * The high 16 bits and the low 16 bits are used to set the mask.
+	 * For example, GPIO 15 maps to 31,15: 0,1 => On; 1,0=> Off
+	 */
+	m1 = 1 << (m & 0x0F);
+	m0 = m1 << 16;
+
+	for (i = 0; i < len; ++i) {
+		if (get_user(c, data+i))
+			return -EFAULT;
+
+		for (j = 0; j < ARRAY_SIZE(rm); j++) {
+			if (c == rm[j].on) {
+				outl(m1, base + rm[j].wr_offset);
+				break;
+			} else if (c == rm[j].off) {
+				outl(m0, base + rm[j].wr_offset);
+				break;
+			}
+		}
+	}
+	*ppos = 0;
+	return len;
+}
+
+static ssize_t cs5535_gpio_read(struct file *file, char __user *buf,
+				size_t len, loff_t *ppos)
+{
+	u32	m = iminor(file->f_dentry->d_inode);
+	u32	base = gpio_base + cs5535_lowhigh_base(m);
+	int	rd_bit = 1 << (m & 0x0f);
+	int	i;
+	char	ch;
+	ssize_t	count = 0;
+
+	if (*ppos >= ARRAY_SIZE(rm))
+		return 0;
+
+	for (i = *ppos; (i < (*ppos + len)) && (i < ARRAY_SIZE(rm)); i++) {
+		ch = (inl(base + rm[i].rd_offset) & rd_bit) ?
+		     rm[i].on : rm[i].off;
+
+		if (put_user(ch, buf+count))
+			return -EFAULT;
+
+		count++;
+	}
+
+	/* add a line-feed if there is room */
+	if ((i == ARRAY_SIZE(rm)) && (count < len)) {
+		put_user('\n', buf + count);
+		count++;
+	}
+
+	*ppos += count;
+	return count;
+}
+
+static int cs5535_gpio_open(struct inode *inode, struct file *file)
+{
+	u32 m = iminor(inode);
+
+	/* the mask says which pins are usable by this driver */
+	if ((mask & (1 << m)) == 0)
+		return -EINVAL;
+
+	return nonseekable_open(inode, file);
+}
+
+static struct file_operations cs5535_gpio_fops = {
+	.owner	= THIS_MODULE,
+	.write	= cs5535_gpio_write,
+	.read	= cs5535_gpio_read,
+	.open	= cs5535_gpio_open
+};
+
+static int __init cs5535_gpio_init(void)
+{
+	dev_t	dev_id;
+	u32	low, hi;
+	int	retval;
+
+	if (pci_dev_present(divil_pci) == 0) {
+		printk(KERN_WARNING NAME ": DIVIL not found\n");
+		return -ENODEV;
+	}
+
+	/* Grab the GPIO I/O range */
+	rdmsr(MSR_LBAR_GPIO, low, hi);
+
+	/* Check the mask and whether GPIO is enabled (sanity check) */
+	if (hi != 0x0000f001) {
+		printk(KERN_WARNING NAME ": GPIO not enabled\n");
+		return -ENODEV;
+	}
+
+	/* Mask off the IO base address */
+	gpio_base = low & 0x0000ff00;
+
+	/**
+	 * Some GPIO pins
+	 *  31-29,23 : reserved (always mask out)
+	 *  28       : Power Button
+	 *  26       : PME#
+	 *  22-16    : LPC
+	 *  14,15    : SMBus
+	 *  9,8      : UART1
+	 *  7        : PCI INTB
+	 *  3,4      : UART2/DDC
+	 *  2        : IDE_IRQ0
+	 *  0        : PCI INTA
+	 *
+	 * If a mask was not specified, be conservative and only allow:
+	 *  1,2,5,6,10-13,24,25,27
+	 */
+	if (mask != 0)
+		mask &= 0x1f7fffff;
+	else
+		mask = 0x0b003c66;
+
+	if (request_region(gpio_base, CS5535_GPIO_SIZE, NAME) == 0) {
+		printk(KERN_ERR NAME ": can't allocate I/O for GPIO\n");
+		return -ENODEV;
+	}
+
+	if (major) {
+		dev_id = MKDEV(major, 0);
+		retval = register_chrdev_region(dev_id, CS5535_GPIO_COUNT,
+						NAME);
+	} else {
+		retval = alloc_chrdev_region(&dev_id, 0, CS5535_GPIO_COUNT,
+					     NAME);
+		major = MAJOR(dev_id);
+	}
+
+	if (retval) {
+		release_region(gpio_base, CS5535_GPIO_SIZE);
+		return -1;
+	}
+
+	printk(KERN_DEBUG NAME ": base=%#x mask=%#lx major=%d\n",
+	       gpio_base, mask, major);
+
+	cdev_init(&cs5535_gpio_cdev, &cs5535_gpio_fops);
+	cdev_add(&cs5535_gpio_cdev, dev_id, CS5535_GPIO_COUNT);
+
+	return 0;
+}
+
+static void __exit cs5535_gpio_cleanup(void)
+{
+	dev_t dev_id = MKDEV(major, 0);
+	unregister_chrdev_region(dev_id, CS5535_GPIO_COUNT);
+	if (gpio_base != 0)
+		release_region(gpio_base, CS5535_GPIO_SIZE);
+}
+
+module_init(cs5535_gpio_init);
+module_exit(cs5535_gpio_cleanup);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c3caa93efb10..100eba0f4771 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -377,6 +377,7 @@
 #define PCI_DEVICE_ID_NS_87560_USB	0x0012
 #define PCI_DEVICE_ID_NS_83815		0x0020
 #define PCI_DEVICE_ID_NS_83820		0x0022
+#define PCI_DEVICE_ID_NS_CS5535_ISA	0x002b
 #define PCI_DEVICE_ID_NS_CS5535_IDE	0x002d
 #define PCI_DEVICE_ID_NS_CS5535_AUDIO	0x002e
 #define PCI_DEVICE_ID_NS_CS5535_USB	0x002f
@@ -500,6 +501,14 @@
 #define PCI_DEVICE_ID_AMD_8111_AUDIO	0x746d
 #define PCI_DEVICE_ID_AMD_8151_0	0x7454
 #define PCI_DEVICE_ID_AMD_8131_APIC     0x7450
+#define PCI_DEVICE_ID_AMD_CS5536_ISA    0x2090
+#define PCI_DEVICE_ID_AMD_CS5536_FLASH  0x2091
+#define PCI_DEVICE_ID_AMD_CS5536_AUDIO  0x2093
+#define PCI_DEVICE_ID_AMD_CS5536_OHC    0x2094
+#define PCI_DEVICE_ID_AMD_CS5536_EHC    0x2095
+#define PCI_DEVICE_ID_AMD_CS5536_UDC    0x2096
+#define PCI_DEVICE_ID_AMD_CS5536_UOC    0x2097
+#define PCI_DEVICE_ID_AMD_CS5536_IDE    0x209A
 
 #define PCI_DEVICE_ID_AMD_CS5536_IDE	0x209A
 
diff --git a/sound/pci/cs5535audio/cs5535audio.c b/sound/pci/cs5535audio/cs5535audio.c
index 202c7cf3e328..f36ede827479 100644
--- a/sound/pci/cs5535audio/cs5535audio.c
+++ b/sound/pci/cs5535audio/cs5535audio.c
@@ -385,7 +385,7 @@ static struct pci_driver driver = {
 
 static int __init alsa_card_cs5535audio_init(void)
 {
-	return pci_module_init(&driver);
+	return pci_register_driver(&driver);
 }
 
 static void __exit alsa_card_cs5535audio_exit(void)
-- 
cgit v1.2.3-71-gd317


From 0ad42352c01788e41a33336577fdd270d8de55bb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 9 Jan 2006 20:51:31 -0800
Subject: [PATCH] Add list_for_each_entry_safe_reverse()

Add list_for_each_entry_safe_reverse() to linux/list.h

This is needed by unmerged cachefs and be an as-yet-unreviewed
device_shutdown() fix.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Patrick Mochel <mochel@digitalimplant.org>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/list.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 8e3388284530..945daa1f13dd 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -435,6 +435,20 @@ static inline void list_splice_init(struct list_head *list,
 	     &pos->member != (head);						\
 	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
 
+/**
+ * list_for_each_entry_safe_reverse - iterate backwards over list of given type safe against
+ *				      removal of list entry
+ * @pos:	the type * to use as a loop counter.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
+		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
 /**
  * list_for_each_rcu	-	iterate over an rcu-protected list
  * @pos:	the &struct list_head to use as a loop counter.
-- 
cgit v1.2.3-71-gd317


From df2e71fb9115a8d4f721fb1464db09adc8332bc5 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Mon, 9 Jan 2006 20:51:37 -0800
Subject: [PATCH] dump_thread() cleanup

)

From: Adrian Bunk <bunk@stusta.de>

- create one common dump_thread() prototype in kernel.h

- dump_thread() is only used in fs/binfmt_aout.c and can therefore be
  removed on all architectures where CONFIG_BINFMT_AOUT is not
  available

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/alpha_ksyms.c     |  1 -
 arch/alpha/mm/init.c                |  1 +
 arch/arm26/kernel/armksyms.c        |  1 -
 arch/cris/kernel/crisksyms.c        |  2 --
 arch/cris/kernel/process.c          | 28 ----------------------
 arch/frv/kernel/frv_ksyms.c         |  2 --
 arch/frv/kernel/process.c           | 22 ------------------
 arch/h8300/kernel/h8300_ksyms.c     |  3 ---
 arch/h8300/kernel/process.c         | 28 ----------------------
 arch/m32r/kernel/m32r_ksyms.c       |  3 ---
 arch/m32r/kernel/process.c          |  8 -------
 arch/m68k/kernel/m68k_ksyms.c       |  2 --
 arch/m68knommu/kernel/m68k_ksyms.c  |  2 --
 arch/m68knommu/kernel/process.c     | 46 -------------------------------------
 arch/s390/kernel/process.c          | 21 -----------------
 arch/sh/kernel/process.c            | 20 ----------------
 arch/sh/kernel/sh_ksyms.c           |  2 --
 arch/sh64/kernel/process.c          | 20 ----------------
 arch/sh64/kernel/sh_ksyms.c         |  2 --
 arch/sparc/kernel/sparc_ksyms.c     |  2 --
 arch/sparc64/kernel/binfmt_aout32.c |  2 --
 arch/sparc64/kernel/sparc64_ksyms.c |  2 --
 arch/v850/kernel/process.c          | 24 -------------------
 arch/v850/kernel/v850_ksyms.c       |  2 --
 fs/binfmt_aout.c                    |  2 --
 fs/binfmt_flat.c                    |  2 --
 include/asm-um/processor-generic.h  |  1 -
 include/linux/kernel.h              |  4 ++++
 28 files changed, 5 insertions(+), 250 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/alpha_ksyms.c b/arch/alpha/kernel/alpha_ksyms.c
index f3e98f837784..1898ea79d0e2 100644
--- a/arch/alpha/kernel/alpha_ksyms.c
+++ b/arch/alpha/kernel/alpha_ksyms.c
@@ -40,7 +40,6 @@
 #include <asm/unistd.h>
 
 extern struct hwrpb_struct *hwrpb;
-extern void dump_thread(struct pt_regs *, struct user *);
 extern spinlock_t rtc_lock;
 
 /* these are C runtime functions with special calling conventions: */
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 90752f6d8867..486d7945583d 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -7,6 +7,7 @@
 /* 2.3.x zone allocator, 1999 Andrea Arcangeli <andrea@suse.de> */
 
 #include <linux/config.h>
+#include <linux/pagemap.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
diff --git a/arch/arm26/kernel/armksyms.c b/arch/arm26/kernel/armksyms.c
index 35514b398e2e..811a6376c624 100644
--- a/arch/arm26/kernel/armksyms.c
+++ b/arch/arm26/kernel/armksyms.c
@@ -35,7 +35,6 @@
 #include <asm/checksum.h>
 #include <asm/mach-types.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern int dump_fpu(struct pt_regs *, struct user_fp_struct *);
 extern void inswb(unsigned int port, void *to, int len);
 extern void outswb(unsigned int port, const void *to, int len);
diff --git a/arch/cris/kernel/crisksyms.c b/arch/cris/kernel/crisksyms.c
index 85833d704ebb..de39725da920 100644
--- a/arch/cris/kernel/crisksyms.c
+++ b/arch/cris/kernel/crisksyms.c
@@ -21,7 +21,6 @@
 #include <asm/pgtable.h>
 #include <asm/fasttimer.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern unsigned long get_cmos_time(void);
 extern void __Udiv(void);
 extern void __Umod(void);
@@ -33,7 +32,6 @@ extern void __lshrdi3(void);
 extern void iounmap(volatile void * __iomem);
 
 /* Platform dependent support */
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(get_cmos_time);
 EXPORT_SYMBOL(loops_per_usec);
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 7c80afb10460..4ab3e87115b6 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -257,34 +257,6 @@ void flush_thread(void)
 {
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-#if 0
-	int i;
-
-	/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-	for (i = 0; i < 8; i++)
-		dump->u_debugreg[i] = current->debugreg[i];  
-
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
-
-	dump->regs = *regs;
-
-	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
-#endif 
-}
-
 /* Fill in the fpu structure for a core dump. */
 int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 {
diff --git a/arch/frv/kernel/frv_ksyms.c b/arch/frv/kernel/frv_ksyms.c
index 5f118c89d091..0f1c6cbc4f50 100644
--- a/arch/frv/kernel/frv_ksyms.c
+++ b/arch/frv/kernel/frv_ksyms.c
@@ -18,7 +18,6 @@
 #include <asm/hardirq.h>
 #include <asm/cacheflush.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern long __memcpy_user(void *dst, const void *src, size_t count);
 extern long __memset_user(void *dst, const void *src, size_t count);
 
@@ -27,7 +26,6 @@ extern long __memset_user(void *dst, const void *src, size_t count);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(strnlen);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strstr);
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 54a452136f00..c4488379ac3b 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -243,28 +243,6 @@ int copy_thread(int nr, unsigned long clone_flags,
 	return 0;
 } /* end copy_thread() */
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs *regs, struct user *dump)
-{
-#if 0
-	/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = user_stack(regs) & ~(PAGE_SIZE - 1);
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
-
-	dump->regs = *(struct user_context *) regs;
-#endif
-}
-
 /*
  * sys_execve() executes a new program.
  */
diff --git a/arch/h8300/kernel/h8300_ksyms.c b/arch/h8300/kernel/h8300_ksyms.c
index 5a630233112f..3e0d80ea4464 100644
--- a/arch/h8300/kernel/h8300_ksyms.c
+++ b/arch/h8300/kernel/h8300_ksyms.c
@@ -22,11 +22,8 @@
 //asmlinkage long long __lshrdi3 (long long, int);
 extern char h8300_debug_device[];
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 /* platform dependent support */
 
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(strnlen);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strstr);
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index fe21adf3e75e..585ed5efd0f7 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -207,34 +207,6 @@ int copy_thread(int nr, unsigned long clone_flags,
 	return 0;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = rdusp() & ~(PAGE_SIZE - 1);
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk +
-					  (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-
-	dump->u_ar0 = (struct user_regs_struct *)(((int)(&dump->regs)) -((int)(dump)));
-	dump->regs.er0 = regs->er0;
-	dump->regs.er1 = regs->er1;
-	dump->regs.er2 = regs->er2;
-	dump->regs.er3 = regs->er3;
-	dump->regs.er4 = regs->er4;
-	dump->regs.er5 = regs->er5;
-	dump->regs.er6 = regs->er6;
-	dump->regs.orig_er0 = regs->orig_er0;
-	dump->regs.ccr = regs->ccr;
-	dump->regs.pc  = regs->pc;
-}
-
 /*
  * sys_execve() executes a new program.
  */
diff --git a/arch/m32r/kernel/m32r_ksyms.c b/arch/m32r/kernel/m32r_ksyms.c
index e5ec134d81d9..dbc8a392105f 100644
--- a/arch/m32r/kernel/m32r_ksyms.c
+++ b/arch/m32r/kernel/m32r_ksyms.c
@@ -18,8 +18,6 @@
 #include <asm/irq.h>
 #include <asm/tlbflush.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 #if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
 extern struct drive_info_struct drive_info;
 EXPORT_SYMBOL(drive_info);
@@ -27,7 +25,6 @@ EXPORT_SYMBOL(drive_info);
 
 /* platform dependent support */
 EXPORT_SYMBOL(boot_cpu_data);
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 3bf55d92933f..2a1f250349b7 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -260,14 +260,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long spu,
 	return 0;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-	/* M32R_FIXME */
-}
-
 /*
  * Capture the user space registers if the task is not running (in user space)
  */
diff --git a/arch/m68k/kernel/m68k_ksyms.c b/arch/m68k/kernel/m68k_ksyms.c
index 73e2f5e168dd..3d7f2000b714 100644
--- a/arch/m68k/kernel/m68k_ksyms.c
+++ b/arch/m68k/kernel/m68k_ksyms.c
@@ -23,8 +23,6 @@ asmlinkage long long __lshrdi3 (long long, int);
 asmlinkage long long __muldi3 (long long, long long);
 extern char m68k_debug_device[];
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 /* platform dependent support */
 
 EXPORT_SYMBOL(m68k_machtype);
diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c
index b2c62eeb3bab..eddb8d3e130a 100644
--- a/arch/m68knommu/kernel/m68k_ksyms.c
+++ b/arch/m68knommu/kernel/m68k_ksyms.c
@@ -18,7 +18,6 @@
 #include <asm/checksum.h>
 #include <asm/current.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 
 /* platform dependent support */
@@ -26,7 +25,6 @@ extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(dump_fpu);
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(strnlen);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strstr);
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index 82e7ec888806..8b3cf57ba706 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -275,52 +275,6 @@ int dump_fpu(struct pt_regs *regs, struct user_m68kfp_struct *fpu)
 	return 1;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-	struct switch_stack *sw;
-
-	/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = rdusp() & ~(PAGE_SIZE - 1);
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk +
-					  (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
-
-	dump->u_ar0 = (struct user_regs_struct *)((int)&dump->regs - (int)dump);
-	sw = ((struct switch_stack *)regs) - 1;
-	dump->regs.d1 = regs->d1;
-	dump->regs.d2 = regs->d2;
-	dump->regs.d3 = regs->d3;
-	dump->regs.d4 = regs->d4;
-	dump->regs.d5 = regs->d5;
-	dump->regs.d6 = sw->d6;
-	dump->regs.d7 = sw->d7;
-	dump->regs.a0 = regs->a0;
-	dump->regs.a1 = regs->a1;
-	dump->regs.a2 = regs->a2;
-	dump->regs.a3 = sw->a3;
-	dump->regs.a4 = sw->a4;
-	dump->regs.a5 = sw->a5;
-	dump->regs.a6 = sw->a6;
-	dump->regs.d0 = regs->d0;
-	dump->regs.orig_d0 = regs->orig_d0;
-	dump->regs.stkadj = regs->stkadj;
-	dump->regs.sr = regs->sr;
-	dump->regs.pc = regs->pc;
-	dump->regs.fmtvec = (regs->format << 12) | regs->vector;
-	/* dump floating point stuff */
-	dump->u_fpvalid = dump_fpu (regs, &dump->m68kfp);
-}
-
 /*
  *	Generic dumping code. Used for panic and debug.
  */
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index a942bf2d58e9..7dd58f8ac6b5 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -352,27 +352,6 @@ int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
 	return 1;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-
-/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = regs->gprs[15] & ~(PAGE_SIZE - 1);
-	dump->u_tsize = current->mm->end_code >> PAGE_SHIFT;
-	dump->u_dsize = (current->mm->brk + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = (TASK_SIZE - dump->start_stack) >> PAGE_SHIFT;
-	memcpy(&dump->regs, regs, sizeof(s390_regs));
-	dump_fpu (regs, &dump->regs.fp_regs);
-	dump->regs.per_info = current->thread.per_info;
-}
-
 unsigned long get_wchan(struct task_struct *p)
 {
 	struct stack_frame *sf, *low, *high;
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index fd4f240b833d..8a2bea34ddd2 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -305,26 +305,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
 	return 0;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-	dump->magic = CMAGIC;
-	dump->start_code = current->mm->start_code;
-	dump->start_data  = current->mm->start_data;
-	dump->start_stack = regs->regs[15] & ~(PAGE_SIZE - 1);
-	dump->u_tsize = (current->mm->end_code - dump->start_code) >> PAGE_SHIFT;
-	dump->u_dsize = (current->mm->brk + (PAGE_SIZE-1) - dump->start_data) >> PAGE_SHIFT;
-	dump->u_ssize = (current->mm->start_stack - dump->start_stack +
-			 PAGE_SIZE - 1) >> PAGE_SHIFT;
-	/* Debug registers will come here. */
-
-	dump->regs = *regs;
-
-	dump->u_fpvalid = dump_fpu(regs, &dump->fpu);
-}
-
 /* Tracing by user break controller.  */
 static void
 ubc_set_tracing(int asid, unsigned long pc)
diff --git a/arch/sh/kernel/sh_ksyms.c b/arch/sh/kernel/sh_ksyms.c
index 6954fd62470a..1cf94a618be3 100644
--- a/arch/sh/kernel/sh_ksyms.c
+++ b/arch/sh/kernel/sh_ksyms.c
@@ -21,14 +21,12 @@
 #include <asm/cacheflush.h>
 #include <asm/checksum.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 extern struct hw_interrupt_type no_irq_type;
 
 EXPORT_SYMBOL(sh_mv);
 
 /* platform dependent support */
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(enable_irq);
diff --git a/arch/sh64/kernel/process.c b/arch/sh64/kernel/process.c
index b95d04141855..419b5a710441 100644
--- a/arch/sh64/kernel/process.c
+++ b/arch/sh64/kernel/process.c
@@ -775,26 +775,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
 	return 0;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-	dump->magic = CMAGIC;
-	dump->start_code = current->mm->start_code;
-	dump->start_data  = current->mm->start_data;
-	dump->start_stack = regs->regs[15] & ~(PAGE_SIZE - 1);
-	dump->u_tsize = (current->mm->end_code - dump->start_code) >> PAGE_SHIFT;
-	dump->u_dsize = (current->mm->brk + (PAGE_SIZE-1) - dump->start_data) >> PAGE_SHIFT;
-	dump->u_ssize = (current->mm->start_stack - dump->start_stack +
-			 PAGE_SIZE - 1) >> PAGE_SHIFT;
-	/* Debug registers will come here. */
-
-	dump->regs = *regs;
-
-	dump->u_fpvalid = dump_fpu(regs, &dump->fpu);
-}
-
 asmlinkage int sys_fork(unsigned long r2, unsigned long r3,
 			unsigned long r4, unsigned long r5,
 			unsigned long r6, unsigned long r7,
diff --git a/arch/sh64/kernel/sh_ksyms.c b/arch/sh64/kernel/sh_ksyms.c
index 0b5497d70bd3..472b450e61be 100644
--- a/arch/sh64/kernel/sh_ksyms.c
+++ b/arch/sh64/kernel/sh_ksyms.c
@@ -29,7 +29,6 @@
 #include <asm/delay.h>
 #include <asm/irq.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 
 #if 0
@@ -41,7 +40,6 @@ EXPORT_SYMBOL(drive_info);
 #endif
 
 /* platform dependent support */
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(enable_irq);
diff --git a/arch/sparc/kernel/sparc_ksyms.c b/arch/sparc/kernel/sparc_ksyms.c
index 1c8fd0fd9305..0b0d492c953b 100644
--- a/arch/sparc/kernel/sparc_ksyms.c
+++ b/arch/sparc/kernel/sparc_ksyms.c
@@ -82,8 +82,6 @@ extern int __lshrdi3(int, int);
 extern int __muldi3(int, int);
 extern int __divdi3(int, int);
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 /* Private functions with odd calling conventions. */
 extern void ___atomic24_add(void);
 extern void ___atomic24_sub(void);
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index edf52d06b280..202a80c24b6f 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -36,8 +36,6 @@ static int load_aout32_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout32_library(struct file*);
 static int aout32_core_dump(long signr, struct pt_regs * regs, struct file *file);
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 static struct linux_binfmt aout32_format = {
 	NULL, THIS_MODULE, load_aout32_binary, load_aout32_library, aout32_core_dump,
 	PAGE_SIZE
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index fb7a5370dbfc..d177d7e5c9d3 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -94,7 +94,6 @@ extern void (*prom_palette)(int);
 
 extern int __ashrdi3(int, int);
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs);
 
 extern unsigned long phys_base;
@@ -241,7 +240,6 @@ EXPORT_SYMBOL(io_remap_pfn_range);
 EXPORT_SYMBOL(_sigpause_common);
 EXPORT_SYMBOL(verify_compat_iovec);
 
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(pte_alloc_one_kernel);
 #ifndef CONFIG_SMP
diff --git a/arch/v850/kernel/process.c b/arch/v850/kernel/process.c
index 39cf247cdae4..062ffa0a9998 100644
--- a/arch/v850/kernel/process.c
+++ b/arch/v850/kernel/process.c
@@ -163,30 +163,6 @@ int copy_thread (int nr, unsigned long clone_flags,
 	return 0;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread (struct pt_regs *regs, struct user *dump)
-{
-#if 0  /* Later.  XXX */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = regs->gpr[GPR_SP];
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk +
-					  (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
-
-	dump->u_ar0 = (struct user_regs_struct *)((int)&dump->regs - (int)dump);
-	dump->regs = *regs;
-	dump->u_fpvalid = 0;
-#endif
-}
-
 /*
  * sys_execve() executes a new program.
  */
diff --git a/arch/v850/kernel/v850_ksyms.c b/arch/v850/kernel/v850_ksyms.c
index 0ca64900dd91..8ffc29c1c89d 100644
--- a/arch/v850/kernel/v850_ksyms.c
+++ b/arch/v850/kernel/v850_ksyms.c
@@ -21,8 +21,6 @@ extern void *trap_table;
 EXPORT_SYMBOL (trap_table);
 
 /* platform dependent support */
-extern void dump_thread (struct pt_regs *, struct user *);
-EXPORT_SYMBOL (dump_thread);
 EXPORT_SYMBOL (kernel_thread);
 EXPORT_SYMBOL (enable_irq);
 EXPORT_SYMBOL (disable_irq);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 72011826f0cb..f312103434d4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -33,8 +33,6 @@ static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout_library(struct file*);
 static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file);
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 static struct linux_binfmt aout_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_aout_binary,
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 9d6625829b99..b72dc31a0970 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -77,8 +77,6 @@ static int load_flat_shared_library(int id, struct lib_info *p);
 static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
 static int flat_core_dump(long signr, struct pt_regs * regs, struct file *file);
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 static struct linux_binfmt flat_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_flat_binary,
diff --git a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h
index 075771c371f6..da07a69ce82a 100644
--- a/include/asm-um/processor-generic.h
+++ b/include/asm-um/processor-generic.h
@@ -89,7 +89,6 @@ extern struct task_struct *alloc_task_struct(void);
 
 extern void release_thread(struct task_struct *);
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
-extern void dump_thread(struct pt_regs *regs, struct user *u);
 
 static inline void prepare_to_copy(struct task_struct *tsk)
 {
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d0e6ca3b00ef..e6ee2d95da7a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -47,6 +47,8 @@ extern int console_printk[];
 #define default_console_loglevel (console_printk[3])
 
 struct completion;
+struct pt_regs;
+struct user;
 
 /**
  * might_sleep - annotation for functions that can sleep
@@ -123,6 +125,8 @@ extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int session_of_pgrp(int pgrp);
 
+extern void dump_thread(struct pt_regs *regs, struct user *dump);
+
 #ifdef CONFIG_PRINTK
 asmlinkage int vprintk(const char *fmt, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
-- 
cgit v1.2.3-71-gd317


From cc57165874e938ef684d71ba7d36e7088b551489 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Mon, 9 Jan 2006 20:51:41 -0800
Subject: [PATCH] kdump: dynamic per cpu allocation of memory for saving cpu
 registers

- In case of system crash, current state of cpu registers is saved in memory
  in elf note format.  So far memory for storing elf notes was being allocated
  statically for NR_CPUS.

- This patch introduces dynamic allocation of memory for storing elf notes.
  It uses alloc_percpu() interface.  This should lead to better memory usage.

- Introduced based on Andi Kleen's and Eric W. Biederman's suggestions.

- This patch also moves memory allocation for elf notes from architecture
  dependent portion to architecture independent portion.  Now crash_notes is
  architecture independent.  The whole idea is that size of memory to be
  allocated per cpu (MAX_NOTE_BYTES) can be architecture dependent and
  allocation of this memory can be architecture independent.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/crash.c        |  5 +++--
 arch/ppc/kernel/machine_kexec.c |  6 ------
 arch/s390/kernel/crash.c        |  2 --
 arch/x86_64/kernel/crash.c      |  2 --
 include/asm-i386/kexec.h        |  3 ---
 include/asm-powerpc/kexec.h     |  3 ---
 include/asm-s390/kexec.h        |  3 ---
 include/asm-x86_64/kexec.h      |  3 ---
 include/linux/kexec.h           |  2 ++
 kernel/kexec.c                  | 16 ++++++++++++++++
 10 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 10fe6569751e..f1e65c2ead6e 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -25,7 +25,6 @@
 #include <mach_ipi.h>
 
 
-note_buf_t crash_notes[NR_CPUS];
 /* This keeps a track of which one is crashing cpu. */
 static int crashing_cpu;
 
@@ -72,7 +71,9 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
 	 * squirrelled away.  ELF notes happen to provide
 	 * all of that that no need to invent something new.
 	 */
-	buf = &crash_notes[cpu][0];
+	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
 	memset(&prstatus, 0, sizeof(prstatus));
 	prstatus.pr_pid = current->pid;
 	elf_core_copy_regs(&prstatus.pr_reg, regs);
diff --git a/arch/ppc/kernel/machine_kexec.c b/arch/ppc/kernel/machine_kexec.c
index a882b0dbe8de..84d65a87191e 100644
--- a/arch/ppc/kernel/machine_kexec.c
+++ b/arch/ppc/kernel/machine_kexec.c
@@ -28,12 +28,6 @@ typedef NORET_TYPE void (*relocate_new_kernel_t)(
 const extern unsigned char relocate_new_kernel[];
 const extern unsigned int relocate_new_kernel_size;
 
-/*
- * Provide a dummy crash_notes definition while crash dump arrives to ppc.
- * This prevents breakage of crash_notes attribute in kernel/ksysfs.c.
- */
-note_buf_t crash_notes[NR_CPUS];
-
 void machine_shutdown(void)
 {
 	if (ppc_md.machine_shutdown)
diff --git a/arch/s390/kernel/crash.c b/arch/s390/kernel/crash.c
index 7bd169c58b0c..926cceeae0fa 100644
--- a/arch/s390/kernel/crash.c
+++ b/arch/s390/kernel/crash.c
@@ -10,8 +10,6 @@
 #include <linux/threads.h>
 #include <linux/kexec.h>
 
-note_buf_t crash_notes[NR_CPUS];
-
 void machine_crash_shutdown(struct pt_regs *regs)
 {
 }
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 535e04466079..efe450760bbc 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -19,8 +19,6 @@
 #include <asm/nmi.h>
 #include <asm/hw_irq.h>
 
-note_buf_t crash_notes[NR_CPUS];
-
 void machine_crash_shutdown(struct pt_regs *regs)
 {
 	/* This function is only called after the system
diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h
index 6ed2a03e37b3..d80d446498fb 100644
--- a/include/asm-i386/kexec.h
+++ b/include/asm-i386/kexec.h
@@ -26,8 +26,5 @@
 #define KEXEC_ARCH KEXEC_ARCH_386
 
 #define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
-
-extern note_buf_t crash_notes[];
 
 #endif /* _I386_KEXEC_H */
diff --git a/include/asm-powerpc/kexec.h b/include/asm-powerpc/kexec.h
index 4263af3cadfd..e363752276ef 100644
--- a/include/asm-powerpc/kexec.h
+++ b/include/asm-powerpc/kexec.h
@@ -38,9 +38,6 @@
 #ifdef CONFIG_KEXEC
 
 #define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES / sizeof(u32)];
-
-extern note_buf_t crash_notes[];
 
 #ifdef __powerpc64__
 extern void kexec_smp_wait(void);	/* get and clear naca physid, wait for
diff --git a/include/asm-s390/kexec.h b/include/asm-s390/kexec.h
index 54cf7d9f251c..b4809d98fe69 100644
--- a/include/asm-s390/kexec.h
+++ b/include/asm-s390/kexec.h
@@ -35,8 +35,5 @@
 #define KEXEC_ARCH KEXEC_ARCH_S390
 
 #define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
-
-extern note_buf_t crash_notes[];
 
 #endif /*_S390_KEXEC_H */
diff --git a/include/asm-x86_64/kexec.h b/include/asm-x86_64/kexec.h
index 42d2ff15c592..cea78543a574 100644
--- a/include/asm-x86_64/kexec.h
+++ b/include/asm-x86_64/kexec.h
@@ -26,8 +26,5 @@
 #define KEXEC_ARCH KEXEC_ARCH_X86_64
 
 #define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
-
-extern note_buf_t crash_notes[];
 
 #endif /* _X86_64_KEXEC_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index c8468472aec0..c1cd9b31159e 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -125,6 +125,8 @@ extern struct kimage *kexec_image;
 /* Location of a reserved region to hold the crash kernel.
  */
 extern struct resource crashk_res;
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+extern note_buf_t *crash_notes;
 
 #else /* !CONFIG_KEXEC */
 struct pt_regs;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2c95848fbce8..1197de8b2a94 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -26,6 +26,9 @@
 #include <asm/system.h>
 #include <asm/semaphore.h>
 
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t* crash_notes;
+
 /* Location of the reserved area for the crash kernel */
 struct resource crashk_res = {
 	.name  = "Crash kernel",
@@ -1060,3 +1063,16 @@ void crash_kexec(struct pt_regs *regs)
 		xchg(&kexec_lock, 0);
 	}
 }
+
+static int __init crash_notes_memory_init(void)
+{
+	/* Allocate memory for saving cpu registers. */
+	crash_notes = alloc_percpu(note_buf_t);
+	if (!crash_notes) {
+		printk("Kexec: Memory allocation for saving cpu register"
+		" states failed\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+module_init(crash_notes_memory_init)
-- 
cgit v1.2.3-71-gd317


From 720e1a9f1c3bfa9f72cded56962e7f092fefaaed Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Mon, 9 Jan 2006 20:51:51 -0800
Subject: [PATCH] kexec: increase max segment limit

)

From: Vivek Goyal <vgoyal@in.ibm.com>

- In some cases, the number of segments, on a kexec load, exceeds the
  existing cap of 8.  This patch increases the KEXEC_SEGMENT_MAX limit from 8
  to 16.

Signed-off-by: Rachita Kothiyal <rachita@in.ibm.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kexec.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index c1cd9b31159e..94abc07cb164 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -41,7 +41,7 @@ typedef unsigned long kimage_entry_t;
 #define IND_DONE         0x4
 #define IND_SOURCE       0x8
 
-#define KEXEC_SEGMENT_MAX 8
+#define KEXEC_SEGMENT_MAX 16
 struct kexec_segment {
 	void __user *buf;
 	size_t bufsz;
-- 
cgit v1.2.3-71-gd317


From 5be196e5f925dab2309530fabce69c2e562b9791 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:51:55 -0800
Subject: [PATCH] add vfs_* helpers for xattr operations

Add vfs_getxattr, vfs_setxattr and vfs_removexattr helpers for common checks
around invocation of the xattr methods.  NFSD already was missing some of the
checks and there will be more soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: James Morris <jmorris@namei.org>

(James, I haven't touched selinux yet because it's doing various odd things
and I'm not sure how it would interact with the security attribute fallbacks
you added.  Could you investigate whether it could use vfs_getxattr or if not
add a __vfs_getxattr helper to share the bits it is fine with?)

For NFSv4: instead of just converting it add an nfsd_getxattr helper for the
code shared by NFSv2/3 and NFSv4 ACLs.  In fact that code isn't even
NFS-specific, but I'll wait for more users to pop up first before moving it to
common code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c         | 125 +++++++++++++++++-------------------------
 fs/xattr.c            | 146 ++++++++++++++++++++++++++++++++------------------
 include/linux/xattr.h |   4 ++
 3 files changed, 145 insertions(+), 130 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index bb36b4304491..eef0576a7785 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -48,8 +48,8 @@
 #include <linux/fsnotify.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
-#ifdef CONFIG_NFSD_V4
 #include <linux/xattr.h>
+#ifdef CONFIG_NFSD_V4
 #include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
 #include <linux/nfsd_idmap.h>
@@ -365,8 +365,30 @@ out_nfserr:
 	goto out;
 }
 
-#if defined(CONFIG_NFSD_V4)
+#if defined(CONFIG_NFSD_V2_ACL) || \
+    defined(CONFIG_NFSD_V3_ACL) || \
+    defined(CONFIG_NFSD_V4)
+static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
+{
+	ssize_t buflen;
+	int error;
+
+	buflen = vfs_getxattr(dentry, key, NULL, 0);
+	if (buflen <= 0)
+		return buflen;
 
+	*buf = kmalloc(buflen, GFP_KERNEL);
+	if (!*buf)
+		return -ENOMEM;
+
+	error = vfs_getxattr(dentry, key, *buf, buflen);
+	if (error < 0)
+		return error;
+	return buflen;
+}
+#endif
+
+#if defined(CONFIG_NFSD_V4)
 static int
 set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
 {
@@ -374,7 +396,6 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
 	size_t buflen;
 	char *buf = NULL;
 	int error = 0;
-	struct inode *inode = dentry->d_inode;
 
 	buflen = posix_acl_xattr_size(pacl->a_count);
 	buf = kmalloc(buflen, GFP_KERNEL);
@@ -388,15 +409,7 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
 		goto out;
 	}
 
-	error = -EOPNOTSUPP;
-	if (inode->i_op && inode->i_op->setxattr) {
-		mutex_lock(&inode->i_mutex);
-		security_inode_setxattr(dentry, key, buf, len, 0);
-		error = inode->i_op->setxattr(dentry, key, buf, len, 0);
-		if (!error)
-			security_inode_post_setxattr(dentry, key, buf, len, 0);
-		mutex_unlock(&inode->i_mutex);
-	}
+	error = vfs_setxattr(dentry, key, buf, len, 0);
 out:
 	kfree(buf);
 	return error;
@@ -455,44 +468,19 @@ out_nfserr:
 static struct posix_acl *
 _get_posix_acl(struct dentry *dentry, char *key)
 {
-	struct inode *inode = dentry->d_inode;
-	char *buf = NULL;
-	int buflen, error = 0;
+	void *buf = NULL;
 	struct posix_acl *pacl = NULL;
+	int buflen;
 
-	error = -EOPNOTSUPP;
-	if (inode->i_op == NULL)
-		goto out_err;
-	if (inode->i_op->getxattr == NULL)
-		goto out_err;
-
-	error = security_inode_getxattr(dentry, key);
-	if (error)
-		goto out_err;
-
-	buflen = inode->i_op->getxattr(dentry, key, NULL, 0);
-	if (buflen <= 0) {
-		error = buflen < 0 ? buflen : -ENODATA;
-		goto out_err;
-	}
-
-	buf = kmalloc(buflen, GFP_KERNEL);
-	if (buf == NULL) {
-		error = -ENOMEM;
-		goto out_err;
-	}
-
-	error = inode->i_op->getxattr(dentry, key, buf, buflen);
-	if (error < 0)
-		goto out_err;
+	buflen = nfsd_getxattr(dentry, key, &buf);
+	if (!buflen)
+		buflen = -ENODATA;
+	if (buflen <= 0)
+		return ERR_PTR(buflen);
 
 	pacl = posix_acl_from_xattr(buf, buflen);
- out:
 	kfree(buf);
 	return pacl;
- out_err:
-	pacl = ERR_PTR(error);
-	goto out;
 }
 
 int
@@ -1884,39 +1872,25 @@ nfsd_get_posix_acl(struct svc_fh *fhp, int type)
 	ssize_t size;
 	struct posix_acl *acl;
 
-	if (!IS_POSIXACL(inode) || !inode->i_op || !inode->i_op->getxattr)
+	if (!IS_POSIXACL(inode))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
 		return ERR_PTR(-EOPNOTSUPP);
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			name = POSIX_ACL_XATTR_ACCESS;
-			break;
-		case ACL_TYPE_DEFAULT:
-			name = POSIX_ACL_XATTR_DEFAULT;
-			break;
-		default:
-			return ERR_PTR(-EOPNOTSUPP);
 	}
 
-	size = inode->i_op->getxattr(fhp->fh_dentry, name, NULL, 0);
+	size = nfsd_getxattr(fhp->fh_dentry, name, &value);
+	if (size < 0)
+		return ERR_PTR(size);
 
-	if (size < 0) {
-		acl = ERR_PTR(size);
-		goto getout;
-	} else if (size > 0) {
-		value = kmalloc(size, GFP_KERNEL);
-		if (!value) {
-			acl = ERR_PTR(-ENOMEM);
-			goto getout;
-		}
-		size = inode->i_op->getxattr(fhp->fh_dentry, name, value, size);
-		if (size < 0) {
-			acl = ERR_PTR(size);
-			goto getout;
-		}
-	}
 	acl = posix_acl_from_xattr(value, size);
-
-getout:
 	kfree(value);
 	return acl;
 }
@@ -1957,16 +1931,13 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
 	} else
 		size = 0;
 
-	if (!fhp->fh_locked)
-		fh_lock(fhp);  /* unlocking is done automatically */
 	if (size)
-		error = inode->i_op->setxattr(fhp->fh_dentry, name,
-					      value, size, 0);
+		error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
 	else {
 		if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
 			error = 0;
 		else {
-			error = inode->i_op->removexattr(fhp->fh_dentry, name);
+			error = vfs_removexattr(fhp->fh_dentry, name);
 			if (error == -ENODATA)
 				error = 0;
 		}
diff --git a/fs/xattr.c b/fs/xattr.c
index 386a532ee5a9..fee804e69a9a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -19,6 +19,96 @@
 #include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 
+
+int
+vfs_setxattr(struct dentry *dentry, char *name, void *value,
+		size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	mutex_lock(&inode->i_mutex);
+	error = security_inode_setxattr(dentry, name, value, size, flags);
+	if (error)
+		goto out;
+	error = -EOPNOTSUPP;
+	if (inode->i_op->setxattr) {
+		error = inode->i_op->setxattr(dentry, name, value, size, flags);
+		if (!error) {
+			fsnotify_xattr(dentry);
+			security_inode_post_setxattr(dentry, name, value,
+						     size, flags);
+		}
+	} else if (!strncmp(name, XATTR_SECURITY_PREFIX,
+				sizeof XATTR_SECURITY_PREFIX - 1)) {
+		const char *suffix = name + sizeof XATTR_SECURITY_PREFIX - 1;
+		error = security_inode_setsecurity(inode, suffix, value,
+						   size, flags);
+		if (!error)
+			fsnotify_xattr(dentry);
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_setxattr);
+
+ssize_t
+vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = security_inode_getxattr(dentry, name);
+	if (error)
+		return error;
+
+	if (inode->i_op->getxattr)
+		error = inode->i_op->getxattr(dentry, name, value, size);
+	else
+		error = -EOPNOTSUPP;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX,
+				sizeof XATTR_SECURITY_PREFIX - 1)) {
+		const char *suffix = name + sizeof XATTR_SECURITY_PREFIX - 1;
+		int ret = security_inode_getsecurity(inode, suffix, value,
+						     size, error);
+		/*
+		 * Only overwrite the return value if a security module
+		 * is actually active.
+		 */
+		if (ret != -EOPNOTSUPP)
+			error = ret;
+	}
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_getxattr);
+
+int
+vfs_removexattr(struct dentry *dentry, char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	if (!inode->i_op->removexattr)
+		return -EOPNOTSUPP;
+
+	error = security_inode_removexattr(dentry, name);
+	if (error)
+		return error;
+
+	mutex_lock(&inode->i_mutex);
+	error = inode->i_op->removexattr(dentry, name);
+	mutex_unlock(&inode->i_mutex);
+
+	if (!error)
+		fsnotify_xattr(dentry);
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_removexattr);
+
+
 /*
  * Extended attribute SET operations
  */
@@ -51,29 +141,7 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
 		}
 	}
 
-	mutex_lock(&d->d_inode->i_mutex);
-	error = security_inode_setxattr(d, kname, kvalue, size, flags);
-	if (error)
-		goto out;
-	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->setxattr) {
-		error = d->d_inode->i_op->setxattr(d, kname, kvalue,
-						   size, flags);
-		if (!error) {
-			fsnotify_xattr(d);
-			security_inode_post_setxattr(d, kname, kvalue,
-						     size, flags);
-		}
-	} else if (!strncmp(kname, XATTR_SECURITY_PREFIX,
-			    sizeof XATTR_SECURITY_PREFIX - 1)) {
-		const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
-		error = security_inode_setsecurity(d->d_inode, suffix, kvalue,
-						   size, flags);
-		if (!error)
-			fsnotify_xattr(d);
-	}
-out:
-	mutex_unlock(&d->d_inode->i_mutex);
+	error = vfs_setxattr(d, kname, kvalue, size, flags);
 	kfree(kvalue);
 	return error;
 }
@@ -147,22 +215,7 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
 			return -ENOMEM;
 	}
 
-	error = security_inode_getxattr(d, kname);
-	if (error)
-		goto out;
-	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->getxattr)
-		error = d->d_inode->i_op->getxattr(d, kname, kvalue, size);
-
-	if (!strncmp(kname, XATTR_SECURITY_PREFIX,
-		     sizeof XATTR_SECURITY_PREFIX - 1)) {
-		const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
-		int rv = security_inode_getsecurity(d->d_inode, suffix, kvalue,
-						    size, error);
-		/* Security module active: overwrite error value */
-		if (rv != -EOPNOTSUPP)
-			error = rv;
-	}
+	error = vfs_getxattr(d, kname, kvalue, size);
 	if (error > 0) {
 		if (size && copy_to_user(value, kvalue, error))
 			error = -EFAULT;
@@ -171,7 +224,6 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
 		   than XATTR_SIZE_MAX bytes. Not possible. */
 		error = -E2BIG;
 	}
-out:
 	kfree(kvalue);
 	return error;
 }
@@ -318,19 +370,7 @@ removexattr(struct dentry *d, char __user *name)
 	if (error < 0)
 		return error;
 
-	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->removexattr) {
-		error = security_inode_removexattr(d, kname);
-		if (error)
-			goto out;
-		mutex_lock(&d->d_inode->i_mutex);
-		error = d->d_inode->i_op->removexattr(d, kname);
-		mutex_unlock(&d->d_inode->i_mutex);
-		if (!error)
-			fsnotify_xattr(d);
-	}
-out:
-	return error;
+	return vfs_removexattr(d, kname);
 }
 
 asmlinkage long
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 23f9c61d9546..366f0ab4219f 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -25,6 +25,10 @@ struct xattr_handler {
 		   size_t size, int flags);
 };
 
+ssize_t vfs_getxattr(struct dentry *, char *, void *, size_t);
+int vfs_setxattr(struct dentry *, char *, void *, size_t, int);
+int vfs_removexattr(struct dentry *, char *);
+
 ssize_t generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size);
 ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
 int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags);
-- 
cgit v1.2.3-71-gd317


From e0ad7b073eb7317e5afe0385b02dcb1d52a1eedf Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Mon, 9 Jan 2006 20:51:56 -0800
Subject: [PATCH] move xattr permission checks into the VFS

)

From: Christoph Hellwig <hch@lst.de>

The xattr code has rather complex permission checks because the rules are very
different for different attribute namespaces.  This patch moves as much as we
can into the generic code.  Currently all the major disk based filesystems
duplicate these checks, while many minor filesystems or network filesystems
lack some or all of them.

To do this we need defines for the extended attribute names in common code, I
moved them up from JFS which had the nicest defintions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jfs/xattr.c        | 15 -------------
 fs/xattr.c            | 61 +++++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/xattr.h | 15 +++++++++++++
 3 files changed, 72 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 23aa5066b5a4..9dde36a1eb5d 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -83,21 +83,6 @@ struct ea_buffer {
 #define EA_NEW		0x0004
 #define EA_MALLOC	0x0008
 
-/* Namespaces */
-#define XATTR_SYSTEM_PREFIX "system."
-#define XATTR_SYSTEM_PREFIX_LEN (sizeof (XATTR_SYSTEM_PREFIX) - 1)
-
-#define XATTR_USER_PREFIX "user."
-#define XATTR_USER_PREFIX_LEN (sizeof (XATTR_USER_PREFIX) - 1)
-
-#define XATTR_OS2_PREFIX "os2."
-#define XATTR_OS2_PREFIX_LEN (sizeof (XATTR_OS2_PREFIX) - 1)
-
-/* XATTR_SECURITY_PREFIX is defined in include/linux/xattr.h */
-#define XATTR_SECURITY_PREFIX_LEN (sizeof (XATTR_SECURITY_PREFIX) - 1)
-
-#define XATTR_TRUSTED_PREFIX "trusted."
-#define XATTR_TRUSTED_PREFIX_LEN (sizeof (XATTR_TRUSTED_PREFIX) - 1)
 
 /*
  * These three routines are used to recognize on-disk extended attributes
diff --git a/fs/xattr.c b/fs/xattr.c
index fee804e69a9a..80eca7d3d69f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -20,6 +20,47 @@
 #include <asm/uaccess.h>
 
 
+/*
+ * Check permissions for extended attribute access.  This is a bit complicated
+ * because different namespaces have very different rules.
+ */
+static int
+xattr_permission(struct inode *inode, const char *name, int mask)
+{
+	/*
+	 * We can never set or remove an extended attribute on a read-only
+	 * filesystem  or on an immutable / append-only inode.
+	 */
+	if (mask & MAY_WRITE) {
+		if (IS_RDONLY(inode))
+			return -EROFS;
+		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+			return -EPERM;
+	}
+
+	/*
+	 * No restriction for security.* and system.* from the VFS.  Decision
+	 * on these is left to the underlying filesystem / security module.
+	 */
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+	    !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return 0;
+
+	/*
+	 * The trusted.* namespace can only accessed by a privilegued user.
+	 */
+	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+		return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
+
+	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
+		if (!S_ISREG(inode->i_mode) &&
+		    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
+			return -EPERM;
+	}
+
+	return permission(inode, mask, NULL);
+}
+
 int
 vfs_setxattr(struct dentry *dentry, char *name, void *value,
 		size_t size, int flags)
@@ -27,6 +68,10 @@ vfs_setxattr(struct dentry *dentry, char *name, void *value,
 	struct inode *inode = dentry->d_inode;
 	int error;
 
+	error = xattr_permission(inode, name, MAY_WRITE);
+	if (error)
+		return error;
+
 	mutex_lock(&inode->i_mutex);
 	error = security_inode_setxattr(dentry, name, value, size, flags);
 	if (error)
@@ -40,8 +85,8 @@ vfs_setxattr(struct dentry *dentry, char *name, void *value,
 						     size, flags);
 		}
 	} else if (!strncmp(name, XATTR_SECURITY_PREFIX,
-				sizeof XATTR_SECURITY_PREFIX - 1)) {
-		const char *suffix = name + sizeof XATTR_SECURITY_PREFIX - 1;
+				XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
 		error = security_inode_setsecurity(inode, suffix, value,
 						   size, flags);
 		if (!error)
@@ -59,6 +104,10 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
 	struct inode *inode = dentry->d_inode;
 	int error;
 
+	error = xattr_permission(inode, name, MAY_READ);
+	if (error)
+		return error;
+
 	error = security_inode_getxattr(dentry, name);
 	if (error)
 		return error;
@@ -69,8 +118,8 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
 		error = -EOPNOTSUPP;
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
-				sizeof XATTR_SECURITY_PREFIX - 1)) {
-		const char *suffix = name + sizeof XATTR_SECURITY_PREFIX - 1;
+				XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
 		int ret = security_inode_getsecurity(inode, suffix, value,
 						     size, error);
 		/*
@@ -94,6 +143,10 @@ vfs_removexattr(struct dentry *dentry, char *name)
 	if (!inode->i_op->removexattr)
 		return -EOPNOTSUPP;
 
+	error = xattr_permission(inode, name, MAY_WRITE);
+	if (error)
+		return error;
+
 	error = security_inode_removexattr(dentry, name);
 	if (error)
 		return error;
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 366f0ab4219f..cda8a96e2fa0 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -13,7 +13,22 @@
 #define XATTR_CREATE	0x1	/* set value, fail if attr already exists */
 #define XATTR_REPLACE	0x2	/* set value, fail if attr does not exist */
 
+/* Namespaces */
+#define XATTR_OS2_PREFIX "os2."
+#define XATTR_OS2_PREFIX_LEN (sizeof (XATTR_OS2_PREFIX) - 1)
+
 #define XATTR_SECURITY_PREFIX	"security."
+#define XATTR_SECURITY_PREFIX_LEN (sizeof (XATTR_SECURITY_PREFIX) - 1)
+
+#define XATTR_SYSTEM_PREFIX "system."
+#define XATTR_SYSTEM_PREFIX_LEN (sizeof (XATTR_SYSTEM_PREFIX) - 1)
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+#define XATTR_TRUSTED_PREFIX_LEN (sizeof (XATTR_TRUSTED_PREFIX) - 1)
+
+#define XATTR_USER_PREFIX "user."
+#define XATTR_USER_PREFIX_LEN (sizeof (XATTR_USER_PREFIX) - 1)
+
 
 struct xattr_handler {
 	char *prefix;
-- 
cgit v1.2.3-71-gd317


From 870f481793b585323fbda3e87c54efc116f46351 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:01 -0800
Subject: [PATCH] replace inode_update_time with file_update_time

To allow various options to work per-mount instead of per-sb we need a
struct vfsmount when updating ctime and mtime.  This preparation patch
replaces the inode_update_time routine with a file_update_atime routine so
we can easily get at the vfsmount.  (and the file makes more sense in this
context anyway).  Also get rid of the unused second argument - we always
want to update the ctime when calling this routine.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c                 | 27 +++++++++++++++------------
 fs/ncpfs/file.c            |  2 +-
 fs/ntfs/file.c             |  2 +-
 fs/ntfs/inode.c            | 20 +++++++++++++++++++-
 fs/ocfs2/mmap.c            |  8 ++------
 fs/pipe.c                  |  2 +-
 fs/reiserfs/file.c         |  2 +-
 fs/xfs/linux-2.6/xfs_lrw.c |  2 +-
 include/linux/fs.h         |  2 +-
 mm/filemap.c               |  2 +-
 mm/filemap_xip.c           |  2 +-
 11 files changed, 44 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index e08767fd57b0..e177769f3b41 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1204,16 +1204,20 @@ void update_atime(struct inode *inode)
 EXPORT_SYMBOL(update_atime);
 
 /**
- *	inode_update_time	-	update mtime and ctime time
- *	@inode: inode accessed
- *	@ctime_too: update ctime too
+ *	file_update_time	-	update mtime and ctime time
+ *	@file: file accessed
  *
- *	Update the mtime time on an inode and mark it for writeback.
- *	When ctime_too is specified update the ctime too.
+ *	Update the mtime and ctime members of an inode and mark the inode
+ *	for writeback.  Note that this function is meant exclusively for
+ *	usage in the file write path of filesystems, and filesystems may
+ *	choose to explicitly ignore update via this function with the
+ *	S_NOCTIME inode flag, e.g. for network filesystem where these
+ *	timestamps are handled by the server.
  */
 
-void inode_update_time(struct inode *inode, int ctime_too)
+void file_update_time(struct file *file)
 {
+	struct inode *inode = file->f_dentry->d_inode;
 	struct timespec now;
 	int sync_it = 0;
 
@@ -1227,16 +1231,15 @@ void inode_update_time(struct inode *inode, int ctime_too)
 		sync_it = 1;
 	inode->i_mtime = now;
 
-	if (ctime_too) {
-		if (!timespec_equal(&inode->i_ctime, &now))
-			sync_it = 1;
-		inode->i_ctime = now;
-	}
+	if (!timespec_equal(&inode->i_ctime, &now))
+		sync_it = 1;
+	inode->i_ctime = now;
+
 	if (sync_it)
 		mark_inode_dirty_sync(inode);
 }
 
-EXPORT_SYMBOL(inode_update_time);
+EXPORT_SYMBOL(file_update_time);
 
 int inode_needs_sync(struct inode *inode)
 {
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 4947d9b11fc1..973b444d6914 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -262,7 +262,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 	}
 	vfree(bouncebuffer);
 
-	inode_update_time(inode, 1);
+	file_update_time(file);
 
 	*ppos = pos;
 
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 30f71acdc1cb..fb413d3d8618 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2173,7 +2173,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
 	err = remove_suid(file->f_dentry);
 	if (err)
 		goto out;
-	inode_update_time(inode, 1);
+	file_update_time(file);
 	written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
 			count);
 out:
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index bda7a08911a5..ea1bd3feea1b 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2767,7 +2767,25 @@ unm_done:
 	up_write(&ni->runlist.lock);
 done:
 	/* Update the mtime and ctime on the base inode. */
-	inode_update_time(VFS_I(base_ni), 1);
+	/* normally ->truncate shouldn't update ctime or mtime,
+	 * but ntfs did before so it got a copy & paste version
+	 * of file_update_time.  one day someone should fix this
+	 * for real.
+	 */
+	if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
+		struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb);
+		int sync_it = 0;
+
+		if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+		    !timespec_equal(&VFS_I(base_ni)->i_ctime, &now))
+			sync_it = 1;
+		VFS_I(base_ni)->i_mtime = now;
+		VFS_I(base_ni)->i_ctime = now;
+
+		if (sync_it)
+			mark_inode_dirty_sync(VFS_I(base_ni));
+	}
+
 	if (likely(!err)) {
 		NInoClearTruncateFailed(ni);
 		ntfs_debug("Done.");
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index afdeec4b0eef..843cf9ddefe8 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -80,12 +80,8 @@ static struct vm_operations_struct ocfs2_file_vm_ops = {
 	.nopage = ocfs2_nopage,
 };
 
-int ocfs2_mmap(struct file *file,
-	       struct vm_area_struct *vma)
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
-	struct inode *inode = mapping->host;
-
 	/* We don't want to support shared writable mappings yet. */
 	if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
 	    && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
@@ -95,7 +91,7 @@ int ocfs2_mmap(struct file *file,
 		return -EINVAL;
 	}
 
-	update_atime(inode);
+	file_accessed(file);
 	vma->vm_ops = &ocfs2_file_vm_ops;
 	return 0;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index acb030b61fb0..eef0f29e86ef 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -347,7 +347,7 @@ out:
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
 	}
 	if (ret > 0)
-		inode_update_time(inode, 1);	/* mtime and ctime */
+		file_update_time(filp);
 	return ret;
 }
 
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 127e7d2cabdd..ad6fa964b0e7 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1360,7 +1360,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 	if (res)
 		goto out;
 
-	inode_update_time(inode, 1);	/* Both mtime and ctime */
+	file_update_time(file);
 
 	// Ok, we are done with all the checks.
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 5675117ef227..885dfafeabee 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -713,7 +713,7 @@ start:
 	}
 
 	if (likely(!(ioflags & IO_INVIS))) {
-		inode_update_time(inode, 1);
+		file_update_time(file);
 		xfs_ichgtime_fast(xip, inode,
 				  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 92ae3e2067b0..1feee2e7e47b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1716,7 +1716,7 @@ extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const vo
 extern int inode_change_ok(struct inode *, struct iattr *);
 extern int __must_check inode_setattr(struct inode *, struct iattr *);
 
-extern void inode_update_time(struct inode *inode, int ctime_too);
+extern void file_update_time(struct file *file);
 
 static inline ino_t parent_ino(struct dentry *dentry)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 5fca2737c971..96de772be487 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2108,7 +2108,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 	if (err)
 		goto out;
 
-	inode_update_time(inode, 1);
+	file_update_time(file);
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (unlikely(file->f_flags & O_DIRECT)) {
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index e2b34e95913e..b960ac8e5918 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -383,7 +383,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
 	if (ret)
 		goto out_backing;
 
-	inode_update_time(inode, 1);
+	file_update_time(filp);
 
 	ret = __xip_file_write (filp, buf, count, pos, ppos);
 
-- 
cgit v1.2.3-71-gd317


From 869243a0f6143f76e7c847e707eee6ece9cbf821 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:03 -0800
Subject: [PATCH] remove update_atime

All callers use touch_atime now which takes a vfsmount and allows us to
implement per-mount noatime.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c         |  9 ++++++---
 include/linux/fs.h | 10 +---------
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index e177769f3b41..76980a9c92e7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1176,17 +1176,20 @@ sector_t bmap(struct inode * inode, sector_t block)
 EXPORT_SYMBOL(bmap);
 
 /**
- *	update_atime	-	update the access time
+ *	touch_atime	-	update the access time
+ *	@mnt: mount the inode is accessed on
  *	@inode: inode accessed
  *
  *	Update the accessed time on an inode and mark it for writeback.
  *	This function automatically handles read only file systems and media,
  *	as well as the "noatime" flag and inode specific "noatime" markers.
  */
-void update_atime(struct inode *inode)
+void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 {
+	struct inode *inode = dentry->d_inode;
 	struct timespec now;
 
+	/* per-mountpoint checks will go here */
 	if (IS_NOATIME(inode))
 		return;
 	if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
@@ -1201,7 +1204,7 @@ void update_atime(struct inode *inode)
 	}
 }
 
-EXPORT_SYMBOL(update_atime);
+EXPORT_SYMBOL(touch_atime);
 
 /**
  *	file_update_time	-	update mtime and ctime time
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1feee2e7e47b..85c5656756b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -235,9 +235,6 @@ struct kstatfs;
 struct vm_area_struct;
 struct vfsmount;
 
-/* Used to be a macro which just called the function, now just a function */
-extern void update_atime (struct inode *);
-
 extern void __init inode_init(unsigned long);
 extern void __init inode_init_early(void);
 extern void __init mnt_init(unsigned long);
@@ -1118,12 +1115,7 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
-static inline void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
-{
-	/* per-mountpoint checks will go here */
-	update_atime(dentry->d_inode);
-}
-
+extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry);
 static inline void file_accessed(struct file *file)
 {
 	if (!(file->f_flags & O_NOATIME))
-- 
cgit v1.2.3-71-gd317


From bdff071dbf911bf5d1dcaedfaafebb549d2fd969 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 9 Jan 2006 20:52:03 -0800
Subject: [PATCH] __deprecated_for_modules the lookup_hash() prototype

This patch __deprecated_for_modules the lookup_hash() prototype.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/namei.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/namei.h b/include/linux/namei.h
index 455660eafba9..b699e427c00c 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -74,7 +74,7 @@ extern struct file *nameidata_to_filp(struct nameidata *nd, int flags);
 extern void release_open_intent(struct nameidata *);
 
 extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
-extern struct dentry * lookup_hash(struct nameidata *);
+extern __deprecated_for_modules struct dentry * lookup_hash(struct nameidata *);
 
 extern int follow_down(struct vfsmount **, struct dentry **);
 extern int follow_up(struct vfsmount **, struct dentry **);
-- 
cgit v1.2.3-71-gd317


From e6a6d2efcb7e7c87c5fe0395803da1453b29cbef Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:14 -0800
Subject: [PATCH] sanitize building of fs/compat_ioctl.c

Now that all these entries in the arch ioctl32.c files are gone [1], we can
build fs/compat_ioctl.c as a normal object and kill tons of cruft.  We need a
special do_ioctl32_pointer handler for s390 so the compat_ptr call is done.
This is not needed but harmless on all other architectures.  Also remove some
superflous includes in fs/compat_ioctl.c

Tested on ppc64.

[1] parisc still had it's PPP handler left, which is not fully correct
    for ppp and besides that ppp uses the generic SIOCPRIV ioctl so it'd
    kick in for all netdevice users.  We can introduce a proper handler
    in one of the next patch series by adding a compat_ioctl method to
    struct net_device but for now let's just kill it - parisc doesn't
    compile in mainline anyway and I don't want this to block this
    patchset.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <willy@debian.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/ia32/Makefile         |  4 +---
 arch/ia64/ia32/ia32_ioctl.c     | 45 -------------------------------------
 arch/mips/kernel/Makefile       |  3 +--
 arch/mips/kernel/ioctl32.c      | 50 -----------------------------------------
 arch/parisc/kernel/Makefile     |  3 +--
 arch/parisc/kernel/ioctl32.c    | 41 ---------------------------------
 arch/powerpc/kernel/Makefile    |  3 +--
 arch/powerpc/kernel/ioctl32.c   | 45 -------------------------------------
 arch/s390/kernel/Makefile       |  3 +--
 arch/s390/kernel/compat_ioctl.c | 47 --------------------------------------
 arch/sparc64/kernel/Makefile    |  4 +---
 arch/sparc64/kernel/ioctl32.c   | 39 --------------------------------
 arch/x86_64/ia32/Makefile       |  4 +---
 arch/x86_64/ia32/ia32_ioctl.c   | 32 --------------------------
 fs/Makefile                     |  2 +-
 fs/compat_ioctl.c               | 37 +++++++++++++++++-------------
 include/linux/compat_ioctl.h    |  8 -------
 17 files changed, 29 insertions(+), 341 deletions(-)
 delete mode 100644 arch/ia64/ia32/ia32_ioctl.c
 delete mode 100644 arch/mips/kernel/ioctl32.c
 delete mode 100644 arch/parisc/kernel/ioctl32.c
 delete mode 100644 arch/powerpc/kernel/ioctl32.c
 delete mode 100644 arch/s390/kernel/compat_ioctl.c
 delete mode 100644 arch/sparc64/kernel/ioctl32.c
 delete mode 100644 arch/x86_64/ia32/ia32_ioctl.c

(limited to 'include/linux')

diff --git a/arch/ia64/ia32/Makefile b/arch/ia64/ia32/Makefile
index 2ed90da81166..61cb60affd95 100644
--- a/arch/ia64/ia32/Makefile
+++ b/arch/ia64/ia32/Makefile
@@ -2,11 +2,9 @@
 # Makefile for the ia32 kernel emulation subsystem.
 #
 
-obj-y := ia32_entry.o sys_ia32.o ia32_ioctl.o ia32_signal.o \
+obj-y := ia32_entry.o sys_ia32.o ia32_signal.o \
 	 ia32_support.o ia32_traps.o binfmt_elf32.o ia32_ldt.o
 
-CFLAGS_ia32_ioctl.o += -Ifs/
-
 # Don't let GCC uses f16-f31 so that save_ia32_fpstate_live() and
 # restore_ia32_fpstate_live() can be sure the live register contain user-level state.
 CFLAGS_ia32_signal.o += -mfixed-range=f16-f31
diff --git a/arch/ia64/ia32/ia32_ioctl.c b/arch/ia64/ia32/ia32_ioctl.c
deleted file mode 100644
index 88739394f6df..000000000000
--- a/arch/ia64/ia32/ia32_ioctl.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * IA32 Architecture-specific ioctl shim code
- *
- * Copyright (C) 2000 VA Linux Co
- * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
- * Copyright (C) 2001-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/signal.h>	/* argh, msdos_fs.h isn't self-contained... */
-#include <linux/syscalls.h>
-#include "ia32priv.h"
-  
-#define	INCLUDES
-#include "compat_ioctl.c"
-
-#define IOCTL_NR(a)	((a) & ~(_IOC_SIZEMASK << _IOC_SIZESHIFT))
-
-#define DO_IOCTL(fd, cmd, arg) ({			\
-	int _ret;					\
-	mm_segment_t _old_fs = get_fs();		\
-							\
-	set_fs(KERNEL_DS);				\
-	_ret = sys_ioctl(fd, cmd, (unsigned long)arg);	\
-	set_fs(_old_fs);				\
-	_ret;						\
-})
-
-#define CODE
-#include "compat_ioctl.c"
-
-#define COMPATIBLE_IOCTL(cmd)		HANDLE_IOCTL((cmd),sys_ioctl)
-#define HANDLE_IOCTL(cmd,handler)	{ (cmd), (ioctl_trans_handler_t)(handler), NULL },
-#define IOCTL_TABLE_START \
-	struct ioctl_trans ioctl_start[] = {
-#define IOCTL_TABLE_END \
-	};
-
-IOCTL_TABLE_START
-#define DECLARES
-#include "compat_ioctl.c"
-#include <linux/compat_ioctl.h>
-IOCTL_TABLE_END
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index 72f2126ad19d..f36c4f20ee8a 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_MIPS_BOARDS_GEN)	+= irq-msc01.o
 obj-$(CONFIG_32BIT)		+= scall32-o32.o
 obj-$(CONFIG_64BIT)		+= scall64-64.o
 obj-$(CONFIG_BINFMT_IRIX)	+= binfmt_irix.o
-obj-$(CONFIG_MIPS32_COMPAT)	+= ioctl32.o linux32.o signal32.o
+obj-$(CONFIG_MIPS32_COMPAT)	+= linux32.o signal32.o
 obj-$(CONFIG_MIPS32_N32)	+= binfmt_elfn32.o scall64-n32.o signal_n32.o
 obj-$(CONFIG_MIPS32_O32)	+= binfmt_elfo32.o scall64-o32.o ptrace32.o
 
@@ -60,6 +60,5 @@ obj-$(CONFIG_PROC_FS)		+= proc.o
 obj-$(CONFIG_64BIT)		+= cpu-bugs64.o
 
 CFLAGS_cpu-bugs64.o	= $(shell if $(CC) $(CFLAGS) -Wa,-mdaddi -c -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-DHAVE_AS_SET_DADDI"; fi)
-CFLAGS_ioctl32.o	+= -Ifs/
 
 EXTRA_AFLAGS := $(CFLAGS)
diff --git a/arch/mips/kernel/ioctl32.c b/arch/mips/kernel/ioctl32.c
deleted file mode 100644
index 9ea1fc748864..000000000000
--- a/arch/mips/kernel/ioctl32.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Copyright (C) 2000 Silicon Graphics, Inc.
- * Written by Ulf Carlsson (ulfc@engr.sgi.com)
- * Copyright (C) 2000, 2004 Ralf Baechle
- * Copyright (C) 2002, 2003  Maciej W. Rozycki
- */
-#define INCLUDES
-#include "compat_ioctl.c"
-
-#include <linux/config.h>
-#include <linux/types.h>
-#include <linux/compat.h>
-#include <linux/ioctl32.h>
-#include <linux/syscalls.h>
-
-#ifdef CONFIG_SIBYTE_TBPROF
-#include <asm/sibyte/trace_prof.h>
-#endif
-
-#define A(__x) ((unsigned long)(__x))
-
-long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
-
-#define CODE
-#include "compat_ioctl.c"
-
-#define COMPATIBLE_IOCTL(cmd)		HANDLE_IOCTL((cmd),sys_ioctl)
-#define HANDLE_IOCTL(cmd,handler)	{ (cmd), (ioctl_trans_handler_t)(handler), NULL },
-#define IOCTL_TABLE_START \
-	struct ioctl_trans ioctl_start[] = {
-#define IOCTL_TABLE_END \
-	};
-
-IOCTL_TABLE_START
-
-#include <linux/compat_ioctl.h>
-#define DECLARES
-#include "compat_ioctl.c"
-
-/*HANDLE_IOCTL(RTC_IRQP_READ, w_long)
-COMPATIBLE_IOCTL(RTC_IRQP_SET)
-HANDLE_IOCTL(RTC_EPOCH_READ, w_long)
-COMPATIBLE_IOCTL(RTC_EPOCH_SET)
-*/
-
-IOCTL_TABLE_END
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index 171f9c239f60..27827bc3717e 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -6,7 +6,6 @@ extra-y			:= init_task.o head.o vmlinux.lds
 
 AFLAGS_entry.o	:= -traditional
 AFLAGS_pacache.o := -traditional
-CFLAGS_ioctl32.o := -Ifs/
 
 obj-y	     	:= cache.o pacache.o setup.o traps.o time.o irq.o \
 		   pa7300lc.o syscall.o entry.o sys_parisc.o firmware.o \
@@ -19,6 +18,6 @@ obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PA11)	+= pci-dma.o
 obj-$(CONFIG_PCI)	+= pci.o
 obj-$(CONFIG_MODULES)	+= module.o
-obj-$(CONFIG_64BIT)	+= binfmt_elf32.o sys_parisc32.o ioctl32.o signal32.o
+obj-$(CONFIG_64BIT)	+= binfmt_elf32.o sys_parisc32.o signal32.o
 # only supported for PCX-W/U in 64-bit mode at the moment
 obj-$(CONFIG_64BIT)	+= perf.o perf_asm.o
diff --git a/arch/parisc/kernel/ioctl32.c b/arch/parisc/kernel/ioctl32.c
deleted file mode 100644
index 805f31486cf9..000000000000
--- a/arch/parisc/kernel/ioctl32.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/* $Id: ioctl32.c,v 1.5 2002/10/18 00:21:43 varenet Exp $
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
- * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
- *
- * These routines maintain argument size conversion between 32bit and 64bit
- * ioctls.
- */
-
-#include <linux/syscalls.h>
-
-#define INCLUDES
-#include "compat_ioctl.c"
-
-#include <asm/perf.h>
-#include <asm/ioctls.h>
-
-#define CODE
-#include "compat_ioctl.c"
-
-#define HANDLE_IOCTL(cmd, handler) { cmd, (ioctl_trans_handler_t)handler, NULL },
-#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd, sys_ioctl) 
-
-#define IOCTL_TABLE_START  struct ioctl_trans ioctl_start[] = {
-#define IOCTL_TABLE_END    };
-
-IOCTL_TABLE_START
-#include <linux/compat_ioctl.h>
-
-#define DECLARES
-#include "compat_ioctl.c"
-
-/* And these ioctls need translation */
-HANDLE_IOCTL(SIOCGPPPSTATS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGPPPCSTATS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGPPPVER, dev_ifsioc)
-
-IOCTL_TABLE_END
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 6e03b595b6c8..17ed5018288b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -4,7 +4,6 @@
 
 ifeq ($(CONFIG_PPC64),y)
 EXTRA_CFLAGS	+= -mno-minimal-toc
-CFLAGS_ioctl32.o += -Ifs/
 endif
 ifeq ($(CONFIG_PPC32),y)
 CFLAGS_prom_init.o      += -fPIC
@@ -16,7 +15,7 @@ obj-y				:= semaphore.o cputable.o ptrace.o syscalls.o \
 obj-y				+= vdso32/
 obj-$(CONFIG_PPC64)		+= setup_64.o binfmt_elf32.o sys_ppc32.o \
 				   signal_64.o ptrace32.o systbl.o \
-				   paca.o ioctl32.o cpu_setup_power4.o \
+				   paca.o cpu_setup_power4.o \
 				   firmware.o sysfs.o idle_64.o
 obj-$(CONFIG_PPC64)		+= vdso64/
 obj-$(CONFIG_ALTIVEC)		+= vecemu.o vector.o
diff --git a/arch/powerpc/kernel/ioctl32.c b/arch/powerpc/kernel/ioctl32.c
deleted file mode 100644
index 0fa3d27fef01..000000000000
--- a/arch/powerpc/kernel/ioctl32.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Based on sparc64 ioctl32.c by:
- *
- * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
- * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
- *
- * ppc64 changes:
- *
- * Copyright (C) 2000  Ken Aaker (kdaaker@rchland.vnet.ibm.com)
- * Copyright (C) 2001  Anton Blanchard (antonb@au.ibm.com)
- *
- * These routines maintain argument size conversion between 32bit and 64bit
- * ioctls.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define INCLUDES
-#include "compat_ioctl.c"
-#include <linux/syscalls.h>
-
-#define CODE
-#include "compat_ioctl.c"
-
-#define HANDLE_IOCTL(cmd,handler) { cmd, (ioctl_trans_handler_t)handler, NULL },
-#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
-
-#define IOCTL_TABLE_START \
-	struct ioctl_trans ioctl_start[] = {
-#define IOCTL_TABLE_END \
-	};
-
-IOCTL_TABLE_START
-#include <linux/compat_ioctl.h>
-#define DECLARES
-#include "compat_ioctl.c"
-
-IOCTL_TABLE_END
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 4865e4b49464..9269b5788fac 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -17,8 +17,7 @@ obj-$(CONFIG_MODULES)		+= s390_ksyms.o module.o
 obj-$(CONFIG_SMP)		+= smp.o
 
 obj-$(CONFIG_COMPAT)		+= compat_linux.o compat_signal.o \
-					compat_ioctl.o compat_wrapper.o \
-					compat_exec_domain.o
+					compat_wrapper.o compat_exec_domain.o
 obj-$(CONFIG_BINFMT_ELF32)	+= binfmt_elf32.o
 
 obj-$(CONFIG_VIRT_TIMER)	+= vtime.o
diff --git a/arch/s390/kernel/compat_ioctl.c b/arch/s390/kernel/compat_ioctl.c
deleted file mode 100644
index d716b1768c99..000000000000
--- a/arch/s390/kernel/compat_ioctl.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- *  S390 version
- *    Copyright (C) 2000-2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
- *    Author(s): Gerhard Tonn (ton@de.ibm.com)
- *               Arnd Bergmann (arndb@de.ibm.com)
- *
- * Original implementation from 32-bit Sparc compat code which is
- * Copyright (C) 2000 Silicon Graphics, Inc.
- * Written by Ulf Carlsson (ulfc@engr.sgi.com) 
- */
-
-#include "compat_linux.h"
-#define INCLUDES
-#define CODE
-#include "../../../fs/compat_ioctl.c"
-#include <asm/dasd.h>
-#include <asm/cmb.h>
-#include <asm/tape390.h>
-#include <asm/ccwdev.h>
-#include "../../../drivers/s390/char/raw3270.h"
-
-static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd,
-				unsigned long arg, struct file *f)
-{
-	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
-}
-
-static int do_ioctl32_ulong(unsigned int fd, unsigned int cmd,
-				unsigned long arg, struct file *f)
-{
-	return sys_ioctl(fd, cmd, arg);
-}
-
-#define COMPATIBLE_IOCTL(cmd)		HANDLE_IOCTL((cmd),(ioctl_trans_handler_t)do_ioctl32_pointer)
-#define ULONG_IOCTL(cmd)		HANDLE_IOCTL((cmd),(ioctl_trans_handler_t)do_ioctl32_ulong)
-#define HANDLE_IOCTL(cmd,handler)	{ (cmd), (ioctl_trans_handler_t)(handler), NULL },
-
-struct ioctl_trans ioctl_start[] = {
-/* architecture independent ioctls */
-#include <linux/compat_ioctl.h>
-#define DECLARES
-#include "../../../fs/compat_ioctl.c"
-};
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile
index 6f00ab8b9d23..83d67eb18895 100644
--- a/arch/sparc64/kernel/Makefile
+++ b/arch/sparc64/kernel/Makefile
@@ -16,7 +16,7 @@ obj-y		:= process.o setup.o cpu.o idprom.o \
 obj-$(CONFIG_PCI)	 += ebus.o isa.o pci_common.o pci_iommu.o \
 			    pci_psycho.o pci_sabre.o pci_schizo.o
 obj-$(CONFIG_SMP)	 += smp.o trampoline.o
-obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o ioctl32.o
+obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o
 obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o
 obj-$(CONFIG_BINFMT_AOUT32) += binfmt_aout32.o
 obj-$(CONFIG_MODULES) += module.o
@@ -40,5 +40,3 @@ endif
 
 head.o: head.S ttable.S itlb_base.S dtlb_base.S dtlb_backend.S dtlb_prot.S \
 	etrap.S rtrap.S winfixup.S entry.S
-
-CFLAGS_ioctl32.o += -Ifs/
diff --git a/arch/sparc64/kernel/ioctl32.c b/arch/sparc64/kernel/ioctl32.c
deleted file mode 100644
index 196b208665a2..000000000000
--- a/arch/sparc64/kernel/ioctl32.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* $Id: ioctl32.c,v 1.136 2002/01/14 09:49:52 davem Exp $
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
- * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
- * Copyright (C) 2003  Pavel Machek (pavel@suse.cz)
- *
- * These routines maintain argument size conversion between 32bit and 64bit
- * ioctls.
- */
-
-#define INCLUDES
-#include "compat_ioctl.c"
-#include <linux/syscalls.h>
-
-#define CODE
-#include "compat_ioctl.c"
-
-#define COMPATIBLE_IOCTL(cmd)		HANDLE_IOCTL((cmd),sys_ioctl)
-#define HANDLE_IOCTL(cmd,handler)	{ (cmd), (ioctl_trans_handler_t)(handler), NULL },
-#define IOCTL_TABLE_START \
-	struct ioctl_trans ioctl_start[] = {
-#define IOCTL_TABLE_END \
-	};
-
-IOCTL_TABLE_START
-#include <linux/compat_ioctl.h>
-#define DECLARES
-#include "compat_ioctl.c"
-#if 0
-HANDLE_IOCTL(RTC32_IRQP_READ, do_rtc_ioctl)
-HANDLE_IOCTL(RTC32_IRQP_SET, do_rtc_ioctl)
-HANDLE_IOCTL(RTC32_EPOCH_READ, do_rtc_ioctl)
-HANDLE_IOCTL(RTC32_EPOCH_SET, do_rtc_ioctl)
-#endif
-/* take care of sizeof(sizeof()) breakage */
-IOCTL_TABLE_END
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
index f76217d8f579..051608d55920 100644
--- a/arch/x86_64/ia32/Makefile
+++ b/arch/x86_64/ia32/Makefile
@@ -2,8 +2,7 @@
 # Makefile for the ia32 kernel emulation subsystem.
 #
 
-obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \
-	ia32_signal.o tls32.o \
+obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \
 	ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
@@ -29,4 +28,3 @@ $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
 
 AFLAGS_vsyscall-sysenter.o = -m32
 AFLAGS_vsyscall-syscall.o = -m32
-CFLAGS_ia32_ioctl.o += -Ifs/
diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
deleted file mode 100644
index e11cc5699352..000000000000
--- a/arch/x86_64/ia32/ia32_ioctl.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* $Id: ia32_ioctl.c,v 1.25 2002/10/11 07:17:06 ak Exp $
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
- * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
- * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
- *
- * These routines maintain argument size conversion between 32bit and 64bit
- * ioctls.
- */
-
-#define INCLUDES
-#include <linux/syscalls.h>
-#include "compat_ioctl.c"
-#include <asm/ia32.h>
-
-#define CODE
-#include "compat_ioctl.c"
-
-
-#define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) }, 
-#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
-
-struct ioctl_trans ioctl_start[] = { 
-#include <linux/compat_ioctl.h>
-#define DECLARES
-#include "compat_ioctl.c"
-/* take care of sizeof(sizeof()) breakage */
-}; 
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
-
diff --git a/fs/Makefile b/fs/Makefile
index 35e9aec608e4..1db711319c80 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,7 +14,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
-obj-$(CONFIG_COMPAT)		+= compat.o
+obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
 nfsd-$(CONFIG_NFSD)		:= nfsctl.o
 obj-y				+= $(nfsd-y) $(nfsd-m)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index b9aeacc11c8f..890bc30fbe20 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -10,7 +10,6 @@
  * ioctls.
  */
 
-#ifdef INCLUDES
 #include <linux/config.h>
 #include <linux/types.h>
 #include <linux/compat.h>
@@ -81,13 +80,9 @@
 #include <linux/capi.h>
 
 #include <scsi/scsi.h>
-/* Ugly hack. */
-#undef __KERNEL__
 #include <scsi/scsi_ioctl.h>
-#define __KERNEL__
 #include <scsi/sg.h>
 
-#include <asm/types.h>
 #include <asm/uaccess.h>
 #include <linux/ethtool.h>
 #include <linux/mii.h>
@@ -95,7 +90,6 @@
 #include <linux/watchdog.h>
 #include <linux/dm-ioctl.h>
 
-#include <asm/module.h>
 #include <linux/soundcard.h>
 #include <linux/lp.h>
 #include <linux/ppdev.h>
@@ -128,11 +122,6 @@
 #include <linux/dvb/frontend.h>
 #include <linux/dvb/video.h>
 
-#undef INCLUDES
-#endif
-
-#ifdef CODE
-
 /* Aiee. Someone does not find a difference between int and long */
 #define EXT2_IOC32_GETFLAGS               _IOR('f', 1, int)
 #define EXT2_IOC32_SETFLAGS               _IOW('f', 2, int)
@@ -148,6 +137,12 @@
 #define EXT2_IOC32_GETVERSION             _IOR('v', 1, int)
 #define EXT2_IOC32_SETVERSION             _IOW('v', 2, int)
 
+static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd,
+			      unsigned long arg, struct file *f)
+{
+	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
+}
+
 static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	mm_segment_t old_fs = get_fs();
@@ -2705,10 +2700,20 @@ static int do_ncp_setprivatedata(unsigned int fd, unsigned int cmd, unsigned lon
 }
 #endif
 
-#undef CODE
-#endif
+#define HANDLE_IOCTL(cmd,handler) \
+	{ (cmd), (ioctl_trans_handler_t)(handler) },
+
+/* pointer to compatible structure or no argument */
+#define COMPATIBLE_IOCTL(cmd) \
+	{ (cmd), do_ioctl32_pointer },
+
+/* argument is an unsigned long integer, not a pointer */
+#define ULONG_IOCTL(cmd) \
+	{ (cmd), (ioctl_trans_handler_t)sys_ioctl },
 
-#ifdef DECLARES
+
+struct ioctl_trans ioctl_start[] = {
+#include <linux/compat_ioctl.h>
 HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob)
 HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob)
 #ifdef CONFIG_NET
@@ -2921,6 +2926,6 @@ HANDLE_IOCTL(DMX_GET_EVENT, do_dmx_get_event)
 HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
 HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
 HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
+};
 
-#undef DECLARES
-#endif
+int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 339878952f12..8fad50f8e389 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -2,14 +2,6 @@
  * compatible types passed or none at all... Please include
  * only stuff that is compatible on *all architectures*.
  */
-#ifndef COMPATIBLE_IOCTL /* pointer to compatible structure or no argument */
-#define COMPATIBLE_IOCTL(cmd)  HANDLE_IOCTL((cmd),(ioctl_trans_handler_t)sys_ioctl)
-#endif
-
-#ifndef ULONG_IOCTL /* argument is an unsigned long integer, not a pointer */
-#define ULONG_IOCTL(cmd)  HANDLE_IOCTL((cmd),(ioctl_trans_handler_t)sys_ioctl)
-#endif
-
 
 COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
 COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
-- 
cgit v1.2.3-71-gd317


From fc33a7bb9c6dd8f6e4a014976200f8fdabb3a45c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:17 -0800
Subject: [PATCH] per-mountpoint noatime/nodiratime

Turn noatime and nodiratime into per-mount instead of per-sb flags.

After all the preparations this is a rather trivial patch.  The mount code
needs to treat the two options as per-mount instead of per-superblock, and
touch_atime needs to be changed to check the new MNT_ flags in addition to
the MS_ flags that are kept for filesystems that are always
noatime/nodiratime but not user settable anymore.  Besides that core code
only nfs needed an update because it's leaving atime updates to the server
and thus sets the S_NOATIME flag on every inode, but needs to know whether
it's a real noatime mount for an getattr optimization.

While we're at it I've killed the IS_NOATIME/IS_NODIRATIME macros that were
only used by touch_atime.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c                  | 17 +++++++++++++----
 fs/namespace.c              | 12 +++++++++---
 fs/nfs/inode.c              | 17 +++++++++++++----
 fs/xfs/linux-2.6/xfs_iops.c |  3 +++
 include/linux/fs.h          |  5 +----
 include/linux/mount.h       |  8 +++++---
 6 files changed, 44 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 76980a9c92e7..108138d4e909 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/mount.h>
 
 /*
  * This is needed for the following functions:
@@ -1189,12 +1190,20 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	struct timespec now;
 
-	/* per-mountpoint checks will go here */
-	if (IS_NOATIME(inode))
+	if (IS_RDONLY(inode))
 		return;
-	if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+
+	if ((inode->i_flags & S_NOATIME) ||
+	    (inode->i_sb->s_flags & MS_NOATIME) ||
+	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return;
-	if (IS_RDONLY(inode))
+
+	/*
+	 * We may have a NULL vfsmount when coming from NFSD
+	 */
+	if (mnt &&
+	    ((mnt->mnt_flags & MNT_NOATIME) ||
+	     ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))))
 		return;
 
 	now = current_fs_time(inode->i_sb);
diff --git a/fs/namespace.c b/fs/namespace.c
index f0e353f5bc30..2ca6145f43d6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -355,14 +355,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
 		{ MS_SYNCHRONOUS, ",sync" },
 		{ MS_DIRSYNC, ",dirsync" },
 		{ MS_MANDLOCK, ",mand" },
-		{ MS_NOATIME, ",noatime" },
-		{ MS_NODIRATIME, ",nodiratime" },
 		{ 0, NULL }
 	};
 	static struct proc_fs_info mnt_info[] = {
 		{ MNT_NOSUID, ",nosuid" },
 		{ MNT_NODEV, ",nodev" },
 		{ MNT_NOEXEC, ",noexec" },
+		{ MNT_NOATIME, ",noatime" },
+		{ MNT_NODIRATIME, ",nodiratime" },
 		{ 0, NULL }
 	};
 	struct proc_fs_info *fs_infop;
@@ -1286,7 +1286,13 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		mnt_flags |= MNT_NODEV;
 	if (flags & MS_NOEXEC)
 		mnt_flags |= MNT_NOEXEC;
-	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE);
+	if (flags & MS_NOATIME)
+		mnt_flags |= MNT_NOATIME;
+	if (flags & MS_NODIRATIME)
+		mnt_flags |= MNT_NODIRATIME;
+
+	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
+		   MS_NOATIME | MS_NODIRATIME);
 
 	/* ... and get the mountpoint */
 	retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3e4ba9cb7f80..a77ee95b7efb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -950,11 +950,20 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 
 	/* Flush out writes to the server in order to update c/mtime */
 	nfs_sync_inode(inode, 0, 0, FLUSH_WAIT|FLUSH_NOCOMMIT);
-	if (__IS_FLG(inode, MS_NOATIME))
-		need_atime = 0;
-	else if (__IS_FLG(inode, MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+
+	/*
+	 * We may force a getattr if the user cares about atime.
+	 *
+	 * Note that we only have to check the vfsmount flags here:
+	 *  - NFS always sets S_NOATIME by so checking it would give a
+	 *    bogus result
+	 *  - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
+	 *    no point in checking those.
+	 */
+ 	if ((mnt->mnt_flags & MNT_NOATIME) ||
+ 	    ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		need_atime = 0;
-	/* We may force a getattr if the user cares about atime */
+
 	if (need_atime)
 		err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	else
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 41c478bb1ffc..97fb1470cf28 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -54,6 +54,9 @@
 #include <linux/xattr.h>
 #include <linux/namei.h>
 
+#define IS_NOATIME(inode) ((inode->i_sb->s_flags & MS_NOATIME) ||	\
+	(S_ISDIR(inode->i_mode) && inode->i_sb->s_flags & MS_NODIRATIME))
+
 /*
  * Change the requested timestamp in the given inode.
  * We don't lock across timestamp updates, and we don't log them but
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 85c5656756b6..d1e370d25f7b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -114,8 +114,7 @@ extern int dir_notify_enable;
 /*
  * Superblock flags that can be altered by MS_REMOUNT
  */
-#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_NOATIME|\
-			 MS_NODIRATIME)
+#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK)
 
 /*
  * Old magic mount flag and mask
@@ -161,8 +160,6 @@ extern int dir_notify_enable;
 #define IS_NOQUOTA(inode)	((inode)->i_flags & S_NOQUOTA)
 #define IS_APPEND(inode)	((inode)->i_flags & S_APPEND)
 #define IS_IMMUTABLE(inode)	((inode)->i_flags & S_IMMUTABLE)
-#define IS_NOATIME(inode)	(__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME))
-#define IS_NODIRATIME(inode)	__IS_FLG(inode, MS_NODIRATIME)
 #define IS_POSIXACL(inode)	__IS_FLG(inode, MS_POSIXACL)
 
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index b98a709f1794..b7472ae91fa4 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -20,10 +20,12 @@
 #define MNT_NOSUID	0x01
 #define MNT_NODEV	0x02
 #define MNT_NOEXEC	0x04
-#define MNT_SHARED	0x10	/* if the vfsmount is a shared mount */
-#define MNT_UNBINDABLE	0x20	/* if the vfsmount is a unbindable mount */
+#define MNT_NOATIME	0x08
+#define MNT_NODIRATIME	0x10
 
-#define MNT_PNODE_MASK	(MNT_SHARED | MNT_UNBINDABLE)
+#define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
+#define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
+#define MNT_PNODE_MASK	0x3000	/* propogation flag mask */
 
 struct vfsmount {
 	struct list_head mnt_hash;
-- 
cgit v1.2.3-71-gd317


From 5cca7619a562c9d98a3a0123dc878d79bf3c8fb3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:20 -0800
Subject: [PATCH] hrtimer: move div_long_long_rem out of jiffies.h

move div_long_long_rem() from jiffies.h into a new calc64.h include file, as
it is a general math function useful for other things than the jiffy code.
Convert it to an inline function

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/calc64.h  | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/jiffies.h | 18 ++++++------------
 2 files changed, 55 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/calc64.h

(limited to 'include/linux')

diff --git a/include/linux/calc64.h b/include/linux/calc64.h
new file mode 100644
index 000000000000..ebf4b8f38d88
--- /dev/null
+++ b/include/linux/calc64.h
@@ -0,0 +1,49 @@
+#ifndef _LINUX_CALC64_H
+#define _LINUX_CALC64_H
+
+#include <linux/types.h>
+#include <asm/div64.h>
+
+/*
+ * This is a generic macro which is used when the architecture
+ * specific div64.h does not provide a optimized one.
+ *
+ * The 64bit dividend is divided by the divisor (data type long), the
+ * result is returned and the remainder stored in the variable
+ * referenced by remainder (data type long *). In contrast to the
+ * do_div macro the dividend is kept intact.
+ */
+#ifndef div_long_long_rem
+#define div_long_long_rem(dividend, divisor, remainder)	\
+	do_div_llr((dividend), divisor, remainder)
+
+static inline unsigned long do_div_llr(const long long dividend,
+				       const long divisor, long *remainder)
+{
+	u64 result = dividend;
+
+	*(remainder) = do_div(result, divisor);
+	return (unsigned long) result;
+}
+#endif
+
+/*
+ * Sign aware variation of the above. On some architectures a
+ * negative dividend leads to an divide overflow exception, which
+ * is avoided by the sign check.
+ */
+static inline long div_long_long_rem_signed(const long long dividend,
+					    const long divisor, long *remainder)
+{
+	long res;
+
+	if (unlikely(dividend < 0)) {
+		res = -div_long_long_rem(-dividend, divisor, remainder);
+		*remainder = -(*remainder);
+	} else
+		res = div_long_long_rem(dividend, divisor, remainder);
+
+	return res;
+}
+
+#endif
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 6acfdbba734b..99905e180532 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -1,21 +1,12 @@
 #ifndef _LINUX_JIFFIES_H
 #define _LINUX_JIFFIES_H
 
+#include <linux/calc64.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/time.h>
 #include <linux/timex.h>
 #include <asm/param.h>			/* for HZ */
-#include <asm/div64.h>
-
-#ifndef div_long_long_rem
-#define div_long_long_rem(dividend,divisor,remainder) \
-({							\
-	u64 result = dividend;				\
-	*remainder = do_div(result,divisor);		\
-	result;						\
-})
-#endif
 
 /*
  * The following defines establish the engineering parameters of the PLL
@@ -373,8 +364,11 @@ jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
 	 * one divide.
 	 */
 	u64 nsec = (u64)jiffies * TICK_NSEC;
-	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_usec);
-	value->tv_usec /= NSEC_PER_USEC;
+	long tv_usec;
+
+	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
+	tv_usec /= NSEC_PER_USEC;
+	value->tv_usec = tv_usec;
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From 753be6222728996974e9e12c185108fcabbb7c6e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:22 -0800
Subject: [PATCH] hrtimer: deinline mktime and set_normalized_timespec

mktime() and set_normalized_timespec() are large inline functions used in many
places: deinline them.

From: George Anzinger, off-by-1 bugfix

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 52 +++++---------------------------------------
 kernel/time.c        | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 797ccd813bb0..9c444d9c4aa0 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -38,38 +38,9 @@ static __inline__ int timespec_equal(struct timespec *a, struct timespec *b)
 	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
 } 
 
-/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
- * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
- * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
- *
- * [For the Julian calendar (which was used in Russia before 1917,
- * Britain & colonies before 1752, anywhere else before 1582,
- * and is still in use by some communities) leave out the
- * -year/100+year/400 terms, and add 10.]
- *
- * This algorithm was first published by Gauss (I think).
- *
- * WARNING: this function will overflow on 2106-02-07 06:28:16 on
- * machines were long is 32-bit! (However, as time_t is signed, we
- * will already get problems at other places on 2038-01-19 03:14:08)
- */
-static inline unsigned long
-mktime (unsigned int year, unsigned int mon,
-	unsigned int day, unsigned int hour,
-	unsigned int min, unsigned int sec)
-{
-	if (0 >= (int) (mon -= 2)) {	/* 1..12 -> 11,12,1..10 */
-		mon += 12;		/* Puts Feb last since it has leap day */
-		year -= 1;
-	}
-
-	return (((
-		(unsigned long) (year/4 - year/100 + year/400 + 367*mon/12 + day) +
-			year*365 - 719499
-	    )*24 + hour /* now have hours */
-	  )*60 + min /* now have minutes */
-	)*60 + sec; /* finally seconds */
-}
+extern unsigned long mktime (unsigned int year, unsigned int mon,
+			     unsigned int day, unsigned int hour,
+			     unsigned int min, unsigned int sec);
 
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
@@ -80,6 +51,8 @@ static inline unsigned long get_seconds(void)
 	return xtime.tv_sec;
 }
 
+extern void set_normalized_timespec (struct timespec *ts, time_t sec, long nsec);
+
 struct timespec current_kernel_time(void);
 
 #define CURRENT_TIME (current_kernel_time())
@@ -99,21 +72,6 @@ extern void getnstimestamp(struct timespec *ts);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
-static inline void
-set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
-{
-	while (nsec >= NSEC_PER_SEC) {
-		nsec -= NSEC_PER_SEC;
-		++sec;
-	}
-	while (nsec < 0) {
-		nsec += NSEC_PER_SEC;
-		--sec;
-	}
-	ts->tv_sec = sec;
-	ts->tv_nsec = nsec;
-}
-
 #endif /* __KERNEL__ */
 
 #define NFDBITS			__NFDBITS
diff --git a/kernel/time.c b/kernel/time.c
index b94bfa8c03e0..fa569885e22b 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -583,6 +583,67 @@ void getnstimestamp(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(getnstimestamp);
 
+/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
+ * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
+ * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
+ *
+ * [For the Julian calendar (which was used in Russia before 1917,
+ * Britain & colonies before 1752, anywhere else before 1582,
+ * and is still in use by some communities) leave out the
+ * -year/100+year/400 terms, and add 10.]
+ *
+ * This algorithm was first published by Gauss (I think).
+ *
+ * WARNING: this function will overflow on 2106-02-07 06:28:16 on
+ * machines were long is 32-bit! (However, as time_t is signed, we
+ * will already get problems at other places on 2038-01-19 03:14:08)
+ */
+unsigned long
+mktime (unsigned int year, unsigned int mon,
+	unsigned int day, unsigned int hour,
+	unsigned int min, unsigned int sec)
+{
+	if (0 >= (int) (mon -= 2)) {	/* 1..12 -> 11,12,1..10 */
+		mon += 12;		/* Puts Feb last since it has leap day */
+		year -= 1;
+	}
+
+	return ((((unsigned long)
+		  (year/4 - year/100 + year/400 + 367*mon/12 + day) +
+		  year*365 - 719499
+	    )*24 + hour /* now have hours */
+	  )*60 + min /* now have minutes */
+	)*60 + sec; /* finally seconds */
+}
+
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts:		pointer to timespec variable to be set
+ * @sec:	seconds to set
+ * @nsec:	nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ * 	0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
+{
+	while (nsec >= NSEC_PER_SEC) {
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	while (nsec < 0) {
+		nsec += NSEC_PER_SEC;
+		--sec;
+	}
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v1.2.3-71-gd317


From f4818900fa3ee1c56e96f6dede7cc4c05ed383d1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 20:52:23 -0800
Subject: [PATCH] hrtimer: clean up mktime and make arguments const

add 'const' to mktime arguments, and clean it up a bit

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 10 +++++-----
 kernel/time.c        | 15 +++++++++------
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 9c444d9c4aa0..773b83ddd8ef 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -38,9 +38,11 @@ static __inline__ int timespec_equal(struct timespec *a, struct timespec *b)
 	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
 } 
 
-extern unsigned long mktime (unsigned int year, unsigned int mon,
-			     unsigned int day, unsigned int hour,
-			     unsigned int min, unsigned int sec);
+extern unsigned long mktime(const unsigned int year, const unsigned int mon,
+			    const unsigned int day, const unsigned int hour,
+			    const unsigned int min, const unsigned int sec);
+
+extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
 
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
@@ -51,8 +53,6 @@ static inline unsigned long get_seconds(void)
 	return xtime.tv_sec;
 }
 
-extern void set_normalized_timespec (struct timespec *ts, time_t sec, long nsec);
-
 struct timespec current_kernel_time(void);
 
 #define CURRENT_TIME (current_kernel_time())
diff --git a/kernel/time.c b/kernel/time.c
index fa569885e22b..a0502aef43ce 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -599,12 +599,15 @@ EXPORT_SYMBOL_GPL(getnstimestamp);
  * will already get problems at other places on 2038-01-19 03:14:08)
  */
 unsigned long
-mktime (unsigned int year, unsigned int mon,
-	unsigned int day, unsigned int hour,
-	unsigned int min, unsigned int sec)
+mktime(const unsigned int year0, const unsigned int mon0,
+       const unsigned int day, const unsigned int hour,
+       const unsigned int min, const unsigned int sec)
 {
-	if (0 >= (int) (mon -= 2)) {	/* 1..12 -> 11,12,1..10 */
-		mon += 12;		/* Puts Feb last since it has leap day */
+	unsigned int mon = mon0, year = year0;
+
+	/* 1..12 -> 11,12,1..10 */
+	if (0 >= (int) (mon -= 2)) {
+		mon += 12;	/* Puts Feb last since it has leap day */
 		year -= 1;
 	}
 
@@ -630,7 +633,7 @@ mktime (unsigned int year, unsigned int mon,
  * 	0 <= tv_nsec < NSEC_PER_SEC
  * For negative values only the tv_sec field is negative !
  */
-void set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
+void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
 {
 	while (nsec >= NSEC_PER_SEC) {
 		nsec -= NSEC_PER_SEC;
-- 
cgit v1.2.3-71-gd317


From 0c4f6eeca98a805fd0c2536b55039383eb56d2ba Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:25 -0800
Subject: [PATCH] hrtimer: remove unused clock constants

remove unused CLOCK_ constants from time.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 773b83ddd8ef..ea64cde7c450 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -104,12 +104,10 @@ struct	itimerval {
 /*
  * The IDs of the various system clocks (for POSIX.1b interval timers).
  */
-#define CLOCK_REALTIME		  0
-#define CLOCK_MONOTONIC	  1
+#define CLOCK_REALTIME		 0
+#define CLOCK_MONOTONIC	  	 1
 #define CLOCK_PROCESS_CPUTIME_ID 2
 #define CLOCK_THREAD_CPUTIME_ID	 3
-#define CLOCK_REALTIME_HR	 4
-#define CLOCK_MONOTONIC_HR	  5
 
 /*
  * The IDs of various hardware clocks
@@ -118,9 +116,8 @@ struct	itimerval {
 
 #define CLOCK_SGI_CYCLE 10
 #define MAX_CLOCKS 16
-#define CLOCKS_MASK  (CLOCK_REALTIME | CLOCK_MONOTONIC | \
-                     CLOCK_REALTIME_HR | CLOCK_MONOTONIC_HR)
-#define CLOCKS_MONO (CLOCK_MONOTONIC & CLOCK_MONOTONIC_HR)
+#define CLOCKS_MASK  (CLOCK_REALTIME | CLOCK_MONOTONIC)
+#define CLOCKS_MONO (CLOCK_MONOTONIC)
 
 /*
  * The various flags for setting POSIX.1b interval timers.
-- 
cgit v1.2.3-71-gd317


From 1ad106ca185e66dc312518e18e2ffaedf376a160 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 20:52:25 -0800
Subject: [PATCH] hrtimer: coding style clean up of clock constants

clean up the CLOCK_ portions of time.h

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index ea64cde7c450..aded44c48d42 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -100,30 +100,25 @@ struct	itimerval {
 	struct	timeval it_value;	/* current value */
 };
 
-
 /*
  * The IDs of the various system clocks (for POSIX.1b interval timers).
  */
-#define CLOCK_REALTIME		 0
-#define CLOCK_MONOTONIC	  	 1
-#define CLOCK_PROCESS_CPUTIME_ID 2
-#define CLOCK_THREAD_CPUTIME_ID	 3
+#define CLOCK_REALTIME			0
+#define CLOCK_MONOTONIC			1
+#define CLOCK_PROCESS_CPUTIME_ID	2
+#define CLOCK_THREAD_CPUTIME_ID		3
 
 /*
  * The IDs of various hardware clocks
  */
-
-
-#define CLOCK_SGI_CYCLE 10
-#define MAX_CLOCKS 16
-#define CLOCKS_MASK  (CLOCK_REALTIME | CLOCK_MONOTONIC)
-#define CLOCKS_MONO (CLOCK_MONOTONIC)
+#define CLOCK_SGI_CYCLE			10
+#define MAX_CLOCKS			16
+#define CLOCKS_MASK			(CLOCK_REALTIME | CLOCK_MONOTONIC)
+#define CLOCKS_MONO			CLOCK_MONOTONIC
 
 /*
  * The various flags for setting POSIX.1b interval timers.
  */
-
-#define TIMER_ABSTIME 0x01
-
+#define TIMER_ABSTIME			0x01
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 57a558757bdbb877b54ed5ea15bd0892e02a707d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 20:52:26 -0800
Subject: [PATCH] hrtimer: coding style and white space cleanup

style and whitespace cleanup of the rest of time.h.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 61 ++++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index aded44c48d42..4d49cabb9b47 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -4,7 +4,7 @@
 #include <linux/types.h>
 
 #ifdef __KERNEL__
-#include <linux/seqlock.h>
+# include <linux/seqlock.h>
 #endif
 
 #ifndef _STRUCT_TIMESPEC
@@ -13,7 +13,7 @@ struct timespec {
 	time_t	tv_sec;		/* seconds */
 	long	tv_nsec;	/* nanoseconds */
 };
-#endif /* _STRUCT_TIMESPEC */
+#endif
 
 struct timeval {
 	time_t		tv_sec;		/* seconds */
@@ -27,16 +27,16 @@ struct timezone {
 
 #ifdef __KERNEL__
 
-/* Parameters used to convert the timespec values */
-#define MSEC_PER_SEC (1000L)
-#define USEC_PER_SEC (1000000L)
-#define NSEC_PER_SEC (1000000000L)
-#define NSEC_PER_USEC (1000L)
+/* Parameters used to convert the timespec values: */
+#define MSEC_PER_SEC		1000L
+#define USEC_PER_SEC		1000000L
+#define NSEC_PER_SEC		1000000000L
+#define NSEC_PER_USEC		1000L
 
-static __inline__ int timespec_equal(struct timespec *a, struct timespec *b) 
-{ 
+static __inline__ int timespec_equal(struct timespec *a, struct timespec *b)
+{
 	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
-} 
+}
 
 extern unsigned long mktime(const unsigned int year, const unsigned int mon,
 			    const unsigned int day, const unsigned int hour,
@@ -49,25 +49,26 @@ extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
 
 static inline unsigned long get_seconds(void)
-{ 
+{
 	return xtime.tv_sec;
 }
 
 struct timespec current_kernel_time(void);
 
-#define CURRENT_TIME (current_kernel_time())
-#define CURRENT_TIME_SEC ((struct timespec) { xtime.tv_sec, 0 })
+#define CURRENT_TIME		(current_kernel_time())
+#define CURRENT_TIME_SEC	((struct timespec) { xtime.tv_sec, 0 })
 
 extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
-extern void clock_was_set(void); // call when ever the clock is set
+extern void clock_was_set(void); // call whenever the clock is set
 extern int do_posix_clock_monotonic_gettime(struct timespec *tp);
-extern long do_utimes(char __user * filename, struct timeval * times);
+extern long do_utimes(char __user *filename, struct timeval *times);
 struct itimerval;
-extern int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue);
+extern int do_setitimer(int which, struct itimerval *value,
+			struct itimerval *ovalue);
 extern int do_getitimer(int which, struct itimerval *value);
-extern void getnstimeofday (struct timespec *tv);
+extern void getnstimeofday(struct timespec *tv);
 extern void getnstimestamp(struct timespec *ts);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
@@ -84,24 +85,24 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
 /*
  * Names of the interval timers, and structure
- * defining a timer setting.
+ * defining a timer setting:
  */
-#define	ITIMER_REAL	0
-#define	ITIMER_VIRTUAL	1
-#define	ITIMER_PROF	2
+#define	ITIMER_REAL		0
+#define	ITIMER_VIRTUAL		1
+#define	ITIMER_PROF		2
 
-struct  itimerspec {
-        struct  timespec it_interval;    /* timer period */
-        struct  timespec it_value;       /* timer expiration */
+struct itimerspec {
+	struct timespec it_interval;	/* timer period */
+	struct timespec it_value;	/* timer expiration */
 };
 
-struct	itimerval {
-	struct	timeval it_interval;	/* timer interval */
-	struct	timeval it_value;	/* current value */
+struct itimerval {
+	struct timeval it_interval;	/* timer interval */
+	struct timeval it_value;	/* current value */
 };
 
 /*
- * The IDs of the various system clocks (for POSIX.1b interval timers).
+ * The IDs of the various system clocks (for POSIX.1b interval timers):
  */
 #define CLOCK_REALTIME			0
 #define CLOCK_MONOTONIC			1
@@ -109,7 +110,7 @@ struct	itimerval {
 #define CLOCK_THREAD_CPUTIME_ID		3
 
 /*
- * The IDs of various hardware clocks
+ * The IDs of various hardware clocks:
  */
 #define CLOCK_SGI_CYCLE			10
 #define MAX_CLOCKS			16
@@ -117,7 +118,7 @@ struct	itimerval {
 #define CLOCKS_MONO			CLOCK_MONOTONIC
 
 /*
- * The various flags for setting POSIX.1b interval timers.
+ * The various flags for setting POSIX.1b interval timers:
  */
 #define TIMER_ABSTIME			0x01
 
-- 
cgit v1.2.3-71-gd317


From a924b04ddea9788e09f387fe19ccbede5f09ddd8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:27 -0800
Subject: [PATCH] hrtimer: make clockid_t arguments const

add const arguments to the posix-timers.h API functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix-timers.h | 22 +++++++++++-----------
 kernel/posix-cpu-timers.c    | 40 ++++++++++++++++++++++------------------
 kernel/posix-timers.c        | 38 +++++++++++++++++++++-----------------
 3 files changed, 54 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index f942e2bad8e3..ecda38e07899 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -72,12 +72,12 @@ struct k_clock_abs {
 };
 struct k_clock {
 	int res;		/* in nano seconds */
-	int (*clock_getres) (clockid_t which_clock, struct timespec *tp);
+	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
 	struct k_clock_abs *abs_struct;
-	int (*clock_set) (clockid_t which_clock, struct timespec * tp);
-	int (*clock_get) (clockid_t which_clock, struct timespec * tp);
+	int (*clock_set) (const clockid_t which_clock, struct timespec * tp);
+	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
 	int (*timer_create) (struct k_itimer *timer);
-	int (*nsleep) (clockid_t which_clock, int flags, struct timespec *);
+	int (*nsleep) (const clockid_t which_clock, int flags, struct timespec *);
 	int (*timer_set) (struct k_itimer * timr, int flags,
 			  struct itimerspec * new_setting,
 			  struct itimerspec * old_setting);
@@ -87,12 +87,12 @@ struct k_clock {
 			   struct itimerspec * cur_setting);
 };
 
-void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock);
+void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 
 /* Error handlers for timer_create, nanosleep and settime */
 int do_posix_clock_notimer_create(struct k_itimer *timer);
-int do_posix_clock_nonanosleep(clockid_t, int flags, struct timespec *);
-int do_posix_clock_nosettime(clockid_t, struct timespec *tp);
+int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *);
+int do_posix_clock_nosettime(const clockid_t, struct timespec *tp);
 
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
@@ -117,11 +117,11 @@ struct now_struct {
               }								\
             }while (0)
 
-int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *);
-int posix_cpu_clock_get(clockid_t which_clock, struct timespec *);
-int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp);
+int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *);
+int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *);
+int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp);
 int posix_cpu_timer_create(struct k_itimer *);
-int posix_cpu_nsleep(clockid_t, int, struct timespec *);
+int posix_cpu_nsleep(const clockid_t, int, struct timespec *);
 int posix_cpu_timer_set(struct k_itimer *, int,
 			struct itimerspec *, struct itimerspec *);
 int posix_cpu_timer_del(struct k_itimer *);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4c68edff900b..abf6990c6eb5 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,7 +7,7 @@
 #include <asm/uaccess.h>
 #include <linux/errno.h>
 
-static int check_clock(clockid_t which_clock)
+static int check_clock(const clockid_t which_clock)
 {
 	int error = 0;
 	struct task_struct *p;
@@ -31,7 +31,7 @@ static int check_clock(clockid_t which_clock)
 }
 
 static inline union cpu_time_count
-timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
+timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
 {
 	union cpu_time_count ret;
 	ret.sched = 0;		/* high half always zero when .cpu used */
@@ -43,7 +43,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
 	return ret;
 }
 
-static void sample_to_timespec(clockid_t which_clock,
+static void sample_to_timespec(const clockid_t which_clock,
 			       union cpu_time_count cpu,
 			       struct timespec *tp)
 {
@@ -55,7 +55,7 @@ static void sample_to_timespec(clockid_t which_clock,
 	}
 }
 
-static inline int cpu_time_before(clockid_t which_clock,
+static inline int cpu_time_before(const clockid_t which_clock,
 				  union cpu_time_count now,
 				  union cpu_time_count then)
 {
@@ -65,7 +65,7 @@ static inline int cpu_time_before(clockid_t which_clock,
 		return cputime_lt(now.cpu, then.cpu);
 	}
 }
-static inline void cpu_time_add(clockid_t which_clock,
+static inline void cpu_time_add(const clockid_t which_clock,
 				union cpu_time_count *acc,
 			        union cpu_time_count val)
 {
@@ -75,7 +75,7 @@ static inline void cpu_time_add(clockid_t which_clock,
 		acc->cpu = cputime_add(acc->cpu, val.cpu);
 	}
 }
-static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
+static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
 						union cpu_time_count a,
 						union cpu_time_count b)
 {
@@ -151,7 +151,7 @@ static inline unsigned long long sched_ns(struct task_struct *p)
 	return (p == current) ? current_sched_time(p) : p->sched_time;
 }
 
-int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
 	int error = check_clock(which_clock);
 	if (!error) {
@@ -169,7 +169,7 @@ int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
 	return error;
 }
 
-int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
+int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 {
 	/*
 	 * You can never reset a CPU clock, but we check for other errors
@@ -186,7 +186,7 @@ int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
 /*
  * Sample a per-thread clock for the given task.
  */
-static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
+static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 			    union cpu_time_count *cpu)
 {
 	switch (CPUCLOCK_WHICH(which_clock)) {
@@ -248,7 +248,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
  */
-static int cpu_clock_sample_group(clockid_t which_clock,
+static int cpu_clock_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
 				  union cpu_time_count *cpu)
 {
@@ -262,7 +262,7 @@ static int cpu_clock_sample_group(clockid_t which_clock,
 }
 
 
-int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
 	const pid_t pid = CPUCLOCK_PID(which_clock);
 	int error = -EINVAL;
@@ -1399,7 +1399,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 
 static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
 
-int posix_cpu_nsleep(clockid_t which_clock, int flags,
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		     struct timespec *rqtp)
 {
 	struct restart_block *restart_block =
@@ -1503,11 +1503,13 @@ posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
 #define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
 #define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 
-static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+static int process_cpu_clock_getres(const clockid_t which_clock,
+				    struct timespec *tp)
 {
 	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
-static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+static int process_cpu_clock_get(const clockid_t which_clock,
+				 struct timespec *tp)
 {
 	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
 }
@@ -1516,16 +1518,18 @@ static int process_cpu_timer_create(struct k_itimer *timer)
 	timer->it_clock = PROCESS_CLOCK;
 	return posix_cpu_timer_create(timer);
 }
-static int process_cpu_nsleep(clockid_t which_clock, int flags,
+static int process_cpu_nsleep(const clockid_t which_clock, int flags,
 			      struct timespec *rqtp)
 {
 	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
 }
-static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+static int thread_cpu_clock_getres(const clockid_t which_clock,
+				   struct timespec *tp)
 {
 	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
-static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+static int thread_cpu_clock_get(const clockid_t which_clock,
+				struct timespec *tp)
 {
 	return posix_cpu_clock_get(THREAD_CLOCK, tp);
 }
@@ -1534,7 +1538,7 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
 	timer->it_clock = THREAD_CLOCK;
 	return posix_cpu_timer_create(timer);
 }
-static int thread_cpu_nsleep(clockid_t which_clock, int flags,
+static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
 			      struct timespec *rqtp)
 {
 	return -EINVAL;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fdb710777439..69d5a4b5395b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -151,7 +151,7 @@ static void posix_timer_fn(unsigned long);
 static u64 do_posix_clock_monotonic_gettime_parts(
 	struct timespec *tp, struct timespec *mo);
 int do_posix_clock_monotonic_gettime(struct timespec *tp);
-static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp);
+static int do_posix_clock_monotonic_get(const clockid_t, struct timespec *tp);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -176,7 +176,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
  * the function pointer CALL in struct k_clock.
  */
 
-static inline int common_clock_getres(clockid_t which_clock,
+static inline int common_clock_getres(const clockid_t which_clock,
 				      struct timespec *tp)
 {
 	tp->tv_sec = 0;
@@ -184,13 +184,15 @@ static inline int common_clock_getres(clockid_t which_clock,
 	return 0;
 }
 
-static inline int common_clock_get(clockid_t which_clock, struct timespec *tp)
+static inline int common_clock_get(const clockid_t which_clock,
+				   struct timespec *tp)
 {
 	getnstimeofday(tp);
 	return 0;
 }
 
-static inline int common_clock_set(clockid_t which_clock, struct timespec *tp)
+static inline int common_clock_set(const clockid_t which_clock,
+				   struct timespec *tp)
 {
 	return do_sys_settimeofday(tp, NULL);
 }
@@ -207,7 +209,7 @@ static inline int common_timer_create(struct k_itimer *new_timer)
 /*
  * These ones are defined below.
  */
-static int common_nsleep(clockid_t, int flags, struct timespec *t);
+static int common_nsleep(const clockid_t, int flags, struct timespec *t);
 static void common_timer_get(struct k_itimer *, struct itimerspec *);
 static int common_timer_set(struct k_itimer *, int,
 			    struct itimerspec *, struct itimerspec *);
@@ -216,7 +218,7 @@ static int common_timer_del(struct k_itimer *timer);
 /*
  * Return nonzero iff we know a priori this clockid_t value is bogus.
  */
-static inline int invalid_clockid(clockid_t which_clock)
+static inline int invalid_clockid(const clockid_t which_clock)
 {
 	if (which_clock < 0)	/* CPU clock, posix_cpu_* will check it */
 		return 0;
@@ -522,7 +524,7 @@ static inline struct task_struct * good_sigevent(sigevent_t * event)
 	return rtn;
 }
 
-void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock)
+void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
 {
 	if ((unsigned) clock_id >= MAX_CLOCKS) {
 		printk("POSIX clock register failed for clock_id %d\n",
@@ -568,7 +570,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 /* Create a POSIX.1b interval timer. */
 
 asmlinkage long
-sys_timer_create(clockid_t which_clock,
+sys_timer_create(const clockid_t which_clock,
 		 struct sigevent __user *timer_event_spec,
 		 timer_t __user * created_timer_id)
 {
@@ -1195,7 +1197,8 @@ static u64 do_posix_clock_monotonic_gettime_parts(
 	return jiff;
 }
 
-static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
+static int do_posix_clock_monotonic_get(const clockid_t clock,
+					struct timespec *tp)
 {
 	struct timespec wall_to_mono;
 
@@ -1212,7 +1215,7 @@ int do_posix_clock_monotonic_gettime(struct timespec *tp)
 	return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
 }
 
-int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
+int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
 {
 	return -EINVAL;
 }
@@ -1224,7 +1227,8 @@ int do_posix_clock_notimer_create(struct k_itimer *timer)
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
 
-int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
+int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
+			       struct timespec *t)
 {
 #ifndef ENOTSUP
 	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
@@ -1234,8 +1238,8 @@ int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
 
-asmlinkage long
-sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
+asmlinkage long sys_clock_settime(const clockid_t which_clock,
+				  const struct timespec __user *tp)
 {
 	struct timespec new_tp;
 
@@ -1248,7 +1252,7 @@ sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
 }
 
 asmlinkage long
-sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
+sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp)
 {
 	struct timespec kernel_tp;
 	int error;
@@ -1265,7 +1269,7 @@ sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
 }
 
 asmlinkage long
-sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
+sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
 {
 	struct timespec rtn_tp;
 	int error;
@@ -1387,7 +1391,7 @@ void clock_was_set(void)
 long clock_nanosleep_restart(struct restart_block *restart_block);
 
 asmlinkage long
-sys_clock_nanosleep(clockid_t which_clock, int flags,
+sys_clock_nanosleep(const clockid_t which_clock, int flags,
 		    const struct timespec __user *rqtp,
 		    struct timespec __user *rmtp)
 {
@@ -1419,7 +1423,7 @@ sys_clock_nanosleep(clockid_t which_clock, int flags,
 }
 
 
-static int common_nsleep(clockid_t which_clock,
+static int common_nsleep(const clockid_t which_clock,
 			 int flags, struct timespec *tsave)
 {
 	struct timespec t, dum;
-- 
cgit v1.2.3-71-gd317


From 2a698971941bf5e6ebe96275f7d5318b2cf91ccf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:28 -0800
Subject: [PATCH] hrtimer: coding style and white space cleanup 2

style/whitespace/macro cleanups of posix-timers.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix-timers.h | 80 ++++++++++++++++++++++++--------------------
 1 file changed, 44 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index ecda38e07899..ae51473d3d48 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -42,7 +42,7 @@ struct k_itimer {
 	timer_t it_id;			/* timer id */
 	int it_overrun;			/* overrun on pending signal  */
 	int it_overrun_last;		/* overrun on last delivered signal */
-	int it_requeue_pending;         /* waiting to requeue this timer */
+	int it_requeue_pending;		/* waiting to requeue this timer */
 #define REQUEUE_PENDING 1
 	int it_sigev_notify;		/* notify word of sigevent struct */
 	int it_sigev_signo;		/* signo word of sigevent struct */
@@ -52,8 +52,10 @@ struct k_itimer {
 	union {
 		struct {
 			struct timer_list timer;
-			struct list_head abs_timer_entry; /* clock abs_timer_list */
-			struct timespec wall_to_prev;   /* wall_to_monotonic used when set */
+			/* clock abs_timer_list: */
+			struct list_head abs_timer_entry;
+			/* wall_to_monotonic used when set: */
+			struct timespec wall_to_prev;
 			unsigned long incr; /* interval in jiffies */
 		} real;
 		struct cpu_timer_list cpu;
@@ -70,14 +72,16 @@ struct k_clock_abs {
 	struct list_head list;
 	spinlock_t lock;
 };
+
 struct k_clock {
-	int res;		/* in nano seconds */
+	int res;		/* in nanoseconds */
 	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
 	struct k_clock_abs *abs_struct;
 	int (*clock_set) (const clockid_t which_clock, struct timespec * tp);
 	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
 	int (*timer_create) (struct k_itimer *timer);
-	int (*nsleep) (const clockid_t which_clock, int flags, struct timespec *);
+	int (*nsleep) (const clockid_t which_clock, int flags,
+		       struct timespec *);
 	int (*timer_set) (struct k_itimer * timr, int flags,
 			  struct itimerspec * new_setting,
 			  struct itimerspec * old_setting);
@@ -89,7 +93,7 @@ struct k_clock {
 
 void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 
-/* Error handlers for timer_create, nanosleep and settime */
+/* error handlers for timer_create, nanosleep and settime */
 int do_posix_clock_notimer_create(struct k_itimer *timer);
 int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *);
 int do_posix_clock_nosettime(const clockid_t, struct timespec *tp);
@@ -101,39 +105,43 @@ struct now_struct {
 	unsigned long jiffies;
 };
 
-#define posix_get_now(now) (now)->jiffies = jiffies;
+#define posix_get_now(now) \
+	do { (now)->jiffies = jiffies; } while (0)
+
 #define posix_time_before(timer, now) \
                       time_before((timer)->expires, (now)->jiffies)
 
 #define posix_bump_timer(timr, now)					\
-         do {								\
-              long delta, orun;						\
-	      delta = now.jiffies - (timr)->it.real.timer.expires;	\
-              if (delta >= 0) {						\
-	           orun = 1 + (delta / (timr)->it.real.incr);		\
-	          (timr)->it.real.timer.expires +=			\
-			 orun * (timr)->it.real.incr;			\
-                  (timr)->it_overrun += orun;				\
-              }								\
-            }while (0)
-
-int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *);
-int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *);
-int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp);
-int posix_cpu_timer_create(struct k_itimer *);
-int posix_cpu_nsleep(const clockid_t, int, struct timespec *);
-int posix_cpu_timer_set(struct k_itimer *, int,
-			struct itimerspec *, struct itimerspec *);
-int posix_cpu_timer_del(struct k_itimer *);
-void posix_cpu_timer_get(struct k_itimer *, struct itimerspec *);
-
-void posix_cpu_timer_schedule(struct k_itimer *);
-
-void run_posix_cpu_timers(struct task_struct *);
-void posix_cpu_timers_exit(struct task_struct *);
-void posix_cpu_timers_exit_group(struct task_struct *);
-
-void set_process_cpu_timer(struct task_struct *, unsigned int,
-			   cputime_t *, cputime_t *);
+	do {								\
+		long delta, orun;					\
+									\
+		delta = (now).jiffies - (timr)->it.real.timer.expires;	\
+		if (delta >= 0) {					\
+			orun = 1 + (delta / (timr)->it.real.incr);	\
+			(timr)->it.real.timer.expires +=		\
+				orun * (timr)->it.real.incr;		\
+			(timr)->it_overrun += orun;			\
+		}							\
+	} while (0)
+
+int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *ts);
+int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *ts);
+int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *ts);
+int posix_cpu_timer_create(struct k_itimer *timer);
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+		     struct timespec *ts);
+int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+			struct itimerspec *new, struct itimerspec *old);
+int posix_cpu_timer_del(struct k_itimer *timer);
+void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp);
+
+void posix_cpu_timer_schedule(struct k_itimer *timer);
+
+void run_posix_cpu_timers(struct task_struct *task);
+void posix_cpu_timers_exit(struct task_struct *task);
+void posix_cpu_timers_exit_group(struct task_struct *task);
+
+void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
+			   cputime_t *newval, cputime_t *oldval);
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 5f82b2b77e66d452c3037cc47f436d2d76fd5f06 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:29 -0800
Subject: [PATCH] hrtimer: create and use timespec_valid macro

add timespec_valid(ts) [returns false if the timespec is denorm]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h  | 6 ++++++
 kernel/posix-timers.c | 5 ++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 4d49cabb9b47..64e797464589 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -44,6 +44,12 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon,
 
 extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
 
+/*
+ * Returns true if the timespec is norm, false if denorm:
+ */
+#define timespec_valid(ts) \
+	(((ts)->tv_sec >= 0) && (((unsigned) (ts)->tv_nsec) < NSEC_PER_SEC))
+
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69d5a4b5395b..6b851a1bf4b0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -712,8 +712,7 @@ out:
  */
 static int good_timespec(const struct timespec *ts)
 {
-	if ((!ts) || (ts->tv_sec < 0) ||
-			((unsigned) ts->tv_nsec >= NSEC_PER_SEC))
+	if ((!ts) || !timespec_valid(ts))
 		return 0;
 	return 1;
 }
@@ -1406,7 +1405,7 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
 	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
 		return -EFAULT;
 
-	if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
+	if (!timespec_valid(&t))
 		return -EINVAL;
 
 	/*
-- 
cgit v1.2.3-71-gd317


From f8f46da3b4cbb03b43a102b1eb92b63419e10f90 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:30 -0800
Subject: [PATCH] hrtimer: introduce nsec_t type and conversion functions

- introduce the nsec_t type

- basic nsec conversion routines: timespec_to_ns(), timeval_to_ns(),
  ns_to_timespec(), ns_to_timeval().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/time.c        | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 64e797464589..f639fde29253 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -50,6 +50,12 @@ extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
 #define timespec_valid(ts) \
 	(((ts)->tv_sec >= 0) && (((unsigned) (ts)->tv_nsec) < NSEC_PER_SEC))
 
+/*
+ * 64-bit nanosec type. Large enough to span 292+ years in nanosecond
+ * resolution. Ought to be enough for a while.
+ */
+typedef s64 nsec_t;
+
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
@@ -79,6 +85,47 @@ extern void getnstimestamp(struct timespec *ts);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
+/**
+ * timespec_to_ns - Convert timespec to nanoseconds
+ * @ts:		pointer to the timespec variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timespec
+ * parameter.
+ */
+static inline nsec_t timespec_to_ns(const struct timespec *ts)
+{
+	return ((nsec_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+}
+
+/**
+ * timeval_to_ns - Convert timeval to nanoseconds
+ * @ts:		pointer to the timeval variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timeval
+ * parameter.
+ */
+static inline nsec_t timeval_to_ns(const struct timeval *tv)
+{
+	return ((nsec_t) tv->tv_sec * NSEC_PER_SEC) +
+		tv->tv_usec * NSEC_PER_USEC;
+}
+
+/**
+ * ns_to_timespec - Convert nanoseconds to timespec
+ * @nsec:	the nanoseconds value to be converted
+ *
+ * Returns the timespec representation of the nsec parameter.
+ */
+extern struct timespec ns_to_timespec(const nsec_t nsec);
+
+/**
+ * ns_to_timeval - Convert nanoseconds to timeval
+ * @nsec:	the nanoseconds value to be converted
+ *
+ * Returns the timeval representation of the nsec parameter.
+ */
+extern struct timeval ns_to_timeval(const nsec_t nsec);
+
 #endif /* __KERNEL__ */
 
 #define NFDBITS			__NFDBITS
diff --git a/kernel/time.c b/kernel/time.c
index c689b53297cf..cf5a4582a672 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -652,6 +652,42 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
 	ts->tv_nsec = nsec;
 }
 
+/**
+ * ns_to_timespec - Convert nanoseconds to timespec
+ * @nsec:       the nanoseconds value to be converted
+ *
+ * Returns the timespec representation of the nsec parameter.
+ */
+inline struct timespec ns_to_timespec(const nsec_t nsec)
+{
+	struct timespec ts;
+
+	if (nsec)
+		ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC,
+						     &ts.tv_nsec);
+	else
+		ts.tv_sec = ts.tv_nsec = 0;
+
+	return ts;
+}
+
+/**
+ * ns_to_timeval - Convert nanoseconds to timeval
+ * @nsec:       the nanoseconds value to be converted
+ *
+ * Returns the timeval representation of the nsec parameter.
+ */
+struct timeval ns_to_timeval(const nsec_t nsec)
+{
+	struct timespec ts = ns_to_timespec(nsec);
+	struct timeval tv;
+
+	tv.tv_sec = ts.tv_sec;
+	tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
+
+	return tv;
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v1.2.3-71-gd317


From 97fc79f97b1111c80010d34ee66312b88f531e41 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:31 -0800
Subject: [PATCH] hrtimer: introduce ktime_t time format

- introduce ktime_t: nanosecond-resolution time format.

- eliminate the plain s64 scalar type, and always use the union.
  This simplifies the arithmetics. Idea from Roman Zippel.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig     |   4 +
 include/linux/ktime.h | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 273 insertions(+)
 create mode 100644 include/linux/ktime.h

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 554ce3f344c9..815878ebd30f 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1060,3 +1060,7 @@ config X86_TRAMPOLINE
 	bool
 	depends on X86_SMP || (X86_VOYAGER && SMP)
 	default y
+
+config KTIME_SCALAR
+	bool
+	default y
diff --git a/include/linux/ktime.h b/include/linux/ktime.h
new file mode 100644
index 000000000000..5b9a9eb82baa
--- /dev/null
+++ b/include/linux/ktime.h
@@ -0,0 +1,269 @@
+/*
+ *  include/linux/ktime.h
+ *
+ *  ktime_t - nanosecond-resolution time format.
+ *
+ *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ *  data type definitions, declarations, prototypes and macros.
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_KTIME_H
+#define _LINUX_KTIME_H
+
+#include <linux/time.h>
+#include <linux/jiffies.h>
+
+/*
+ * ktime_t:
+ *
+ * On 64-bit CPUs a single 64-bit variable is used to store the hrtimers
+ * internal representation of time values in scalar nanoseconds. The
+ * design plays out best on 64-bit CPUs, where most conversions are
+ * NOPs and most arithmetic ktime_t operations are plain arithmetic
+ * operations.
+ *
+ * On 32-bit CPUs an optimized representation of the timespec structure
+ * is used to avoid expensive conversions from and to timespecs. The
+ * endian-aware order of the tv struct members is choosen to allow
+ * mathematical operations on the tv64 member of the union too, which
+ * for certain operations produces better code.
+ *
+ * For architectures with efficient support for 64/32-bit conversions the
+ * plain scalar nanosecond based representation can be selected by the
+ * config switch CONFIG_KTIME_SCALAR.
+ */
+typedef union {
+	s64	tv64;
+#if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR)
+	struct {
+# ifdef __BIG_ENDIAN
+	s32	sec, nsec;
+# else
+	s32	nsec, sec;
+# endif
+	} tv;
+#endif
+} ktime_t;
+
+#define KTIME_MAX			(~((u64)1 << 63))
+
+/*
+ * ktime_t definitions when using the 64-bit scalar representation:
+ */
+
+#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+
+/* Define a ktime_t variable and initialize it to zero: */
+#define DEFINE_KTIME(kt)		ktime_t kt = { .tv64 = 0 }
+
+/**
+ * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
+ *
+ * @secs:	seconds to set
+ * @nsecs:	nanoseconds to set
+ *
+ * Return the ktime_t representation of the value
+ */
+static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
+{
+	return (ktime_t) { .tv64 = (s64)secs * NSEC_PER_SEC + (s64)nsecs };
+}
+
+/* Subtract two ktime_t variables. rem = lhs -rhs: */
+#define ktime_sub(lhs, rhs) \
+		({ (ktime_t){ .tv64 = (lhs).tv64 - (rhs).tv64 }; })
+
+/* Add two ktime_t variables. res = lhs + rhs: */
+#define ktime_add(lhs, rhs) \
+		({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; })
+
+/*
+ * Add a ktime_t variable and a scalar nanosecond value.
+ * res = kt + nsval:
+ */
+#define ktime_add_ns(kt, nsval) \
+		({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; })
+
+/* convert a timespec to ktime_t format: */
+#define timespec_to_ktime(ts)		ktime_set((ts).tv_sec, (ts).tv_nsec)
+
+/* convert a timeval to ktime_t format: */
+#define timeval_to_ktime(tv)		ktime_set((tv).tv_sec, (tv).tv_usec * 1000)
+
+/* Map the ktime_t to timespec conversion to ns_to_timespec function */
+#define ktime_to_timespec(kt)		ns_to_timespec((kt).tv64)
+
+/* Map the ktime_t to timeval conversion to ns_to_timeval function */
+#define ktime_to_timeval(kt)		ns_to_timeval((kt).tv64)
+
+/* Map the ktime_t to clock_t conversion to the inline in jiffies.h: */
+#define ktime_to_clock_t(kt)		nsec_to_clock_t((kt).tv64)
+
+/* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */
+#define ktime_to_ns(kt)			((kt).tv64)
+
+#else
+
+/*
+ * Helper macros/inlines to get the ktime_t math right in the timespec
+ * representation. The macros are sometimes ugly - their actual use is
+ * pretty okay-ish, given the circumstances. We do all this for
+ * performance reasons. The pure scalar nsec_t based code was nice and
+ * simple, but created too many 64-bit / 32-bit conversions and divisions.
+ *
+ * Be especially aware that negative values are represented in a way
+ * that the tv.sec field is negative and the tv.nsec field is greater
+ * or equal to zero but less than nanoseconds per second. This is the
+ * same representation which is used by timespecs.
+ *
+ *   tv.sec < 0 and 0 >= tv.nsec < NSEC_PER_SEC
+ */
+
+/* Define a ktime_t variable and initialize it to zero: */
+#define DEFINE_KTIME(kt)		ktime_t kt = { .tv64 = 0 }
+
+/* Set a ktime_t variable to a value in sec/nsec representation: */
+static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
+{
+	return (ktime_t) { .tv = { .sec = secs, .nsec = nsecs } };
+}
+
+/**
+ * ktime_sub - subtract two ktime_t variables
+ *
+ * @lhs:	minuend
+ * @rhs:	subtrahend
+ *
+ * Returns the remainder of the substraction
+ */
+static inline ktime_t ktime_sub(const ktime_t lhs, const ktime_t rhs)
+{
+	ktime_t res;
+
+	res.tv64 = lhs.tv64 - rhs.tv64;
+	if (res.tv.nsec < 0)
+		res.tv.nsec += NSEC_PER_SEC;
+
+	return res;
+}
+
+/**
+ * ktime_add - add two ktime_t variables
+ *
+ * @add1:	addend1
+ * @add2:	addend2
+ *
+ * Returns the sum of addend1 and addend2
+ */
+static inline ktime_t ktime_add(const ktime_t add1, const ktime_t add2)
+{
+	ktime_t res;
+
+	res.tv64 = add1.tv64 + add2.tv64;
+	/*
+	 * performance trick: the (u32) -NSEC gives 0x00000000Fxxxxxxx
+	 * so we subtract NSEC_PER_SEC and add 1 to the upper 32 bit.
+	 *
+	 * it's equivalent to:
+	 *   tv.nsec -= NSEC_PER_SEC
+	 *   tv.sec ++;
+	 */
+	if (res.tv.nsec >= NSEC_PER_SEC)
+		res.tv64 += (u32)-NSEC_PER_SEC;
+
+	return res;
+}
+
+/**
+ * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
+ *
+ * @kt:		addend
+ * @nsec:	the scalar nsec value to add
+ *
+ * Returns the sum of kt and nsec in ktime_t format
+ */
+extern ktime_t ktime_add_ns(const ktime_t kt, u64 nsec);
+
+/**
+ * timespec_to_ktime - convert a timespec to ktime_t format
+ *
+ * @ts:		the timespec variable to convert
+ *
+ * Returns a ktime_t variable with the converted timespec value
+ */
+static inline ktime_t timespec_to_ktime(const struct timespec ts)
+{
+	return (ktime_t) { .tv = { .sec = (s32)ts.tv_sec,
+			   	   .nsec = (s32)ts.tv_nsec } };
+}
+
+/**
+ * timeval_to_ktime - convert a timeval to ktime_t format
+ *
+ * @tv:		the timeval variable to convert
+ *
+ * Returns a ktime_t variable with the converted timeval value
+ */
+static inline ktime_t timeval_to_ktime(const struct timeval tv)
+{
+	return (ktime_t) { .tv = { .sec = (s32)tv.tv_sec,
+				   .nsec = (s32)tv.tv_usec * 1000 } };
+}
+
+/**
+ * ktime_to_timespec - convert a ktime_t variable to timespec format
+ *
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns the timespec representation of the ktime value
+ */
+static inline struct timespec ktime_to_timespec(const ktime_t kt)
+{
+	return (struct timespec) { .tv_sec = (time_t) kt.tv.sec,
+				   .tv_nsec = (long) kt.tv.nsec };
+}
+
+/**
+ * ktime_to_timeval - convert a ktime_t variable to timeval format
+ *
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns the timeval representation of the ktime value
+ */
+static inline struct timeval ktime_to_timeval(const ktime_t kt)
+{
+	return (struct timeval) {
+		.tv_sec = (time_t) kt.tv.sec,
+		.tv_usec = (suseconds_t) (kt.tv.nsec / NSEC_PER_USEC) };
+}
+
+/**
+ * ktime_to_clock_t - convert a ktime_t variable to clock_t format
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns a clock_t variable with the converted value
+ */
+static inline clock_t ktime_to_clock_t(const ktime_t kt)
+{
+	return nsec_to_clock_t( (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec);
+}
+
+/**
+ * ktime_to_ns - convert a ktime_t variable to scalar nanoseconds
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns the scalar nanoseconds representation of kt
+ */
+static inline u64 ktime_to_ns(const ktime_t kt)
+{
+	return (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec;
+}
+
+#endif
+
+#endif
-- 
cgit v1.2.3-71-gd317


From c0a3132963db68f1fbbd0e316b73de100fee3f08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:32 -0800
Subject: [PATCH] hrtimer: hrtimer core code

hrtimer subsystem core.  It is initialized at bootup and expired by the timer
interrupt, but is otherwise not utilized by any other subsystem yet.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h | 130 +++++++++
 include/linux/ktime.h   |  15 ++
 init/main.c             |   1 +
 kernel/Makefile         |   3 +-
 kernel/hrtimer.c        | 679 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c          |   1 +
 6 files changed, 828 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hrtimer.h
 create mode 100644 kernel/hrtimer.c

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
new file mode 100644
index 000000000000..64f8d554fbb8
--- /dev/null
+++ b/include/linux/hrtimer.h
@@ -0,0 +1,130 @@
+/*
+ *  include/linux/hrtimer.h
+ *
+ *  hrtimers - High-resolution kernel timers
+ *
+ *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ *  data type definitions, declarations, prototypes
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_HRTIMER_H
+#define _LINUX_HRTIMER_H
+
+#include <linux/rbtree.h>
+#include <linux/ktime.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+
+/*
+ * Mode arguments of xxx_hrtimer functions:
+ */
+enum hrtimer_mode {
+	HRTIMER_ABS,	/* Time value is absolute */
+	HRTIMER_REL,	/* Time value is relative to now */
+};
+
+enum hrtimer_restart {
+	HRTIMER_NORESTART,
+	HRTIMER_RESTART,
+};
+
+/*
+ * Timer states:
+ */
+enum hrtimer_state {
+	HRTIMER_INACTIVE,	/* Timer is inactive */
+	HRTIMER_EXPIRED,		/* Timer is expired */
+	HRTIMER_PENDING,		/* Timer is pending */
+};
+
+struct hrtimer_base;
+
+/**
+ * struct hrtimer - the basic hrtimer structure
+ *
+ * @node:	red black tree node for time ordered insertion
+ * @list:	list head for easier access to the time ordered list,
+ *		without walking the red black tree.
+ * @expires:	the absolute expiry time in the hrtimers internal
+ *		representation. The time is related to the clock on
+ *		which the timer is based.
+ * @state:	state of the timer
+ * @function:	timer expiry callback function
+ * @data:	argument for the callback function
+ * @base:	pointer to the timer base (per cpu and per clock)
+ *
+ * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE()
+ */
+struct hrtimer {
+	struct rb_node		node;
+	struct list_head	list;
+	ktime_t			expires;
+	enum hrtimer_state	state;
+	int			(*function)(void *);
+	void			*data;
+	struct hrtimer_base	*base;
+};
+
+/**
+ * struct hrtimer_base - the timer base for a specific clock
+ *
+ * @index:	clock type index for per_cpu support when moving a timer
+ *		to a base on another cpu.
+ * @lock:	lock protecting the base and associated timers
+ * @active:	red black tree root node for the active timers
+ * @pending:	list of pending timers for simple time ordered access
+ * @resolution:	the resolution of the clock, in nanoseconds
+ * @get_time:	function to retrieve the current time of the clock
+ * @curr_timer:	the timer which is executing a callback right now
+ */
+struct hrtimer_base {
+	clockid_t		index;
+	spinlock_t		lock;
+	struct rb_root		active;
+	struct list_head	pending;
+	unsigned long		resolution;
+	ktime_t			(*get_time)(void);
+	struct hrtimer		*curr_timer;
+};
+
+/* Exported timer functions: */
+
+/* Initialize timers: */
+extern void hrtimer_init(struct hrtimer *timer, const clockid_t which_clock);
+extern void hrtimer_rebase(struct hrtimer *timer, const clockid_t which_clock);
+
+
+/* Basic timer operations: */
+extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
+			 const enum hrtimer_mode mode);
+extern int hrtimer_cancel(struct hrtimer *timer);
+extern int hrtimer_try_to_cancel(struct hrtimer *timer);
+
+#define hrtimer_restart(timer) hrtimer_start((timer), (timer)->expires, HRTIMER_ABS)
+
+/* Query timers: */
+extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
+extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
+
+static inline int hrtimer_active(const struct hrtimer *timer)
+{
+	return timer->state == HRTIMER_PENDING;
+}
+
+/* Forward a hrtimer so it expires after now: */
+extern unsigned long hrtimer_forward(struct hrtimer *timer,
+				     const ktime_t interval);
+
+/* Soft interrupt function to run the hrtimer queues: */
+extern void hrtimer_run_queues(void);
+
+/* Bootup initialization: */
+extern void __init hrtimers_init(void);
+
+#endif
diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 5b9a9eb82baa..222a047cc145 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -266,4 +266,19 @@ static inline u64 ktime_to_ns(const ktime_t kt)
 
 #endif
 
+/*
+ * The resolution of the clocks. The resolution value is returned in
+ * the clock_getres() system call to give application programmers an
+ * idea of the (in)accuracy of timers. Timer values are rounded up to
+ * this resolution values.
+ */
+#define KTIME_REALTIME_RES	(NSEC_PER_SEC/HZ)
+#define KTIME_MONOTONIC_RES	(NSEC_PER_SEC/HZ)
+
+/* Get the monotonic time in timespec format: */
+extern void ktime_get_ts(struct timespec *ts);
+
+/* Get the real (wall-) time in timespec format: */
+#define ktime_get_real_ts(ts)	getnstimeofday(ts)
+
 #endif
diff --git a/init/main.c b/init/main.c
index 8342c2890b16..e092b1979a90 100644
--- a/init/main.c
+++ b/init/main.c
@@ -485,6 +485,7 @@ asmlinkage void __init start_kernel(void)
 	init_IRQ();
 	pidhash_init();
 	init_timers();
+	hrtimers_init();
 	softirq_init();
 	time_init();
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 1e039700c0ad..355126606d1b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+	    hrtimer.o
 
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
new file mode 100644
index 000000000000..690efd9d9adf
--- /dev/null
+++ b/kernel/hrtimer.c
@@ -0,0 +1,679 @@
+/*
+ *  linux/kernel/hrtimer.c
+ *
+ *  Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ *  High-resolution kernel timers
+ *
+ *  In contrast to the low-resolution timeout API implemented in
+ *  kernel/timer.c, hrtimers provide finer resolution and accuracy
+ *  depending on system configuration and capabilities.
+ *
+ *  These timers are currently used for:
+ *   - itimers
+ *   - POSIX timers
+ *   - nanosleep
+ *   - precise in-kernel timing
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  Credits:
+ *	based on kernel/timer.c
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+
+#include <asm/uaccess.h>
+
+/**
+ * ktime_get - get the monotonic time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+static ktime_t ktime_get(void)
+{
+	struct timespec now;
+
+	ktime_get_ts(&now);
+
+	return timespec_to_ktime(now);
+}
+
+/**
+ * ktime_get_real - get the real (wall-) time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+static ktime_t ktime_get_real(void)
+{
+	struct timespec now;
+
+	getnstimeofday(&now);
+
+	return timespec_to_ktime(now);
+}
+
+EXPORT_SYMBOL_GPL(ktime_get_real);
+
+/*
+ * The timer bases:
+ */
+
+#define MAX_HRTIMER_BASES 2
+
+static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
+{
+	{
+		.index = CLOCK_REALTIME,
+		.get_time = &ktime_get_real,
+		.resolution = KTIME_REALTIME_RES,
+	},
+	{
+		.index = CLOCK_MONOTONIC,
+		.get_time = &ktime_get,
+		.resolution = KTIME_MONOTONIC_RES,
+	},
+};
+
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ *
+ * @ts:		pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+	struct timespec tomono;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		getnstimeofday(ts);
+		tomono = wall_to_monotonic;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+				ts->tv_nsec + tomono.tv_nsec);
+}
+
+/*
+ * Functions and macros which are different for UP/SMP systems are kept in a
+ * single place
+ */
+#ifdef CONFIG_SMP
+
+#define set_curr_timer(b, t)		do { (b)->curr_timer = (t); } while (0)
+
+/*
+ * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on the lists/queues.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
+					      unsigned long *flags)
+{
+	struct hrtimer_base *base;
+
+	for (;;) {
+		base = timer->base;
+		if (likely(base != NULL)) {
+			spin_lock_irqsave(&base->lock, *flags);
+			if (likely(base == timer->base))
+				return base;
+			/* The timer has migrated to another CPU: */
+			spin_unlock_irqrestore(&base->lock, *flags);
+		}
+		cpu_relax();
+	}
+}
+
+/*
+ * Switch the timer base to the current CPU when possible.
+ */
+static inline struct hrtimer_base *
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	struct hrtimer_base *new_base;
+
+	new_base = &__get_cpu_var(hrtimer_bases[base->index]);
+
+	if (base != new_base) {
+		/*
+		 * We are trying to schedule the timer on the local CPU.
+		 * However we can't change timer's base while it is running,
+		 * so we keep it on the same CPU. No hassle vs. reprogramming
+		 * the event source in the high resolution case. The softirq
+		 * code will take care of this when the timer function has
+		 * completed. There is no conflict as we hold the lock until
+		 * the timer is enqueued.
+		 */
+		if (unlikely(base->curr_timer == timer))
+			return base;
+
+		/* See the comment in lock_timer_base() */
+		timer->base = NULL;
+		spin_unlock(&base->lock);
+		spin_lock(&new_base->lock);
+		timer->base = new_base;
+	}
+	return new_base;
+}
+
+#else /* CONFIG_SMP */
+
+#define set_curr_timer(b, t)		do { } while (0)
+
+static inline struct hrtimer_base *
+lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+{
+	struct hrtimer_base *base = timer->base;
+
+	spin_lock_irqsave(&base->lock, *flags);
+
+	return base;
+}
+
+#define switch_hrtimer_base(t, b)	(b)
+
+#endif	/* !CONFIG_SMP */
+
+/*
+ * Functions for the union type storage format of ktime_t which are
+ * too large for inlining:
+ */
+#if BITS_PER_LONG < 64
+# ifndef CONFIG_KTIME_SCALAR
+/**
+ * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
+ *
+ * @kt:		addend
+ * @nsec:	the scalar nsec value to add
+ *
+ * Returns the sum of kt and nsec in ktime_t format
+ */
+ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
+{
+	ktime_t tmp;
+
+	if (likely(nsec < NSEC_PER_SEC)) {
+		tmp.tv64 = nsec;
+	} else {
+		unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+
+		tmp = ktime_set((long)nsec, rem);
+	}
+
+	return ktime_add(kt, tmp);
+}
+
+#else /* CONFIG_KTIME_SCALAR */
+
+# endif /* !CONFIG_KTIME_SCALAR */
+
+/*
+ * Divide a ktime value by a nanosecond value
+ */
+static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
+{
+	u64 dclc, inc, dns;
+	int sft = 0;
+
+	dclc = dns = ktime_to_ns(kt);
+	inc = div;
+	/* Make sure the divisor is less than 2^32: */
+	while (div >> 32) {
+		sft++;
+		div >>= 1;
+	}
+	dclc >>= sft;
+	do_div(dclc, (unsigned long) div);
+
+	return (unsigned long) dclc;
+}
+
+#else /* BITS_PER_LONG < 64 */
+# define ktime_divns(kt, div)		(unsigned long)((kt).tv64 / (div))
+#endif /* BITS_PER_LONG >= 64 */
+
+/*
+ * Counterpart to lock_timer_base above:
+ */
+static inline
+void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+{
+	spin_unlock_irqrestore(&timer->base->lock, *flags);
+}
+
+/**
+ * hrtimer_forward - forward the timer expiry
+ *
+ * @timer:	hrtimer to forward
+ * @interval:	the interval to forward
+ *
+ * Forward the timer expiry so it will expire in the future.
+ * The number of overruns is added to the overrun field.
+ */
+unsigned long
+hrtimer_forward(struct hrtimer *timer, const ktime_t interval)
+{
+	unsigned long orun = 1;
+	ktime_t delta, now;
+
+	now = timer->base->get_time();
+
+	delta = ktime_sub(now, timer->expires);
+
+	if (delta.tv64 < 0)
+		return 0;
+
+	if (unlikely(delta.tv64 >= interval.tv64)) {
+		nsec_t incr = ktime_to_ns(interval);
+
+		orun = ktime_divns(delta, incr);
+		timer->expires = ktime_add_ns(timer->expires, incr * orun);
+		if (timer->expires.tv64 > now.tv64)
+			return orun;
+		/*
+		 * This (and the ktime_add() below) is the
+		 * correction for exact:
+		 */
+		orun++;
+	}
+	timer->expires = ktime_add(timer->expires, interval);
+
+	return orun;
+}
+
+/*
+ * enqueue_hrtimer - internal function to (re)start a timer
+ *
+ * The timer is inserted in expiry order. Insertion into the
+ * red black tree is O(log(n)). Must hold the base lock.
+ */
+static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	struct rb_node **link = &base->active.rb_node;
+	struct list_head *prev = &base->pending;
+	struct rb_node *parent = NULL;
+	struct hrtimer *entry;
+
+	/*
+	 * Find the right place in the rbtree:
+	 */
+	while (*link) {
+		parent = *link;
+		entry = rb_entry(parent, struct hrtimer, node);
+		/*
+		 * We dont care about collisions. Nodes with
+		 * the same expiry time stay together.
+		 */
+		if (timer->expires.tv64 < entry->expires.tv64)
+			link = &(*link)->rb_left;
+		else {
+			link = &(*link)->rb_right;
+			prev = &entry->list;
+		}
+	}
+
+	/*
+	 * Insert the timer to the rbtree and to the sorted list:
+	 */
+	rb_link_node(&timer->node, parent, link);
+	rb_insert_color(&timer->node, &base->active);
+	list_add(&timer->list, prev);
+
+	timer->state = HRTIMER_PENDING;
+}
+
+
+/*
+ * __remove_hrtimer - internal function to remove a timer
+ *
+ * Caller must hold the base lock.
+ */
+static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	/*
+	 * Remove the timer from the sorted list and from the rbtree:
+	 */
+	list_del(&timer->list);
+	rb_erase(&timer->node, &base->active);
+}
+
+/*
+ * remove hrtimer, called with base lock held
+ */
+static inline int
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	if (hrtimer_active(timer)) {
+		__remove_hrtimer(timer, base);
+		timer->state = HRTIMER_INACTIVE;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * hrtimer_start - (re)start an relative timer on the current CPU
+ *
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+	struct hrtimer_base *base, *new_base;
+	unsigned long flags;
+	int ret;
+
+	base = lock_hrtimer_base(timer, &flags);
+
+	/* Remove an active timer from the queue: */
+	ret = remove_hrtimer(timer, base);
+
+	/* Switch the timer base, if necessary: */
+	new_base = switch_hrtimer_base(timer, base);
+
+	if (mode == HRTIMER_REL)
+		tim = ktime_add(tim, new_base->get_time());
+	timer->expires = tim;
+
+	enqueue_hrtimer(timer, new_base);
+
+	unlock_hrtimer_base(timer, &flags);
+
+	return ret;
+}
+
+/**
+ * hrtimer_try_to_cancel - try to deactivate a timer
+ *
+ * @timer:	hrtimer to stop
+ *
+ * Returns:
+ *  0 when the timer was not active
+ *  1 when the timer was active
+ * -1 when the timer is currently excuting the callback function and
+ *    can not be stopped
+ */
+int hrtimer_try_to_cancel(struct hrtimer *timer)
+{
+	struct hrtimer_base *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_hrtimer_base(timer, &flags);
+
+	if (base->curr_timer != timer)
+		ret = remove_hrtimer(timer, base);
+
+	unlock_hrtimer_base(timer, &flags);
+
+	return ret;
+
+}
+
+/**
+ * hrtimer_cancel - cancel a timer and wait for the handler to finish.
+ *
+ * @timer:	the timer to be cancelled
+ *
+ * Returns:
+ *  0 when the timer was not active
+ *  1 when the timer was active
+ */
+int hrtimer_cancel(struct hrtimer *timer)
+{
+	for (;;) {
+		int ret = hrtimer_try_to_cancel(timer);
+
+		if (ret >= 0)
+			return ret;
+	}
+}
+
+/**
+ * hrtimer_get_remaining - get remaining time for the timer
+ *
+ * @timer:	the timer to read
+ */
+ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+{
+	struct hrtimer_base *base;
+	unsigned long flags;
+	ktime_t rem;
+
+	base = lock_hrtimer_base(timer, &flags);
+	rem = ktime_sub(timer->expires, timer->base->get_time());
+	unlock_hrtimer_base(timer, &flags);
+
+	return rem;
+}
+
+/**
+ * hrtimer_rebase - rebase an initialized hrtimer to a different base
+ *
+ * @timer:	the timer to be rebased
+ * @clock_id:	the clock to be used
+ */
+void hrtimer_rebase(struct hrtimer *timer, const clockid_t clock_id)
+{
+	struct hrtimer_base *bases;
+
+	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+	timer->base = &bases[clock_id];
+}
+
+/**
+ * hrtimer_init - initialize a timer to the given clock
+ *
+ * @timer:	the timer to be initialized
+ * @clock_id:	the clock to be used
+ */
+void hrtimer_init(struct hrtimer *timer, const clockid_t clock_id)
+{
+	memset(timer, 0, sizeof(struct hrtimer));
+	hrtimer_rebase(timer, clock_id);
+}
+
+/**
+ * hrtimer_get_res - get the timer resolution for a clock
+ *
+ * @which_clock: which clock to query
+ * @tp:		 pointer to timespec variable to store the resolution
+ *
+ * Store the resolution of the clock selected by which_clock in the
+ * variable pointed to by tp.
+ */
+int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
+{
+	struct hrtimer_base *bases;
+
+	tp->tv_sec = 0;
+	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+	tp->tv_nsec = bases[which_clock].resolution;
+
+	return 0;
+}
+
+/*
+ * Expire the per base hrtimer-queue:
+ */
+static inline void run_hrtimer_queue(struct hrtimer_base *base)
+{
+	ktime_t now = base->get_time();
+
+	spin_lock_irq(&base->lock);
+
+	while (!list_empty(&base->pending)) {
+		struct hrtimer *timer;
+		int (*fn)(void *);
+		int restart;
+		void *data;
+
+		timer = list_entry(base->pending.next, struct hrtimer, list);
+		if (now.tv64 <= timer->expires.tv64)
+			break;
+
+		fn = timer->function;
+		data = timer->data;
+		set_curr_timer(base, timer);
+		__remove_hrtimer(timer, base);
+		spin_unlock_irq(&base->lock);
+
+		/*
+		 * fn == NULL is special case for the simplest timer
+		 * variant - wake up process and do not restart:
+		 */
+		if (!fn) {
+			wake_up_process(data);
+			restart = HRTIMER_NORESTART;
+		} else
+			restart = fn(data);
+
+		spin_lock_irq(&base->lock);
+
+		if (restart == HRTIMER_RESTART)
+			enqueue_hrtimer(timer, base);
+		else
+			timer->state = HRTIMER_EXPIRED;
+	}
+	set_curr_timer(base, NULL);
+	spin_unlock_irq(&base->lock);
+}
+
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ */
+void hrtimer_run_queues(void)
+{
+	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+	int i;
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++)
+		run_hrtimer_queue(&base[i]);
+}
+
+/*
+ * Functions related to boot-time initialization:
+ */
+static void __devinit init_hrtimers_cpu(int cpu)
+{
+	struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
+	int i;
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
+		spin_lock_init(&base->lock);
+		INIT_LIST_HEAD(&base->pending);
+		base++;
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void migrate_hrtimer_list(struct hrtimer_base *old_base,
+				struct hrtimer_base *new_base)
+{
+	struct hrtimer *timer;
+	struct rb_node *node;
+
+	while ((node = rb_first(&old_base->active))) {
+		timer = rb_entry(node, struct hrtimer, node);
+		__remove_hrtimer(timer, old_base);
+		timer->base = new_base;
+		enqueue_hrtimer(timer, new_base);
+	}
+}
+
+static void migrate_hrtimers(int cpu)
+{
+	struct hrtimer_base *old_base, *new_base;
+	int i;
+
+	BUG_ON(cpu_online(cpu));
+	old_base = per_cpu(hrtimer_bases, cpu);
+	new_base = get_cpu_var(hrtimer_bases);
+
+	local_irq_disable();
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
+
+		spin_lock(&new_base->lock);
+		spin_lock(&old_base->lock);
+
+		BUG_ON(old_base->curr_timer);
+
+		migrate_hrtimer_list(old_base, new_base);
+
+		spin_unlock(&old_base->lock);
+		spin_unlock(&new_base->lock);
+		old_base++;
+		new_base++;
+	}
+
+	local_irq_enable();
+	put_cpu_var(hrtimer_bases);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+					unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+
+	case CPU_UP_PREPARE:
+		init_hrtimers_cpu(cpu);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		migrate_hrtimers(cpu);
+		break;
+#endif
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata hrtimers_nb = {
+	.notifier_call = hrtimer_cpu_notify,
+};
+
+void __init hrtimers_init(void)
+{
+	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
+			  (void *)(long)smp_processor_id());
+	register_cpu_notifier(&hrtimers_nb);
+}
+
diff --git a/kernel/timer.c b/kernel/timer.c
index 074b4bd5cfd8..80bf2acf6b08 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -858,6 +858,7 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	tvec_base_t *base = &__get_cpu_var(tvec_bases);
 
+ 	hrtimer_run_queues();
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
-- 
cgit v1.2.3-71-gd317


From 2ff678b8da6478d861c1b0ecb3ac14575760e906 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:34 -0800
Subject: [PATCH] hrtimer: switch itimers to hrtimer

switch itimers to a hrtimers-based implementation

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |   6 +--
 fs/proc/array.c       |   6 +--
 include/linux/sched.h |   5 ++-
 include/linux/timer.h |   2 +-
 kernel/exit.c         |   2 +-
 kernel/fork.c         |   6 +--
 kernel/itimer.c       | 106 ++++++++++++++++++++++++--------------------------
 7 files changed, 65 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index fd02ea4a81e9..b5bcf1aae0ab 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -632,10 +632,10 @@ static inline int de_thread(struct task_struct *tsk)
 		 * synchronize with any firing (by calling del_timer_sync)
 		 * before we can safely let the old group leader die.
 		 */
-		sig->real_timer.data = (unsigned long)current;
+		sig->real_timer.data = current;
 		spin_unlock_irq(lock);
-		if (del_timer_sync(&sig->real_timer))
-			add_timer(&sig->real_timer);
+		if (hrtimer_cancel(&sig->real_timer))
+			hrtimer_restart(&sig->real_timer);
 		spin_lock_irq(lock);
 	}
 	while (atomic_read(&sig->count) > count) {
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 5e9251f65317..7eb1bd7f800c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -330,7 +330,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 	unsigned long  min_flt = 0,  maj_flt = 0;
 	cputime_t cutime, cstime, utime, stime;
 	unsigned long rsslim = 0;
-	unsigned long it_real_value = 0;
+	DEFINE_KTIME(it_real_value);
 	struct task_struct *t;
 	char tcomm[sizeof(task->comm)];
 
@@ -386,7 +386,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 			utime = cputime_add(utime, task->signal->utime);
 			stime = cputime_add(stime, task->signal->stime);
 		}
-		it_real_value = task->signal->it_real_value;
+		it_real_value = task->signal->real_timer.expires;
 	}
 	ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
 	read_unlock(&tasklist_lock);
@@ -435,7 +435,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 		priority,
 		nice,
 		num_threads,
-		jiffies_to_clock_t(it_real_value),
+		(long) ktime_to_clock_t(it_real_value),
 		start_time,
 		vsize,
 		mm ? get_mm_rss(mm) : 0,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 85b53f87c703..ee4677ad204e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -105,6 +105,7 @@ extern unsigned long nr_iowait(void);
 #include <linux/param.h>
 #include <linux/resource.h>
 #include <linux/timer.h>
+#include <linux/hrtimer.h>
 
 #include <asm/processor.h>
 
@@ -398,8 +399,8 @@ struct signal_struct {
 	struct list_head posix_timers;
 
 	/* ITIMER_REAL timer for the process */
-	struct timer_list real_timer;
-	unsigned long it_real_value, it_real_incr;
+	struct hrtimer real_timer;
+	ktime_t it_real_incr;
 
 	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
 	cputime_t it_prof_expires, it_virt_expires;
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 72f3a7781106..9b9877fd2505 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -96,6 +96,6 @@ static inline void add_timer(struct timer_list *timer)
 
 extern void init_timers(void);
 extern void run_local_timers(void);
-extern void it_real_fn(unsigned long);
+extern int it_real_fn(void *);
 
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 309a46fa16f8..e75a51f33768 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -842,7 +842,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
- 		del_timer_sync(&tsk->signal->real_timer);
+ 		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
diff --git a/kernel/fork.c b/kernel/fork.c
index b18d64554feb..3bdcab49998d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -801,10 +801,10 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 
-	sig->it_real_value = sig->it_real_incr = 0;
+	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC);
+	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
-	sig->real_timer.data = (unsigned long) tsk;
-	init_timer(&sig->real_timer);
+	sig->real_timer.data = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 7c1b25e25e47..c2c05c4ff28d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -12,36 +12,46 @@
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/posix-timers.h>
+#include <linux/hrtimer.h>
 
 #include <asm/uaccess.h>
 
-static unsigned long it_real_value(struct signal_struct *sig)
+/**
+ * itimer_get_remtime - get remaining time for the timer
+ *
+ * @timer: the timer to read
+ *
+ * Returns the delta between the expiry time and now, which can be
+ * less than zero or 1usec for an pending expired timer
+ */
+static struct timeval itimer_get_remtime(struct hrtimer *timer)
 {
-	unsigned long val = 0;
-	if (timer_pending(&sig->real_timer)) {
-		val = sig->real_timer.expires - jiffies;
+	ktime_t rem = hrtimer_get_remaining(timer);
 
-		/* look out for negative/zero itimer.. */
-		if ((long) val <= 0)
-			val = 1;
-	}
-	return val;
+	/*
+	 * Racy but safe: if the itimer expires after the above
+	 * hrtimer_get_remtime() call but before this condition
+	 * then we return 0 - which is correct.
+	 */
+	if (hrtimer_active(timer)) {
+		if (rem.tv64 <= 0)
+			rem.tv64 = NSEC_PER_USEC;
+	} else
+		rem.tv64 = 0;
+
+	return ktime_to_timeval(rem);
 }
 
 int do_getitimer(int which, struct itimerval *value)
 {
 	struct task_struct *tsk = current;
-	unsigned long interval, val;
 	cputime_t cinterval, cval;
 
 	switch (which) {
 	case ITIMER_REAL:
-		spin_lock_irq(&tsk->sighand->siglock);
-		interval = tsk->signal->it_real_incr;
-		val = it_real_value(tsk->signal);
-		spin_unlock_irq(&tsk->sighand->siglock);
-		jiffies_to_timeval(val, &value->it_value);
-		jiffies_to_timeval(interval, &value->it_interval);
+		value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
+		value->it_interval =
+			ktime_to_timeval(tsk->signal->it_real_incr);
 		break;
 	case ITIMER_VIRTUAL:
 		read_lock(&tasklist_lock);
@@ -113,59 +123,45 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 }
 
 
-void it_real_fn(unsigned long __data)
+/*
+ * The timer is automagically restarted, when interval != 0
+ */
+int it_real_fn(void *data)
 {
-	struct task_struct * p = (struct task_struct *) __data;
-	unsigned long inc = p->signal->it_real_incr;
+	struct task_struct *tsk = (struct task_struct *) data;
 
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
 
-	/*
-	 * Now restart the timer if necessary.  We don't need any locking
-	 * here because do_setitimer makes sure we have finished running
-	 * before it touches anything.
-	 * Note, we KNOW we are (or should be) at a jiffie edge here so
-	 * we don't need the +1 stuff.  Also, we want to use the prior
-	 * expire value so as to not "slip" a jiffie if we are late.
-	 * Deal with requesting a time prior to "now" here rather than
-	 * in add_timer.
-	 */
-	if (!inc)
-		return;
-	while (time_before_eq(p->signal->real_timer.expires, jiffies))
-		p->signal->real_timer.expires += inc;
-	add_timer(&p->signal->real_timer);
+	if (tsk->signal->it_real_incr.tv64 != 0) {
+		hrtimer_forward(&tsk->signal->real_timer,
+			       tsk->signal->it_real_incr);
+
+		return HRTIMER_RESTART;
+	}
+	return HRTIMER_NORESTART;
 }
 
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
- 	unsigned long val, interval, expires;
+	struct hrtimer *timer;
+	ktime_t expires;
 	cputime_t cval, cinterval, nval, ninterval;
 
 	switch (which) {
 	case ITIMER_REAL:
-again:
-		spin_lock_irq(&tsk->sighand->siglock);
-		interval = tsk->signal->it_real_incr;
-		val = it_real_value(tsk->signal);
-		/* We are sharing ->siglock with it_real_fn() */
-		if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) {
-			spin_unlock_irq(&tsk->sighand->siglock);
-			goto again;
-		}
-		tsk->signal->it_real_incr =
-			timeval_to_jiffies(&value->it_interval);
-		expires = timeval_to_jiffies(&value->it_value);
-		if (expires)
-			mod_timer(&tsk->signal->real_timer,
-				  jiffies + 1 + expires);
-		spin_unlock_irq(&tsk->sighand->siglock);
+		timer = &tsk->signal->real_timer;
+		hrtimer_cancel(timer);
 		if (ovalue) {
-			jiffies_to_timeval(val, &ovalue->it_value);
-			jiffies_to_timeval(interval,
-					   &ovalue->it_interval);
+			ovalue->it_value = itimer_get_remtime(timer);
+			ovalue->it_interval
+				= ktime_to_timeval(tsk->signal->it_real_incr);
 		}
+		tsk->signal->it_real_incr =
+			timeval_to_ktime(value->it_interval);
+		expires = timeval_to_ktime(value->it_value);
+		if (expires.tv64 != 0)
+			hrtimer_start(timer, expires, HRTIMER_REL);
 		break;
 	case ITIMER_VIRTUAL:
 		nval = timeval_to_cputime(&value->it_value);
-- 
cgit v1.2.3-71-gd317


From 10c94ec16dd187f8d8dfdbb088e98330c05bf03c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:35 -0800
Subject: [PATCH] hrtimer: create hrtimer nanosleep API

introduce the hrtimer_nanosleep() and hrtimer_nanosleep_real() APIs.  Not yet
used by any code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h |   6 +++
 kernel/hrtimer.c        | 127 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 64f8d554fbb8..2ac20b48b2f3 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -121,6 +121,12 @@ static inline int hrtimer_active(const struct hrtimer *timer)
 extern unsigned long hrtimer_forward(struct hrtimer *timer,
 				     const ktime_t interval);
 
+/* Precise sleep: */
+extern long hrtimer_nanosleep(struct timespec *rqtp,
+			      struct timespec __user *rmtp,
+			      const enum hrtimer_mode mode,
+			      const clockid_t clockid);
+
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 690efd9d9adf..64d37a3c5948 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -580,6 +580,133 @@ void hrtimer_run_queues(void)
 		run_hrtimer_queue(&base[i]);
 }
 
+/*
+ * Sleep related functions:
+ */
+
+/**
+ * schedule_hrtimer - sleep until timeout
+ *
+ * @timer:	hrtimer variable initialized with the correct clock base
+ * @mode:	timeout value is abs/rel
+ *
+ * Make the current task sleep until @timeout is
+ * elapsed.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
+ * pass before the routine returns. The routine will return 0
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task. In this case the remaining time
+ * will be returned
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ */
+static ktime_t __sched
+schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
+{
+	/* fn stays NULL, meaning single-shot wakeup: */
+	timer->data = current;
+
+	hrtimer_start(timer, timer->expires, mode);
+
+	schedule();
+	hrtimer_cancel(timer);
+
+	/* Return the remaining time: */
+	if (timer->state != HRTIMER_EXPIRED)
+		return ktime_sub(timer->expires, timer->base->get_time());
+	else
+		return (ktime_t) {.tv64 = 0 };
+}
+
+static inline ktime_t __sched
+schedule_hrtimer_interruptible(struct hrtimer *timer,
+			       const enum hrtimer_mode mode)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	return schedule_hrtimer(timer, mode);
+}
+
+static long __sched
+nanosleep_restart(struct restart_block *restart, clockid_t clockid)
+{
+	struct timespec __user *rmtp, tu;
+	void *rfn_save = restart->fn;
+	struct hrtimer timer;
+	ktime_t rem;
+
+	restart->fn = do_no_restart_syscall;
+
+	hrtimer_init(&timer, clockid);
+
+	timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+
+	rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
+
+	if (rem.tv64 <= 0)
+		return 0;
+
+	rmtp = (struct timespec __user *) restart->arg2;
+	tu = ktime_to_timespec(rem);
+	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
+		return -EFAULT;
+
+	restart->fn = rfn_save;
+
+	/* The other values in restart are already filled in */
+	return -ERESTART_RESTARTBLOCK;
+}
+
+static long __sched nanosleep_restart_mono(struct restart_block *restart)
+{
+	return nanosleep_restart(restart, CLOCK_MONOTONIC);
+}
+
+static long __sched nanosleep_restart_real(struct restart_block *restart)
+{
+	return nanosleep_restart(restart, CLOCK_REALTIME);
+}
+
+long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+		       const enum hrtimer_mode mode, const clockid_t clockid)
+{
+	struct restart_block *restart;
+	struct hrtimer timer;
+	struct timespec tu;
+	ktime_t rem;
+
+	hrtimer_init(&timer, clockid);
+
+	timer.expires = timespec_to_ktime(*rqtp);
+
+	rem = schedule_hrtimer_interruptible(&timer, mode);
+	if (rem.tv64 <= 0)
+		return 0;
+
+	/* Absolute timers do not update the rmtp value: */
+	if (mode == HRTIMER_ABS)
+		return -ERESTARTNOHAND;
+
+	tu = ktime_to_timespec(rem);
+
+	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
+		return -EFAULT;
+
+	restart = &current_thread_info()->restart_block;
+	restart->fn = (clockid == CLOCK_MONOTONIC) ?
+		nanosleep_restart_mono : nanosleep_restart_real;
+	restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
+	restart->arg1 = timer.expires.tv64 >> 32;
+	restart->arg2 = (unsigned long) rmtp;
+
+	return -ERESTART_RESTARTBLOCK;
+}
+
 /*
  * Functions related to boot-time initialization:
  */
-- 
cgit v1.2.3-71-gd317


From 97735f25d2ba898ec5e13746451525580631c834 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:37 -0800
Subject: [PATCH] hrtimer: switch clock_nanosleep to hrtimer nanosleep API

Switch clock_nanosleep to use the new nanosleep functions in hrtimer.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix-timers.h |   7 +-
 kernel/posix-cpu-timers.c    |  23 ++++---
 kernel/posix-timers.c        | 151 ++++++++-----------------------------------
 3 files changed, 45 insertions(+), 136 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index ae51473d3d48..3c0a5beb7f0d 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -81,7 +81,7 @@ struct k_clock {
 	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
 	int (*timer_create) (struct k_itimer *timer);
 	int (*nsleep) (const clockid_t which_clock, int flags,
-		       struct timespec *);
+		       struct timespec *, struct timespec __user *);
 	int (*timer_set) (struct k_itimer * timr, int flags,
 			  struct itimerspec * new_setting,
 			  struct itimerspec * old_setting);
@@ -95,7 +95,8 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 
 /* error handlers for timer_create, nanosleep and settime */
 int do_posix_clock_notimer_create(struct k_itimer *timer);
-int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *);
+int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *,
+			       struct timespec __user *);
 int do_posix_clock_nosettime(const clockid_t, struct timespec *tp);
 
 /* function to call to trigger timer event */
@@ -129,7 +130,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *ts);
 int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *ts);
 int posix_cpu_timer_create(struct k_itimer *timer);
 int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *ts);
+		     struct timespec *rqtp, struct timespec __user *rmtp);
 int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 			struct itimerspec *new, struct itimerspec *old);
 int posix_cpu_timer_del(struct k_itimer *timer);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index abf6990c6eb5..520f6c59948d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
 
 int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *rqtp)
+		     struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block =
 	    &current_thread_info()->restart_block;
@@ -1425,7 +1425,6 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 	error = posix_cpu_timer_create(&timer);
 	timer.it_process = current;
 	if (!error) {
-		struct timespec __user *rmtp;
 		static struct itimerspec zero_it;
 		struct itimerspec it = { .it_value = *rqtp,
 					 .it_interval = {} };
@@ -1472,7 +1471,6 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		/*
 		 * Report back to the user the time still remaining.
 		 */
-		rmtp = (struct timespec __user *) restart_block->arg1;
 		if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
 		    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
@@ -1480,6 +1478,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		restart_block->fn = posix_cpu_clock_nanosleep_restart;
 		/* Caller already set restart_block->arg1 */
 		restart_block->arg0 = which_clock;
+		restart_block->arg1 = (unsigned long) rmtp;
 		restart_block->arg2 = rqtp->tv_sec;
 		restart_block->arg3 = rqtp->tv_nsec;
 
@@ -1493,10 +1492,15 @@ static long
 posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->arg0;
-	struct timespec t = { .tv_sec = restart_block->arg2,
-			      .tv_nsec = restart_block->arg3 };
+	struct timespec __user *rmtp;
+	struct timespec t;
+
+	rmtp = (struct timespec __user *) restart_block->arg1;
+	t.tv_sec = restart_block->arg2;
+	t.tv_nsec = restart_block->arg3;
+
 	restart_block->fn = do_no_restart_syscall;
-	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t);
+	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
 }
 
 
@@ -1519,9 +1523,10 @@ static int process_cpu_timer_create(struct k_itimer *timer)
 	return posix_cpu_timer_create(timer);
 }
 static int process_cpu_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec *rqtp)
+			      struct timespec *rqtp,
+			      struct timespec __user *rmtp)
 {
-	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
+	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
 }
 static int thread_cpu_clock_getres(const clockid_t which_clock,
 				   struct timespec *tp)
@@ -1539,7 +1544,7 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
 	return posix_cpu_timer_create(timer);
 }
 static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec *rqtp)
+			      struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	return -EINVAL;
 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 6b851a1bf4b0..ba900587b815 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -209,7 +209,8 @@ static inline int common_timer_create(struct k_itimer *new_timer)
 /*
  * These ones are defined below.
  */
-static int common_nsleep(const clockid_t, int flags, struct timespec *t);
+static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+			 struct timespec __user *rmtp);
 static void common_timer_get(struct k_itimer *, struct itimerspec *);
 static int common_timer_set(struct k_itimer *, int,
 			    struct itimerspec *, struct itimerspec *);
@@ -1227,7 +1228,7 @@ int do_posix_clock_notimer_create(struct k_itimer *timer)
 EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
 
 int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
-			       struct timespec *t)
+			       struct timespec *t, struct timespec __user *r)
 {
 #ifndef ENOTSUP
 	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
@@ -1387,7 +1388,28 @@ void clock_was_set(void)
 	up(&clock_was_set_lock);
 }
 
-long clock_nanosleep_restart(struct restart_block *restart_block);
+/*
+ * nanosleep for monotonic and realtime clocks
+ */
+static int common_nsleep(const clockid_t which_clock, int flags,
+			 struct timespec *tsave, struct timespec __user *rmtp)
+{
+	int mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
+	int clockid = which_clock;
+
+	switch (which_clock) {
+	case CLOCK_REALTIME:
+		/* Posix madness. Only absolute timers on clock realtime
+		   are affected by clock set. */
+		if (mode == HRTIMER_ABS)
+			clockid = CLOCK_MONOTONIC;
+	case CLOCK_MONOTONIC:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return hrtimer_nanosleep(tsave, rmtp, mode, clockid);
+}
 
 asmlinkage long
 sys_clock_nanosleep(const clockid_t which_clock, int flags,
@@ -1395,9 +1417,6 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
 		    struct timespec __user *rmtp)
 {
 	struct timespec t;
-	struct restart_block *restart_block =
-	    &(current_thread_info()->restart_block);
-	int ret;
 
 	if (invalid_clockid(which_clock))
 		return -EINVAL;
@@ -1408,122 +1427,6 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
 	if (!timespec_valid(&t))
 		return -EINVAL;
 
-	/*
-	 * Do this here as nsleep function does not have the real address.
-	 */
-	restart_block->arg1 = (unsigned long)rmtp;
-
-	ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
-
-	if ((ret == -ERESTART_RESTARTBLOCK) && rmtp &&
-					copy_to_user(rmtp, &t, sizeof (t)))
-		return -EFAULT;
-	return ret;
-}
-
-
-static int common_nsleep(const clockid_t which_clock,
-			 int flags, struct timespec *tsave)
-{
-	struct timespec t, dum;
-	DECLARE_WAITQUEUE(abs_wqueue, current);
-	u64 rq_time = (u64)0;
-	s64 left;
-	int abs;
-	struct restart_block *restart_block =
-	    &current_thread_info()->restart_block;
-
-	abs_wqueue.flags = 0;
-	abs = flags & TIMER_ABSTIME;
-
-	if (restart_block->fn == clock_nanosleep_restart) {
-		/*
-		 * Interrupted by a non-delivered signal, pick up remaining
-		 * time and continue.  Remaining time is in arg2 & 3.
-		 */
-		restart_block->fn = do_no_restart_syscall;
-
-		rq_time = restart_block->arg3;
-		rq_time = (rq_time << 32) + restart_block->arg2;
-		if (!rq_time)
-			return -EINTR;
-		left = rq_time - get_jiffies_64();
-		if (left <= (s64)0)
-			return 0;	/* Already passed */
-	}
-
-	if (abs && (posix_clocks[which_clock].clock_get !=
-			    posix_clocks[CLOCK_MONOTONIC].clock_get))
-		add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue);
-
-	do {
-		t = *tsave;
-		if (abs || !rq_time) {
-			adjust_abs_time(&posix_clocks[which_clock], &t, abs,
-					&rq_time, &dum);
-		}
-
-		left = rq_time - get_jiffies_64();
-		if (left >= (s64)MAX_JIFFY_OFFSET)
-			left = (s64)MAX_JIFFY_OFFSET;
-		if (left < (s64)0)
-			break;
-
-		schedule_timeout_interruptible(left);
-
-		left = rq_time - get_jiffies_64();
-	} while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
-
-	if (abs_wqueue.task_list.next)
-		finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
-
-	if (left > (s64)0) {
-
-		/*
-		 * Always restart abs calls from scratch to pick up any
-		 * clock shifting that happened while we are away.
-		 */
-		if (abs)
-			return -ERESTARTNOHAND;
-
-		left *= TICK_NSEC;
-		tsave->tv_sec = div_long_long_rem(left, 
-						  NSEC_PER_SEC, 
-						  &tsave->tv_nsec);
-		/*
-		 * Restart works by saving the time remaing in 
-		 * arg2 & 3 (it is 64-bits of jiffies).  The other
-		 * info we need is the clock_id (saved in arg0). 
-		 * The sys_call interface needs the users 
-		 * timespec return address which _it_ saves in arg1.
-		 * Since we have cast the nanosleep call to a clock_nanosleep
-		 * both can be restarted with the same code.
-		 */
-		restart_block->fn = clock_nanosleep_restart;
-		restart_block->arg0 = which_clock;
-		/*
-		 * Caller sets arg1
-		 */
-		restart_block->arg2 = rq_time & 0xffffffffLL;
-		restart_block->arg3 = rq_time >> 32;
-
-		return -ERESTART_RESTARTBLOCK;
-	}
-
-	return 0;
-}
-/*
- * This will restart clock_nanosleep.
- */
-long
-clock_nanosleep_restart(struct restart_block *restart_block)
-{
-	struct timespec t;
-	int ret = common_nsleep(restart_block->arg0, 0, &t);
-
-	if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 &&
-	    copy_to_user((struct timespec __user *)(restart_block->arg1), &t,
-			 sizeof (t)))
-		return -EFAULT;
-	return ret;
+	return CLOCK_DISPATCH(which_clock, nsleep,
+			      (which_clock, flags, &t, rmtp));
 }
-- 
cgit v1.2.3-71-gd317


From becf8b5d00f4b47e847f98322cdaf8cd16243861 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:38 -0800
Subject: [PATCH] hrtimer: convert posix timers completely

- convert posix-timers.c to use hrtimers

- remove the now obsolete abslist code

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/time.c        |   7 +-
 include/linux/hrtimer.h      |   7 +
 include/linux/posix-timers.h |  37 +--
 include/linux/time.h         |   3 +-
 kernel/posix-timers.c        | 717 ++++++++-----------------------------------
 5 files changed, 149 insertions(+), 622 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 11f518a7e156..8fa2ae7f3026 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -99,7 +99,8 @@ void uml_idle_timer(void)
 	set_interval(ITIMER_REAL);
 }
 
-extern int do_posix_clock_monotonic_gettime(struct timespec *tp);
+extern void ktime_get_ts(struct timespec *ts);
+#define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
 
 void time_init(void)
 {
@@ -114,8 +115,8 @@ void time_init(void)
 	wall_to_monotonic.tv_nsec = -now.tv_nsec;
 }
 
-/* Declared in linux/time.h, which can't be included here */
-extern void clock_was_set(void);
+/* Defined in linux/ktimer.h, which can't be included here */
+#define clock_was_set()		do { } while (0)
 
 void do_gettimeofday(struct timeval *tv)
 {
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2ac20b48b2f3..cf5cfdf8d613 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -93,6 +93,13 @@ struct hrtimer_base {
 	struct hrtimer		*curr_timer;
 };
 
+/*
+ * clock_was_set() is a NOP for non- high-resolution systems. The
+ * time-sorted order guarantees that a timer does not expire early and
+ * is expired in the next softirq when the clock was advanced.
+ */
+#define clock_was_set()		do { } while (0)
+
 /* Exported timer functions: */
 
 /* Initialize timers: */
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 3c0a5beb7f0d..54faf5236da0 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -51,12 +51,8 @@ struct k_itimer {
 	struct sigqueue *sigq;		/* signal queue entry. */
 	union {
 		struct {
-			struct timer_list timer;
-			/* clock abs_timer_list: */
-			struct list_head abs_timer_entry;
-			/* wall_to_monotonic used when set: */
-			struct timespec wall_to_prev;
-			unsigned long incr; /* interval in jiffies */
+			struct hrtimer timer;
+			ktime_t interval;
 		} real;
 		struct cpu_timer_list cpu;
 		struct {
@@ -68,15 +64,9 @@ struct k_itimer {
 	} it;
 };
 
-struct k_clock_abs {
-	struct list_head list;
-	spinlock_t lock;
-};
-
 struct k_clock {
 	int res;		/* in nanoseconds */
 	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
-	struct k_clock_abs *abs_struct;
 	int (*clock_set) (const clockid_t which_clock, struct timespec * tp);
 	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
 	int (*timer_create) (struct k_itimer *timer);
@@ -102,29 +92,6 @@ int do_posix_clock_nosettime(const clockid_t, struct timespec *tp);
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
 
-struct now_struct {
-	unsigned long jiffies;
-};
-
-#define posix_get_now(now) \
-	do { (now)->jiffies = jiffies; } while (0)
-
-#define posix_time_before(timer, now) \
-                      time_before((timer)->expires, (now)->jiffies)
-
-#define posix_bump_timer(timr, now)					\
-	do {								\
-		long delta, orun;					\
-									\
-		delta = (now).jiffies - (timr)->it.real.timer.expires;	\
-		if (delta >= 0) {					\
-			orun = 1 + (delta / (timr)->it.real.incr);	\
-			(timr)->it.real.timer.expires +=		\
-				orun * (timr)->it.real.incr;		\
-			(timr)->it_overrun += orun;			\
-		}							\
-	} while (0)
-
 int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *ts);
 int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *ts);
 int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *ts);
diff --git a/include/linux/time.h b/include/linux/time.h
index f639fde29253..1201155b2202 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -73,8 +73,7 @@ struct timespec current_kernel_time(void);
 extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
-extern void clock_was_set(void); // call whenever the clock is set
-extern int do_posix_clock_monotonic_gettime(struct timespec *tp);
+#define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
 extern long do_utimes(char __user *filename, struct timeval *times);
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ba900587b815..9e66e614862a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,7 +35,6 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <linux/calc64.h>
 
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
@@ -49,12 +48,6 @@
 #include <linux/workqueue.h>
 #include <linux/module.h>
 
-#define CLOCK_REALTIME_RES TICK_NSEC  /* In nano seconds. */
-
-static inline u64  mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
-{
-	return (u64)mpy1 * mpy2;
-}
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
  * Timer ids are allocated by an external routine that keeps track of the
@@ -140,18 +133,18 @@ static DEFINE_SPINLOCK(idr_lock);
  */
 
 static struct k_clock posix_clocks[MAX_CLOCKS];
+
 /*
- * We only have one real clock that can be set so we need only one abs list,
- * even if we should want to have several clocks with differing resolutions.
+ * These ones are defined below.
  */
-static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list),
-				      .lock = SPIN_LOCK_UNLOCKED};
+static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+			 struct timespec __user *rmtp);
+static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static int common_timer_set(struct k_itimer *, int,
+			    struct itimerspec *, struct itimerspec *);
+static int common_timer_del(struct k_itimer *timer);
 
-static void posix_timer_fn(unsigned long);
-static u64 do_posix_clock_monotonic_gettime_parts(
-	struct timespec *tp, struct timespec *mo);
-int do_posix_clock_monotonic_gettime(struct timespec *tp);
-static int do_posix_clock_monotonic_get(const clockid_t, struct timespec *tp);
+static int posix_timer_fn(void *data);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -184,10 +177,12 @@ static inline int common_clock_getres(const clockid_t which_clock,
 	return 0;
 }
 
-static inline int common_clock_get(const clockid_t which_clock,
-				   struct timespec *tp)
+/*
+ * Get real time for posix timers
+ */
+static int common_clock_get(clockid_t which_clock, struct timespec *tp)
 {
-	getnstimeofday(tp);
+	ktime_get_real_ts(tp);
 	return 0;
 }
 
@@ -199,25 +194,14 @@ static inline int common_clock_set(const clockid_t which_clock,
 
 static inline int common_timer_create(struct k_itimer *new_timer)
 {
-	INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry);
-	init_timer(&new_timer->it.real.timer);
-	new_timer->it.real.timer.data = (unsigned long) new_timer;
+	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock);
+	new_timer->it.real.timer.data = new_timer;
 	new_timer->it.real.timer.function = posix_timer_fn;
 	return 0;
 }
 
 /*
- * These ones are defined below.
- */
-static int common_nsleep(const clockid_t, int flags, struct timespec *t,
-			 struct timespec __user *rmtp);
-static void common_timer_get(struct k_itimer *, struct itimerspec *);
-static int common_timer_set(struct k_itimer *, int,
-			    struct itimerspec *, struct itimerspec *);
-static int common_timer_del(struct k_itimer *timer);
-
-/*
- * Return nonzero iff we know a priori this clockid_t value is bogus.
+ * Return nonzero if we know a priori this clockid_t value is bogus.
  */
 static inline int invalid_clockid(const clockid_t which_clock)
 {
@@ -227,26 +211,32 @@ static inline int invalid_clockid(const clockid_t which_clock)
 		return 1;
 	if (posix_clocks[which_clock].clock_getres != NULL)
 		return 0;
-#ifndef CLOCK_DISPATCH_DIRECT
 	if (posix_clocks[which_clock].res != 0)
 		return 0;
-#endif
 	return 1;
 }
 
+/*
+ * Get monotonic time for posix timers
+ */
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+{
+	ktime_get_ts(tp);
+	return 0;
+}
 
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
 static __init int init_posix_timers(void)
 {
-	struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES,
-					 .abs_struct = &abs_list
+	struct k_clock clock_realtime = {
+		.clock_getres = hrtimer_get_res,
 	};
-	struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES,
-		.abs_struct = NULL,
-		.clock_get = do_posix_clock_monotonic_get,
-		.clock_set = do_posix_clock_nosettime
+	struct k_clock clock_monotonic = {
+		.clock_getres = hrtimer_get_res,
+		.clock_get = posix_ktime_get_ts,
+		.clock_set = do_posix_clock_nosettime,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -260,117 +250,17 @@ static __init int init_posix_timers(void)
 
 __initcall(init_posix_timers);
 
-static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
-{
-	long sec = tp->tv_sec;
-	long nsec = tp->tv_nsec + res - 1;
-
-	if (nsec >= NSEC_PER_SEC) {
-		sec++;
-		nsec -= NSEC_PER_SEC;
-	}
-
-	/*
-	 * The scaling constants are defined in <linux/time.h>
-	 * The difference between there and here is that we do the
-	 * res rounding and compute a 64-bit result (well so does that
-	 * but it then throws away the high bits).
-  	 */
-	*jiff =  (mpy_l_X_l_ll(sec, SEC_CONVERSION) +
-		  (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> 
-		   (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
-}
-
-/*
- * This function adjusts the timer as needed as a result of the clock
- * being set.  It should only be called for absolute timers, and then
- * under the abs_list lock.  It computes the time difference and sets
- * the new jiffies value in the timer.  It also updates the timers
- * reference wall_to_monotonic value.  It is complicated by the fact
- * that tstojiffies() only handles positive times and it needs to work
- * with both positive and negative times.  Also, for negative offsets,
- * we need to defeat the res round up.
- *
- * Return is true if there is a new time, else false.
- */
-static long add_clockset_delta(struct k_itimer *timr,
-			       struct timespec *new_wall_to)
-{
-	struct timespec delta;
-	int sign = 0;
-	u64 exp;
-
-	set_normalized_timespec(&delta,
-				new_wall_to->tv_sec -
-				timr->it.real.wall_to_prev.tv_sec,
-				new_wall_to->tv_nsec -
-				timr->it.real.wall_to_prev.tv_nsec);
-	if (likely(!(delta.tv_sec | delta.tv_nsec)))
-		return 0;
-	if (delta.tv_sec < 0) {
-		set_normalized_timespec(&delta,
-					-delta.tv_sec,
-					1 - delta.tv_nsec -
-					posix_clocks[timr->it_clock].res);
-		sign++;
-	}
-	tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp);
-	timr->it.real.wall_to_prev = *new_wall_to;
-	timr->it.real.timer.expires += (sign ? -exp : exp);
-	return 1;
-}
-
-static void remove_from_abslist(struct k_itimer *timr)
-{
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		list_del_init(&timr->it.real.abs_timer_entry);
-		spin_unlock(&abs_list.lock);
-	}
-}
-
 static void schedule_next_timer(struct k_itimer *timr)
 {
-	struct timespec new_wall_to;
-	struct now_struct now;
-	unsigned long seq;
-
-	/*
-	 * Set up the timer for the next interval (if there is one).
-	 * Note: this code uses the abs_timer_lock to protect
-	 * it.real.wall_to_prev and must hold it until exp is set, not exactly
-	 * obvious...
-
-	 * This function is used for CLOCK_REALTIME* and
-	 * CLOCK_MONOTONIC* timers.  If we ever want to handle other
-	 * CLOCKs, the calling code (do_schedule_next_timer) would need
-	 * to pull the "clock" info from the timer and dispatch the
-	 * "other" CLOCKs "next timer" code (which, I suppose should
-	 * also be added to the k_clock structure).
-	 */
-	if (!timr->it.real.incr)
+	if (timr->it.real.interval.tv64 == 0)
 		return;
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		new_wall_to =	wall_to_monotonic;
-		posix_get_now(&now);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		add_clockset_delta(timr, &new_wall_to);
-
-		posix_bump_timer(timr, now);
-
-		spin_unlock(&abs_list.lock);
-	} else {
-		posix_bump_timer(timr, now);
-	}
+	timr->it_overrun += hrtimer_forward(&timr->it.real.timer,
+					    timr->it.real.interval);
 	timr->it_overrun_last = timr->it_overrun;
 	timr->it_overrun = -1;
 	++timr->it_requeue_pending;
-	add_timer(&timr->it.real.timer);
+	hrtimer_restart(&timr->it.real.timer);
 }
 
 /*
@@ -391,31 +281,23 @@ void do_schedule_next_timer(struct siginfo *info)
 
 	timr = lock_timer(info->si_tid, &flags);
 
-	if (!timr || timr->it_requeue_pending != info->si_sys_private)
-		goto exit;
+	if (timr && timr->it_requeue_pending == info->si_sys_private) {
+		if (timr->it_clock < 0)
+			posix_cpu_timer_schedule(timr);
+		else
+			schedule_next_timer(timr);
 
-	if (timr->it_clock < 0)	/* CPU clock */
-		posix_cpu_timer_schedule(timr);
-	else
-		schedule_next_timer(timr);
-	info->si_overrun = timr->it_overrun_last;
-exit:
-	if (timr)
-		unlock_timer(timr, flags);
+		info->si_overrun = timr->it_overrun_last;
+	}
+
+	unlock_timer(timr, flags);
 }
 
 int posix_timer_event(struct k_itimer *timr,int si_private)
 {
 	memset(&timr->sigq->info, 0, sizeof(siginfo_t));
 	timr->sigq->info.si_sys_private = si_private;
-	/*
-	 * Send signal to the process that owns this timer.
-
-	 * This code assumes that all the possible abs_lists share the
-	 * same lock (there is only one list at this time). If this is
-	 * not the case, the CLOCK info would need to be used to find
-	 * the proper abs list lock.
-	 */
+	/* Send signal to the process that owns this timer.*/
 
 	timr->sigq->info.si_signo = timr->it_sigev_signo;
 	timr->sigq->info.si_errno = 0;
@@ -449,64 +331,35 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 
  * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
  */
-static void posix_timer_fn(unsigned long __data)
+static int posix_timer_fn(void *data)
 {
-	struct k_itimer *timr = (struct k_itimer *) __data;
+	struct k_itimer *timr = data;
 	unsigned long flags;
-	unsigned long seq;
-	struct timespec delta, new_wall_to;
-	u64 exp = 0;
-	int do_notify = 1;
+	int si_private = 0;
+	int ret = HRTIMER_NORESTART;
 
 	spin_lock_irqsave(&timr->it_lock, flags);
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		do {
-			seq = read_seqbegin(&xtime_lock);
-			new_wall_to =	wall_to_monotonic;
-		} while (read_seqretry(&xtime_lock, seq));
-		set_normalized_timespec(&delta,
-					new_wall_to.tv_sec -
-					timr->it.real.wall_to_prev.tv_sec,
-					new_wall_to.tv_nsec -
-					timr->it.real.wall_to_prev.tv_nsec);
-		if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) {
-			/* do nothing, timer is on time */
-		} else if (delta.tv_sec < 0) {
-			/* do nothing, timer is already late */
-		} else {
-			/* timer is early due to a clock set */
-			tstojiffie(&delta,
-				   posix_clocks[timr->it_clock].res,
-				   &exp);
-			timr->it.real.wall_to_prev = new_wall_to;
-			timr->it.real.timer.expires += exp;
-			add_timer(&timr->it.real.timer);
-			do_notify = 0;
-		}
-		spin_unlock(&abs_list.lock);
 
-	}
-	if (do_notify)  {
-		int si_private=0;
+	if (timr->it.real.interval.tv64 != 0)
+		si_private = ++timr->it_requeue_pending;
 
-		if (timr->it.real.incr)
-			si_private = ++timr->it_requeue_pending;
-		else {
-			remove_from_abslist(timr);
+	if (posix_timer_event(timr, si_private)) {
+		/*
+		 * signal was not sent because of sig_ignor
+		 * we will not get a call back to restart it AND
+		 * it should be restarted.
+		 */
+		if (timr->it.real.interval.tv64 != 0) {
+			timr->it_overrun +=
+				hrtimer_forward(&timr->it.real.timer,
+						timr->it.real.interval);
+			ret = HRTIMER_RESTART;
 		}
-
-		if (posix_timer_event(timr, si_private))
-			/*
-			 * signal was not sent because of sig_ignor
-			 * we will not get a call back to restart it AND
-			 * it should be restarted.
-			 */
-			schedule_next_timer(timr);
 	}
-	unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */
-}
 
+	unlock_timer(timr, flags);
+	return ret;
+}
 
 static inline struct task_struct * good_sigevent(sigevent_t * event)
 {
@@ -597,8 +450,7 @@ sys_timer_create(const clockid_t which_clock,
 		goto out;
 	}
 	spin_lock_irq(&idr_lock);
-	error = idr_get_new(&posix_timers_id,
-			    (void *) new_timer,
+	error = idr_get_new(&posix_timers_id, (void *) new_timer,
 			    &new_timer_id);
 	spin_unlock_irq(&idr_lock);
 	if (error == -EAGAIN)
@@ -698,26 +550,6 @@ out:
 	return error;
 }
 
-/*
- * good_timespec
- *
- * This function checks the elements of a timespec structure.
- *
- * Arguments:
- * ts	     : Pointer to the timespec structure to check
- *
- * Return value:
- * If a NULL pointer was passed in, or the tv_nsec field was less than 0
- * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0,
- * this function returns 0. Otherwise it returns 1.
- */
-static int good_timespec(const struct timespec *ts)
-{
-	if ((!ts) || !timespec_valid(ts))
-		return 0;
-	return 1;
-}
-
 /*
  * Locking issues: We need to protect the result of the id look up until
  * we get the timer locked down so it is not deleted under us.  The
@@ -770,39 +602,39 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
 static void
 common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 {
-	unsigned long expires;
-	struct now_struct now;
-
-	do
-		expires = timr->it.real.timer.expires;
-	while ((volatile long) (timr->it.real.timer.expires) != expires);
-
-	posix_get_now(&now);
-
-	if (expires &&
-	    ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) &&
-	    !timr->it.real.incr &&
-	    posix_time_before(&timr->it.real.timer, &now))
-		timr->it.real.timer.expires = expires = 0;
-	if (expires) {
-		if (timr->it_requeue_pending & REQUEUE_PENDING ||
-		    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
-			posix_bump_timer(timr, now);
-			expires = timr->it.real.timer.expires;
-		}
-		else
-			if (!timer_pending(&timr->it.real.timer))
-				expires = 0;
-		if (expires)
-			expires -= now.jiffies;
-	}
-	jiffies_to_timespec(expires, &cur_setting->it_value);
-	jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval);
+	ktime_t remaining;
+	struct hrtimer *timer = &timr->it.real.timer;
 
-	if (cur_setting->it_value.tv_sec < 0) {
-		cur_setting->it_value.tv_nsec = 1;
-		cur_setting->it_value.tv_sec = 0;
+	memset(cur_setting, 0, sizeof(struct itimerspec));
+	remaining = hrtimer_get_remaining(timer);
+
+	/* Time left ? or timer pending */
+	if (remaining.tv64 > 0 || hrtimer_active(timer))
+		goto calci;
+	/* interval timer ? */
+	if (timr->it.real.interval.tv64 == 0)
+		return;
+	/*
+	 * When a requeue is pending or this is a SIGEV_NONE timer
+	 * move the expiry time forward by intervals, so expiry is >
+	 * now.
+	 */
+	if (timr->it_requeue_pending & REQUEUE_PENDING ||
+	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+		timr->it_overrun +=
+			hrtimer_forward(timer, timr->it.real.interval);
+		remaining = hrtimer_get_remaining(timer);
 	}
+ calci:
+	/* interval timer ? */
+	if (timr->it.real.interval.tv64 != 0)
+		cur_setting->it_interval =
+			ktime_to_timespec(timr->it.real.interval);
+	/* Return 0 only, when the timer is expired and not pending */
+	if (remaining.tv64 <= 0)
+		cur_setting->it_value.tv_nsec = 1;
+	else
+		cur_setting->it_value = ktime_to_timespec(remaining);
 }
 
 /* Get the time remaining on a POSIX.1b interval timer. */
@@ -826,6 +658,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
 
 	return 0;
 }
+
 /*
  * Get the number of overruns of a POSIX.1b interval timer.  This is to
  * be the overrun of the timer last delivered.  At the same time we are
@@ -835,7 +668,6 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
  * the call back to do_schedule_next_timer().  So all we need to do is
  * to pick up the frozen overrun.
  */
-
 asmlinkage long
 sys_timer_getoverrun(timer_t timer_id)
 {
@@ -852,84 +684,6 @@ sys_timer_getoverrun(timer_t timer_id)
 
 	return overrun;
 }
-/*
- * Adjust for absolute time
- *
- * If absolute time is given and it is not CLOCK_MONOTONIC, we need to
- * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and
- * what ever clock he is using.
- *
- * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
- * time to it to get the proper time for the timer.
- */
-static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, 
-			   int abs, u64 *exp, struct timespec *wall_to)
-{
-	struct timespec now;
-	struct timespec oc = *tp;
-	u64 jiffies_64_f;
-	int rtn =0;
-
-	if (abs) {
-		/*
-		 * The mask pick up the 4 basic clocks 
-		 */
-		if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) {
-			jiffies_64_f = do_posix_clock_monotonic_gettime_parts(
-				&now,  wall_to);
-			/*
-			 * If we are doing a MONOTONIC clock
-			 */
-			if((clock - &posix_clocks[0]) & CLOCKS_MONO){
-				now.tv_sec += wall_to->tv_sec;
-				now.tv_nsec += wall_to->tv_nsec;
-			}
-		} else {
-			/*
-			 * Not one of the basic clocks
-			 */
-			clock->clock_get(clock - posix_clocks, &now);
-			jiffies_64_f = get_jiffies_64();
-		}
-		/*
-		 * Take away now to get delta and normalize
-		 */
-		set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec,
-					oc.tv_nsec - now.tv_nsec);
-	}else{
-		jiffies_64_f = get_jiffies_64();
-	}
-	/*
-	 * Check if the requested time is prior to now (if so set now)
-	 */
-	if (oc.tv_sec < 0)
-		oc.tv_sec = oc.tv_nsec = 0;
-
-	if (oc.tv_sec | oc.tv_nsec)
-		set_normalized_timespec(&oc, oc.tv_sec,
-					oc.tv_nsec + clock->res);
-	tstojiffie(&oc, clock->res, exp);
-
-	/*
-	 * Check if the requested time is more than the timer code
-	 * can handle (if so we error out but return the value too).
-	 */
-	if (*exp > ((u64)MAX_JIFFY_OFFSET))
-			/*
-			 * This is a considered response, not exactly in
-			 * line with the standard (in fact it is silent on
-			 * possible overflows).  We assume such a large 
-			 * value is ALMOST always a programming error and
-			 * try not to compound it by setting a really dumb
-			 * value.
-			 */
-			rtn = -EINVAL;
-	/*
-	 * return the actual jiffies expire time, full 64 bits
-	 */
-	*exp += jiffies_64_f;
-	return rtn;
-}
 
 /* Set a POSIX.1b interval timer. */
 /* timr->it_lock is taken. */
@@ -937,68 +691,48 @@ static inline int
 common_timer_set(struct k_itimer *timr, int flags,
 		 struct itimerspec *new_setting, struct itimerspec *old_setting)
 {
-	struct k_clock *clock = &posix_clocks[timr->it_clock];
-	u64 expire_64;
+	struct hrtimer *timer = &timr->it.real.timer;
 
 	if (old_setting)
 		common_timer_get(timr, old_setting);
 
 	/* disable the timer */
-	timr->it.real.incr = 0;
+	timr->it.real.interval.tv64 = 0;
 	/*
 	 * careful here.  If smp we could be in the "fire" routine which will
 	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
 	 */
-	if (try_to_del_timer_sync(&timr->it.real.timer) < 0) {
-#ifdef CONFIG_SMP
-		/*
-		 * It can only be active if on an other cpu.  Since
-		 * we have cleared the interval stuff above, it should
-		 * clear once we release the spin lock.  Of course once
-		 * we do that anything could happen, including the
-		 * complete melt down of the timer.  So return with
-		 * a "retry" exit status.
-		 */
+	if (hrtimer_try_to_cancel(timer) < 0)
 		return TIMER_RETRY;
-#endif
-	}
-
-	remove_from_abslist(timr);
 
 	timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 
 		~REQUEUE_PENDING;
 	timr->it_overrun_last = 0;
-	timr->it_overrun = -1;
-	/*
-	 *switch off the timer when it_value is zero
-	 */
-	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) {
-		timr->it.real.timer.expires = 0;
-		return 0;
-	}
 
-	if (adjust_abs_time(clock,
-			    &new_setting->it_value, flags & TIMER_ABSTIME, 
-			    &expire_64, &(timr->it.real.wall_to_prev))) {
-		return -EINVAL;
-	}
-	timr->it.real.timer.expires = (unsigned long)expire_64;
-	tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
-	timr->it.real.incr = (unsigned long)expire_64;
+	/* switch off the timer when it_value is zero */
+	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
+		return 0;
 
-	/*
-	 * We do not even queue SIGEV_NONE timers!  But we do put them
-	 * in the abs list so we can do that right.
+	/* Posix madness. Only absolute CLOCK_REALTIME timers
+	 * are affected by clock sets. So we must reiniatilize
+	 * the timer.
 	 */
-	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE))
-		add_timer(&timr->it.real.timer);
-
-	if (flags & TIMER_ABSTIME && clock->abs_struct) {
-		spin_lock(&clock->abs_struct->lock);
-		list_add_tail(&(timr->it.real.abs_timer_entry),
-			      &(clock->abs_struct->list));
-		spin_unlock(&clock->abs_struct->lock);
-	}
+	if (timr->it_clock == CLOCK_REALTIME && (flags & TIMER_ABSTIME))
+		hrtimer_rebase(timer, CLOCK_REALTIME);
+	else
+		hrtimer_rebase(timer, CLOCK_MONOTONIC);
+
+	timer->expires = timespec_to_ktime(new_setting->it_value);
+
+	/* Convert interval */
+	timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+
+	/* SIGEV_NONE timers are not queued ! See common_timer_get */
+	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+		return 0;
+
+	hrtimer_start(timer, timer->expires, (flags & TIMER_ABSTIME) ?
+		      HRTIMER_ABS : HRTIMER_REL);
 	return 0;
 }
 
@@ -1020,8 +754,8 @@ sys_timer_settime(timer_t timer_id, int flags,
 	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
 		return -EFAULT;
 
-	if ((!good_timespec(&new_spec.it_interval)) ||
-	    (!good_timespec(&new_spec.it_value)))
+	if (!timespec_valid(&new_spec.it_interval) ||
+	    !timespec_valid(&new_spec.it_value))
 		return -EINVAL;
 retry:
 	timr = lock_timer(timer_id, &flag);
@@ -1037,8 +771,8 @@ retry:
 		goto retry;
 	}
 
-	if (old_setting && !error && copy_to_user(old_setting,
-						  &old_spec, sizeof (old_spec)))
+	if (old_setting && !error &&
+	    copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
 		error = -EFAULT;
 
 	return error;
@@ -1046,24 +780,10 @@ retry:
 
 static inline int common_timer_del(struct k_itimer *timer)
 {
-	timer->it.real.incr = 0;
+	timer->it.real.interval.tv64 = 0;
 
-	if (try_to_del_timer_sync(&timer->it.real.timer) < 0) {
-#ifdef CONFIG_SMP
-		/*
-		 * It can only be active if on an other cpu.  Since
-		 * we have cleared the interval stuff above, it should
-		 * clear once we release the spin lock.  Of course once
-		 * we do that anything could happen, including the
-		 * complete melt down of the timer.  So return with
-		 * a "retry" exit status.
-		 */
+	if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
 		return TIMER_RETRY;
-#endif
-	}
-
-	remove_from_abslist(timer);
-
 	return 0;
 }
 
@@ -1079,24 +799,16 @@ sys_timer_delete(timer_t timer_id)
 	struct k_itimer *timer;
 	long flags;
 
-#ifdef CONFIG_SMP
-	int error;
 retry_delete:
-#endif
 	timer = lock_timer(timer_id, &flags);
 	if (!timer)
 		return -EINVAL;
 
-#ifdef CONFIG_SMP
-	error = timer_delete_hook(timer);
-
-	if (error == TIMER_RETRY) {
+	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
 		goto retry_delete;
 	}
-#else
-	timer_delete_hook(timer);
-#endif
+
 	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
 	spin_unlock(&current->sighand->siglock);
@@ -1113,6 +825,7 @@ retry_delete:
 	release_posix_timer(timer, IT_ID_SET);
 	return 0;
 }
+
 /*
  * return timer owned by the process, used by exit_itimers
  */
@@ -1120,22 +833,13 @@ static inline void itimer_delete(struct k_itimer *timer)
 {
 	unsigned long flags;
 
-#ifdef CONFIG_SMP
-	int error;
 retry_delete:
-#endif
 	spin_lock_irqsave(&timer->it_lock, flags);
 
-#ifdef CONFIG_SMP
-	error = timer_delete_hook(timer);
-
-	if (error == TIMER_RETRY) {
+	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
 		goto retry_delete;
 	}
-#else
-	timer_delete_hook(timer);
-#endif
 	list_del(&timer->list);
 	/*
 	 * This keeps any tasks waiting on the spin lock from thinking
@@ -1164,57 +868,7 @@ void exit_itimers(struct signal_struct *sig)
 	}
 }
 
-/*
- * And now for the "clock" calls
- *
- * These functions are called both from timer functions (with the timer
- * spin_lock_irq() held and from clock calls with no locking.	They must
- * use the save flags versions of locks.
- */
-
-/*
- * We do ticks here to avoid the irq lock ( they take sooo long).
- * The seqlock is great here.  Since we a reader, we don't really care
- * if we are interrupted since we don't take lock that will stall us or
- * any other cpu. Voila, no irq lock is needed.
- *
- */
-
-static u64 do_posix_clock_monotonic_gettime_parts(
-	struct timespec *tp, struct timespec *mo)
-{
-	u64 jiff;
-	unsigned int seq;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		getnstimeofday(tp);
-		*mo = wall_to_monotonic;
-		jiff = jiffies_64;
-
-	} while(read_seqretry(&xtime_lock, seq));
-
-	return jiff;
-}
-
-static int do_posix_clock_monotonic_get(const clockid_t clock,
-					struct timespec *tp)
-{
-	struct timespec wall_to_mono;
-
-	do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
-
-	set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec,
-				tp->tv_nsec + wall_to_mono.tv_nsec);
-
-	return 0;
-}
-
-int do_posix_clock_monotonic_gettime(struct timespec *tp)
-{
-	return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
-}
-
+/* Not available / possible... functions */
 int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
 {
 	return -EINVAL;
@@ -1287,107 +941,6 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
 	return error;
 }
 
-/*
- * The standard says that an absolute nanosleep call MUST wake up at
- * the requested time in spite of clock settings.  Here is what we do:
- * For each nanosleep call that needs it (only absolute and not on
- * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure
- * into the "nanosleep_abs_list".  All we need is the task_struct pointer.
- * When ever the clock is set we just wake up all those tasks.	 The rest
- * is done by the while loop in clock_nanosleep().
- *
- * On locking, clock_was_set() is called from update_wall_clock which
- * holds (or has held for it) a write_lock_irq( xtime_lock) and is
- * called from the timer bh code.  Thus we need the irq save locks.
- *
- * Also, on the call from update_wall_clock, that is done as part of a
- * softirq thing.  We don't want to delay the system that much (possibly
- * long list of timers to fix), so we defer that work to keventd.
- */
-
-static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue);
-static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL);
-
-static DECLARE_MUTEX(clock_was_set_lock);
-
-void clock_was_set(void)
-{
-	struct k_itimer *timr;
-	struct timespec new_wall_to;
-	LIST_HEAD(cws_list);
-	unsigned long seq;
-
-
-	if (unlikely(in_interrupt())) {
-		schedule_work(&clock_was_set_work);
-		return;
-	}
-	wake_up_all(&nanosleep_abs_wqueue);
-
-	/*
-	 * Check if there exist TIMER_ABSTIME timers to correct.
-	 *
-	 * Notes on locking: This code is run in task context with irq
-	 * on.  We CAN be interrupted!  All other usage of the abs list
-	 * lock is under the timer lock which holds the irq lock as
-	 * well.  We REALLY don't want to scan the whole list with the
-	 * interrupt system off, AND we would like a sequence lock on
-	 * this code as well.  Since we assume that the clock will not
-	 * be set often, it seems ok to take and release the irq lock
-	 * for each timer.  In fact add_timer will do this, so this is
-	 * not an issue.  So we know when we are done, we will move the
-	 * whole list to a new location.  Then as we process each entry,
-	 * we will move it to the actual list again.  This way, when our
-	 * copy is empty, we are done.  We are not all that concerned
-	 * about preemption so we will use a semaphore lock to protect
-	 * aginst reentry.  This way we will not stall another
-	 * processor.  It is possible that this may delay some timers
-	 * that should have expired, given the new clock, but even this
-	 * will be minimal as we will always update to the current time,
-	 * even if it was set by a task that is waiting for entry to
-	 * this code.  Timers that expire too early will be caught by
-	 * the expire code and restarted.
-
-	 * Absolute timers that repeat are left in the abs list while
-	 * waiting for the task to pick up the signal.  This means we
-	 * may find timers that are not in the "add_timer" list, but are
-	 * in the abs list.  We do the same thing for these, save
-	 * putting them back in the "add_timer" list.  (Note, these are
-	 * left in the abs list mainly to indicate that they are
-	 * ABSOLUTE timers, a fact that is used by the re-arm code, and
-	 * for which we have no other flag.)
-
-	 */
-
-	down(&clock_was_set_lock);
-	spin_lock_irq(&abs_list.lock);
-	list_splice_init(&abs_list.list, &cws_list);
-	spin_unlock_irq(&abs_list.lock);
-	do {
-		do {
-			seq = read_seqbegin(&xtime_lock);
-			new_wall_to =	wall_to_monotonic;
-		} while (read_seqretry(&xtime_lock, seq));
-
-		spin_lock_irq(&abs_list.lock);
-		if (list_empty(&cws_list)) {
-			spin_unlock_irq(&abs_list.lock);
-			break;
-		}
-		timr = list_entry(cws_list.next, struct k_itimer,
-				  it.real.abs_timer_entry);
-
-		list_del_init(&timr->it.real.abs_timer_entry);
-		if (add_clockset_delta(timr, &new_wall_to) &&
-		    del_timer(&timr->it.real.timer))  /* timer run yet? */
-			add_timer(&timr->it.real.timer);
-		list_add(&timr->it.real.abs_timer_entry, &abs_list.list);
-		spin_unlock_irq(&abs_list.lock);
-	} while (1);
-
-	up(&clock_was_set_lock);
-}
-
 /*
  * nanosleep for monotonic and realtime clocks
  */
@@ -1401,7 +954,7 @@ static int common_nsleep(const clockid_t which_clock, int flags,
 	case CLOCK_REALTIME:
 		/* Posix madness. Only absolute timers on clock realtime
 		   are affected by clock set. */
-		if (mode == HRTIMER_ABS)
+		if (mode != HRTIMER_ABS)
 			clockid = CLOCK_MONOTONIC;
 	case CLOCK_MONOTONIC:
 		break;
-- 
cgit v1.2.3-71-gd317


From d1c0b8f835aeba85aa428aaec6d521ef4639c7fa Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Mon, 9 Jan 2006 20:52:40 -0800
Subject: [PATCH] Remove getnstimestamp()

Remove getnstimestamp() in favor of ktime.h's ktime_get_ts()

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h |  1 -
 kernel/time.c        | 22 ----------------------
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 1201155b2202..f2aca7ec6325 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -80,7 +80,6 @@ extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
 extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
-extern void getnstimestamp(struct timespec *ts);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
diff --git a/kernel/time.c b/kernel/time.c
index cf5a4582a672..169e8329e0b6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -564,28 +564,6 @@ void getnstimeofday(struct timespec *tv)
 EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
 
-void getnstimestamp(struct timespec *ts)
-{
-	unsigned int seq;
-	struct timespec wall2mono;
-
-	/* synchronize with settimeofday() changes */
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		getnstimeofday(ts);
-		wall2mono = wall_to_monotonic;
-	} while(unlikely(read_seqretry(&xtime_lock, seq)));
-
-	/* adjust to monotonicaly-increasing values */
-	ts->tv_sec += wall2mono.tv_sec;
-	ts->tv_nsec += wall2mono.tv_nsec;
-	while (unlikely(ts->tv_nsec >= NSEC_PER_SEC)) {
-		ts->tv_nsec -= NSEC_PER_SEC;
-		ts->tv_sec++;
-	}
-}
-EXPORT_SYMBOL_GPL(getnstimestamp);
-
 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
  * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
  * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
-- 
cgit v1.2.3-71-gd317


From 49a2a1b83ba6fa40c41968d6a28ba16e7ed0c3f7 Mon Sep 17 00:00:00 2001
From: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Mon, 9 Jan 2006 20:52:43 -0800
Subject: [PATCH] kprobes: changed from using spinlock to mutex

Since Kprobes runtime exception handlers is now lock free as this code path is
now using RCU to walk through the list, there is no need for the
register/unregister{_kprobe} to use spin_{lock/unlock}_isr{save/restore}.  The
serialization during registration/unregistration is now possible using just a
mutex.

In the above process, this patch also fixes a minor memory leak for x86_64 and
powerpc.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c    |  6 +--
 arch/powerpc/kernel/kprobes.c | 14 +++----
 arch/sparc64/kernel/kprobes.c |  6 +--
 arch/x86_64/kernel/kprobes.c  |  7 +---
 include/asm-ia64/kprobes.h    |  5 ---
 include/linux/kprobes.h       |  1 -
 kernel/kprobes.c              | 91 ++++++++++++++++++++-----------------------
 7 files changed, 53 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 19edcd526ba4..68fe10250486 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -57,14 +57,10 @@ static inline int is_IF_modifier(kprobe_opcode_t opcode)
 }
 
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-	return 0;
-}
-
-void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
 	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
 	p->opcode = *p->addr;
+	return 0;
 }
 
 void __kprobes arch_arm_kprobe(struct kprobe *p)
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 5368f9c2e6bf..331e169e8629 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -60,13 +60,13 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 		if (!p->ainsn.insn)
 			ret = -ENOMEM;
 	}
-	return ret;
-}
 
-void __kprobes arch_copy_kprobe(struct kprobe *p)
-{
-	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-	p->opcode = *p->addr;
+	if (!ret) {
+		memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+		p->opcode = *p->addr;
+	}
+
+	return ret;
 }
 
 void __kprobes arch_arm_kprobe(struct kprobe *p)
@@ -85,9 +85,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	down(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
-	up(&kprobe_mutex);
 }
 
 static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
diff --git a/arch/sparc64/kernel/kprobes.c b/arch/sparc64/kernel/kprobes.c
index a97b0f0727ab..bbd5aa6818ea 100644
--- a/arch/sparc64/kernel/kprobes.c
+++ b/arch/sparc64/kernel/kprobes.c
@@ -42,15 +42,11 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-	return 0;
-}
-
-void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
 	p->ainsn.insn[0] = *p->addr;
 	p->ainsn.insn[1] = BREAKPOINT_INSTRUCTION_2;
 	p->opcode = *p->addr;
+	return 0;
 }
 
 void __kprobes arch_arm_kprobe(struct kprobe *p)
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index afe11f4fbd1d..8b8943bfb89e 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -42,8 +42,8 @@
 #include <asm/pgtable.h>
 #include <asm/kdebug.h>
 
-static DECLARE_MUTEX(kprobe_mutex);
 void jprobe_return_end(void);
+void __kprobes arch_copy_kprobe(struct kprobe *p);
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
@@ -69,12 +69,11 @@ static inline int is_IF_modifier(kprobe_opcode_t *insn)
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
 	/* insn: must be on special executable page on x86_64. */
-	down(&kprobe_mutex);
 	p->ainsn.insn = get_insn_slot();
-	up(&kprobe_mutex);
 	if (!p->ainsn.insn) {
 		return -ENOMEM;
 	}
+	arch_copy_kprobe(p);
 	return 0;
 }
 
@@ -223,9 +222,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	down(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
-	up(&kprobe_mutex);
 }
 
 static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index 5b26462674a7..12a0b93020da 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -110,11 +110,6 @@ struct arch_specific_insn {
  	unsigned short target_br_reg;
 };
 
-/* ia64 does not need this */
-static inline void arch_copy_kprobe(struct kprobe *p)
-{
-}
-
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
 
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index c03f2dc933de..ad6e4fe970fd 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -150,7 +150,6 @@ struct kretprobe_instance {
 
 extern spinlock_t kretprobe_lock;
 extern int arch_prepare_kprobe(struct kprobe *p);
-extern void arch_copy_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
 extern void arch_disarm_kprobe(struct kprobe *p);
 extern void arch_remove_kprobe(struct kprobe *p);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3897630d2335..f14ccd35e9b6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
-static DEFINE_SPINLOCK(kprobe_lock);	/* Protects kprobe_table */
+static DECLARE_MUTEX(kprobe_mutex);	/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
@@ -167,7 +167,7 @@ static inline void reset_kprobe_instance(void)
 
 /*
  * This routine is called either:
- * 	- under the kprobe_lock spinlock - during kprobe_[un]register()
+ * 	- under the kprobe_mutex - during kprobe_[un]register()
  * 				OR
  * 	- with preemption disabled - from arch/xxx/kernel/kprobes.c
  */
@@ -420,7 +420,6 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 /*
  * This is the second or subsequent kprobe at the address - handle
  * the intricacies
- * TODO: Move kcalloc outside the spin_lock
  */
 static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 					  struct kprobe *p)
@@ -442,25 +441,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 	return ret;
 }
 
-/* kprobe removal house-keeping routines */
-static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
-{
-	arch_disarm_kprobe(p);
-	hlist_del_rcu(&p->hlist);
-	spin_unlock_irqrestore(&kprobe_lock, flags);
-	arch_remove_kprobe(p);
-}
-
-static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
-		struct kprobe *p, unsigned long flags)
-{
-	list_del_rcu(&p->list);
-	if (list_empty(&old_p->list))
-		cleanup_kprobe(old_p, flags);
-	else
-		spin_unlock_irqrestore(&kprobe_lock, flags);
-}
-
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
 	if (addr >= (unsigned long)__kprobes_text_start
@@ -472,7 +452,6 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
 int __kprobes register_kprobe(struct kprobe *p)
 {
 	int ret = 0;
-	unsigned long flags = 0;
 	struct kprobe *old_p;
 	struct module *mod;
 
@@ -484,18 +463,17 @@ int __kprobes register_kprobe(struct kprobe *p)
 			(unlikely(!try_module_get(mod))))
 		return -EINVAL;
 
-	if ((ret = arch_prepare_kprobe(p)) != 0)
-		goto rm_kprobe;
-
 	p->nmissed = 0;
-	spin_lock_irqsave(&kprobe_lock, flags);
+	down(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
 		ret = register_aggr_kprobe(old_p, p);
 		goto out;
 	}
 
-	arch_copy_kprobe(p);
+	if ((ret = arch_prepare_kprobe(p)) != 0)
+		goto out;
+
 	INIT_HLIST_NODE(&p->hlist);
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
@@ -503,10 +481,8 @@ int __kprobes register_kprobe(struct kprobe *p)
   	arch_arm_kprobe(p);
 
 out:
-	spin_unlock_irqrestore(&kprobe_lock, flags);
-rm_kprobe:
-	if (ret == -EEXIST)
-		arch_remove_kprobe(p);
+	up(&kprobe_mutex);
+
 	if (ret && mod)
 		module_put(mod);
 	return ret;
@@ -514,29 +490,48 @@ rm_kprobe:
 
 void __kprobes unregister_kprobe(struct kprobe *p)
 {
-	unsigned long flags;
-	struct kprobe *old_p;
 	struct module *mod;
+	struct kprobe *old_p, *cleanup_p;
 
-	spin_lock_irqsave(&kprobe_lock, flags);
+	down(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
-	if (old_p) {
-		/* cleanup_*_kprobe() does the spin_unlock_irqrestore */
-		if (old_p->pre_handler == aggr_pre_handler)
-			cleanup_aggr_kprobe(old_p, p, flags);
-		else
-			cleanup_kprobe(p, flags);
+	if (unlikely(!old_p)) {
+		up(&kprobe_mutex);
+		return;
+	}
 
-		synchronize_sched();
+	if ((old_p->pre_handler == aggr_pre_handler) &&
+		(p->list.next == &old_p->list) &&
+		(p->list.prev == &old_p->list)) {
+		/* Only one element in the aggregate list */
+		arch_disarm_kprobe(p);
+		hlist_del_rcu(&old_p->hlist);
+		cleanup_p = old_p;
+	} else if (old_p == p) {
+		/* Only one kprobe element in the hash list */
+		arch_disarm_kprobe(p);
+		hlist_del_rcu(&p->hlist);
+		cleanup_p = p;
+	} else {
+		list_del_rcu(&p->list);
+		cleanup_p = NULL;
+	}
 
-		if ((mod = module_text_address((unsigned long)p->addr)))
-			module_put(mod);
+	up(&kprobe_mutex);
 
-		if (old_p->pre_handler == aggr_pre_handler &&
-				list_empty(&old_p->list))
+	synchronize_sched();
+	if ((mod = module_text_address((unsigned long)p->addr)))
+		module_put(mod);
+
+	if (cleanup_p) {
+		if (cleanup_p->pre_handler == aggr_pre_handler) {
+			list_del_rcu(&p->list);
 			kfree(old_p);
-	} else
-		spin_unlock_irqrestore(&kprobe_lock, flags);
+		}
+		down(&kprobe_mutex);
+		arch_remove_kprobe(p);
+		up(&kprobe_mutex);
+	}
 }
 
 static struct notifier_block kprobe_exceptions_nb = {
-- 
cgit v1.2.3-71-gd317


From e597c2984c64609c6e1e1ac803f00f7550705860 Mon Sep 17 00:00:00 2001
From: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Mon, 9 Jan 2006 20:52:45 -0800
Subject: [PATCH] kprobes: arch_remove_kprobe

Currently arch_remove_kprobes() is only implemented/required for x86_64 and
powerpc.  All other architecture like IA64, i386 and sparc64 implementes a
dummy function which is being called from arch independent kprobes.c file.

This patch removes the dummy functions and replaces it with
#define arch_remove_kprobe(p, s)	do { } while(0)

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c    | 4 ----
 arch/ia64/kernel/kprobes.c    | 4 ----
 arch/powerpc/kernel/kprobes.c | 4 +++-
 arch/sparc64/kernel/kprobes.c | 4 ----
 arch/x86_64/kernel/kprobes.c  | 4 +++-
 include/asm-i386/kprobes.h    | 1 +
 include/asm-ia64/kprobes.h    | 1 +
 include/asm-powerpc/kprobes.h | 1 +
 include/asm-sparc64/kprobes.h | 1 +
 include/asm-x86_64/kprobes.h  | 1 +
 include/linux/kprobes.h       | 1 -
 kernel/kprobes.c              | 4 +---
 12 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 68fe10250486..2f372dbd34fd 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -77,10 +77,6 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-}
-
 static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	kcb->prev_kprobe.kp = kprobe_running();
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 89a70400c4f6..4de7f6759093 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -467,10 +467,6 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 	flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t));
 }
 
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-}
-
 /*
  * We are resuming execution after a single step fault, so the pt_regs
  * structure reflects the register state after we executed the instruction
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 2cd32dd6898b..93444e32fccd 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -80,9 +80,11 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void __kprobes arch_remove_kprobe(struct kprobe *p)
+void __kprobes arch_remove_kprobe(struct kprobe *p, struct semaphore *s)
 {
+	down(s);
 	free_insn_slot(p->ainsn.insn);
+	up(s);
 }
 
 static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
diff --git a/arch/sparc64/kernel/kprobes.c b/arch/sparc64/kernel/kprobes.c
index bbd5aa6818ea..ff5e9d5cad50 100644
--- a/arch/sparc64/kernel/kprobes.c
+++ b/arch/sparc64/kernel/kprobes.c
@@ -61,10 +61,6 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 	flushi(p->addr);
 }
 
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-}
-
 static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	kcb->prev_kprobe.kp = kprobe_running();
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 128e18190f99..61a6a9369cbd 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -220,9 +220,11 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void __kprobes arch_remove_kprobe(struct kprobe *p)
+void __kprobes arch_remove_kprobe(struct kprobe *p, struct semaphore *s)
 {
+	down(s);
 	free_insn_slot(p->ainsn.insn);
+	up(s);
 }
 
 static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h
index f9e150fa1d44..dc559267ce3e 100644
--- a/include/asm-i386/kprobes.h
+++ b/include/asm-i386/kprobes.h
@@ -40,6 +40,7 @@ typedef u8 kprobe_opcode_t;
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
 #define ARCH_SUPPORTS_KRETPROBES
+#define arch_remove_kprobe(p, s)	do { } while(0)
 
 void kretprobe_trampoline(void);
 
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index 12a0b93020da..698508f4a0cf 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -89,6 +89,7 @@ struct kprobe_ctlblk {
 #define IP_RELATIVE_PREDICT_OPCODE	(7)
 #define LONG_BRANCH_OPCODE		(0xC)
 #define LONG_CALL_OPCODE		(0xD)
+#define arch_remove_kprobe(p, s)	do { } while(0)
 
 typedef struct kprobe_opcode {
 	bundle_t bundle;
diff --git a/include/asm-powerpc/kprobes.h b/include/asm-powerpc/kprobes.h
index 42ece411435a..89dee13c2a4c 100644
--- a/include/asm-powerpc/kprobes.h
+++ b/include/asm-powerpc/kprobes.h
@@ -50,6 +50,7 @@ typedef unsigned int kprobe_opcode_t;
 
 #define ARCH_SUPPORTS_KRETPROBES
 void kretprobe_trampoline(void);
+extern void arch_remove_kprobe(struct kprobe *p, struct semaphore *s);
 
 /* Architecture specific copy of original instruction */
 struct arch_specific_insn {
diff --git a/include/asm-sparc64/kprobes.h b/include/asm-sparc64/kprobes.h
index 1b47e0d2ee93..27fbdcba724b 100644
--- a/include/asm-sparc64/kprobes.h
+++ b/include/asm-sparc64/kprobes.h
@@ -12,6 +12,7 @@ typedef u32 kprobe_opcode_t;
 #define MAX_INSN_SIZE 2
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
+#define arch_remove_kprobe(p, s)	do { } while(0)
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
diff --git a/include/asm-x86_64/kprobes.h b/include/asm-x86_64/kprobes.h
index 9e2532adf42c..3a19ad179220 100644
--- a/include/asm-x86_64/kprobes.h
+++ b/include/asm-x86_64/kprobes.h
@@ -78,6 +78,7 @@ static inline void restore_interrupts(struct pt_regs *regs)
 		local_irq_enable();
 }
 
+extern void arch_remove_kprobe(struct kprobe *p, struct semaphore *s);
 extern int post_kprobe_handler(struct pt_regs *regs);
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 extern int kprobe_handler(struct pt_regs *regs);
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index ad6e4fe970fd..59bf240cdb9d 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -152,7 +152,6 @@ extern spinlock_t kretprobe_lock;
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
 extern void arch_disarm_kprobe(struct kprobe *p);
-extern void arch_remove_kprobe(struct kprobe *p);
 extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
 extern kprobe_opcode_t *get_insn_slot(void);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f1c0e61a2cb4..19c42cbf91a0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -532,9 +532,7 @@ valid_p:
 			list_del_rcu(&p->list);
 			kfree(old_p);
 		}
-		down(&kprobe_mutex);
-		arch_remove_kprobe(p);
-		up(&kprobe_mutex);
+		arch_remove_kprobe(p, &kprobe_mutex);
 	}
 }
 
-- 
cgit v1.2.3-71-gd317


From 0498b63504f818e5ab39c818cd6f7b41319a1187 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 9 Jan 2006 20:52:46 -0800
Subject: [PATCH] kprobes: fix build breakage

The following patch (against 2.6.15-rc5-mm3) fixes a kprobes build break
due to changes introduced in the kprobe locking in 2.6.15-rc5-mm3.  In
addition, the patch reverts back the open-coding of kprobe_mutex.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/kernel/kprobes.c | 6 +++---
 arch/x86_64/kernel/kprobes.c  | 6 +++---
 include/asm-i386/kprobes.h    | 2 +-
 include/asm-ia64/kprobes.h    | 2 +-
 include/asm-powerpc/kprobes.h | 3 ++-
 include/asm-sparc64/kprobes.h | 2 +-
 include/asm-x86_64/kprobes.h  | 3 ++-
 include/linux/kprobes.h       | 1 +
 kernel/kprobes.c              | 4 ++--
 9 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 93444e32fccd..27b0c40601fb 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -80,11 +80,11 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void __kprobes arch_remove_kprobe(struct kprobe *p, struct semaphore *s)
+void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	down(s);
+	down(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
-	up(s);
+	up(&kprobe_mutex);
 }
 
 static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 61a6a9369cbd..b7dc1f816d13 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -220,11 +220,11 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void __kprobes arch_remove_kprobe(struct kprobe *p, struct semaphore *s)
+void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	down(s);
+	down(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
-	up(s);
+	up(&kprobe_mutex);
 }
 
 static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h
index dc559267ce3e..27cac050a60e 100644
--- a/include/asm-i386/kprobes.h
+++ b/include/asm-i386/kprobes.h
@@ -40,7 +40,7 @@ typedef u8 kprobe_opcode_t;
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
 #define ARCH_SUPPORTS_KRETPROBES
-#define arch_remove_kprobe(p, s)	do { } while(0)
+#define arch_remove_kprobe(p)	do {} while (0)
 
 void kretprobe_trampoline(void);
 
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index 698508f4a0cf..a74b68104559 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -89,7 +89,7 @@ struct kprobe_ctlblk {
 #define IP_RELATIVE_PREDICT_OPCODE	(7)
 #define LONG_BRANCH_OPCODE		(0xC)
 #define LONG_CALL_OPCODE		(0xD)
-#define arch_remove_kprobe(p, s)	do { } while(0)
+#define arch_remove_kprobe(p)		do {} while (0)
 
 typedef struct kprobe_opcode {
 	bundle_t bundle;
diff --git a/include/asm-powerpc/kprobes.h b/include/asm-powerpc/kprobes.h
index 89dee13c2a4c..f466bc804f41 100644
--- a/include/asm-powerpc/kprobes.h
+++ b/include/asm-powerpc/kprobes.h
@@ -33,6 +33,7 @@
 #define  __ARCH_WANT_KPROBES_INSN_SLOT
 
 struct pt_regs;
+struct kprobe;
 
 typedef unsigned int kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION	0x7fe00008	/* trap */
@@ -50,7 +51,7 @@ typedef unsigned int kprobe_opcode_t;
 
 #define ARCH_SUPPORTS_KRETPROBES
 void kretprobe_trampoline(void);
-extern void arch_remove_kprobe(struct kprobe *p, struct semaphore *s);
+extern void arch_remove_kprobe(struct kprobe *p);
 
 /* Architecture specific copy of original instruction */
 struct arch_specific_insn {
diff --git a/include/asm-sparc64/kprobes.h b/include/asm-sparc64/kprobes.h
index 27fbdcba724b..e4efe652b54b 100644
--- a/include/asm-sparc64/kprobes.h
+++ b/include/asm-sparc64/kprobes.h
@@ -12,7 +12,7 @@ typedef u32 kprobe_opcode_t;
 #define MAX_INSN_SIZE 2
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
-#define arch_remove_kprobe(p, s)	do { } while(0)
+#define arch_remove_kprobe(p)	do {} while (0)
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
diff --git a/include/asm-x86_64/kprobes.h b/include/asm-x86_64/kprobes.h
index 3a19ad179220..98a1e95ddb98 100644
--- a/include/asm-x86_64/kprobes.h
+++ b/include/asm-x86_64/kprobes.h
@@ -30,6 +30,7 @@
 #define  __ARCH_WANT_KPROBES_INSN_SLOT
 
 struct pt_regs;
+struct kprobe;
 
 typedef u8 kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION	0xcc
@@ -44,6 +45,7 @@ typedef u8 kprobe_opcode_t;
 #define ARCH_SUPPORTS_KRETPROBES
 
 void kretprobe_trampoline(void);
+extern void arch_remove_kprobe(struct kprobe *p);
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
@@ -78,7 +80,6 @@ static inline void restore_interrupts(struct pt_regs *regs)
 		local_irq_enable();
 }
 
-extern void arch_remove_kprobe(struct kprobe *p, struct semaphore *s);
 extern int post_kprobe_handler(struct pt_regs *regs);
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 extern int kprobe_handler(struct pt_regs *regs);
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 59bf240cdb9d..10005bc92a31 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -149,6 +149,7 @@ struct kretprobe_instance {
 };
 
 extern spinlock_t kretprobe_lock;
+extern struct semaphore kprobe_mutex;
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
 extern void arch_disarm_kprobe(struct kprobe *p);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 19c42cbf91a0..f24cbab558f1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
-static DECLARE_MUTEX(kprobe_mutex);	/* Protects kprobe_table */
+DECLARE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
@@ -532,7 +532,7 @@ valid_p:
 			list_del_rcu(&p->list);
 			kfree(old_p);
 		}
-		arch_remove_kprobe(p, &kprobe_mutex);
+		arch_remove_kprobe(p);
 	}
 }
 
-- 
cgit v1.2.3-71-gd317


From 2b4f2f4b0132afa9f441171285cca354377bf5d0 Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Mon, 9 Jan 2006 20:52:54 -0800
Subject: [PATCH] vesafb: Drop blank hook

From: Bugzilla Bug 5351

"After resuming from S3 (suspended while in X), the LCD panel stays black .
 However, the laptop is up again, and I can SSH into it from another
machine.

I can get the panel working again, when I first direct video output to the
CRT output of the laptop, and then back to LCD (done by repeatedly hitting
Fn+F5 buttons on the Toshiba, which directs output to either LCD, CRT or
TV) None of this ever happened with older kernels."

This bug is due to the recently added vesafb_blank() method in vesafb.  It
works with CRT displays, but has a high incidence of problems in laptop
users.  Since CRT users don't really get that much benefit from hardware
blanking, drop support for this.

Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/boot/video.S      |  5 -----
 drivers/video/vesafb.c      | 37 -------------------------------------
 include/linux/screen_info.h |  3 +--
 3 files changed, 1 insertion(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/boot/video.S b/arch/i386/boot/video.S
index 92f669470142..2ac40c8244c4 100644
--- a/arch/i386/boot/video.S
+++ b/arch/i386/boot/video.S
@@ -97,7 +97,6 @@
 #define PARAM_VESAPM_OFF	0x30
 #define PARAM_LFB_PAGES		0x32
 #define PARAM_VESA_ATTRIB	0x34
-#define PARAM_CAPABILITIES      0x36
 
 /* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
 #ifdef CONFIG_VIDEO_RETAIN
@@ -234,10 +233,6 @@ mopar_gr:
 	movw	18(%di), %ax
 	movl	%eax, %fs:(PARAM_LFB_SIZE)
 
-# store mode capabilities
-	movl    10(%di), %eax
-	movl    %eax, %fs:(PARAM_CAPABILITIES)
-
 # switching the DAC to 8-bit is for <= 8 bpp only
 	movw	%fs:(PARAM_LFB_DEPTH), %ax
 	cmpw	$8, %ax
diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c
index 3e58ddc2bc38..55e28ba57b43 100644
--- a/drivers/video/vesafb.c
+++ b/drivers/video/vesafb.c
@@ -57,7 +57,6 @@ static unsigned short  *pmi_base  = NULL;
 static void            (*pmi_start)(void);
 static void            (*pmi_pal)(void);
 static int             depth;
-static int             vga_compat;
 
 /* --------------------------------------------------------------------- */
 
@@ -90,37 +89,6 @@ static int vesafb_pan_display(struct fb_var_screeninfo *var,
 	return 0;
 }
 
-static int vesafb_blank(int blank, struct fb_info *info)
-{
-	int err = 1;
-
-	if (vga_compat) {
-		int loop = 10000;
-		u8 seq = 0, crtc17 = 0;
-
-		if (blank == FB_BLANK_POWERDOWN) {
-			seq = 0x20;
-			crtc17 = 0x00;
-			err = 0;
-		} else {
-			seq = 0x00;
-			crtc17 = 0x80;
-			err = (blank == FB_BLANK_UNBLANK) ? 0 : -EINVAL;
-		}
-
-		vga_wseq(NULL, 0x00, 0x01);
-		seq |= vga_rseq(NULL, 0x01) & ~0x20;
-		vga_wseq(NULL, 0x00, seq);
-
-		crtc17 |= vga_rcrt(NULL, 0x17) & ~0x80;
-		while (loop--);
-		vga_wcrt(NULL, 0x17, crtc17);
-		vga_wseq(NULL, 0x00, 0x03);
-	}
-
-	return err;
-}
-
 static void vesa_setpalette(int regno, unsigned red, unsigned green,
 			    unsigned blue)
 {
@@ -205,7 +173,6 @@ static struct fb_ops vesafb_ops = {
 	.owner		= THIS_MODULE,
 	.fb_setcolreg	= vesafb_setcolreg,
 	.fb_pan_display	= vesafb_pan_display,
-	.fb_blank       = vesafb_blank,
 	.fb_fillrect	= cfb_fillrect,
 	.fb_copyarea	= cfb_copyarea,
 	.fb_imageblit	= cfb_imageblit,
@@ -459,10 +426,6 @@ static int __init vesafb_probe(struct platform_device *dev)
 	info->flags = FBINFO_FLAG_DEFAULT |
 		(ypan) ? FBINFO_HWACCEL_YPAN : 0;
 
-	vga_compat = (screen_info.capabilities & 2) ? 0 : 1;
-	printk("vesafb: Mode is %sVGA compatible\n",
-	       (vga_compat) ? "" : "not ");
-
 	if (fb_alloc_cmap(&info->cmap, 256, 0) < 0) {
 		err = -ENOMEM;
 		goto err;
diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h
index 76850b75b3f6..6336987dae62 100644
--- a/include/linux/screen_info.h
+++ b/include/linux/screen_info.h
@@ -41,8 +41,7 @@ struct screen_info {
 	u16 vesapm_off;		/* 0x30 */
 	u16 pages;		/* 0x32 */
 	u16 vesa_attributes;	/* 0x34 */
-	u32  capabilities;      /* 0x36 */
-				/* 0x3a -- 0x3f reserved for future expansion */
+				/* 0x36 -- 0x3f reserved for future expansion */
 };
 
 extern struct screen_info screen_info;
-- 
cgit v1.2.3-71-gd317


From c549dc6422e4b720fed6702d70fddd8cee0f5c9a Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Mon, 9 Jan 2006 20:53:33 -0800
Subject: [PATCH] nvidiafb: Add support for some pci-e chipsets

Chipsets with PCI device ids & 0xf0 == 0x00f0 has their actual chipset type in
offset 0x1800 of the mmio space.  Add support for this.

Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/video/nvidia/nvidia.c | 75 +++++++++++++++++++++++++++++++------------
 include/linux/pci_ids.h       |  5 +++
 2 files changed, 59 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c
index dcf2e7a7d215..099d64af37be 100644
--- a/drivers/video/nvidia/nvidia.c
+++ b/drivers/video/nvidia/nvidia.c
@@ -284,6 +284,16 @@ static struct pci_device_id nvidiafb_pci_tbl[] = {
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6200,
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_ALT1,
+	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6600_ALT1,
+	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6600_ALT2,
+	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6200_ALT1,
+	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_GT,
+	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, 0x0252,
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, 0x0313,
@@ -1448,11 +1458,34 @@ static int __devinit nvidia_set_fbinfo(struct fb_info *info)
 	return nvidiafb_check_var(&info->var, info);
 }
 
-static u32 __devinit nvidia_get_arch(struct pci_dev *pd)
+static u32 __devinit nvidia_get_chipset(struct fb_info *info)
 {
+	struct nvidia_par *par = info->par;
+	u32 id = (par->pci_dev->vendor << 16) | par->pci_dev->device;
+
+	printk("nvidiafb: PCI id - %x\n", id);
+	if ((id & 0xfff0) == 0x00f0) {
+		/* pci-e */
+		printk("nvidiafb: PCI-E card\n");
+		id = NV_RD32(par->REGS, 0x1800);
+
+		if ((id & 0x0000ffff) == 0x000010DE)
+			id = 0x10DE0000 | (id >> 16);
+		else if ((id & 0xffff0000) == 0xDE100000) /* wrong endian */
+			id = 0x10DE0000 | ((id << 8) & 0x0000ff00) |
+                            ((id >> 8) & 0x000000ff);
+	}
+
+	printk("nvidiafb: Actual id - %x\n", id);
+	return id;
+}
+
+static u32 __devinit nvidia_get_arch(struct fb_info *info)
+{
+	struct nvidia_par *par = info->par;
 	u32 arch = 0;
 
-	switch (pd->device & 0x0ff0) {
+	switch (par->Chipset & 0x0ff0) {
 	case 0x0100:		/* GeForce 256 */
 	case 0x0110:		/* GeForce2 MX */
 	case 0x0150:		/* GeForce2 */
@@ -1535,18 +1568,6 @@ static int __devinit nvidiafb_probe(struct pci_dev *pd,
 		goto err_out_request;
 	}
 
-	par->Architecture = nvidia_get_arch(pd);
-
-	par->Chipset = (pd->vendor << 16) | pd->device;
-	printk(KERN_INFO PFX "nVidia device/chipset %X\n", par->Chipset);
-
-	if (par->Architecture == 0) {
-		printk(KERN_ERR PFX "unknown NV_ARCH\n");
-		goto err_out_free_base0;
-	}
-
-	sprintf(nvidiafb_fix.id, "NV%x", (pd->device & 0x0ff0) >> 4);
-
 	par->FlatPanel = flatpanel;
 	if (flatpanel == 1)
 		printk(KERN_INFO PFX "flatpanel support enabled\n");
@@ -1572,6 +1593,17 @@ static int __devinit nvidiafb_probe(struct pci_dev *pd,
 		goto err_out_free_base0;
 	}
 
+	par->Chipset = nvidia_get_chipset(info);
+	printk(KERN_INFO PFX "nVidia device/chipset %X\n", par->Chipset);
+	par->Architecture = nvidia_get_arch(info);
+
+	if (par->Architecture == 0) {
+		printk(KERN_ERR PFX "unknown NV_ARCH\n");
+		goto err_out_arch;
+	}
+
+	sprintf(nvidiafb_fix.id, "NV%x", (pd->device & 0x0ff0) >> 4);
+
 	NVCommonSetup(info);
 
 	par->FbAddress = nvidiafb_fix.smem_start;
@@ -1647,21 +1679,22 @@ static int __devinit nvidiafb_probe(struct pci_dev *pd,
 	NVTRACE_LEAVE();
 	return 0;
 
-      err_out_iounmap_fb:
+err_out_iounmap_fb:
 	iounmap(info->screen_base);
-      err_out_free_base1:
+err_out_free_base1:
 	fb_destroy_modedb(info->monspecs.modedb);
 	nvidia_delete_i2c_busses(par);
+err_out_arch:
 	iounmap(par->REGS);
-      err_out_free_base0:
+err_out_free_base0:
 	pci_release_regions(pd);
-      err_out_request:
+err_out_request:
 	pci_disable_device(pd);
-      err_out_enable:
+err_out_enable:
 	kfree(info->pixmap.addr);
-      err_out_kfree:
+err_out_kfree:
 	framebuffer_release(info);
-      err_out:
+err_out:
 	return -ENODEV;
 }
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 100eba0f4771..f55c98a68aa9 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1050,6 +1050,11 @@
 #define PCI_DEVICE_ID_NVIDIA_NVENET_6		0x00e6
 #define PCI_DEVICE_ID_NVIDIA_CK8S_AUDIO		0x00ea
 #define PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2	0x00ee
+#define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_ALT1 0x00f0
+#define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6600_ALT1 0x00f1
+#define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6600_ALT2 0x00f2
+#define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6200_ALT1 0x00f3
+#define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_GT   0x00f9
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_SDR	0x0100
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_DDR	0x0101
 #define PCI_DEVICE_ID_NVIDIA_QUADRO		0x0103
-- 
cgit v1.2.3-71-gd317


From 0863afb32b77fc89c7110b3d10fb048cb56bb1b5 Mon Sep 17 00:00:00 2001
From: Martin Waitz <tali@admingilde.org>
Date: Mon, 9 Jan 2006 20:53:55 -0800
Subject: [PATCH] DocBook: fix kernel-doc comments

Fix typos in comments to remove kernel-doc warnings.

Signed-off-by: Martin Waitz <tali@admingilde.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/core.c     | 4 ++--
 include/linux/rio_drv.h | 4 ++--
 include/sound/core.h    | 3 +--
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index fd8059920dbf..6b355bd7816d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -161,8 +161,8 @@ static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
 	return count;
 }
 
-/**
- *	device_subsys - structure to be registered with kobject core.
+/*
+ *	devices_subsys - structure to be registered with kobject core.
  */
 
 decl_subsys(devices, &ktype_device, &device_uevent_ops);
diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h
index 157d7e3236b5..f54772d0e7f8 100644
--- a/include/linux/rio_drv.h
+++ b/include/linux/rio_drv.h
@@ -337,8 +337,8 @@ static inline void rio_init_dbell_res(struct resource *res, u16 start, u16 end)
 
 /**
  * RIO_DEVICE - macro used to describe a specific RIO device
- * @vid: the 16 bit RIO vendor ID
- * @did: the 16 bit RIO device ID
+ * @dev: the 16 bit RIO device ID
+ * @ven: the 16 bit RIO vendor ID
  *
  * This macro is used to create a struct rio_device_id that matches a
  * specific device.  The assembly vendor and assembly device fields
diff --git a/include/sound/core.h b/include/sound/core.h
index 90ac6132ea3b..3093e3ddcf36 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -317,7 +317,7 @@ void snd_verbose_printd(const char *file, int line, const char *format, ...)
 #ifdef CONFIG_SND_VERBOSE_PRINTK
 /**
  * snd_printd - debug printk
- * @format: format string
+ * @fmt: format string
  *
  * Compiled only when Works like snd_printk() for debugging purpose.
  * Ignored when CONFIG_SND_DEBUG is not set.
@@ -331,7 +331,6 @@ void snd_verbose_printd(const char *file, int line, const char *format, ...)
 /**
  * snd_assert - run-time assertion macro
  * @expr: expression
- * @args...: the action
  *
  * This macro checks the expression in run-time and invokes the commands
  * given in the rest arguments if the assertion is failed.
-- 
cgit v1.2.3-71-gd317


From 87c2ce3b9305b9b723faeedf6e32ef703ec9b33a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 9 Jan 2006 20:54:07 -0800
Subject: [PATCH] lib/zlib*: cleanups

This patch contains the following possible cleanups:
- #if 0 the following unused functions:
  - zlib_deflate/deflate.c: zlib_deflateSetDictionary
  - zlib_deflate/deflate.c: zlib_deflateParams
  - zlib_deflate/deflate.c: zlib_deflateCopy
  - zlib_inflate/infblock.c: zlib_inflate_set_dictionary
  - zlib_inflate/infblock.c: zlib_inflate_blocks_sync_point
  - zlib_inflate/inflate_sync.c: zlib_inflateSync
  - zlib_inflate/inflate_sync.c: zlib_inflateSyncPoint
- remove the following unneeded EXPORT_SYMBOL's:
  - zlib_deflate/deflate_syms.c: zlib_deflateCopy
  - zlib_deflate/deflate_syms.c: zlib_deflateParams
  - zlib_inflate/inflate_syms.c: zlib_inflateSync
  - zlib_inflate/inflate_syms.c: zlib_inflateSyncPoint

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/zlib.h            | 11 +++++++++++
 lib/zlib_deflate/deflate.c      |  6 ++++++
 lib/zlib_deflate/deflate_syms.c |  2 --
 lib/zlib_inflate/infblock.c     |  4 ++++
 lib/zlib_inflate/infblock.h     |  4 ++++
 lib/zlib_inflate/inflate_syms.c |  2 --
 lib/zlib_inflate/inflate_sync.c |  4 ++++
 7 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zlib.h b/include/linux/zlib.h
index 74f7b78c22d2..4fa32f0d4df8 100644
--- a/include/linux/zlib.h
+++ b/include/linux/zlib.h
@@ -442,9 +442,11 @@ extern int deflateInit2 (z_streamp strm,
    not perform any compression: this will be done by deflate().
 */
                             
+#if 0
 extern int zlib_deflateSetDictionary (z_streamp strm,
 						     const Byte *dictionary,
 						     uInt  dictLength);
+#endif
 /*
      Initializes the compression dictionary from the given byte sequence
    without producing any compressed output. This function must be called
@@ -478,7 +480,10 @@ extern int zlib_deflateSetDictionary (z_streamp strm,
    perform any compression: this will be done by deflate().
 */
 
+#if 0
 extern int zlib_deflateCopy (z_streamp dest, z_streamp source);
+#endif
+
 /*
      Sets the destination stream as a complete copy of the source stream.
 
@@ -511,7 +516,9 @@ static inline unsigned long deflateBound(unsigned long s)
 	return s + ((s + 7) >> 3) + ((s + 63) >> 6) + 11;
 }
 
+#if 0
 extern int zlib_deflateParams (z_streamp strm, int level, int strategy);
+#endif
 /*
      Dynamically update the compression level and compression strategy.  The
    interpretation of level and strategy is as in deflateInit2.  This can be
@@ -571,7 +578,9 @@ extern int zlib_inflateSetDictionary (z_streamp strm,
    inflate().
 */
 
+#if 0
 extern int zlib_inflateSync (z_streamp strm);
+#endif
 /* 
     Skips invalid compressed data until a full flush point (see above the
   description of deflate with Z_FULL_FLUSH) can be found, or until all
@@ -636,7 +645,9 @@ extern int zlib_inflateInit2_ (z_streamp strm, int  windowBits,
 #endif
 
 extern const char  * zlib_zError           (int err);
+#if 0
 extern int           zlib_inflateSyncPoint (z_streamp z);
+#endif
 extern const uLong * zlib_get_crc_table    (void);
 
 #endif /* _ZLIB_H */
diff --git a/lib/zlib_deflate/deflate.c b/lib/zlib_deflate/deflate.c
index ad9a1bf4fc63..1653dd9bb01a 100644
--- a/lib/zlib_deflate/deflate.c
+++ b/lib/zlib_deflate/deflate.c
@@ -255,6 +255,7 @@ int zlib_deflateInit2_(
 }
 
 /* ========================================================================= */
+#if 0
 int zlib_deflateSetDictionary(
 	z_streamp strm,
 	const Byte *dictionary,
@@ -297,6 +298,7 @@ int zlib_deflateSetDictionary(
     if (hash_head) hash_head = 0;  /* to make compiler happy */
     return Z_OK;
 }
+#endif  /*  0  */
 
 /* ========================================================================= */
 int zlib_deflateReset(
@@ -330,6 +332,7 @@ int zlib_deflateReset(
 }
 
 /* ========================================================================= */
+#if 0
 int zlib_deflateParams(
 	z_streamp strm,
 	int level,
@@ -365,6 +368,7 @@ int zlib_deflateParams(
     s->strategy = strategy;
     return err;
 }
+#endif  /*  0  */
 
 /* =========================================================================
  * Put a short in the pending buffer. The 16-bit value is put in MSB order.
@@ -572,6 +576,7 @@ int zlib_deflateEnd(
 /* =========================================================================
  * Copy the source state to the destination state.
  */
+#if 0
 int zlib_deflateCopy (
 	z_streamp dest,
 	z_streamp source
@@ -624,6 +629,7 @@ int zlib_deflateCopy (
     return Z_OK;
 #endif
 }
+#endif  /*  0  */
 
 /* ===========================================================================
  * Read a new buffer from the current input stream, update the adler32
diff --git a/lib/zlib_deflate/deflate_syms.c b/lib/zlib_deflate/deflate_syms.c
index 5985b28c8e30..767b573d1ef6 100644
--- a/lib/zlib_deflate/deflate_syms.c
+++ b/lib/zlib_deflate/deflate_syms.c
@@ -16,6 +16,4 @@ EXPORT_SYMBOL(zlib_deflateInit_);
 EXPORT_SYMBOL(zlib_deflateInit2_);
 EXPORT_SYMBOL(zlib_deflateEnd);
 EXPORT_SYMBOL(zlib_deflateReset);
-EXPORT_SYMBOL(zlib_deflateCopy);
-EXPORT_SYMBOL(zlib_deflateParams);
 MODULE_LICENSE("GPL");
diff --git a/lib/zlib_inflate/infblock.c b/lib/zlib_inflate/infblock.c
index 50f21ca4ef7f..c16cdeff51aa 100644
--- a/lib/zlib_inflate/infblock.c
+++ b/lib/zlib_inflate/infblock.c
@@ -338,6 +338,7 @@ int zlib_inflate_blocks_free(
 }
 
 
+#if 0
 void zlib_inflate_set_dictionary(
 	inflate_blocks_statef *s,
 	const Byte *d,
@@ -347,15 +348,18 @@ void zlib_inflate_set_dictionary(
   memcpy(s->window, d, n);
   s->read = s->write = s->window + n;
 }
+#endif  /*  0  */
 
 
 /* Returns true if inflate is currently at the end of a block generated
  * by Z_SYNC_FLUSH or Z_FULL_FLUSH. 
  * IN assertion: s != NULL
  */
+#if 0
 int zlib_inflate_blocks_sync_point(
 	inflate_blocks_statef *s
 )
 {
   return s->mode == LENS;
 }
+#endif  /*  0  */
diff --git a/lib/zlib_inflate/infblock.h b/lib/zlib_inflate/infblock.h
index f5221ddf6054..ceee60b5107c 100644
--- a/lib/zlib_inflate/infblock.h
+++ b/lib/zlib_inflate/infblock.h
@@ -33,12 +33,16 @@ extern int zlib_inflate_blocks_free (
     inflate_blocks_statef *,
     z_streamp);
 
+#if 0
 extern void zlib_inflate_set_dictionary (
     inflate_blocks_statef *s,
     const Byte *d,  /* dictionary */
     uInt  n);       /* dictionary length */
+#endif  /*  0  */
 
+#if 0
 extern int zlib_inflate_blocks_sync_point (
     inflate_blocks_statef *s);
+#endif  /*  0  */
 
 #endif /* _INFBLOCK_H */
diff --git a/lib/zlib_inflate/inflate_syms.c b/lib/zlib_inflate/inflate_syms.c
index aa1b08189121..ef49738f57ec 100644
--- a/lib/zlib_inflate/inflate_syms.c
+++ b/lib/zlib_inflate/inflate_syms.c
@@ -15,8 +15,6 @@ EXPORT_SYMBOL(zlib_inflate);
 EXPORT_SYMBOL(zlib_inflateInit_);
 EXPORT_SYMBOL(zlib_inflateInit2_);
 EXPORT_SYMBOL(zlib_inflateEnd);
-EXPORT_SYMBOL(zlib_inflateSync);
 EXPORT_SYMBOL(zlib_inflateReset);
-EXPORT_SYMBOL(zlib_inflateSyncPoint);
 EXPORT_SYMBOL(zlib_inflateIncomp); 
 MODULE_LICENSE("GPL");
diff --git a/lib/zlib_inflate/inflate_sync.c b/lib/zlib_inflate/inflate_sync.c
index e07bdb21f55c..61411ff89d61 100644
--- a/lib/zlib_inflate/inflate_sync.c
+++ b/lib/zlib_inflate/inflate_sync.c
@@ -7,6 +7,7 @@
 #include "infblock.h"
 #include "infutil.h"
 
+#if 0
 int zlib_inflateSync(
 	z_streamp z
 )
@@ -57,6 +58,7 @@ int zlib_inflateSync(
   z->state->mode = BLOCKS;
   return Z_OK;
 }
+#endif  /*  0  */
 
 
 /* Returns true if inflate is currently at the end of a block generated
@@ -66,6 +68,7 @@ int zlib_inflateSync(
  * decompressing, PPP checks that at the end of input packet, inflate is
  * waiting for these length bytes.
  */
+#if 0
 int zlib_inflateSyncPoint(
 	z_streamp z
 )
@@ -74,6 +77,7 @@ int zlib_inflateSyncPoint(
     return Z_STREAM_ERROR;
   return zlib_inflate_blocks_sync_point(z->state->blocks);
 }
+#endif  /*  0  */
 
 /*
  * This subroutine adds the data at next_in/avail_in to the output history
-- 
cgit v1.2.3-71-gd317


From 33f0f88f1c51ae5c2d593d26960c760ea154c2e2 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 9 Jan 2006 20:54:13 -0800
Subject: [PATCH] TTY layer buffering revamp

The API and code have been through various bits of initial review by
serial driver people but they definitely need to live somewhere for a
while so the unconverted drivers can get knocked into shape, existing
drivers that have been updated can be better tuned and bugs whacked out.

This replaces the tty flip buffers with kmalloc objects in rings. In the
normal situation for an IRQ driven serial port at typical speeds the
behaviour is pretty much the same, two buffers end up allocated and the
kernel cycles between them as before.

When there are delays or at high speed we now behave far better as the
buffer pool can grow a bit rather than lose characters. This also means
that we can operate at higher speeds reliably.

For drivers that receive characters in blocks (DMA based, USB and
especially virtualisation) the layer allows a lot of driver specific
code that works around the tty layer with private secondary queues to be
removed. The IBM folks need this sort of layer, the smart serial port
people do, the virtualisers do (because a virtualised tty typically
operates at infinite speed rather than emulating 9600 baud).

Finally many drivers had invalid and unsafe attempts to avoid buffer
overflows by directly invoking tty methods extracted out of the innards
of work queue structs. These are no longer needed and all go away. That
fixes various random hangs with serial ports on overflow.

The other change in here is to optimise the receive_room path that is
used by some callers. It turns out that only one ldisc uses receive room
except asa constant and it updates it far far less than the value is
read. We thus make it a variable not a function call.

I expect the code to contain bugs due to the size alone but I'll be
watching and squashing them and feeding out new patches as it goes.

Because the buffers now dynamically expand you should only run out of
buffering when the kernel runs out of memory for real.  That means a lot of
the horrible hacks high performance drivers used to do just aren't needed any
more.

Description:

tty_insert_flip_char is an old API and continues to work as before, as does
tty_flip_buffer_push() [this is why many drivers dont need modification].  It
does now also return the number of chars inserted

There are also

tty_buffer_request_room(tty, len)

which asks for a buffer block of the length requested and returns the space
found.  This improves efficiency with hardware that knows how much to
transfer.

and tty_insert_flip_string_flags(tty, str, flags, len)

to insert a string of characters and flags

For a smart interface the usual code is

    len = tty_request_buffer_room(tty, amount_hardware_says);
    tty_insert_flip_string(tty, buffer_from_card, len);

More description!

At the moment tty buffers are attached directly to the tty.  This is causing a
lot of the problems related to tty layer locking, also problems at high speed
and also with bursty data (such as occurs in virtualised environments)

I'm working on ripping out the flip buffers and replacing them with a pool of
dynamically allocated buffers.  This allows both for old style "byte I/O"
devices and also helps virtualisation and smart devices where large blocks of
data suddenely materialise and need storing.

So far so good.  Lots of drivers reference tty->flip.*.  Several of them also
call directly and unsafely into function pointers it provides.  This will all
break.  Most drivers can use tty_insert_flip_char which can be kept as an API
but others need more.

At the moment I've added the following interfaces, if people think more will
be needed now is a good time to say

 int tty_buffer_request_room(tty, size)

Try and ensure at least size bytes are available, returns actual room (may be
zero).  At the moment it just uses the flipbuf space but that will change.
Repeated calls without characters being added are not cumulative.  (ie if you
call it with 1, 1, 1, and then 4 you'll have four characters of space.  The
other functions will also try and grow buffers in future but this will be a
more efficient way when you know block sizes.

 int tty_insert_flip_char(tty, ch, flag)

As before insert a character if there is room.  Now returns 1 for success, 0
for failure.

 int tty_insert_flip_string(tty, str, len)

Insert a block of non error characters.  Returns the number inserted.

 int tty_prepare_flip_string(tty, strptr, len)

Adjust the buffer to allow len characters to be added.  Returns a buffer
pointer in strptr and the length available.  This allows for hardware that
needs to use functions like insl or mencpy_fromio.

Signed-off-by: Alan Cox <alan@redhat.com>
Cc: Paul Fulghum <paulkf@microgate.com>
Signed-off-by: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/chan_kern.c           |   6 +-
 drivers/bluetooth/hci_ldisc.c         |  16 +-
 drivers/char/Kconfig                  |   6 +-
 drivers/char/amiserial.c              |  33 ++---
 drivers/char/cyclades.c               |  89 ++++--------
 drivers/char/epca.c                   |  17 +--
 drivers/char/esp.c                    |  67 ++++-----
 drivers/char/hvc_console.c            |   6 +-
 drivers/char/hvcs.c                   |  10 +-
 drivers/char/isicom.c                 |  24 ++-
 drivers/char/istallion.c              |  26 ++--
 drivers/char/moxa.c                   |  73 ++++------
 drivers/char/mxser.c                  |   2 +-
 drivers/char/n_hdlc.c                 |  18 +--
 drivers/char/n_r3964.c                |  10 +-
 drivers/char/n_tty.c                  |  66 +++++----
 drivers/char/pcmcia/synclink_cs.c     |  28 ++--
 drivers/char/pty.c                    |   4 +-
 drivers/char/rio/riointr.c            |  13 +-
 drivers/char/riscom8.c                |  39 ++---
 drivers/char/rocket.c                 |  19 +--
 drivers/char/selection.c              |   5 +-
 drivers/char/ser_a2232.c              |   8 +-
 drivers/char/serial167.c              |  35 ++---
 drivers/char/specialix.c              |  38 ++---
 drivers/char/stallion.c               |  50 ++-----
 drivers/char/sx.c                     |  13 +-
 drivers/char/synclink.c               |  43 ++----
 drivers/char/synclink_gt.c            |  35 +++--
 drivers/char/synclinkmp.c             |  34 ++---
 drivers/char/tty_io.c                 | 266 ++++++++++++++++++++++++++++++----
 drivers/char/viocons.c                |   3 +-
 drivers/char/vme_scc.c                |  16 +-
 drivers/input/serio/serport.c         |  13 +-
 drivers/isdn/capi/capi.c              |   3 +-
 drivers/isdn/i4l/isdn_common.c        | 112 ++++++++++++++
 drivers/isdn/i4l/isdn_common.h        |   1 +
 drivers/isdn/i4l/isdn_tty.c           |  75 +++++-----
 drivers/net/hamradio/6pack.c          |   7 +-
 drivers/net/hamradio/mkiss.c          |   7 +-
 drivers/net/irda/irtty-sir.c          |  18 +--
 drivers/net/ppp_async.c               |   9 +-
 drivers/net/ppp_synctty.c             |   9 +-
 drivers/net/slip.c                    |  11 +-
 drivers/net/wan/pc300_tty.c           |   2 +-
 drivers/net/wan/x25_asy.c             |   7 +-
 drivers/net/wireless/strip.c          |  10 +-
 drivers/s390/char/con3215.c           |  25 +---
 drivers/s390/char/sclp_tty.c          |  21 +--
 drivers/s390/char/sclp_vt220.c        |  12 +-
 drivers/s390/net/ctctty.c             |  28 +---
 drivers/serial/21285.c                |   9 --
 drivers/serial/68328serial.c          |  18 +--
 drivers/serial/68360serial.c          |  54 ++-----
 drivers/serial/8250.c                 |  13 --
 drivers/serial/amba-pl010.c           |   9 --
 drivers/serial/amba-pl011.c           |   9 --
 drivers/serial/au1x00_uart.c          |  31 ++--
 drivers/serial/clps711x.c             |   2 -
 drivers/serial/dz.c                   |   2 -
 drivers/serial/icom.c                 |  57 +++-----
 drivers/serial/imx.c                  |   3 -
 drivers/serial/ioc4_serial.c          |  10 +-
 drivers/serial/ip22zilog.c            |  34 ++---
 drivers/serial/m32r_sio.c             |  31 ++--
 drivers/serial/mcfserial.c            |  23 +--
 drivers/serial/mpc52xx_uart.c         |  46 ++----
 drivers/serial/mpsc.c                 |   6 +-
 drivers/serial/mux.c                  |   9 +-
 drivers/serial/pmac_zilog.c           |  39 +----
 drivers/serial/pxa.c                  |   8 -
 drivers/serial/s3c2410.c              |  10 --
 drivers/serial/sa1100.c               |   2 -
 drivers/serial/serial_lh7a40x.c       |   9 --
 drivers/serial/serial_txx9.c          |  11 --
 drivers/serial/sh-sci.c               |  81 ++++-------
 drivers/serial/sn_console.c           |   6 +-
 drivers/serial/sunsab.c               |  38 ++---
 drivers/serial/sunsu.c                |  32 ++--
 drivers/serial/sunzilog.c             |  34 +----
 drivers/serial/vr41xx_siu.c           |   5 -
 drivers/usb/class/cdc-acm.c           |  11 +-
 drivers/usb/gadget/serial.c           |  19 +--
 drivers/usb/serial/Kconfig            |   2 +-
 drivers/usb/serial/cyberjack.c        |  11 +-
 drivers/usb/serial/cypress_m8.c       |   4 +-
 drivers/usb/serial/digi_acceleport.c  |  28 ++--
 drivers/usb/serial/empeg.c            |  16 +-
 drivers/usb/serial/ftdi_sio.c         |  15 +-
 drivers/usb/serial/garmin_gps.c       |  13 +-
 drivers/usb/serial/generic.c          |  11 +-
 drivers/usb/serial/io_edgeport.c      |  20 +--
 drivers/usb/serial/io_ti.c            |  20 +--
 drivers/usb/serial/ipaq.c             |  12 +-
 drivers/usb/serial/ipw.c              |  11 +-
 drivers/usb/serial/kl5kusb105.c       |  13 +-
 drivers/usb/serial/kobil_sct.c        |  11 +-
 drivers/usb/serial/option.c           |   9 +-
 drivers/usb/serial/pl2303.c           |   8 +-
 drivers/usb/serial/ti_usb_3410_5052.c |  20 +--
 drivers/usb/serial/visor.c            |  11 +-
 drivers/usb/serial/whiteheat.c        |  11 +-
 include/linux/kbd_kern.h              |   2 +-
 include/linux/tty.h                   |  25 ++--
 include/linux/tty_flip.h              |  20 ++-
 include/linux/tty_ldisc.h             |   9 --
 net/bluetooth/rfcomm/tty.c            |   9 +-
 107 files changed, 1005 insertions(+), 1465 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index cd13b91b9ff6..ab0d0b170816 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -186,9 +186,6 @@ static void tty_receive_char(struct tty_struct *tty, char ch)
 		}
 	}
 
-	if((tty->flip.flag_buf_ptr == NULL) ||
-	   (tty->flip.char_buf_ptr == NULL))
-		return;
 	tty_insert_flip_char(tty, ch, TTY_NORMAL);
 }
 
@@ -653,8 +650,7 @@ void chan_interrupt(struct list_head *chans, struct work_struct *task,
 		chan = list_entry(ele, struct chan, list);
 		if(!chan->input || (chan->ops->read == NULL)) continue;
 		do {
-			if((tty != NULL) &&
-			   (tty->flip.count >= TTY_FLIPBUF_SIZE)){
+			if (tty && !tty_buffer_request_room(tty, 1)) {
 				schedule_delayed_work(task, 1);
 				goto out;
 			}
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 573ff6c1be5f..613673b12fa6 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -279,6 +279,7 @@ static int hci_uart_tty_open(struct tty_struct *tty)
 
 	tty->disc_data = hu;
 	hu->tty = tty;
+	tty->receive_room = 65536;
 
 	spin_lock_init(&hu->rx_lock);
 
@@ -348,20 +349,6 @@ static void hci_uart_tty_wakeup(struct tty_struct *tty)
 		hci_uart_tx_wakeup(hu);
 }
 
-/* hci_uart_tty_room()
- * 
- *    Callback function from tty driver. Return the amount of 
- *    space left in the receiver's buffer to decide if remote
- *    transmitter is to be throttled.
- *
- * Arguments:        tty    pointer to associated tty instance data
- * Return Value:    number of bytes left in receive buffer
- */
-static int hci_uart_tty_room (struct tty_struct *tty)
-{
-	return 65536;
-}
-
 /* hci_uart_tty_receive()
  * 
  *     Called by tty low level driver when receive data is
@@ -544,7 +531,6 @@ static int __init hci_uart_init(void)
 	hci_uart_ldisc.write		= hci_uart_tty_write;
 	hci_uart_ldisc.ioctl		= hci_uart_tty_ioctl;
 	hci_uart_ldisc.poll		= hci_uart_tty_poll;
-	hci_uart_ldisc.receive_room	= hci_uart_tty_room;
 	hci_uart_ldisc.receive_buf	= hci_uart_tty_receive;
 	hci_uart_ldisc.write_wakeup	= hci_uart_tty_wakeup;
 	hci_uart_ldisc.owner		= THIS_MODULE;
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 977a74e16efb..d6fcd0a36f9f 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -80,7 +80,7 @@ config SERIAL_NONSTANDARD
 
 config COMPUTONE
 	tristate "Computone IntelliPort Plus serial support"
-	depends on SERIAL_NONSTANDARD && BROKEN_ON_SMP
+	depends on SERIAL_NONSTANDARD
 	---help---
 	  This driver supports the entire family of Intelliport II/Plus
 	  controllers with the exception of the MicroChannel controllers and
@@ -153,7 +153,7 @@ config DIGIEPCA
 
 config ESPSERIAL
 	tristate "Hayes ESP serial port support"
-	depends on SERIAL_NONSTANDARD && ISA && BROKEN_ON_SMP && ISA_DMA_API
+	depends on SERIAL_NONSTANDARD && ISA && ISA_DMA_API
 	help
 	  This is a driver which supports Hayes ESP serial ports.  Both single
 	  port cards and multiport cards are supported.  Make sure to read
@@ -166,7 +166,7 @@ config ESPSERIAL
 
 config MOXA_INTELLIO
 	tristate "Moxa Intellio support"
-	depends on SERIAL_NONSTANDARD && BROKEN_ON_SMP
+	depends on SERIAL_NONSTANDARD
 	help
 	  Say Y here if you have a Moxa Intellio multiport serial card.
 
diff --git a/drivers/char/amiserial.c b/drivers/char/amiserial.c
index 10c81ecdace8..869518e4035f 100644
--- a/drivers/char/amiserial.c
+++ b/drivers/char/amiserial.c
@@ -265,8 +265,9 @@ static _INLINE_ void receive_chars(struct async_struct *info)
         int status;
 	int serdatr;
 	struct tty_struct *tty = info->tty;
-	unsigned char ch;
+	unsigned char ch, flag;
 	struct	async_icount *icount;
+	int oe = 0;
 
 	icount = &info->state->icount;
 
@@ -282,15 +283,12 @@ static _INLINE_ void receive_chars(struct async_struct *info)
 	    status |= UART_LSR_OE;
 
 	ch = serdatr & 0xff;
-	if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-	  goto ignore_char;
-	*tty->flip.char_buf_ptr = ch;
 	icount->rx++;
 
 #ifdef SERIAL_DEBUG_INTR
 	printk("DR%02x:%02x...", ch, status);
 #endif
-	*tty->flip.flag_buf_ptr = 0;
+	flag = TTY_NORMAL;
 
 	/*
 	 * We don't handle parity or frame errors - but I have left
@@ -319,7 +317,7 @@ static _INLINE_ void receive_chars(struct async_struct *info)
 	   * should be ignored.
 	   */
 	  if (status & info->ignore_status_mask)
-	    goto ignore_char;
+	    goto out;
 
 	  status &= info->read_status_mask;
 
@@ -327,33 +325,28 @@ static _INLINE_ void receive_chars(struct async_struct *info)
 #ifdef SERIAL_DEBUG_INTR
 	    printk("handling break....");
 #endif
-	    *tty->flip.flag_buf_ptr = TTY_BREAK;
+	    flag = TTY_BREAK;
 	    if (info->flags & ASYNC_SAK)
 	      do_SAK(tty);
 	  } else if (status & UART_LSR_PE)
-	    *tty->flip.flag_buf_ptr = TTY_PARITY;
+	    flag = TTY_PARITY;
 	  else if (status & UART_LSR_FE)
-	    *tty->flip.flag_buf_ptr = TTY_FRAME;
+	    flag = TTY_FRAME;
 	  if (status & UART_LSR_OE) {
 	    /*
 	     * Overrun is special, since it's
 	     * reported immediately, and doesn't
 	     * affect the current character
 	     */
-	    if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-	      tty->flip.count++;
-	      tty->flip.flag_buf_ptr++;
-	      tty->flip.char_buf_ptr++;
-	      *tty->flip.flag_buf_ptr = TTY_OVERRUN;
-	    }
+	     oe = 1;
 	  }
 	}
-	tty->flip.flag_buf_ptr++;
-	tty->flip.char_buf_ptr++;
-	tty->flip.count++;
- ignore_char:
-
+	tty_insert_flip_char(tty, ch, flag);
+	if (oe == 1)
+		tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 	tty_flip_buffer_push(tty);
+out:
+	return;
 }
 
 static _INLINE_ void transmit_chars(struct async_struct *info)
diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index c8e7daecad72..39c61a71176e 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -641,6 +641,7 @@ static char rcsid[] =
 #include <linux/timer.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
+#include <linux/tty_flip.h>
 #include <linux/serial.h>
 #include <linux/major.h>
 #include <linux/string.h>
@@ -1086,7 +1087,7 @@ cyy_interrupt(int irq, void *dev_id, struct pt_regs *regs)
   int had_work;
   int mdm_change;
   int mdm_status;
-
+  int len;
     if((cinfo = (struct cyclades_card *)dev_id) == 0){
 #ifdef CY_DEBUG_INTERRUPTS
 	printk("cyy_interrupt: spurious interrupt %d\n\r", irq);
@@ -1163,63 +1164,43 @@ cyy_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 				info->icount.rx++;
                                 continue;
                             }
-                            if (tty->flip.count < TTY_FLIPBUF_SIZE){
-                                tty->flip.count++;
+                            if (tty_buffer_request_room(tty, 1)) {
                                 if (data & info->read_status_mask){
                                     if(data & CyBREAK){
-                                        *tty->flip.flag_buf_ptr++ =
-	    						    TTY_BREAK;
-                                        *tty->flip.char_buf_ptr++ =
-					  cy_readb(base_addr+(CyRDSR<<index));
+                                        tty_insert_flip_char(tty, cy_readb(base_addr+(CyRDSR<<index)), TTY_BREAK);
 					info->icount.rx++;
                                         if (info->flags & ASYNC_SAK){
                                             do_SAK(tty);
                                         }
                                     }else if(data & CyFRAME){
-                                        *tty->flip.flag_buf_ptr++ =
-							    TTY_FRAME;
-                                        *tty->flip.char_buf_ptr++ =
-					  cy_readb(base_addr+(CyRDSR<<index));
+                                        tty_insert_flip_char(tty, cy_readb(base_addr+(CyRDSR<<index)), TTY_FRAME);
 					info->icount.rx++;
 					info->idle_stats.frame_errs++;
                                     }else if(data & CyPARITY){
-                                        *tty->flip.flag_buf_ptr++ =
-							    TTY_PARITY;
-                                        *tty->flip.char_buf_ptr++ =
-					  cy_readb(base_addr+(CyRDSR<<index));
+					/* Pieces of seven... */
+                                        tty_insert_flip_char(tty, cy_readb(base_addr+(CyRDSR<<index)), TTY_PARITY);
 					info->icount.rx++;
 					info->idle_stats.parity_errs++;
                                     }else if(data & CyOVERRUN){
-                                        *tty->flip.flag_buf_ptr++ =
-							    TTY_OVERRUN;
-                                        *tty->flip.char_buf_ptr++ = 0;
+                                        tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 					info->icount.rx++;
                                         /* If the flip buffer itself is
                                            overflowing, we still lose
                                            the next incoming character.
                                          */
-                                        if(tty->flip.count
-					           < TTY_FLIPBUF_SIZE){
-                                            tty->flip.count++;
-                                            *tty->flip.flag_buf_ptr++ =
-							     TTY_NORMAL;
-                                           *tty->flip.char_buf_ptr++ =
-					    cy_readb(base_addr+(CyRDSR<<index));
-					    info->icount.rx++;
-                                        }
+                                        tty_insert_flip_char(tty, cy_readb(base_addr+(CyRDSR<<index)), TTY_FRAME);
+				        info->icount.rx++;
 					info->idle_stats.overruns++;
                                     /* These two conditions may imply */
                                     /* a normal read should be done. */
                                     /* }else if(data & CyTIMEOUT){ */
                                     /* }else if(data & CySPECHAR){ */
-                                    }else{
-                                        *tty->flip.flag_buf_ptr++ = 0;
-                                        *tty->flip.char_buf_ptr++ = 0;
-					info->icount.rx++;
+                                    }else {
+					tty_insert_flip_char(tty, 0, TTY_NORMAL);
+				        info->icount.rx++;
                                     }
                                 }else{
-                                    *tty->flip.flag_buf_ptr++ = 0;
-                                    *tty->flip.char_buf_ptr++ = 0;
+				    tty_insert_flip_char(tty, 0, TTY_NORMAL);
 				    info->icount.rx++;
                                 }
                             }else{
@@ -1240,14 +1221,10 @@ cyy_interrupt(int irq, void *dev_id, struct pt_regs *regs)
                                info->mon.char_max = char_count;
                             info->mon.char_last = char_count;
 #endif
-                            while(char_count--){
-                                if (tty->flip.count >= TTY_FLIPBUF_SIZE){
-                                        break;
-                                }
-                                tty->flip.count++;
+			    len = tty_buffer_request_room(tty, char_count);
+                            while(len--){
                                 data = cy_readb(base_addr+(CyRDSR<<index));
-                                *tty->flip.flag_buf_ptr++ = TTY_NORMAL;
-                                *tty->flip.char_buf_ptr++ = data;
+				tty_insert_flip_char(tty, data, TTY_NORMAL);
 				info->idle_stats.recv_bytes++;
 				info->icount.rx++;
 #ifdef CY_16Y_HACK
@@ -1256,7 +1233,7 @@ cyy_interrupt(int irq, void *dev_id, struct pt_regs *regs)
                             }
                              info->idle_stats.recv_idle = jiffies;
                         }
-                        schedule_delayed_work(&tty->flip.work, 1);
+                        schedule_delayed_work(&tty->buf.work, 1);
                     }
                     /* end of service */
                     cy_writeb(base_addr+(CyRIR<<index), (save_xir & 0x3f));
@@ -1551,6 +1528,7 @@ cyz_handle_rx(struct cyclades_port *info,
   struct cyclades_card *cinfo = &cy_card[info->card];
   struct tty_struct *tty = info->tty;
   volatile int char_count;
+  int len;
 #ifdef BLOCKMOVE
   int small_count;
 #else
@@ -1606,18 +1584,11 @@ cyz_handle_rx(struct cyclades_port *info,
 		tty->flip.count += small_count;
 	    }
 #else
-	    while(char_count--){
-		if (tty->flip.count >= N_TTY_BUF_SIZE - tty->read_cnt)
-                    break;
-
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-		    break;
-
+	    len = tty_buffer_request_room(tty, char_count);
+	    while(len--){
 		data = cy_readb(cinfo->base_addr + rx_bufaddr + new_rx_get);
 		new_rx_get = (new_rx_get + 1) & (rx_bufsize - 1);
-		tty->flip.count++;
-		*tty->flip.flag_buf_ptr++ = TTY_NORMAL;
-		*tty->flip.char_buf_ptr++ = data;
+		tty_insert_flip_char(tty, data, TTY_NORMAL);
 		info->idle_stats.recv_bytes++;
 		info->icount.rx++;
 	    }
@@ -1635,7 +1606,7 @@ cyz_handle_rx(struct cyclades_port *info,
 	    }
 #endif
 	    info->idle_stats.recv_idle = jiffies;
-	    schedule_delayed_work(&tty->flip.work, 1);
+	    schedule_delayed_work(&tty->buf.work, 1);
 	}
 	/* Update rx_get */
 	cy_writel(&buf_ctrl->rx_get, new_rx_get);
@@ -1763,23 +1734,17 @@ cyz_handle_cmd(struct cyclades_card *cinfo)
 
 	switch(cmd) {
 	    case C_CM_PR_ERROR:
-		tty->flip.count++;
-		*tty->flip.flag_buf_ptr++ = TTY_PARITY;
-		*tty->flip.char_buf_ptr++ = 0;
+		tty_insert_flip_char(tty, 0, TTY_PARITY);
 		info->icount.rx++;
 		special_count++;
 		break;
 	    case C_CM_FR_ERROR:
-		tty->flip.count++;
-		*tty->flip.flag_buf_ptr++ = TTY_FRAME;
-		*tty->flip.char_buf_ptr++ = 0;
+		tty_insert_flip_char(tty, 0, TTY_FRAME);
 		info->icount.rx++;
 		special_count++;
 		break;
 	    case C_CM_RXBRK:
-		tty->flip.count++;
-		*tty->flip.flag_buf_ptr++ = TTY_BREAK;
-		*tty->flip.char_buf_ptr++ = 0;
+		tty_insert_flip_char(tty, 0, TTY_BREAK);
 		info->icount.rx++;
 		special_count++;
 		break;
@@ -1844,7 +1809,7 @@ cyz_handle_cmd(struct cyclades_card *cinfo)
 	if(delta_count)
 	    cy_sched_event(info, Cy_EVENT_DELTA_WAKEUP);
 	if(special_count)
-	    schedule_delayed_work(&tty->flip.work, 1);
+	    schedule_delayed_work(&tty->buf.work, 1);
     }
 }
 
diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 407708a001e4..765c5c108bf4 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -1786,9 +1786,7 @@ static void doevent(int crd)
 		if (tty)  { /* Begin if valid tty */
 			if (event & BREAK_IND)  { /* Begin if BREAK_IND */
 				/* A break has been indicated */
-				tty->flip.count++;
-				*tty->flip.flag_buf_ptr++ = TTY_BREAK;
-				*tty->flip.char_buf_ptr++ = 0;
+				tty_insert_flip_char(tty, 0, TTY_BREAK);
 				tty_schedule_flip(tty); 
 			} else if (event & LOWTX_IND)  { /* Begin LOWTX_IND */
 				if (ch->statusflags & LOWWAIT) 
@@ -2124,7 +2122,6 @@ static void receive_data(struct channel *ch)
 	int dataToRead, wrapgap, bytesAvailable;
 	unsigned int tail, head;
 	unsigned int wrapmask;
-	int rc;
 
 	/* ---------------------------------------------------------------
 		This routine is called by doint when a receive data event 
@@ -2162,16 +2159,15 @@ static void receive_data(struct channel *ch)
 		return;
 	}
 
-	if (tty->flip.count == TTY_FLIPBUF_SIZE) 
+	if (tty_buffer_request_room(tty, bytesAvailable + 1) == 0)
 		return;
 
 	if (readb(&bc->orun)) {
 		writeb(0, &bc->orun);
 		printk(KERN_WARNING "epca; overrun! DigiBoard device %s\n",tty->name);
+		tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 	}
 	rxwinon(ch);
-	rptr = tty->flip.char_buf_ptr;
-	rc = tty->flip.count;
 	while (bytesAvailable > 0)  { /* Begin while there is data on the card */
 		wrapgap = (head >= tail) ? head - tail : ch->rxbufsize - tail;
 		/* ---------------------------------------------------------------
@@ -2183,8 +2179,7 @@ static void receive_data(struct channel *ch)
 		/* --------------------------------------------------------------
 		   Make sure we don't overflow the buffer
 		----------------------------------------------------------------- */
-		if ((rc + dataToRead) > TTY_FLIPBUF_SIZE)
-			dataToRead = TTY_FLIPBUF_SIZE - rc;
+		dataToRead = tty_prepare_flip_string(tty, &rptr, dataToRead);
 		if (dataToRead == 0)
 			break;
 		/* ---------------------------------------------------------------
@@ -2192,13 +2187,9 @@ static void receive_data(struct channel *ch)
 			for translation if necessary.
 		------------------------------------------------------------------ */
 		memcpy_fromio(rptr, ch->rxptr + tail, dataToRead);
-		rc   += dataToRead;
-		rptr += dataToRead;
 		tail = (tail + dataToRead) & wrapmask;
 		bytesAvailable -= dataToRead;
 	} /* End while there is data on the card */
-	tty->flip.count = rc;
-	tty->flip.char_buf_ptr = rptr;
 	globalwinon(ch);
 	writew(tail, &bc->rout);
 	/* Must be called with global data */
diff --git a/drivers/char/esp.c b/drivers/char/esp.c
index 9f53d2fcc360..e469f641c728 100644
--- a/drivers/char/esp.c
+++ b/drivers/char/esp.c
@@ -345,26 +345,22 @@ static inline void receive_chars_pio(struct esp_struct *info, int num_bytes)
 
 	for (i = 0; i < num_bytes; i++) {
 		if (!(err_buf->data[i] & status_mask)) {
-			*(tty->flip.char_buf_ptr++) = pio_buf->data[i];
+			int flag = 0;
 
 			if (err_buf->data[i] & 0x04) {
-				*(tty->flip.flag_buf_ptr++) = TTY_BREAK;
-
+				flag = TTY_BREAK;
 				if (info->flags & ASYNC_SAK)
 					do_SAK(tty);
 			}
 			else if (err_buf->data[i] & 0x02)
-				*(tty->flip.flag_buf_ptr++) = TTY_FRAME;
+				flag = TTY_FRAME;
 			else if (err_buf->data[i] & 0x01)
-				*(tty->flip.flag_buf_ptr++) = TTY_PARITY;
-			else
-				*(tty->flip.flag_buf_ptr++) = 0;
-		
-			tty->flip.count++;
+				flag = TTY_PARITY;
+			tty_insert_flip_char(tty, pio_buf->data[i], flag);
 		}
 	}
 
-	schedule_delayed_work(&tty->flip.work, 1);
+	schedule_delayed_work(&tty->buf.work, 1);
 
 	info->stat_flags &= ~ESP_STAT_RX_TIMEOUT;
 	release_pio_buffer(pio_buf);
@@ -397,7 +393,6 @@ static inline void receive_chars_dma_done(struct esp_struct *info,
 	int num_bytes;
 	unsigned long flags;
 	
-	
 	flags=claim_dma_lock();
 	disable_dma(dma);
 	clear_dma_ff(dma);
@@ -408,38 +403,31 @@ static inline void receive_chars_dma_done(struct esp_struct *info,
 	
 	info->icount.rx += num_bytes;
 
-	memcpy(tty->flip.char_buf_ptr, dma_buffer, num_bytes);
-	tty->flip.char_buf_ptr += num_bytes;
-	tty->flip.count += num_bytes;
-	memset(tty->flip.flag_buf_ptr, 0, num_bytes);
-	tty->flip.flag_buf_ptr += num_bytes;
-
 	if (num_bytes > 0) {
-		tty->flip.flag_buf_ptr--;
+		tty_insert_flip_string(tty, dma_buffer, num_bytes - 1);
 
 		status &= (0x1c & info->read_status_mask);
+		
+		/* Is the status significant or do we throw the last byte ? */
+		if (!(status & info->ignore_status_mask)) {
+			int statflag = 0;
 
-		if (status & info->ignore_status_mask) {
-			tty->flip.count--;
-			tty->flip.char_buf_ptr--;
-			tty->flip.flag_buf_ptr--;
-		} else if (status & 0x10) {
-			*tty->flip.flag_buf_ptr = TTY_BREAK;
-			(info->icount.brk)++;
-			if (info->flags & ASYNC_SAK)
-				do_SAK(tty);
-		} else if (status & 0x08) {
-			*tty->flip.flag_buf_ptr = TTY_FRAME;
-			(info->icount.frame)++;
-		}
-		else if (status & 0x04) {
-			*tty->flip.flag_buf_ptr = TTY_PARITY;
-			(info->icount.parity)++;
+			if (status & 0x10) {
+				statflag = TTY_BREAK;
+				(info->icount.brk)++;
+				if (info->flags & ASYNC_SAK)
+					do_SAK(tty);
+			} else if (status & 0x08) {
+				statflag = TTY_FRAME;
+				(info->icount.frame)++;
+			}
+			else if (status & 0x04) {
+				statflag = TTY_PARITY;
+				(info->icount.parity)++;
+			}
+			tty_insert_flip_char(tty, dma_buffer[num_bytes - 1], statflag);
 		}
-
-		tty->flip.flag_buf_ptr++;
-		
-		schedule_delayed_work(&tty->flip.work, 1);
+		schedule_delayed_work(&tty->buf.work, 1);
 	}
 
 	if (dma_bytes != num_bytes) {
@@ -693,8 +681,7 @@ static irqreturn_t rs_interrupt_single(int irq, void *dev_id,
 		num_bytes = serial_in(info, UART_ESI_STAT1) << 8;
 		num_bytes |= serial_in(info, UART_ESI_STAT2);
 
-		if (num_bytes > (TTY_FLIPBUF_SIZE - info->tty->flip.count))
-		  num_bytes = TTY_FLIPBUF_SIZE - info->tty->flip.count;
+		num_bytes = tty_buffer_request_room(info->tty, num_bytes);
 
 		if (num_bytes) {
 			if (dma_bytes ||
diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index f92177634677..1994a92d4733 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -597,9 +597,7 @@ static int hvc_poll(struct hvc_struct *hp)
 
 	/* Read data if any */
 	for (;;) {
-		int count = N_INBUF;
-		if (count > (TTY_FLIPBUF_SIZE - tty->flip.count))
-			count = TTY_FLIPBUF_SIZE - tty->flip.count;
+		int count = tty_buffer_request_room(tty, N_INBUF);
 
 		/* If flip is full, just reschedule a later read */
 		if (count == 0) {
@@ -635,7 +633,7 @@ static int hvc_poll(struct hvc_struct *hp)
 			tty_insert_flip_char(tty, buf[i], 0);
 		}
 
-		if (tty->flip.count)
+		if (count)
 			tty_schedule_flip(tty);
 
 		/*
diff --git a/drivers/char/hvcs.c b/drivers/char/hvcs.c
index 53dc77c760fc..831eb4e8d9d3 100644
--- a/drivers/char/hvcs.c
+++ b/drivers/char/hvcs.c
@@ -456,12 +456,11 @@ static int hvcs_io(struct hvcs_struct *hvcsd)
 	/* remove the read masks */
 	hvcsd->todo_mask &= ~(HVCS_READ_MASK);
 
-	if ((tty->flip.count + HVCS_BUFF_LEN) < TTY_FLIPBUF_SIZE) {
+	if (tty_buffer_request_room(tty, HVCS_BUFF_LEN) >= HVCS_BUFF_LEN) {
 		got = hvc_get_chars(unit_address,
 				&buf[0],
 				HVCS_BUFF_LEN);
-		for (i=0;got && i<got;i++)
-			tty_insert_flip_char(tty, buf[i], TTY_NORMAL);
+		tty_insert_flip_string(tty, buf, got);
 	}
 
 	/* Give the TTY time to process the data we just sent. */
@@ -469,10 +468,9 @@ static int hvcs_io(struct hvcs_struct *hvcsd)
 		hvcsd->todo_mask |= HVCS_QUICK_READ;
 
 	spin_unlock_irqrestore(&hvcsd->lock, flags);
-	if (tty->flip.count) {
-		/* This is synch because tty->low_latency == 1 */
+	/* This is synch because tty->low_latency == 1 */
+	if(got)
 		tty_flip_buffer_push(tty);
-	}
 
 	if (!got) {
 		/* Do this _after_ the flip_buffer_push */
diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index 1bbf507adda5..86033bed5d6c 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -115,6 +115,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/tty.h>
+#include <linux/tty_flip.h>
 #include <linux/termios.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
@@ -773,6 +774,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id,
 	unsigned short base, header, word_count, count;
 	unsigned char channel;
 	short byte_count;
+	unsigned char *rp;
 	
 	card = (struct isi_board *) dev_id;
 
@@ -903,14 +905,10 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id,
 				break;
 				
 			case 1:	/* Received Break !!!	 */
-				if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-					break;
-				*tty->flip.flag_buf_ptr++ = TTY_BREAK;
-				*tty->flip.char_buf_ptr++ = 0;
-				tty->flip.count++;
+				tty_insert_flip_char(tty, 0, TTY_BREAK);
 				if (port->flags & ASYNC_SAK)
 					do_SAK(tty);
-				schedule_delayed_work(&tty->flip.work, 1);
+				tty_flip_buffer_push(tty);
 				break;
 				
 			case 2:	/* Statistics		 */
@@ -923,23 +921,19 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id,
 		}	 
 	}
 	else {				/* Data   Packet */
-		count = min_t(unsigned short, byte_count, (TTY_FLIPBUF_SIZE - tty->flip.count));
+
+		count = tty_prepare_flip_string(tty, &rp, byte_count & ~1);
 #ifdef ISICOM_DEBUG
 		printk(KERN_DEBUG "ISICOM: Intr: Can rx %d of %d bytes.\n", 
 					count, byte_count);
 #endif			
 		word_count = count >> 1;
-		insw(base, tty->flip.char_buf_ptr, word_count);
-		tty->flip.char_buf_ptr += (word_count << 1);		
+		insw(base, rp, word_count);
 		byte_count -= (word_count << 1);
 		if (count & 0x0001) {
-			*tty->flip.char_buf_ptr++ = (char)(inw(base) & 0xff);
+			tty_insert_flip_char(tty,  inw(base) & 0xff, TTY_NORMAL);
 			byte_count -= 2;
 		}	
-		memset(tty->flip.flag_buf_ptr, 0, count);
-		tty->flip.flag_buf_ptr += count;
-		tty->flip.count += count;
-		
 		if (byte_count > 0) {
 			printk(KERN_DEBUG "ISICOM: Intr(0x%x:%d): Flip buffer overflow! dropping bytes...\n",
 					base, channel+1);
@@ -948,7 +942,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id,
 				byte_count -= 2;
 			}
 		}
-		schedule_delayed_work(&tty->flip.work, 1);
+		tty_flip_buffer_push(tty);
 	}
 	if (card->isa == YES)
 		ClearInterrupt(base);
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index 24435f8daa68..28c5a3193b81 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -2711,17 +2711,13 @@ static void stli_read(stlibrd_t *brdp, stliport_t *portp)
 		stlen = size - tail;
 	}
 
-	len = MIN(len, (TTY_FLIPBUF_SIZE - tty->flip.count));
+	len = tty_buffer_request_room(tty, len);
+	/* FIXME : iomap ? */
 	shbuf = (volatile char *) EBRDGETMEMPTR(brdp, portp->rxoffset);
 
 	while (len > 0) {
 		stlen = MIN(len, stlen);
-		memcpy(tty->flip.char_buf_ptr, (char *) (shbuf + tail), stlen);
-		memset(tty->flip.flag_buf_ptr, 0, stlen);
-		tty->flip.char_buf_ptr += stlen;
-		tty->flip.flag_buf_ptr += stlen;
-		tty->flip.count += stlen;
-
+		tty_insert_flip_string(tty, (char *)(shbuf + tail), stlen);
 		len -= stlen;
 		tail += stlen;
 		if (tail >= size) {
@@ -2906,16 +2902,12 @@ static int stli_hostcmd(stlibrd_t *brdp, stliport_t *portp)
 
 		if ((nt.data & DT_RXBREAK) && (portp->rxmarkmsk & BRKINT)) {
 			if (tty != (struct tty_struct *) NULL) {
-				if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-					tty->flip.count++;
-					*tty->flip.flag_buf_ptr++ = TTY_BREAK;
-					*tty->flip.char_buf_ptr++ = 0;
-					if (portp->flags & ASYNC_SAK) {
-						do_SAK(tty);
-						EBRDENABLE(brdp);
-					}
-					tty_schedule_flip(tty);
+				tty_insert_flip_char(tty, 0, TTY_BREAK);
+				if (portp->flags & ASYNC_SAK) {
+					do_SAK(tty);
+					EBRDENABLE(brdp);
 				}
+				tty_schedule_flip(tty);
 			}
 		}
 
@@ -4940,7 +4932,7 @@ static int stli_portcmdstats(stliport_t *portp)
 	if (portp->tty != (struct tty_struct *) NULL) {
 		if (portp->tty->driver_data == portp) {
 			stli_comstats.ttystate = portp->tty->flags;
-			stli_comstats.rxbuffered = portp->tty->flip.count;
+			stli_comstats.rxbuffered = -1 /*portp->tty->flip.count*/;
 			if (portp->tty->termios != (struct termios *) NULL) {
 				stli_comstats.cflags = portp->tty->termios->c_cflag;
 				stli_comstats.iflags = portp->tty->termios->c_iflag;
diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c
index 46a3a8ccd65f..5e3ef5522194 100644
--- a/drivers/char/moxa.c
+++ b/drivers/char/moxa.c
@@ -269,7 +269,7 @@ static int MoxaPortDCDChange(int);
 static int MoxaPortDCDON(int);
 static void MoxaPortFlushData(int, int);
 static int MoxaPortWriteData(int, unsigned char *, int);
-static int MoxaPortReadData(int, unsigned char *, int);
+static int MoxaPortReadData(int, struct tty_struct *tty);
 static int MoxaPortTxQueue(int);
 static int MoxaPortRxQueue(int);
 static int MoxaPortTxFree(int);
@@ -301,6 +301,8 @@ static struct tty_operations moxa_ops = {
 	.tiocmset = moxa_tiocmset,
 };
 
+static spinlock_t moxa_lock = SPIN_LOCK_UNLOCKED;
+
 #ifdef CONFIG_PCI
 static int moxa_get_PCI_conf(struct pci_dev *p, int board_type, moxa_board_conf * board)
 {
@@ -645,10 +647,10 @@ static int moxa_write(struct tty_struct *tty,
 	if (ch == NULL)
 		return (0);
 	port = ch->port;
-	save_flags(flags);
-	cli();
+
+	spin_lock_irqsave(&moxa_lock, flags);
 	len = MoxaPortWriteData(port, (unsigned char *) buf, count);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&moxa_lock, flags);
 
 	/*********************************************
 	if ( !(ch->statusflags & LOWWAIT) &&
@@ -723,11 +725,10 @@ static void moxa_put_char(struct tty_struct *tty, unsigned char c)
 	if (ch == NULL)
 		return;
 	port = ch->port;
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&moxa_lock, flags);
 	moxaXmitBuff[0] = c;
 	MoxaPortWriteData(port, moxaXmitBuff, 1);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&moxa_lock, flags);
 	/************************************************
 	if ( !(ch->statusflags & LOWWAIT) && (MoxaPortTxFree(port) <= 100) )
 	*************************************************/
@@ -1030,12 +1031,12 @@ static int block_till_ready(struct tty_struct *tty, struct file *filp,
 	printk("block_til_ready before block: ttys%d, count = %d\n",
 	       ch->line, ch->count);
 #endif
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&moxa_lock, flags);
 	if (!tty_hung_up_p(filp))
 		ch->count--;
-	restore_flags(flags);
 	ch->blocked_open++;
+	spin_unlock_irqrestore(&moxa_lock, flags);
+
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (tty_hung_up_p(filp) ||
@@ -1062,17 +1063,21 @@ static int block_till_ready(struct tty_struct *tty, struct file *filp,
 	}
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&ch->open_wait, &wait);
+
+	spin_lock_irqsave(&moxa_lock, flags);
 	if (!tty_hung_up_p(filp))
 		ch->count++;
 	ch->blocked_open--;
+	spin_unlock_irqrestore(&moxa_lock, flags);
 #ifdef SERIAL_DEBUG_OPEN
 	printk("block_til_ready after blocking: ttys%d, count = %d\n",
 	       ch->line, ch->count);
 #endif
 	if (retval)
 		return (retval);
+	/* FIXME: review to see if we need to use set_bit on these */
 	ch->asyncflags |= ASYNC_NORMAL_ACTIVE;
-	return (0);
+	return 0;
 }
 
 static void setup_empty_event(struct tty_struct *tty)
@@ -1080,15 +1085,14 @@ static void setup_empty_event(struct tty_struct *tty)
 	struct moxa_str *ch = tty->driver_data;
 	unsigned long flags;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&moxa_lock, flags);
 	ch->statusflags |= EMPTYWAIT;
 	moxaEmptyTimer_on[ch->port] = 0;
 	del_timer(&moxaEmptyTimer[ch->port]);
 	moxaEmptyTimer[ch->port].expires = jiffies + HZ;
 	moxaEmptyTimer_on[ch->port] = 1;
 	add_timer(&moxaEmptyTimer[ch->port]);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&moxa_lock, flags);
 }
 
 static void check_xmit_empty(unsigned long data)
@@ -1135,8 +1139,6 @@ static void receive_data(struct moxa_str *ch)
 {
 	struct tty_struct *tp;
 	struct termios *ts;
-	int i, count, rc, space;
-	unsigned char *charptr, *flagptr;
 	unsigned long flags;
 
 	ts = NULL;
@@ -1150,24 +1152,10 @@ static void receive_data(struct moxa_str *ch)
 		MoxaPortFlushData(ch->port, 0);
 		return;
 	}
-	space = TTY_FLIPBUF_SIZE - tp->flip.count;
-	if (space <= 0)
-		return;
-	charptr = tp->flip.char_buf_ptr;
-	flagptr = tp->flip.flag_buf_ptr;
-	rc = tp->flip.count;
-	save_flags(flags);
-	cli();
-	count = MoxaPortReadData(ch->port, charptr, space);
-	restore_flags(flags);
-	for (i = 0; i < count; i++)
-		*flagptr++ = 0;
-	charptr += count;
-	rc += count;
-	tp->flip.count = rc;
-	tp->flip.char_buf_ptr = charptr;
-	tp->flip.flag_buf_ptr = flagptr;
-	tty_schedule_flip(ch->tty);
+	spin_lock_irqsave(&moxa_lock, flags);
+	MoxaPortReadData(ch->port, tp);
+	spin_unlock_irqrestore(&moxa_lock, flags);
+	tty_schedule_flip(tp);
 }
 
 #define Magic_code	0x404
@@ -1774,7 +1762,7 @@ int MoxaPortsOfCard(int cardno)
  *	14. MoxaPortDCDON(int port);					     *
  *	15. MoxaPortFlushData(int port, int mode);	                     *
  *	16. MoxaPortWriteData(int port, unsigned char * buffer, int length); *
- *	17. MoxaPortReadData(int port, unsigned char * buffer, int length);  *
+ *	17. MoxaPortReadData(int port, struct tty_struct *tty); 	     *
  *	18. MoxaPortTxBufSize(int port);				     *
  *	19. MoxaPortRxBufSize(int port);				     *
  *	20. MoxaPortTxQueue(int port);					     *
@@ -2003,10 +1991,9 @@ int MoxaPortsOfCard(int cardno)
  *
  *      Function 21:    Read data.
  *      Syntax:
- *      int  MoxaPortReadData(int port, unsigned char * buffer, int length);
+ *      int  MoxaPortReadData(int port, struct tty_struct *tty);
  *           int port           : port number (0 - 127)
- *           unsigned char * buffer     : pointer to read data buffer.
- *           int length         : read data buffer length
+ *	     struct tty_struct *tty : tty for data
  *
  *           return:    0 - length      : real read data length
  *
@@ -2504,7 +2491,7 @@ int MoxaPortWriteData(int port, unsigned char * buffer, int len)
 	return (total);
 }
 
-int MoxaPortReadData(int port, unsigned char * buffer, int space)
+int MoxaPortReadData(int port, struct tty_struct *tty)
 {
 	register ushort head, pageofs;
 	int i, count, cnt, len, total, remain;
@@ -2522,9 +2509,9 @@ int MoxaPortReadData(int port, unsigned char * buffer, int space)
 	count = (tail >= head) ? (tail - head)
 	    : (tail - head + rx_mask + 1);
 	if (count == 0)
-		return (0);
+		return 0;
 
-	total = (space > count) ? count : space;
+	total = count;
 	remain = count - total;
 	moxaLog.rxcnt[port] += total;
 	count = total;
@@ -2539,7 +2526,7 @@ int MoxaPortReadData(int port, unsigned char * buffer, int space)
 			len = (count > len) ? len : count;
 			ofs = baseAddr + DynPage_addr + bufhead + head;
 			for (i = 0; i < len; i++)
-				*buffer++ = readb(ofs + i);
+				tty_insert_flip_char(tty, readb(ofs + i), TTY_NORMAL);
 			head = (head + len) & rx_mask;
 			count -= len;
 		}
@@ -2556,7 +2543,7 @@ int MoxaPortReadData(int port, unsigned char * buffer, int space)
 			writew(pageno, baseAddr + Control_reg);
 			ofs = baseAddr + DynPage_addr + pageofs;
 			for (i = 0; i < cnt; i++)
-				*buffer++ = readb(ofs + i);
+				tty_insert_flip_char(tty, readb(ofs + i), TTY_NORMAL);
 			if (count == 0) {
 				writew((head + len) & rx_mask, ofsAddr + RXrptr);
 				break;
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index 51bb2a3cf8b3..ea725a9964e2 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -1982,7 +1982,7 @@ static void mxser_receive_chars(struct mxser_struct *info, int *status)
 
 	spin_lock_irqsave(&info->slock, flags);
 
-	recv_room = tty->ldisc.receive_room(tty);
+	recv_room = tty->receive_room;
 	if ((recv_room == 0) && (!info->ldisc_stop_rx)) {
 		//mxser_throttle(tty);
 		mxser_stoprx(tty);
diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c
index a133a62f3d55..70f487dd7b8d 100644
--- a/drivers/char/n_hdlc.c
+++ b/drivers/char/n_hdlc.c
@@ -212,7 +212,6 @@ static struct tty_ldisc n_hdlc_ldisc = {
 	.ioctl		= n_hdlc_tty_ioctl,
 	.poll		= n_hdlc_tty_poll,
 	.receive_buf	= n_hdlc_tty_receive,
-	.receive_room	= n_hdlc_tty_room,
 	.write_wakeup	= n_hdlc_tty_wakeup,
 };
 
@@ -337,6 +336,7 @@ static int n_hdlc_tty_open (struct tty_struct *tty)
 		
 	tty->disc_data = n_hdlc;
 	n_hdlc->tty    = tty;
+	tty->receive_room = 65536;
 	
 #if defined(TTY_NO_WRITE_SPLIT)
 	/* change tty_io write() to not split large writes into 8K chunks */
@@ -477,22 +477,6 @@ static void n_hdlc_tty_wakeup(struct tty_struct *tty)
 		
 }	/* end of n_hdlc_tty_wakeup() */
 
-/**
- * n_hdlc_tty_room - Return the amount of space left in the receiver's buffer
- * @tty	- pointer to associated tty instance data
- *
- * Callback function from tty driver. Return the amount of space left in the
- * receiver's buffer to decide if remote transmitter is to be throttled.
- */
-static int n_hdlc_tty_room(struct tty_struct *tty)
-{
-	if (debuglevel >= DEBUG_LEVEL_INFO)	
-		printk("%s(%d)n_hdlc_tty_room() called\n",__FILE__,__LINE__);
-	/* always return a larger number to prevent */
-	/* throttling of remote transmitter. */
-	return 65536;
-}	/* end of n_hdlc_tty_root() */
-
 /**
  * n_hdlc_tty_receive - Called by tty driver when receive data is available
  * @tty	- pointer to tty instance data
diff --git a/drivers/char/n_r3964.c b/drivers/char/n_r3964.c
index 853c98cee64f..c48de09d68f0 100644
--- a/drivers/char/n_r3964.c
+++ b/drivers/char/n_r3964.c
@@ -147,7 +147,6 @@ static unsigned int r3964_poll(struct tty_struct * tty, struct file * file,
 		      struct poll_table_struct  *wait);
 static void r3964_receive_buf(struct tty_struct *tty, const unsigned char *cp,
                               char *fp, int count);
-static int  r3964_receive_room(struct tty_struct *tty);
 
 static struct tty_ldisc tty_ldisc_N_R3964 = {
 	.owner	 = THIS_MODULE,
@@ -161,7 +160,6 @@ static struct tty_ldisc tty_ldisc_N_R3964 = {
 	.set_termios = r3964_set_termios,
 	.poll	= r3964_poll,            
 	.receive_buf = r3964_receive_buf,
-	.receive_room = r3964_receive_room,
 };
 
 
@@ -1119,6 +1117,7 @@ static int r3964_open(struct tty_struct *tty)
    pInfo->nRetry = 0;
    
    tty->disc_data = pInfo;
+   tty->receive_room = 65536;
 
    init_timer(&pInfo->tmr);
    pInfo->tmr.data = (unsigned long)pInfo;
@@ -1405,12 +1404,5 @@ static void r3964_receive_buf(struct tty_struct *tty, const unsigned char *cp,
     }
 }
 
-static int r3964_receive_room(struct tty_struct *tty)
-{
-   TRACE_L("receive_room");
-   return -1;
-}
-
-
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_LDISC(N_R3964);
diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index c556f4d3ccd7..ccad7ae94541 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -78,7 +78,32 @@ static inline void free_buf(unsigned char *buf)
 		free_page((unsigned long) buf);
 }
 
-static inline void put_tty_queue_nolock(unsigned char c, struct tty_struct *tty)
+/**
+ *	n_tty_set__room	-	receive space
+ *	@tty: terminal
+ *
+ *	Called by the driver to find out how much data it is
+ *	permitted to feed to the line discipline without any being lost
+ *	and thus to manage flow control. Not serialized. Answers for the
+ *	"instant".
+ */
+
+static void n_tty_set_room(struct tty_struct *tty)
+{
+	int	left = N_TTY_BUF_SIZE - tty->read_cnt - 1;
+
+	/*
+	 * If we are doing input canonicalization, and there are no
+	 * pending newlines, let characters through without limit, so
+	 * that erase characters will be handled.  Other excess
+	 * characters will be beeped.
+	 */
+	if (left <= 0)
+		left = tty->icanon && !tty->canon_data;
+	tty->receive_room = left;
+}
+
+static void put_tty_queue_nolock(unsigned char c, struct tty_struct *tty)
 {
 	if (tty->read_cnt < N_TTY_BUF_SIZE) {
 		tty->read_buf[tty->read_head] = c;
@@ -87,7 +112,7 @@ static inline void put_tty_queue_nolock(unsigned char c, struct tty_struct *tty)
 	}
 }
 
-static inline void put_tty_queue(unsigned char c, struct tty_struct *tty)
+static void put_tty_queue(unsigned char c, struct tty_struct *tty)
 {
 	unsigned long flags;
 	/*
@@ -136,6 +161,7 @@ static void reset_buffer_flags(struct tty_struct *tty)
 	spin_unlock_irqrestore(&tty->read_lock, flags);
 	tty->canon_head = tty->canon_data = tty->erasing = 0;
 	memset(&tty->read_flags, 0, sizeof tty->read_flags);
+	n_tty_set_room(tty);
 	check_unthrottle(tty);
 }
 
@@ -838,30 +864,6 @@ send_signal:
 	put_tty_queue(c, tty);
 }	
 
-/**
- *	n_tty_receive_room	-	receive space
- *	@tty: terminal
- *
- *	Called by the driver to find out how much data it is
- *	permitted to feed to the line discipline without any being lost
- *	and thus to manage flow control. Not serialized. Answers for the
- *	"instant".
- */
- 
-static int n_tty_receive_room(struct tty_struct *tty)
-{
-	int	left = N_TTY_BUF_SIZE - tty->read_cnt - 1;
-
-	/*
-	 * If we are doing input canonicalization, and there are no
-	 * pending newlines, let characters through without limit, so
-	 * that erase characters will be handled.  Other excess
-	 * characters will be beeped.
-	 */
-	if (left <= 0)
-		left = tty->icanon && !tty->canon_data;
-	return left;
-}
 
 /**
  *	n_tty_write_wakeup	-	asynchronous I/O notifier
@@ -953,6 +955,8 @@ static void n_tty_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 			tty->driver->flush_chars(tty);
 	}
 
+	n_tty_set_room(tty);
+
 	if (!tty->icanon && (tty->read_cnt >= tty->minimum_to_wake)) {
 		kill_fasync(&tty->fasync, SIGIO, POLL_IN);
 		if (waitqueue_active(&tty->read_wait))
@@ -964,7 +968,7 @@ static void n_tty_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 	 * mode.  We don't want to throttle the driver if we're in
 	 * canonical mode and don't have a newline yet!
 	 */
-	if (n_tty_receive_room(tty) < TTY_THRESHOLD_THROTTLE) {
+	if (tty->receive_room < TTY_THRESHOLD_THROTTLE) {
 		/* check TTY_THROTTLED first so it indicates our state */
 		if (!test_and_set_bit(TTY_THROTTLED, &tty->flags) &&
 		    tty->driver->throttle)
@@ -999,6 +1003,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct termios * old)
 	if (test_bit(TTY_HW_COOK_IN, &tty->flags)) {
 		tty->raw = 1;
 		tty->real_raw = 1;
+		n_tty_set_room(tty);
 		return;
 	}
 	if (I_ISTRIP(tty) || I_IUCLC(tty) || I_IGNCR(tty) ||
@@ -1051,6 +1056,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct termios * old)
 		else
 			tty->real_raw = 0;
 	}
+	n_tty_set_room(tty);
 }
 
 /**
@@ -1130,7 +1136,7 @@ static inline int input_available_p(struct tty_struct *tty, int amt)
  *
  */
  
-static inline int copy_from_read_buf(struct tty_struct *tty,
+static int copy_from_read_buf(struct tty_struct *tty,
 				      unsigned char __user **b,
 				      size_t *nr)
 
@@ -1308,6 +1314,7 @@ do_it_again:
 				retval = -ERESTARTSYS;
 				break;
 			}
+			n_tty_set_room(tty);
 			clear_bit(TTY_DONT_FLIP, &tty->flags);
 			timeout = schedule_timeout(timeout);
 			set_bit(TTY_DONT_FLIP, &tty->flags);
@@ -1401,6 +1408,8 @@ do_it_again:
 	} else if (test_and_clear_bit(TTY_PUSH, &tty->flags))
 		 goto do_it_again;
 
+	n_tty_set_room(tty);
+
 	return retval;
 }
 
@@ -1553,7 +1562,6 @@ struct tty_ldisc tty_ldisc_N_TTY = {
 	normal_poll,		/* poll */
 	NULL,			/* hangup */
 	n_tty_receive_buf,	/* receive_buf */
-	n_tty_receive_room,	/* receive_room */
 	n_tty_write_wakeup	/* write_wakeup */
 };
 
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 9fb10c9fec88..8a8ca32822ba 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1007,8 +1007,9 @@ static void rx_ready_hdlc(MGSLPC_INFO *info, int eom)
 
 static void rx_ready_async(MGSLPC_INFO *info, int tcd)
 {
-	unsigned char data, status;
+	unsigned char data, status, flag;
 	int fifo_count;
+	int work = 0;
  	struct tty_struct *tty = info->tty;
  	struct mgsl_icount *icount = &info->icount;
 
@@ -1023,20 +1024,16 @@ static void rx_ready_async(MGSLPC_INFO *info, int tcd)
 			fifo_count = 32;
 	} else
 		fifo_count = 32;
-	
+
+	tty_buffer_request_room(tty, fifo_count);
 	/* Flush received async data to receive data buffer. */ 
 	while (fifo_count) {
 		data   = read_reg(info, CHA + RXFIFO);
 		status = read_reg(info, CHA + RXFIFO);
 		fifo_count -= 2;
 
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-			break;
-			
-		*tty->flip.char_buf_ptr = data;
 		icount->rx++;
-		
-		*tty->flip.flag_buf_ptr = 0;
+		flag = TTY_NORMAL;
 
 		// if no frameing/crc error then save data
 		// BIT7:parity error
@@ -1055,26 +1052,23 @@ static void rx_ready_async(MGSLPC_INFO *info, int tcd)
 			status &= info->read_status_mask;
 
 			if (status & BIT7)
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			else if (status & BIT6)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
+				flag = TTY_FRAME;
 		}
-		
-		tty->flip.flag_buf_ptr++;
-		tty->flip.char_buf_ptr++;
-		tty->flip.count++;
+		work += tty_insert_flip_char(tty, data, flag);
 	}
 	issue_command(info, CHA, CMD_RXFIFO);
 
 	if (debug_level >= DEBUG_LEVEL_ISR) {
-		printk("%s(%d):rx_ready_async count=%d\n",
-			__FILE__,__LINE__,tty->flip.count);
+		printk("%s(%d):rx_ready_async",
+			__FILE__,__LINE__);
 		printk("%s(%d):rx=%d brk=%d parity=%d frame=%d overrun=%d\n",
 			__FILE__,__LINE__,icount->rx,icount->brk,
 			icount->parity,icount->frame,icount->overrun);
 	}
 			
-	if (tty->flip.count)
+	if (work)
 		tty_flip_buffer_push(tty);
 }
 
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 49f3997fd251..9b5a2c0e7008 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -111,7 +111,7 @@ static int pty_write(struct tty_struct * tty, const unsigned char *buf, int coun
 	if (!to || tty->stopped)
 		return 0;
 
-	c = to->ldisc.receive_room(to);
+	c = to->receive_room;
 	if (c > count)
 		c = count;
 	to->ldisc.receive_buf(to, buf, NULL, c);
@@ -126,7 +126,7 @@ static int pty_write_room(struct tty_struct *tty)
 	if (!to || tty->stopped)
 		return 0;
 
-	return to->ldisc.receive_room(to);
+	return to->receive_room;
 }
 
 /*
diff --git a/drivers/char/rio/riointr.c b/drivers/char/rio/riointr.c
index e42e7b50bf6b..ddda9c14e059 100644
--- a/drivers/char/rio/riointr.c
+++ b/drivers/char/rio/riointr.c
@@ -38,6 +38,7 @@ static char *_riointr_c_sccs_ = "@(#)riointr.c	1.2";
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/tty.h>
+#include <linux/tty_flip.h>
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/string.h>
@@ -560,6 +561,7 @@ struct Port *		PortP;
   struct PKT *PacketP;
   register uint	DataCnt;
   uchar *	ptr;
+  unsigned char *buf;
   int copied =0;
 
   static int intCount, RxIntCnt;
@@ -657,8 +659,7 @@ struct Port *		PortP;
 	  ** and available space.
 	  */
 			
-	  transCount = min_t(unsigned int, PacketP->len & PKT_LEN_MASK,
-			   TTY_FLIPBUF_SIZE - TtyP->flip.count);
+	  transCount = tty_buffer_request_room(TtyP, PacketP->len & PKT_LEN_MASK);
 	  rio_dprintk (RIO_DEBUG_REC,  "port %d: Copy %d bytes\n", 
 				      PortP->PortNum, transCount);
 	  /*
@@ -678,9 +679,8 @@ struct Port *		PortP;
 #endif
 	  ptr = (uchar *) PacketP->data + PortP->RxDataStart;
 
-	  rio_memcpy_fromio (TtyP->flip.char_buf_ptr, ptr, transCount);
-	  memset(TtyP->flip.flag_buf_ptr, TTY_NORMAL, transCount);
-
+	  tty_prepare_flip_string(TtyP, &buf, transCount);
+	  rio_memcpy_fromio (buf, ptr, transCount);
 #ifdef STATS
 	  /*
 	  ** keep a count for statistical purposes
@@ -690,9 +690,6 @@ struct Port *		PortP;
 	  PortP->RxDataStart	+= transCount;
 	  PacketP->len		-= transCount;
 	  copied += transCount;
-	  TtyP->flip.count += transCount;
-	  TtyP->flip.char_buf_ptr += transCount;
-	  TtyP->flip.flag_buf_ptr += transCount;
 
 
 #ifdef ___DEBUG_IT___
diff --git a/drivers/char/riscom8.c b/drivers/char/riscom8.c
index 5dae32521620..050e70ee5920 100644
--- a/drivers/char/riscom8.c
+++ b/drivers/char/riscom8.c
@@ -46,6 +46,7 @@
 #include <linux/major.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/tty_flip.h>
 
 #include <asm/uaccess.h>
 
@@ -354,28 +355,17 @@ static inline void rc_receive_exc(struct riscom_board const * bp)
 	struct riscom_port *port;
 	struct tty_struct *tty;
 	unsigned char status;
-	unsigned char ch;
+	unsigned char ch, flag;
 	
 	if (!(port = rc_get_port(bp, "Receive")))
 		return;
 
 	tty = port->tty;
-	if (tty->flip.count >= TTY_FLIPBUF_SIZE)  {
-		printk(KERN_WARNING "rc%d: port %d: Working around flip "
-				    "buffer overflow.\n",
-		       board_No(bp), port_No(port));
-		return;
-	}
 	
 #ifdef RC_REPORT_OVERRUN	
 	status = rc_in(bp, CD180_RCSR);
-	if (status & RCSR_OE)  {
+	if (status & RCSR_OE)
 		port->overrun++;
-#if 0		
-		printk(KERN_ERR "rc%d: port %d: Overrun. Total %ld overruns\n", 
-		       board_No(bp), port_No(port), port->overrun);
-#endif		
-	}
 	status &= port->mark_mask;
 #else	
 	status = rc_in(bp, CD180_RCSR) & port->mark_mask;
@@ -393,25 +383,24 @@ static inline void rc_receive_exc(struct riscom_board const * bp)
 	} else if (status & RCSR_BREAK)  {
 		printk(KERN_INFO "rc%d: port %d: Handling break...\n",
 		       board_No(bp), port_No(port));
-		*tty->flip.flag_buf_ptr++ = TTY_BREAK;
+		flag = TTY_BREAK;
 		if (port->flags & ASYNC_SAK)
 			do_SAK(tty);
 		
 	} else if (status & RCSR_PE) 
-		*tty->flip.flag_buf_ptr++ = TTY_PARITY;
+		flag = TTY_PARITY;
 	
 	else if (status & RCSR_FE) 
-		*tty->flip.flag_buf_ptr++ = TTY_FRAME;
+		flag = TTY_FRAME;
 	
         else if (status & RCSR_OE)
-		*tty->flip.flag_buf_ptr++ = TTY_OVERRUN;
+		flag = TTY_OVERRUN;
 	
 	else
-		*tty->flip.flag_buf_ptr++ = 0;
+		flag = TTY_NORMAL;
 	
-	*tty->flip.char_buf_ptr++ = ch;
-	tty->flip.count++;
-	schedule_delayed_work(&tty->flip.work, 1);
+	tty_insert_flip_char(tty, ch, flag);
+	tty_flip_buffer_push(tty);
 }
 
 static inline void rc_receive(struct riscom_board const * bp)
@@ -432,17 +421,15 @@ static inline void rc_receive(struct riscom_board const * bp)
 #endif	
 	
 	while (count--)  {
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)  {
+		if (tty_buffer_request_room(tty, 1) == 0)  {
 			printk(KERN_WARNING "rc%d: port %d: Working around "
 					    "flip buffer overflow.\n",
 			       board_No(bp), port_No(port));
 			break;
 		}
-		*tty->flip.char_buf_ptr++ = rc_in(bp, CD180_RDR);
-		*tty->flip.flag_buf_ptr++ = 0;
-		tty->flip.count++;
+		tty_insert_flip_char(tty, rc_in(bp, CD180_RDR), TTY_NORMAL);
 	}
-	schedule_delayed_work(&tty->flip.work, 1);
+	tty_flip_buffer_push(tty);
 }
 
 static inline void rc_transmit(struct riscom_board const * bp)
diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index d3bc731fbb27..0949dcef0697 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -325,19 +325,16 @@ static void rp_do_receive(struct r_port *info,
 {
 	unsigned int CharNStat;
 	int ToRecv, wRecv, space = 0, count;
-	unsigned char *cbuf;
-	char *fbuf;
+	unsigned char *cbuf, *chead;
+	char *fbuf, *fhead;
 	struct tty_ldisc *ld;
 
 	ld = tty_ldisc_ref(tty);
 
 	ToRecv = sGetRxCnt(cp);
-	if (ld)
-		space = ld->receive_room(tty);
+	space = tty->receive_room;
 	if (space > 2 * TTY_FLIPBUF_SIZE)
 		space = 2 * TTY_FLIPBUF_SIZE;
-	cbuf = tty->flip.char_buf;
-	fbuf = tty->flip.flag_buf;
 	count = 0;
 #ifdef ROCKET_DEBUG_INTR
 	printk(KERN_INFO "rp_do_receive(%d, %d)...", ToRecv, space);
@@ -350,9 +347,13 @@ static void rp_do_receive(struct r_port *info,
 	if (ToRecv > space)
 		ToRecv = space;
 
+	ToRecv = tty_prepare_flip_string_flags(tty, &chead, &fhead, ToRecv);
 	if (ToRecv <= 0)
 		goto done;
 
+	cbuf = chead;
+	fbuf = fhead;
+
 	/*
 	 * if status indicates there are errored characters in the
 	 * FIFO, then enter status mode (a word in FIFO holds
@@ -399,7 +400,7 @@ static void rp_do_receive(struct r_port *info,
 			else if (CharNStat & STMRCVROVRH)
 				*fbuf++ = TTY_OVERRUN;
 			else
-				*fbuf++ = 0;
+				*fbuf++ = TTY_NORMAL;
 			*cbuf++ = CharNStat & 0xff;
 			count++;
 			ToRecv--;
@@ -426,13 +427,13 @@ static void rp_do_receive(struct r_port *info,
 			sInStrW(sGetTxRxDataIO(cp), (unsigned short *) cbuf, wRecv);
 		if (ToRecv & 1)
 			cbuf[ToRecv - 1] = sInB(sGetTxRxDataIO(cp));
-		memset(fbuf, 0, ToRecv);
+		memset(fbuf, TTY_NORMAL, ToRecv);
 		cbuf += ToRecv;
 		fbuf += ToRecv;
 		count += ToRecv;
 	}
 	/*  Push the data up to the tty layer */
-	ld->receive_buf(tty, tty->flip.char_buf, tty->flip.flag_buf, count);
+	ld->receive_buf(tty, cbuf, fbuf, count);
 done:
 	tty_ldisc_deref(ld);
 }
diff --git a/drivers/char/selection.c b/drivers/char/selection.c
index 5b187c895c18..71093a9fc462 100644
--- a/drivers/char/selection.c
+++ b/drivers/char/selection.c
@@ -275,7 +275,8 @@ int set_selection(const struct tiocl_selection __user *sel, struct tty_struct *t
 int paste_selection(struct tty_struct *tty)
 {
 	struct vc_data *vc = (struct vc_data *)tty->driver_data;
-	int	pasted = 0, count;
+	int	pasted = 0;
+	unsigned int count;
 	struct  tty_ldisc *ld;
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -293,7 +294,7 @@ int paste_selection(struct tty_struct *tty)
 			continue;
 		}
 		count = sel_buffer_lth - pasted;
-		count = min(count, tty->ldisc.receive_room(tty));
+		count = min(count, tty->receive_room);
 		tty->ldisc.receive_buf(tty, sel_buffer + pasted, NULL, count);
 		pasted += count;
 	}
diff --git a/drivers/char/ser_a2232.c b/drivers/char/ser_a2232.c
index dda30e42ec79..80a5b840e22f 100644
--- a/drivers/char/ser_a2232.c
+++ b/drivers/char/ser_a2232.c
@@ -194,11 +194,6 @@ static inline void a2232_receive_char(struct a2232_port *port, int ch, int err)
 */
 	struct tty_struct *tty = port->gs.tty;
 
-	if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-		return;
-
-	tty->flip.count++;
-
 #if 0
 	switch(err) {
 	case TTY_BREAK:
@@ -212,8 +207,7 @@ static inline void a2232_receive_char(struct a2232_port *port, int ch, int err)
 	}
 #endif
 
-	*tty->flip.flag_buf_ptr++ = err;
-	*tty->flip.char_buf_ptr++ = ch;
+	tty_insert_flip_char(tty, ch, err);
 	tty_flip_buffer_push(tty);
 }
 
diff --git a/drivers/char/serial167.c b/drivers/char/serial167.c
index a580748b92a1..f36342ae8e7e 100644
--- a/drivers/char/serial167.c
+++ b/drivers/char/serial167.c
@@ -422,45 +422,35 @@ cd2401_rxerr_interrupt(int irq, void *dev_id, struct pt_regs *fp)
 	    base_addr[CyREOIR] = rfoc ? 0 : CyNOTRANS;
 	    return IRQ_HANDLED;
 	}
-	if (tty->flip.count < TTY_FLIPBUF_SIZE){
-	    tty->flip.count++;
+	if (tty_buffer_request_room(tty, 1) != 0){
 	    if (err & info->read_status_mask){
 		if(err & CyBREAK){
-		    *tty->flip.flag_buf_ptr++ = TTY_BREAK;
-		    *tty->flip.char_buf_ptr++ = data;
+		    tty_insert_flip_char(tty, data, TTY_BREAK);
 		    if (info->flags & ASYNC_SAK){
 			do_SAK(tty);
 		    }
 		}else if(err & CyFRAME){
-		    *tty->flip.flag_buf_ptr++ = TTY_FRAME;
-		    *tty->flip.char_buf_ptr++ = data;
+		    tty_insert_flip_char(tty, data, TTY_FRAME);
 		}else if(err & CyPARITY){
-		    *tty->flip.flag_buf_ptr++ = TTY_PARITY;
-		    *tty->flip.char_buf_ptr++ = data;
+		    tty_insert_flip_char(tty, data, TTY_PARITY);
 		}else if(err & CyOVERRUN){
-		    *tty->flip.flag_buf_ptr++ = TTY_OVERRUN;
-		    *tty->flip.char_buf_ptr++ = 0;
+		    tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 		    /*
 		       If the flip buffer itself is
 		       overflowing, we still loose
 		       the next incoming character.
 		     */
-		    if(tty->flip.count < TTY_FLIPBUF_SIZE){
-			tty->flip.count++;
-			*tty->flip.flag_buf_ptr++ = TTY_NORMAL;
-			*tty->flip.char_buf_ptr++ = data;
-		    }
+		    tty_insert_flip_char(tty, data, TTY_NORMAL);
+		}
 		/* These two conditions may imply */
 		/* a normal read should be done. */
 		/* else if(data & CyTIMEOUT) */
 		/* else if(data & CySPECHAR) */
 		}else{
-		    *tty->flip.flag_buf_ptr++ = 0;
-		    *tty->flip.char_buf_ptr++ = 0;
+		    tty_insert_flip_char(tty, 0, TTY_NORMAL);
 		}
 	    }else{
-		*tty->flip.flag_buf_ptr++ = 0;
-		*tty->flip.char_buf_ptr++ = 0;
+		    tty_insert_flip_char(tty, data, TTY_NORMAL);
 	    }
 	}else{
 	    /* there was a software buffer overrun
@@ -692,12 +682,7 @@ cd2401_rx_interrupt(int irq, void *dev_id, struct pt_regs *fp)
 #endif
 	while(char_count--){
 	    data = base_addr[CyRDR];
-	    if (tty->flip.count >= TTY_FLIPBUF_SIZE){
-		continue;
-	    }
-	    tty->flip.count++;
-	    *tty->flip.flag_buf_ptr++ = TTY_NORMAL;
-	    *tty->flip.char_buf_ptr++ = data;
+	    tty_insert_flip_char(tty, data, TTY_NORMAL);
 #ifdef CYCLOM_16Y_HACK
 	    udelay(10L);
 #endif
diff --git a/drivers/char/specialix.c b/drivers/char/specialix.c
index 0bbfce43031c..0a574bdbce36 100644
--- a/drivers/char/specialix.c
+++ b/drivers/char/specialix.c
@@ -85,6 +85,7 @@
 #include <linux/interrupt.h>
 #include <linux/errno.h>
 #include <linux/tty.h>
+#include <linux/tty_flip.h>
 #include <linux/mm.h>
 #include <linux/serial.h>
 #include <linux/fcntl.h>
@@ -665,7 +666,7 @@ static inline void sx_receive_exc(struct specialix_board * bp)
 	struct specialix_port *port;
 	struct tty_struct *tty;
 	unsigned char status;
-	unsigned char ch;
+	unsigned char ch, flag;
 
 	func_enter();
 
@@ -676,8 +677,6 @@ static inline void sx_receive_exc(struct specialix_board * bp)
 		return;
 	}
 	tty = port->tty;
-	dprintk (SX_DEBUG_RX, "port: %p count: %d BUFF_SIZE: %d\n",
-		 port,  tty->flip.count, TTY_FLIPBUF_SIZE);
 
 	status = sx_in(bp, CD186x_RCSR);
 
@@ -691,7 +690,7 @@ static inline void sx_receive_exc(struct specialix_board * bp)
 
 	/* This flip buffer check needs to be below the reading of the
 	   status register to reset the chip's IRQ.... */
-	if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
+	if (tty_buffer_request_room(tty, 1) == 0) {
 		dprintk(SX_DEBUG_FIFO, "sx%d: port %d: Working around flip buffer overflow.\n",
 		       board_No(bp), port_No(port));
 		func_exit();
@@ -712,26 +711,24 @@ static inline void sx_receive_exc(struct specialix_board * bp)
 	} else if (status & RCSR_BREAK) {
 		dprintk(SX_DEBUG_RX, "sx%d: port %d: Handling break...\n",
 		       board_No(bp), port_No(port));
-		*tty->flip.flag_buf_ptr++ = TTY_BREAK;
+		flag = TTY_BREAK;
 		if (port->flags & ASYNC_SAK)
 			do_SAK(tty);
 
 	} else if (status & RCSR_PE)
-		*tty->flip.flag_buf_ptr++ = TTY_PARITY;
+		flag = TTY_PARITY;
 
 	else if (status & RCSR_FE)
-		*tty->flip.flag_buf_ptr++ = TTY_FRAME;
+		flag = TTY_FRAME;
 
 	else if (status & RCSR_OE)
-		*tty->flip.flag_buf_ptr++ = TTY_OVERRUN;
+		flag = TTY_OVERRUN;
 
 	else
-		*tty->flip.flag_buf_ptr++ = 0;
-
-	*tty->flip.char_buf_ptr++ = ch;
-	tty->flip.count++;
-	schedule_delayed_work(&tty->flip.work, 1);
+		flag = TTY_NORMAL;
 
+	if(tty_insert_flip_char(tty, ch, flag))
+		tty_flip_buffer_push(tty);
 	func_exit();
 }
 
@@ -755,18 +752,11 @@ static inline void sx_receive(struct specialix_board * bp)
 	dprintk (SX_DEBUG_RX, "port: %p: count: %d\n", port, count);
 	port->hits[count > 8 ? 9 : count]++;
 
-	while (count--) {
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-			printk(KERN_INFO "sx%d: port %d: Working around flip buffer overflow.\n",
-			       board_No(bp), port_No(port));
-			break;
-		}
-		*tty->flip.char_buf_ptr++ = sx_in(bp, CD186x_RDR);
-		*tty->flip.flag_buf_ptr++ = 0;
-		tty->flip.count++;
-	}
-	schedule_delayed_work(&tty->flip.work, 1);
+	tty_buffer_request_room(tty, count);
 
+	while (count--)
+		tty_insert_flip_char(tty, sx_in(bp, CD186x_RDR), TTY_NORMAL);
+	tty_flip_buffer_push(tty);
 	func_exit();
 }
 
diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index acef2abf3f0d..0e20780d4a29 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -2901,7 +2901,8 @@ static int stl_getportstats(stlport_t *portp, comstats_t __user *cp)
 	if (portp->tty != (struct tty_struct *) NULL) {
 		if (portp->tty->driver_data == portp) {
 			portp->stats.ttystate = portp->tty->flags;
-			portp->stats.rxbuffered = portp->tty->flip.count;
+			/* No longer available as a statistic */
+			portp->stats.rxbuffered = 1; /*portp->tty->flip.count; */
 			if (portp->tty->termios != (struct termios *) NULL) {
 				portp->stats.cflags = portp->tty->termios->c_cflag;
 				portp->stats.iflags = portp->tty->termios->c_iflag;
@@ -4045,9 +4046,7 @@ static void stl_cd1400rxisr(stlpanel_t *panelp, int ioaddr)
 	if ((ioack & ACK_TYPMASK) == ACK_TYPRXGOOD) {
 		outb((RDCR + portp->uartaddr), ioaddr);
 		len = inb(ioaddr + EREG_DATA);
-		if ((tty == (struct tty_struct *) NULL) ||
-		    (tty->flip.char_buf_ptr == (char *) NULL) ||
-		    ((buflen = TTY_FLIPBUF_SIZE - tty->flip.count) == 0)) {
+		if (tty == NULL || (buflen = tty_buffer_request_room(tty, len)) == 0) {
 			len = MIN(len, sizeof(stl_unwanted));
 			outb((RDSR + portp->uartaddr), ioaddr);
 			insb((ioaddr + EREG_DATA), &stl_unwanted[0], len);
@@ -4056,12 +4055,10 @@ static void stl_cd1400rxisr(stlpanel_t *panelp, int ioaddr)
 		} else {
 			len = MIN(len, buflen);
 			if (len > 0) {
+				unsigned char *ptr;
 				outb((RDSR + portp->uartaddr), ioaddr);
-				insb((ioaddr + EREG_DATA), tty->flip.char_buf_ptr, len);
-				memset(tty->flip.flag_buf_ptr, 0, len);
-				tty->flip.flag_buf_ptr += len;
-				tty->flip.char_buf_ptr += len;
-				tty->flip.count += len;
+				tty_prepare_flip_string(tty, &ptr, len);
+				insb((ioaddr + EREG_DATA), ptr, len);
 				tty_schedule_flip(tty);
 				portp->stats.rxtotal += len;
 			}
@@ -4085,8 +4082,7 @@ static void stl_cd1400rxisr(stlpanel_t *panelp, int ioaddr)
 				portp->stats.txxoff++;
 			goto stl_rxalldone;
 		}
-		if ((tty != (struct tty_struct *) NULL) &&
-		    ((portp->rxignoremsk & status) == 0)) {
+		if (tty != NULL && (portp->rxignoremsk & status) == 0) {
 			if (portp->rxmarkmsk & status) {
 				if (status & ST_BREAK) {
 					status = TTY_BREAK;
@@ -4106,14 +4102,8 @@ static void stl_cd1400rxisr(stlpanel_t *panelp, int ioaddr)
 			} else {
 				status = 0;
 			}
-			if (tty->flip.char_buf_ptr != (char *) NULL) {
-				if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-					*tty->flip.flag_buf_ptr++ = status;
-					*tty->flip.char_buf_ptr++ = ch;
-					tty->flip.count++;
-				}
-				tty_schedule_flip(tty);
-			}
+			tty_insert_flip_char(tty, ch, status);
+			tty_schedule_flip(tty);
 		}
 	} else {
 		printk("STALLION: bad RX interrupt ack value=%x\n", ioack);
@@ -5012,9 +5002,7 @@ static void stl_sc26198rxisr(stlport_t *portp, unsigned int iack)
 	len = inb(ioaddr + XP_DATA) + 1;
 
 	if ((iack & IVR_TYPEMASK) == IVR_RXDATA) {
-		if ((tty == (struct tty_struct *) NULL) ||
-		    (tty->flip.char_buf_ptr == (char *) NULL) ||
-		    ((buflen = TTY_FLIPBUF_SIZE - tty->flip.count) == 0)) {
+		if (tty == NULL || (buflen = tty_buffer_request_room(tty, len)) == 0) {
 			len = MIN(len, sizeof(stl_unwanted));
 			outb(GRXFIFO, (ioaddr + XP_ADDR));
 			insb((ioaddr + XP_DATA), &stl_unwanted[0], len);
@@ -5023,12 +5011,10 @@ static void stl_sc26198rxisr(stlport_t *portp, unsigned int iack)
 		} else {
 			len = MIN(len, buflen);
 			if (len > 0) {
+				unsigned char *ptr;
 				outb(GRXFIFO, (ioaddr + XP_ADDR));
-				insb((ioaddr + XP_DATA), tty->flip.char_buf_ptr, len);
-				memset(tty->flip.flag_buf_ptr, 0, len);
-				tty->flip.flag_buf_ptr += len;
-				tty->flip.char_buf_ptr += len;
-				tty->flip.count += len;
+				tty_prepare_flip_string(tty, &ptr, len);
+				insb((ioaddr + XP_DATA), ptr, len);
 				tty_schedule_flip(tty);
 				portp->stats.rxtotal += len;
 			}
@@ -5096,14 +5082,8 @@ static inline void stl_sc26198rxbadch(stlport_t *portp, unsigned char status, ch
 			status = 0;
 		}
 
-		if (tty->flip.char_buf_ptr != (char *) NULL) {
-			if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-				*tty->flip.flag_buf_ptr++ = status;
-				*tty->flip.char_buf_ptr++ = ch;
-				tty->flip.count++;
-			}
-			tty_schedule_flip(tty);
-		}
+		tty_insert_flip_char(tty, ch, status);
+		tty_schedule_flip(tty);
 
 		if (status == 0)
 			portp->stats.rxtotal++;
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index 564f31778eb3..64bf89cb574f 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -1085,6 +1085,7 @@ static inline void sx_receive_chars (struct sx_port *port)
 	int rx_op;
 	struct tty_struct *tty;
 	int copied=0;
+	unsigned char *rp;
 
 	func_enter2 ();
 	tty = port->gs.tty;
@@ -1095,8 +1096,8 @@ static inline void sx_receive_chars (struct sx_port *port)
 		sx_dprintk (SX_DEBUG_RECEIVE, "rxop=%d, c = %d.\n", rx_op, c); 
 
 		/* Don't copy more bytes than there is room for in the buffer */
-		if (tty->flip.count + c > TTY_FLIPBUF_SIZE) 
-			c = TTY_FLIPBUF_SIZE - tty->flip.count;
+
+		c = tty_prepare_flip_string(tty, &rp, c);
 
 		sx_dprintk (SX_DEBUG_RECEIVE, "c = %d.\n", c); 
 
@@ -1111,14 +1112,8 @@ static inline void sx_receive_chars (struct sx_port *port)
 		sx_dprintk (SX_DEBUG_RECEIVE , "Copying over %d chars. First is %d at %lx\n", c, 
 		            read_sx_byte (port->board, CHAN_OFFSET(port,hi_rxbuf) + rx_op),
 		            CHAN_OFFSET(port, hi_rxbuf)); 
-		memcpy_fromio (tty->flip.char_buf_ptr, 
+		memcpy_fromio (rp,
 		               port->board->base + CHAN_OFFSET(port,hi_rxbuf) + rx_op, c);
-		memset(tty->flip.flag_buf_ptr, TTY_NORMAL, c);
-
-		/* Update the kernel buffer end */
-		tty->flip.count += c;
-		tty->flip.char_buf_ptr += c;
-		tty->flip.flag_buf_ptr += c;
 
 		/* This one last. ( Not essential.)
 		   It allows the card to start putting more data into the buffer! 
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index 789572fc002b..9f1b466c4f84 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -1467,6 +1467,7 @@ static void mgsl_isr_receive_data( struct mgsl_struct *info )
 {
 	int Fifocount;
 	u16 status;
+	int work = 0;
 	unsigned char DataByte;
  	struct tty_struct *tty = info->tty;
  	struct	mgsl_icount *icount = &info->icount;
@@ -1487,6 +1488,8 @@ static void mgsl_isr_receive_data( struct mgsl_struct *info )
 	/* flush the receive FIFO */
 
 	while( (Fifocount = (usc_InReg(info,RICR) >> 8)) ) {
+		int flag;
+
 		/* read one byte from RxFIFO */
 		outw( (inw(info->io_base + CCAR) & 0x0780) | (RDR+LSBONLY),
 		      info->io_base + CCAR );
@@ -1498,13 +1501,9 @@ static void mgsl_isr_receive_data( struct mgsl_struct *info )
 				RXSTATUS_OVERRUN + RXSTATUS_BREAK_RECEIVED) )
 			usc_UnlatchRxstatusBits(info,RXSTATUS_ALL);
 		
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-			continue;
-			
-		*tty->flip.char_buf_ptr = DataByte;
 		icount->rx++;
 		
-		*tty->flip.flag_buf_ptr = 0;
+		flag = 0;
 		if ( status & (RXSTATUS_FRAMING_ERROR + RXSTATUS_PARITY_ERROR +
 				RXSTATUS_OVERRUN + RXSTATUS_BREAK_RECEIVED) ) {
 			printk("rxerr=%04X\n",status);					
@@ -1530,41 +1529,31 @@ static void mgsl_isr_receive_data( struct mgsl_struct *info )
 			status &= info->read_status_mask;
 		
 			if (status & RXSTATUS_BREAK_RECEIVED) {
-				*tty->flip.flag_buf_ptr = TTY_BREAK;
+				flag = TTY_BREAK;
 				if (info->flags & ASYNC_SAK)
 					do_SAK(tty);
 			} else if (status & RXSTATUS_PARITY_ERROR)
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			else if (status & RXSTATUS_FRAMING_ERROR)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
-			if (status & RXSTATUS_OVERRUN) {
-				/* Overrun is special, since it's
-				 * reported immediately, and doesn't
-				 * affect the current character
-				 */
-				if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-					tty->flip.count++;
-					tty->flip.flag_buf_ptr++;
-					tty->flip.char_buf_ptr++;
-					*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-				}
-			}
+				flag = TTY_FRAME;
 		}	/* end of if (error) */
-		
-		tty->flip.flag_buf_ptr++;
-		tty->flip.char_buf_ptr++;
-		tty->flip.count++;
+		tty_insert_flip_char(tty, DataByte, flag);
+		if (status & RXSTATUS_OVERRUN) {
+			/* Overrun is special, since it's
+			 * reported immediately, and doesn't
+			 * affect the current character
+			 */
+			work += tty_insert_flip_char(tty, 0, TTY_OVERRUN);
+		}
 	}
 
 	if ( debug_level >= DEBUG_LEVEL_ISR ) {
-		printk("%s(%d):mgsl_isr_receive_data flip count=%d\n",
-			__FILE__,__LINE__,tty->flip.count);
 		printk("%s(%d):rx=%d brk=%d parity=%d frame=%d overrun=%d\n",
 			__FILE__,__LINE__,icount->rx,icount->brk,
 			icount->parity,icount->frame,icount->overrun);
 	}
 			
-	if ( tty->flip.count )
+	if(work)
 		tty_flip_buffer_push(tty);
 }
 
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 41759cd70a4f..79c81def4104 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -1749,6 +1749,9 @@ static void rx_async(struct slgt_info *info)
 	unsigned char status;
 	struct slgt_desc *bufs = info->rbufs;
 	int i, count;
+	int chars = 0;
+	int stat;
+	unsigned char ch;
 
 	start = end = info->rbuf_current;
 
@@ -1760,16 +1763,15 @@ static void rx_async(struct slgt_info *info)
 		DBGDATA(info, p, count, "rx");
 
 		for(i=0 ; i < count; i+=2, p+=2) {
-			if (tty) {
-				if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-					tty_flip_buffer_push(tty);
-				if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-					break;
-				*tty->flip.char_buf_ptr = *p;
-				*tty->flip.flag_buf_ptr = 0;
+			if (tty && chars) {
+				tty_flip_buffer_push(tty);
+				chars = 0;
 			}
+			ch = *p;
 			icount->rx++;
 
+			stat = 0;
+
 			if ((status = *(p+1) & (BIT9 + BIT8))) {
 				if (status & BIT9)
 					icount->parity++;
@@ -1778,17 +1780,14 @@ static void rx_async(struct slgt_info *info)
 				/* discard char if tty control flags say so */
 				if (status & info->ignore_status_mask)
 					continue;
-				if (tty) {
-					if (status & BIT9)
-						*tty->flip.flag_buf_ptr = TTY_PARITY;
-					else if (status & BIT8)
-						*tty->flip.flag_buf_ptr = TTY_FRAME;
-				}
+				if (status & BIT9)
+					stat = TTY_PARITY;
+				else if (status & BIT8)
+					stat = TTY_FRAME;
 			}
 			if (tty) {
-				tty->flip.flag_buf_ptr++;
-				tty->flip.char_buf_ptr++;
-				tty->flip.count++;
+				tty_insert_flip_char(tty, ch, stat);
+				chars++;
 			}
 		}
 
@@ -1811,7 +1810,7 @@ static void rx_async(struct slgt_info *info)
 			break;
 	}
 
-	if (tty && tty->flip.count)
+	if (tty && chars)
 		tty_flip_buffer_push(tty);
 }
 
@@ -2029,7 +2028,7 @@ static void isr_serial(struct slgt_info *info)
 			if (info->tty) {
 				if (!(status & info->ignore_status_mask)) {
 					if (info->read_status_mask & MASK_BREAK) {
-						*info->tty->flip.flag_buf_ptr = TTY_BREAK;
+						tty_insert_flip_char(info->tty, 0, TTY_BREAK);
 						if (info->flags & ASYNC_SAK)
 							do_SAK(info->tty);
 					}
diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index a9467e7d3747..960adb256fbb 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -2196,7 +2196,7 @@ void isr_rxint(SLMP_INFO * info)
 			if ( tty ) {
 				if (!(status & info->ignore_status_mask1)) {
 					if (info->read_status_mask1 & BRKD) {
-						*tty->flip.flag_buf_ptr = TTY_BREAK;
+						tty_insert_flip_char(tty, 0, TTY_BREAK);
 						if (info->flags & ASYNC_SAK)
 							do_SAK(tty);
 					}
@@ -2240,16 +2240,10 @@ void isr_rxrdy(SLMP_INFO * info)
 
 	while((status = read_reg(info,CST0)) & BIT0)
 	{
+		int flag = 0;
+		int over = 0;
 		DataByte = read_reg(info,TRB);
 
-		if ( tty ) {
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-				continue;
-
-			*tty->flip.char_buf_ptr = DataByte;
-			*tty->flip.flag_buf_ptr = 0;
-		}
-
 		icount->rx++;
 
 		if ( status & (PE + FRME + OVRN) ) {
@@ -2272,42 +2266,34 @@ void isr_rxrdy(SLMP_INFO * info)
 
 			if ( tty ) {
 				if (status & PE)
-					*tty->flip.flag_buf_ptr = TTY_PARITY;
+					flag = TTY_PARITY;
 				else if (status & FRME)
-					*tty->flip.flag_buf_ptr = TTY_FRAME;
+					flag = TTY_FRAME;
 				if (status & OVRN) {
 					/* Overrun is special, since it's
 					 * reported immediately, and doesn't
 					 * affect the current character
 					 */
-					if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-						tty->flip.count++;
-						tty->flip.flag_buf_ptr++;
-						tty->flip.char_buf_ptr++;
-						*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-					}
+					over = 1;
 				}
 			}
 		}	/* end of if (error) */
 
 		if ( tty ) {
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
+			tty_insert_flip_char(tty, DataByte, flag);
+			if (over)
+				tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 		}
 	}
 
 	if ( debug_level >= DEBUG_LEVEL_ISR ) {
-		printk("%s(%d):%s isr_rxrdy() flip count=%d\n",
-			__FILE__,__LINE__,info->device_name,
-			tty ? tty->flip.count : 0);
 		printk("%s(%d):%s rx=%d brk=%d parity=%d frame=%d overrun=%d\n",
 			__FILE__,__LINE__,info->device_name,
 			icount->rx,icount->brk,icount->parity,
 			icount->frame,icount->overrun);
 	}
 
-	if ( tty && tty->flip.count )
+	if ( tty )
 		tty_flip_buffer_push(tty);
 }
 
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 4b1eef51ec59..1eda82b31a61 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -166,9 +166,12 @@ static struct tty_struct *alloc_tty_struct(void)
 	return tty;
 }
 
+static void tty_buffer_free_all(struct tty_struct *);
+
 static inline void free_tty_struct(struct tty_struct *tty)
 {
 	kfree(tty->write_buf);
+	tty_buffer_free_all(tty);
 	kfree(tty);
 }
 
@@ -230,6 +233,201 @@ static int check_tty_count(struct tty_struct *tty, const char *routine)
 	return 0;
 }
 
+/*
+ * Tty buffer allocation management
+ */
+
+static void tty_buffer_free_all(struct tty_struct *tty)
+{
+	struct tty_buffer *thead;
+	while((thead = tty->buf.head) != NULL) {
+		tty->buf.head = thead->next;
+		kfree(thead);
+	}
+	while((thead = tty->buf.free) != NULL) {
+		tty->buf.free = thead->next;
+		kfree(thead);
+	}
+	tty->buf.tail = NULL;
+}
+
+static void tty_buffer_init(struct tty_struct *tty)
+{
+	tty->buf.head = NULL;
+	tty->buf.tail = NULL;
+	tty->buf.free = NULL;
+}
+
+static struct tty_buffer *tty_buffer_alloc(size_t size)
+{
+	struct tty_buffer *p = kmalloc(sizeof(struct tty_buffer) + 2 * size, GFP_ATOMIC);
+	if(p == NULL)
+		return NULL;
+	p->used = 0;
+	p->size = size;
+	p->next = NULL;
+	p->char_buf_ptr = (char *)(p->data);
+	p->flag_buf_ptr = (unsigned char *)p->char_buf_ptr + size;
+/* 	printk("Flip create %p\n", p); */
+	return p;
+}
+
+/* Must be called with the tty_read lock held. This needs to acquire strategy
+   code to decide if we should kfree or relink a given expired buffer */
+
+static void tty_buffer_free(struct tty_struct *tty, struct tty_buffer *b)
+{
+	/* Dumb strategy for now - should keep some stats */
+/* 	printk("Flip dispose %p\n", b); */
+	if(b->size >= 512)
+		kfree(b);
+	else {
+		b->next = tty->buf.free;
+		tty->buf.free = b;
+	}
+}
+
+static struct tty_buffer *tty_buffer_find(struct tty_struct *tty, size_t size)
+{
+	struct tty_buffer **tbh = &tty->buf.free;
+	while((*tbh) != NULL) {
+		struct tty_buffer *t = *tbh;
+		if(t->size >= size) {
+			*tbh = t->next;
+			t->next = NULL;
+			t->used = 0;
+			/* DEBUG ONLY */
+			memset(t->data, '*', size);
+/* 			printk("Flip recycle %p\n", t); */
+			return t;
+		}
+		tbh = &((*tbh)->next);
+	}
+	/* Round the buffer size out */
+	size = (size + 0xFF) & ~ 0xFF;
+	return tty_buffer_alloc(size);
+	/* Should possibly check if this fails for the largest buffer we
+	   have queued and recycle that ? */
+}
+
+int tty_buffer_request_room(struct tty_struct *tty, size_t size)
+{
+	struct tty_buffer *b = tty->buf.head, *n;
+	int left = 0;
+
+	/* OPTIMISATION: We could keep a per tty "zero" sized buffer to
+	   remove this conditional if its worth it. This would be invisible
+	   to the callers */
+	if(b != NULL)
+		left = b->size - b->used;
+	if(left >= size)
+		return size;
+	/* This is the slow path - looking for new buffers to use */
+	n = tty_buffer_find(tty, size);
+	if(n == NULL)
+		return left;
+	n->next = b;
+	if(b != NULL)
+		b->next = n;
+	else
+		tty->buf.head = n;
+	tty->buf.tail = n;
+	return size;
+}
+
+EXPORT_SYMBOL_GPL(tty_buffer_request_room);
+
+int tty_insert_flip_string(struct tty_struct *tty, unsigned char *chars, size_t size)
+{
+	int copied = 0;
+	do {
+		int space = tty_buffer_request_room(tty, size - copied);
+		struct tty_buffer *tb = tty->buf.tail;
+		/* If there is no space then tb may be NULL */
+		if(unlikely(space == 0))
+			break;
+		memcpy(tb->char_buf_ptr + tb->used, chars, space);
+		memset(tb->flag_buf_ptr + tb->used, TTY_NORMAL, space);
+		tb->used += space;
+		copied += space;
+		chars += space;
+/* 		printk("Flip insert %d.\n", space); */
+	}
+	/* There is a small chance that we need to split the data over
+	   several buffers. If this is the case we must loop */
+	while (unlikely(size > copied));
+	return copied;
+}
+
+EXPORT_SYMBOL_GPL(tty_insert_flip_string);
+
+int tty_insert_flip_string_flags(struct tty_struct *tty, unsigned char *chars, char *flags, size_t size)
+{
+	int copied = 0;
+	do {
+		int space = tty_buffer_request_room(tty, size - copied);
+		struct tty_buffer *tb = tty->buf.tail;
+		/* If there is no space then tb may be NULL */
+		if(unlikely(space == 0))
+			break;
+		memcpy(tb->char_buf_ptr + tb->used, chars, space);
+		memcpy(tb->flag_buf_ptr + tb->used, flags, space);
+		tb->used += space;
+		copied += space;
+		chars += space;
+		flags += space;
+	}
+	/* There is a small chance that we need to split the data over
+	   several buffers. If this is the case we must loop */
+	while (unlikely(size > copied));
+	return copied;
+}
+
+EXPORT_SYMBOL_GPL(tty_insert_flip_string_flags);
+
+
+/*
+ *	Prepare a block of space in the buffer for data. Returns the length
+ *	available and buffer pointer to the space which is now allocated and
+ *	accounted for as ready for normal characters. This is used for drivers
+ *	that need their own block copy routines into the buffer. There is no
+ *	guarantee the buffer is a DMA target!
+ */
+
+int tty_prepare_flip_string(struct tty_struct *tty, unsigned char **chars, size_t size)
+{
+	int space = tty_buffer_request_room(tty, size);
+	struct tty_buffer *tb = tty->buf.tail;
+	*chars = tb->char_buf_ptr + tb->used;
+	memset(tb->flag_buf_ptr + tb->used, TTY_NORMAL, space);
+	tb->used += space;
+	return space;
+}
+
+EXPORT_SYMBOL_GPL(tty_prepare_flip_string);
+
+/*
+ *	Prepare a block of space in the buffer for data. Returns the length
+ *	available and buffer pointer to the space which is now allocated and
+ *	accounted for as ready for characters. This is used for drivers
+ *	that need their own block copy routines into the buffer. There is no
+ *	guarantee the buffer is a DMA target!
+ */
+
+int tty_prepare_flip_string_flags(struct tty_struct *tty, unsigned char **chars, char **flags, size_t size)
+{
+	int space = tty_buffer_request_room(tty, size);
+	struct tty_buffer *tb = tty->buf.tail;
+	*chars = tb->char_buf_ptr + tb->used;
+	*flags = tb->flag_buf_ptr + tb->used;
+	tb->used += space;
+	return space;
+}
+
+EXPORT_SYMBOL_GPL(tty_prepare_flip_string_flags);
+
+
+
 /*
  *	This is probably overkill for real world processors but
  *	they are not on hot paths so a little discipline won't do 
@@ -492,6 +690,17 @@ restart:
 	if (ld == NULL)
 		return -EINVAL;
 
+	/*
+	 *	No more input please, we are switching. The new ldisc
+	 *	will update this value in the ldisc open function
+	 */
+
+	tty->receive_room = 0;
+
+	/*
+	 *	Problem: What do we do if this blocks ?
+	 */
+
 	tty_wait_until_sent(tty, 0);
 
 	if (tty->ldisc.num == ldisc) {
@@ -560,9 +769,9 @@ restart:
 	 *	we say so later on.
 	 */
 
-	work = cancel_delayed_work(&tty->flip.work);
+	work = cancel_delayed_work(&tty->buf.work);
 	/*
-	 * Wait for ->hangup_work and ->flip.work handlers to terminate
+	 * Wait for ->hangup_work and ->buf.work handlers to terminate
 	 */
 	 
 	flush_scheduled_work();
@@ -616,7 +825,7 @@ restart:
 	/* Restart it in case no characters kick it off. Safe if
 	   already running */
 	if (work)
-		schedule_delayed_work(&tty->flip.work, 1);
+		schedule_delayed_work(&tty->buf.work, 1);
 	return retval;
 }
 
@@ -1721,10 +1930,10 @@ static void release_dev(struct file * filp)
 	 */
 	clear_bit(TTY_LDISC, &tty->flags);
 	clear_bit(TTY_DONT_FLIP, &tty->flags);
-	cancel_delayed_work(&tty->flip.work);
+	cancel_delayed_work(&tty->buf.work);
 
 	/*
-	 * Wait for ->hangup_work and ->flip.work handlers to terminate
+	 * Wait for ->hangup_work and ->buf.work handlers to terminate
 	 */
 	 
 	flush_scheduled_work();
@@ -2518,17 +2727,15 @@ EXPORT_SYMBOL(do_SAK);
 
 /*
  * This routine is called out of the software interrupt to flush data
- * from the flip buffer to the line discipline. 
+ * from the buffer chain to the line discipline.
  */
  
 static void flush_to_ldisc(void *private_)
 {
 	struct tty_struct *tty = (struct tty_struct *) private_;
-	unsigned char	*cp;
-	char		*fp;
-	int		count;
 	unsigned long 	flags;
 	struct tty_ldisc *disc;
+	struct tty_buffer *tbuf;
 
 	disc = tty_ldisc_ref(tty);
 	if (disc == NULL)	/*  !TTY_LDISC */
@@ -2538,28 +2745,22 @@ static void flush_to_ldisc(void *private_)
 		/*
 		 * Do it after the next timer tick:
 		 */
-		schedule_delayed_work(&tty->flip.work, 1);
+		schedule_delayed_work(&tty->buf.work, 1);
 		goto out;
 	}
 	spin_lock_irqsave(&tty->read_lock, flags);
-	if (tty->flip.buf_num) {
-		cp = tty->flip.char_buf + TTY_FLIPBUF_SIZE;
-		fp = tty->flip.flag_buf + TTY_FLIPBUF_SIZE;
-		tty->flip.buf_num = 0;
-		tty->flip.char_buf_ptr = tty->flip.char_buf;
-		tty->flip.flag_buf_ptr = tty->flip.flag_buf;
-	} else {
-		cp = tty->flip.char_buf;
-		fp = tty->flip.flag_buf;
-		tty->flip.buf_num = 1;
-		tty->flip.char_buf_ptr = tty->flip.char_buf + TTY_FLIPBUF_SIZE;
-		tty->flip.flag_buf_ptr = tty->flip.flag_buf + TTY_FLIPBUF_SIZE;
-	}
-	count = tty->flip.count;
-	tty->flip.count = 0;
+	while((tbuf = tty->buf.head) != NULL) {
+		tty->buf.head = tbuf->next;
+		spin_unlock_irqrestore(&tty->read_lock, flags);
+		/* printk("Process buffer %p for %d\n", tbuf, tbuf->used); */
+		disc->receive_buf(tty, tbuf->char_buf_ptr,
+				       tbuf->flag_buf_ptr,
+				       tbuf->used);
+		spin_lock_irqsave(&tty->read_lock, flags);
+		tty_buffer_free(tty, tbuf);
+	}
+	tty->buf.tail = NULL;
 	spin_unlock_irqrestore(&tty->read_lock, flags);
-
-	disc->receive_buf(tty, cp, fp, count);
 out:
 	tty_ldisc_deref(disc);
 }
@@ -2654,11 +2855,12 @@ void tty_flip_buffer_push(struct tty_struct *tty)
 	if (tty->low_latency)
 		flush_to_ldisc((void *) tty);
 	else
-		schedule_delayed_work(&tty->flip.work, 1);
+		schedule_delayed_work(&tty->buf.work, 1);
 }
 
 EXPORT_SYMBOL(tty_flip_buffer_push);
 
+
 /*
  * This subroutine initializes a tty structure.
  */
@@ -2669,10 +2871,10 @@ static void initialize_tty_struct(struct tty_struct *tty)
 	tty_ldisc_assign(tty, tty_ldisc_get(N_TTY));
 	tty->pgrp = -1;
 	tty->overrun_time = jiffies;
-	tty->flip.char_buf_ptr = tty->flip.char_buf;
-	tty->flip.flag_buf_ptr = tty->flip.flag_buf;
-	INIT_WORK(&tty->flip.work, flush_to_ldisc, tty);
-	init_MUTEX(&tty->flip.pty_sem);
+	tty->buf.head = tty->buf.tail = NULL;
+	tty_buffer_init(tty);
+	INIT_WORK(&tty->buf.work, flush_to_ldisc, tty);
+	init_MUTEX(&tty->buf.pty_sem);
 	init_MUTEX(&tty->termios_sem);
 	init_waitqueue_head(&tty->write_wait);
 	init_waitqueue_head(&tty->read_wait);
diff --git a/drivers/char/viocons.c b/drivers/char/viocons.c
index 4d75c261f98a..cb82ebf4cb07 100644
--- a/drivers/char/viocons.c
+++ b/drivers/char/viocons.c
@@ -993,11 +993,10 @@ static void vioHandleData(struct HvLpEvent *event)
 		 * Don't attempt to copy more data into the buffer than we
 		 * have room for because it would fail without indication.
 		 */
-		if ((tty->flip.count + 1) > TTY_FLIPBUF_SIZE) {
+		if(tty_insert_flip_char(tty, cevent->data[index], TTY_NORMAL) == 0) {
 			printk(VIOCONS_KERN_WARN "input buffer overflow!\n");
 			break;
 		}
-		tty_insert_flip_char(tty, cevent->data[index], TTY_NORMAL);
 	}
 
 	/* if cevent->len == 0 then no data was added to the buffer and flip.count == 0 */
diff --git a/drivers/char/vme_scc.c b/drivers/char/vme_scc.c
index 33e71e23b212..d9325281e482 100644
--- a/drivers/char/vme_scc.c
+++ b/drivers/char/vme_scc.c
@@ -434,13 +434,7 @@ static irqreturn_t scc_rx_int(int irq, void *data, struct pt_regs *fp)
 		SCCwrite_NB(COMMAND_REG, CR_HIGHEST_IUS_RESET);
 		return IRQ_HANDLED;
 	}
-	if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-		*tty->flip.char_buf_ptr = ch;
-		*tty->flip.flag_buf_ptr = 0;
-		tty->flip.flag_buf_ptr++;
-		tty->flip.char_buf_ptr++;
-		tty->flip.count++;
-	}
+	tty_insert_flip_char(tty, ch, 0);
 
 	/* Check if another character is already ready; in that case, the
 	 * spcond_int() function must be used, because this character may have an
@@ -487,13 +481,7 @@ static irqreturn_t scc_spcond_int(int irq, void *data, struct pt_regs *fp)
 		else
 			err = 0;
 
-		if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-			*tty->flip.char_buf_ptr = ch;
-			*tty->flip.flag_buf_ptr = err;
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
+		tty_insert_flip_char(tty, ch, err);
 
 		/* ++TeSche: *All* errors have to be cleared manually,
 		 * else the condition persists for the next chars
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index 1bd88fca0542..54a680cc704d 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -96,6 +96,7 @@ static int serport_ldisc_open(struct tty_struct *tty)
 	init_waitqueue_head(&serport->wait);
 
 	tty->disc_data = serport;
+	tty->receive_room = 256;
 	set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
 
 	return 0;
@@ -139,17 +140,6 @@ out:
 	spin_unlock_irqrestore(&serport->lock, flags);
 }
 
-/*
- * serport_ldisc_room() reports how much room we do have for receiving data.
- * Although we in fact have infinite room, we need to specify some value
- * here, and 256 seems to be reasonable.
- */
-
-static int serport_ldisc_room(struct tty_struct *tty)
-{
-	return 256;
-}
-
 /*
  * serport_ldisc_read() just waits indefinitely if everything goes well.
  * However, when the serio driver closes the serio port, it finishes,
@@ -237,7 +227,6 @@ static struct tty_ldisc serport_ldisc = {
 	.read =		serport_ldisc_read,
 	.ioctl =	serport_ldisc_ioctl,
 	.receive_buf =	serport_ldisc_receive,
-	.receive_room =	serport_ldisc_room,
 	.write_wakeup =	serport_ldisc_write_wakeup
 };
 
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 11ae0fddea04..623adbb0d13a 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -463,8 +463,7 @@ static int handle_recv_skb(struct capiminor *mp, struct sk_buff *skb)
 #endif
 		goto bad;
 	}
-	if (ld->receive_room &&
-	    ld->receive_room(mp->tty) < datalen) {
+	if (mp->tty->receive_room < datalen) {
 #if defined(_DEBUG_DATAFLOW) || defined(_DEBUG_TTYFUNCS)
 		printk(KERN_DEBUG "capi: no room in tty\n");
 #endif
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 4643df097bfe..22759c01746a 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -857,6 +857,118 @@ isdn_readbchan(int di, int channel, u_char * buf, u_char * fp, int len, wait_que
 	return count;
 }
 
+/*
+ * isdn_readbchan_tty() tries to get data from the read-queue.
+ * It MUST be called with interrupts off.
+ *
+ * Be aware that this is not an atomic operation when sleep != 0, even though
+ * interrupts are turned off! Well, like that we are currently only called
+ * on behalf of a read system call on raw device files (which are documented
+ * to be dangerous and for for debugging purpose only). The inode semaphore
+ * takes care that this is not called for the same minor device number while
+ * we are sleeping, but access is not serialized against simultaneous read()
+ * from the corresponding ttyI device. Can other ugly events, like changes
+ * of the mapping (di,ch)<->minor, happen during the sleep? --he
+ */
+int
+isdn_readbchan_tty(int di, int channel, struct tty_struct *tty, int cisco_hack)
+{
+	int count;
+	int count_pull;
+	int count_put;
+	int dflag;
+	struct sk_buff *skb;
+	char last = 0;
+	int len;
+
+	if (!dev->drv[di])
+		return 0;
+	if (skb_queue_empty(&dev->drv[di]->rpqueue[channel]))
+			return 0;
+
+	len = tty_buffer_request_room(tty, dev->drv[di]->rcvcount[channel]);
+	if(len == 0)
+		return len;
+
+	count = 0;
+	while (len) {
+		if (!(skb = skb_peek(&dev->drv[di]->rpqueue[channel])))
+			break;
+#ifdef CONFIG_ISDN_AUDIO
+		if (ISDN_AUDIO_SKB_LOCK(skb))
+			break;
+		ISDN_AUDIO_SKB_LOCK(skb) = 1;
+		if ((ISDN_AUDIO_SKB_DLECOUNT(skb)) || (dev->drv[di]->DLEflag & (1 << channel))) {
+			char *p = skb->data;
+			unsigned long DLEmask = (1 << channel);
+
+			dflag = 0;
+			count_pull = count_put = 0;
+			while ((count_pull < skb->len) && (len > 0)) {
+				len--;
+				if (dev->drv[di]->DLEflag & DLEmask) {
+					last = DLE;
+					dev->drv[di]->DLEflag &= ~DLEmask;
+				} else {
+					last = *p;
+					if (last == DLE) {
+						dev->drv[di]->DLEflag |= DLEmask;
+						(ISDN_AUDIO_SKB_DLECOUNT(skb))--;
+					}
+					p++;
+					count_pull++;
+				}
+				count_put++;
+			}
+			if (count_pull >= skb->len)
+				dflag = 1;
+		} else {
+#endif
+			/* No DLE's in buff, so simply copy it */
+			dflag = 1;
+			if ((count_pull = skb->len) > len) {
+				count_pull = len;
+				dflag = 0;
+			}
+			count_put = count_pull;
+			if(count_put > 1)
+				tty_insert_flip_string(tty, skb->data, count_put - 1);
+			last = skb->data[count_put] - 1;
+			len -= count_put;
+#ifdef CONFIG_ISDN_AUDIO
+		}
+#endif
+		count += count_put;
+		if (dflag) {
+			/* We got all the data in this buff.
+			 * Now we can dequeue it.
+			 */
+			if(cisco_hack)
+				tty_insert_flip_char(tty, last, 0xFF);
+			else
+				tty_insert_flip_char(tty, last, TTY_NORMAL);
+#ifdef CONFIG_ISDN_AUDIO
+			ISDN_AUDIO_SKB_LOCK(skb) = 0;
+#endif
+			skb = skb_dequeue(&dev->drv[di]->rpqueue[channel]);
+			dev_kfree_skb(skb);
+		} else {
+			tty_insert_flip_char(tty, last, TTY_NORMAL);
+			/* Not yet emptied this buff, so it
+			 * must stay in the queue, for further calls
+			 * but we pull off the data we got until now.
+			 */
+			skb_pull(skb, count_pull);
+#ifdef CONFIG_ISDN_AUDIO
+			ISDN_AUDIO_SKB_LOCK(skb) = 0;
+#endif
+		}
+		dev->drv[di]->rcvcount[channel] -= count_put;
+	}
+	return count;
+}
+
+
 static __inline int
 isdn_minor2drv(int minor)
 {
diff --git a/drivers/isdn/i4l/isdn_common.h b/drivers/isdn/i4l/isdn_common.h
index e27e9c3a81ed..082735dbb412 100644
--- a/drivers/isdn/i4l/isdn_common.h
+++ b/drivers/isdn/i4l/isdn_common.h
@@ -37,6 +37,7 @@ extern void isdn_timer_ctrl(int tf, int onoff);
 extern void isdn_unexclusive_channel(int di, int ch);
 extern int  isdn_getnum(char **);
 extern int  isdn_readbchan(int, int, u_char *, u_char *, int, wait_queue_head_t *);
+extern int  isdn_readbchan_tty(int, int, struct tty_struct *, int);
 extern int  isdn_get_free_channel(int, int, int, int, int, char *);
 extern int  isdn_writebuf_skb_stub(int, int, int, struct sk_buff *);
 extern int  register_isdn(isdn_if * i);
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 8c404b4e2482..f190a99604f0 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -64,37 +64,42 @@ isdn_tty_try_read(modem_info * info, struct sk_buff *skb)
 	int c;
 	int len;
 	struct tty_struct *tty;
+	char last;
 
 	if (info->online) {
 		if ((tty = info->tty)) {
 			if (info->mcr & UART_MCR_RTS) {
-				c = TTY_FLIPBUF_SIZE - tty->flip.count;
 				len = skb->len
 #ifdef CONFIG_ISDN_AUDIO
 					+ ISDN_AUDIO_SKB_DLECOUNT(skb)
 #endif
 					;
+
+				c = tty_buffer_request_room(tty, len);
 				if (c >= len) {
 #ifdef CONFIG_ISDN_AUDIO
-					if (ISDN_AUDIO_SKB_DLECOUNT(skb))
-						while (skb->len--) {
+					if (ISDN_AUDIO_SKB_DLECOUNT(skb)) {
+						int l = skb->len;
+						unsigned char *dp = skb->data;
+						while (--l) {
 							if (*skb->data == DLE)
 								tty_insert_flip_char(tty, DLE, 0);
-							tty_insert_flip_char(tty, *skb->data++, 0);
+							tty_insert_flip_char(tty, *dp++, 0);
+						}
+						last = *dp;
 					} else {
 #endif
-						memcpy(tty->flip.char_buf_ptr,
-						       skb->data, len);
-						tty->flip.count += len;
-						tty->flip.char_buf_ptr += len;
-						memset(tty->flip.flag_buf_ptr, 0, len);
-						tty->flip.flag_buf_ptr += len;
+						if(len > 1)
+							tty_insert_flip_string(tty, skb->data, len - 1);
+						last = skb->data[len - 1];
 #ifdef CONFIG_ISDN_AUDIO
 					}
 #endif
 					if (info->emu.mdmreg[REG_CPPP] & BIT_CPPP)
-						tty->flip.flag_buf_ptr[len - 1] = 0xff;
-					schedule_delayed_work(&tty->flip.work, 1);
+						tty_insert_flip_char(tty, last, 0xFF);
+					else
+						tty_insert_flip_char(tty, last, TTY_NORMAL);
+					tty_flip_buffer_push(tty);
 					kfree_skb(skb);
 					return 1;
 				}
@@ -114,7 +119,6 @@ isdn_tty_readmodem(void)
 	int resched = 0;
 	int midx;
 	int i;
-	int c;
 	int r;
 	struct tty_struct *tty;
 	modem_info *info;
@@ -131,20 +135,13 @@ isdn_tty_readmodem(void)
 #endif
 				if ((tty = info->tty)) {
 					if (info->mcr & UART_MCR_RTS) {
-						c = TTY_FLIPBUF_SIZE - tty->flip.count;
-						if (c > 0) {
-							r = isdn_readbchan(info->isdn_driver, info->isdn_channel,
-									   tty->flip.char_buf_ptr,
-									   tty->flip.flag_buf_ptr, c, NULL);
-							/* CISCO AsyncPPP Hack */
-							if (!(info->emu.mdmreg[REG_CPPP] & BIT_CPPP))
-								memset(tty->flip.flag_buf_ptr, 0, r);
-							tty->flip.count += r;
-							tty->flip.flag_buf_ptr += r;
-							tty->flip.char_buf_ptr += r;
-							if (r)
-								schedule_delayed_work(&tty->flip.work, 1);
-						}
+						/* CISCO AsyncPPP Hack */
+						if (!(info->emu.mdmreg[REG_CPPP] & BIT_CPPP))
+							r = isdn_readbchan_tty(info->isdn_driver, info->isdn_channel, tty, 0);
+						else
+							r = isdn_readbchan_tty(info->isdn_driver, info->isdn_channel, tty, 1);
+						if (r)
+							tty_flip_buffer_push(tty);
 					} else
 						r = 1;
 				} else
@@ -249,7 +246,7 @@ isdn_tty_rcv_skb(int i, int di, int channel, struct sk_buff *skb)
 	}
 #endif
 #endif
-	/* Try to deliver directly via tty-flip-buf if queue is empty */
+	/* Try to deliver directly via tty-buf if queue is empty */
 	spin_lock_irqsave(&info->readlock, flags);
 	if (skb_queue_empty(&dev->drv[di]->rpqueue[channel]))
 		if (isdn_tty_try_read(info, skb)) {
@@ -534,7 +531,7 @@ isdn_tty_senddown(modem_info * info)
 /* The next routine is called once from within timer-interrupt
  * triggered within isdn_tty_modem_ncarrier(). It calls
  * isdn_tty_modem_result() to stuff a "NO CARRIER" Message
- * into the tty's flip-buffer.
+ * into the tty's buffer.
  */
 static void
 isdn_tty_modem_do_ncarrier(unsigned long data)
@@ -2347,6 +2344,7 @@ isdn_tty_at_cout(char *msg, modem_info * info)
 	u_long flags;
 	struct sk_buff *skb = NULL;
 	char *sp = NULL;
+	int l = strlen(msg);
 
 	if (!msg) {
 		printk(KERN_WARNING "isdn_tty: Null-Message in isdn_tty_at_cout\n");
@@ -2359,16 +2357,16 @@ isdn_tty_at_cout(char *msg, modem_info * info)
 		return;
 	}
 
-	/* use queue instead of direct flip, if online and */
-	/* data is in queue or flip buffer is full */
-	if ((info->online) && (((tty->flip.count + strlen(msg)) >= TTY_FLIPBUF_SIZE) ||
-	    (!skb_queue_empty(&dev->drv[info->isdn_driver]->rpqueue[info->isdn_channel])))) {
-		skb = alloc_skb(strlen(msg), GFP_ATOMIC);
+	/* use queue instead of direct, if online and */
+	/* data is in queue or buffer is full */
+	if ((info->online && tty_buffer_request_room(tty, l) < l) ||
+	    (!skb_queue_empty(&dev->drv[info->isdn_driver]->rpqueue[info->isdn_channel]))) {
+		skb = alloc_skb(l, GFP_ATOMIC);
 		if (!skb) {
 			spin_unlock_irqrestore(&info->readlock, flags);
 			return;
 		}
-		sp = skb_put(skb, strlen(msg));
+		sp = skb_put(skb, l);
 #ifdef CONFIG_ISDN_AUDIO
 		ISDN_AUDIO_SKB_DLECOUNT(skb) = 0;
 		ISDN_AUDIO_SKB_LOCK(skb) = 0;
@@ -2392,9 +2390,8 @@ isdn_tty_at_cout(char *msg, modem_info * info)
 		if (skb) {
 			*sp++ = c;
 		} else {
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
+			if(tty_insert_flip_char(tty, c, TTY_NORMAL) == 0)
 				break;
-			tty_insert_flip_char(tty, c, 0);
 		}
 	}
 	if (skb) {
@@ -2402,12 +2399,12 @@ isdn_tty_at_cout(char *msg, modem_info * info)
 		dev->drv[info->isdn_driver]->rcvcount[info->isdn_channel] += skb->len;
 		spin_unlock_irqrestore(&info->readlock, flags);
 		/* Schedule dequeuing */
-		if ((dev->modempoll) && (info->rcvsched))
+		if (dev->modempoll && info->rcvsched)
 			isdn_timer_ctrl(ISDN_TIMER_MODEMREAD, 1);
 
 	} else {
 		spin_unlock_irqrestore(&info->readlock, flags);
-		schedule_delayed_work(&tty->flip.work, 1);
+		tty_flip_buffer_push(tty);
 	}
 }
 
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 90999867a32c..102c1f0b90da 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -456,11 +456,6 @@ out:
 
 /* ----------------------------------------------------------------------- */
 
-static int sixpack_receive_room(struct tty_struct *tty)
-{
-	return 65536;  /* We can handle an infinite amount of data. :-) */
-}
-
 /*
  * Handle the 'receiver data ready' interrupt.
  * This function is called by the 'tty_io' module in the kernel when
@@ -671,6 +666,7 @@ static int sixpack_open(struct tty_struct *tty)
 
 	/* Done.  We have linked the TTY line to a channel. */
 	tty->disc_data = sp;
+	tty->receive_room = 65536;
 
 	/* Now we're ready to register. */
 	if (register_netdev(dev))
@@ -802,7 +798,6 @@ static struct tty_ldisc sp_ldisc = {
 	.close		= sixpack_close,
 	.ioctl		= sixpack_ioctl,
 	.receive_buf	= sixpack_receive_buf,
-	.receive_room	= sixpack_receive_room,
 	.write_wakeup	= sixpack_write_wakeup,
 };
 
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index f4424cf886c5..dc5e9d59deed 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -753,6 +753,7 @@ static int mkiss_open(struct tty_struct *tty)
 
 	ax->tty = tty;
 	tty->disc_data = ax;
+	tty->receive_room = 65535;
 
 	if (tty->driver->flush_buffer)
 		tty->driver->flush_buffer(tty);
@@ -940,11 +941,6 @@ static void mkiss_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 		tty->driver->unthrottle(tty);
 }
 
-static int mkiss_receive_room(struct tty_struct *tty)
-{
-	return 65536;  /* We can handle an infinite amount of data. :-) */
-}
-
 /*
  * Called by the driver when there's room for more data.  If we have
  * more packets to send, we send them here.
@@ -983,7 +979,6 @@ static struct tty_ldisc ax_ldisc = {
 	.close		= mkiss_close,
 	.ioctl		= mkiss_ioctl,
 	.receive_buf	= mkiss_receive_buf,
-	.receive_room	= mkiss_receive_room,
 	.write_wakeup	= mkiss_write_wakeup
 };
 
diff --git a/drivers/net/irda/irtty-sir.c b/drivers/net/irda/irtty-sir.c
index b8d112348ba4..101750bf210f 100644
--- a/drivers/net/irda/irtty-sir.c
+++ b/drivers/net/irda/irtty-sir.c
@@ -288,22 +288,6 @@ static void irtty_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 	sirdev_receive(dev, cp, count);
 }
 
-/*
- * Function irtty_receive_room (tty)
- *
- *    Used by the TTY to find out how much data we can receive at a time
- * 
-*/
-static int irtty_receive_room(struct tty_struct *tty) 
-{
-	struct sirtty_cb *priv = tty->disc_data;
-
-	IRDA_ASSERT(priv != NULL, return 0;);
-	IRDA_ASSERT(priv->magic == IRTTY_MAGIC, return 0;);
-
-	return 65536;  /* We can handle an infinite amount of data. :-) */
-}
-
 /*
  * Function irtty_write_wakeup (tty)
  *
@@ -534,6 +518,7 @@ static int irtty_open(struct tty_struct *tty)
 
 	dev->priv = priv;
 	tty->disc_data = priv;
+	tty->receive_room = 65536;
 
 	up(&irtty_sem);
 
@@ -605,7 +590,6 @@ static struct tty_ldisc irda_ldisc = {
 	.ioctl		= irtty_ioctl,
  	.poll		= NULL,
 	.receive_buf	= irtty_receive_buf,
-	.receive_room	= irtty_receive_room,
 	.write_wakeup	= irtty_write_wakeup,
 	.owner		= THIS_MODULE,
 };
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index 400f652282d7..aa6540b39466 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -189,7 +189,7 @@ ppp_asynctty_open(struct tty_struct *tty)
 		goto out_free;
 
 	tty->disc_data = ap;
-
+	tty->receive_room = 65536;
 	return 0;
 
  out_free:
@@ -343,12 +343,6 @@ ppp_asynctty_poll(struct tty_struct *tty, struct file *file, poll_table *wait)
 	return 0;
 }
 
-static int
-ppp_asynctty_room(struct tty_struct *tty)
-{
-	return 65535;
-}
-
 /*
  * This can now be called from hard interrupt level as well
  * as soft interrupt level or mainline.
@@ -398,7 +392,6 @@ static struct tty_ldisc ppp_ldisc = {
 	.write	= ppp_asynctty_write,
 	.ioctl	= ppp_asynctty_ioctl,
 	.poll	= ppp_asynctty_poll,
-	.receive_room = ppp_asynctty_room,
 	.receive_buf = ppp_asynctty_receive,
 	.write_wakeup = ppp_asynctty_wakeup,
 };
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
index 4d51c0c8023d..33cb8254e79d 100644
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -237,7 +237,7 @@ ppp_sync_open(struct tty_struct *tty)
 		goto out_free;
 
 	tty->disc_data = ap;
-
+	tty->receive_room = 65536;
 	return 0;
 
  out_free:
@@ -384,12 +384,6 @@ ppp_sync_poll(struct tty_struct *tty, struct file *file, poll_table *wait)
 	return 0;
 }
 
-static int
-ppp_sync_room(struct tty_struct *tty)
-{
-	return 65535;
-}
-
 /*
  * This can now be called from hard interrupt level as well
  * as soft interrupt level or mainline.
@@ -439,7 +433,6 @@ static struct tty_ldisc ppp_sync_ldisc = {
 	.write	= ppp_sync_write,
 	.ioctl	= ppp_synctty_ioctl,
 	.poll	= ppp_sync_poll,
-	.receive_room = ppp_sync_room,
 	.receive_buf = ppp_sync_receive,
 	.write_wakeup = ppp_sync_wakeup,
 };
diff --git a/drivers/net/slip.c b/drivers/net/slip.c
index 404ea4297e32..b2e18d28850d 100644
--- a/drivers/net/slip.c
+++ b/drivers/net/slip.c
@@ -651,11 +651,6 @@ static void sl_setup(struct net_device *dev)
  ******************************************/
 
 
-static int slip_receive_room(struct tty_struct *tty)
-{
-	return 65536;  /* We can handle an infinite amount of data. :-) */
-}
-
 /*
  * Handle the 'receiver data ready' interrupt.
  * This function is called by the 'tty_io' module in the kernel when
@@ -869,10 +864,6 @@ static int slip_open(struct tty_struct *tty)
 	sl->line = tty_devnum(tty);
 	sl->pid = current->pid;
 	
-	/* FIXME: already done before we were called - seems this can go */
-	if (tty->driver->flush_buffer)
-		tty->driver->flush_buffer(tty);
-		
 	if (!test_bit(SLF_INUSE, &sl->flags)) {
 		/* Perform the low-level SLIP initialization. */
 		if ((err = sl_alloc_bufs(sl, SL_MTU)) != 0)
@@ -897,6 +888,7 @@ static int slip_open(struct tty_struct *tty)
 
 	/* Done.  We have linked the TTY line to a channel. */
 	rtnl_unlock();
+	tty->receive_room = 65536;	/* We don't flow control */
 	return sl->dev->base_addr;
 
 err_free_bufs:
@@ -1329,7 +1321,6 @@ static struct tty_ldisc	sl_ldisc = {
 	.close	 	= slip_close,
 	.ioctl		= slip_ioctl,
 	.receive_buf	= slip_receive_buf,
-	.receive_room	= slip_receive_room,
 	.write_wakeup	= slip_write_wakeup,
 };
 
diff --git a/drivers/net/wan/pc300_tty.c b/drivers/net/wan/pc300_tty.c
index 52f26b9c69d2..931cbdf6d791 100644
--- a/drivers/net/wan/pc300_tty.c
+++ b/drivers/net/wan/pc300_tty.c
@@ -689,7 +689,7 @@ static void cpc_tty_rx_work(void * data)
 					}
 				}	
 				cpc_tty->buf_rx.first = cpc_tty->buf_rx.first->next;
-				kfree(buf);
+				kfree((void *)buf);
 				buf = cpc_tty->buf_rx.first;
 				flg_rx = 1;
 			}
diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c
index bdf672c48182..9c3ccc669143 100644
--- a/drivers/net/wan/x25_asy.c
+++ b/drivers/net/wan/x25_asy.c
@@ -515,11 +515,6 @@ static int x25_asy_close(struct net_device *dev)
 	return 0;
 }
 
-static int x25_asy_receive_room(struct tty_struct *tty)
-{
-	return 65536;  /* We can handle an infinite amount of data. :-) */
-}
-
 /*
  * Handle the 'receiver data ready' interrupt.
  * This function is called by the 'tty_io' module in the kernel when
@@ -573,6 +568,7 @@ static int x25_asy_open_tty(struct tty_struct *tty)
 
 	sl->tty = tty;
 	tty->disc_data = sl;
+	tty->receive_room = 65536;
 	if (tty->driver->flush_buffer)  {
 		tty->driver->flush_buffer(tty);
 	}
@@ -779,7 +775,6 @@ static struct tty_ldisc x25_ldisc = {
 	.close		= x25_asy_close_tty,
 	.ioctl		= x25_asy_ioctl,
 	.receive_buf	= x25_asy_receive_buf,
-	.receive_room	= x25_asy_receive_room,
 	.write_wakeup	= x25_asy_write_wakeup,
 };
 
diff --git a/drivers/net/wireless/strip.c b/drivers/net/wireless/strip.c
index d25264ba0c0e..18baacfc5a2c 100644
--- a/drivers/net/wireless/strip.c
+++ b/drivers/net/wireless/strip.c
@@ -1675,11 +1675,6 @@ static int strip_rebuild_header(struct sk_buff *skb)
 /************************************************************************/
 /* Receiving routines							*/
 
-static int strip_receive_room(struct tty_struct *tty)
-{
-	return 0x10000;		/* We can handle an infinite amount of data. :-) */
-}
-
 /*
  * This function parses the response to the ATS300? command,
  * extracting the radio version and serial number.
@@ -2424,7 +2419,7 @@ static struct net_device_stats *strip_get_stats(struct net_device *dev)
 /*
  * Here's the order things happen:
  * When the user runs "slattach -p strip ..."
- *  1. The TTY module calls strip_open
+ *  1. The TTY module calls strip_open;;
  *  2. strip_open calls strip_alloc
  *  3.                  strip_alloc calls register_netdev
  *  4.                  register_netdev calls strip_dev_init
@@ -2652,6 +2647,8 @@ static int strip_open(struct tty_struct *tty)
 
 	strip_info->tty = tty;
 	tty->disc_data = strip_info;
+	tty->receive_room = 65536;
+
 	if (tty->driver->flush_buffer)
 		tty->driver->flush_buffer(tty);
 
@@ -2762,7 +2759,6 @@ static struct tty_ldisc strip_ldisc = {
 	.close = strip_close,
 	.ioctl = strip_ioctl,
 	.receive_buf = strip_receive_buf,
-	.receive_room = strip_receive_room,
 	.write_wakeup = strip_write_some_more,
 };
 
diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index 75419cf9d353..1f060914cfa4 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/kdev_t.h>
 #include <linux/tty.h>
+#include <linux/tty_flip.h>
 #include <linux/vt_kern.h>
 #include <linux/init.h>
 #include <linux/console.h>
@@ -432,8 +433,6 @@ raw3215_irq(struct ccw_device *cdev, unsigned long intparm, struct irb *irb)
 				if (count > slen)
 					count = slen;
 			} else
-			if (count >= TTY_FLIPBUF_SIZE - tty->flip.count)
-				count = TTY_FLIPBUF_SIZE - tty->flip.count - 1;
 			EBCASC(raw->inbuf, count);
 			cchar = ctrlchar_handle(raw->inbuf, count, tty);
 			switch (cchar & CTRLCHAR_MASK) {
@@ -441,28 +440,20 @@ raw3215_irq(struct ccw_device *cdev, unsigned long intparm, struct irb *irb)
 				break;
 
 			case CTRLCHAR_CTRL:
-				tty->flip.count++;
-				*tty->flip.flag_buf_ptr++ = TTY_NORMAL;
-				*tty->flip.char_buf_ptr++ = cchar;
+				tty_insert_flip_char(tty, cchar, TTY_NORMAL);
 				tty_flip_buffer_push(raw->tty);
 				break;
 
 			case CTRLCHAR_NONE:
-				memcpy(tty->flip.char_buf_ptr,
-				       raw->inbuf, count);
 				if (count < 2 ||
-				    (strncmp(raw->inbuf+count-2, "^n", 2) &&
-				    strncmp(raw->inbuf+count-2, "\252n", 2)) ) {
-					/* don't add the auto \n */
-					tty->flip.char_buf_ptr[count] = '\n';
-					memset(tty->flip.flag_buf_ptr,
-					       TTY_NORMAL, count + 1);
+				    (strncmp(raw->inbuf+count-2, "\252n", 2) &&
+				     strncmp(raw->inbuf+count-2, "^n", 2)) ) {
+					/* add the auto \n */
+					raw->inbuf[count] = '\n';
 					count++;
 				} else
-					count-=2;
-				tty->flip.char_buf_ptr += count;
-				tty->flip.flag_buf_ptr += count;
-				tty->flip.count += count;
+					count -= 2;
+				tty_insert_flip_string(tty, raw->inbuf, count);
 				tty_flip_buffer_push(raw->tty);
 				break;
 			}
diff --git a/drivers/s390/char/sclp_tty.c b/drivers/s390/char/sclp_tty.c
index a20d7c89341d..6cbf067f1a8f 100644
--- a/drivers/s390/char/sclp_tty.c
+++ b/drivers/s390/char/sclp_tty.c
@@ -13,6 +13,7 @@
 #include <linux/kmod.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
+#include <linux/tty_flip.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
@@ -496,25 +497,19 @@ sclp_tty_input(unsigned char* buf, unsigned int count)
 	case CTRLCHAR_SYSRQ:
 		break;
 	case CTRLCHAR_CTRL:
-		sclp_tty->flip.count++;
-		*sclp_tty->flip.flag_buf_ptr++ = TTY_NORMAL;
-		*sclp_tty->flip.char_buf_ptr++ = cchar;
+		tty_insert_flip_char(sclp_tty, cchar, TTY_NORMAL);
 		tty_flip_buffer_push(sclp_tty);
 		break;
 	case CTRLCHAR_NONE:
 		/* send (normal) input to line discipline */
-		memcpy(sclp_tty->flip.char_buf_ptr, buf, count);
 		if (count < 2 ||
-		    (strncmp ((const char *) buf + count - 2, "^n", 2) &&
-		     strncmp ((const char *) buf + count - 2, "\0252n", 2))) {
-			sclp_tty->flip.char_buf_ptr[count] = '\n';
-			count++;
+		    (strncmp((const char *) buf + count - 2, "^n", 2) &&
+		     strncmp((const char *) buf + count - 2, "\252n", 2))) {
+			/* add the auto \n */
+			tty_insert_flip_string(sclp_tty, buf, count);
+			tty_insert_flip_char(sclp_tty, '\n', TTY_NORMAL);
 		} else
-			count -= 2;
-		memset(sclp_tty->flip.flag_buf_ptr, TTY_NORMAL, count);
-		sclp_tty->flip.char_buf_ptr += count;
-		sclp_tty->flip.flag_buf_ptr += count;
-		sclp_tty->flip.count += count;
+			tty_insert_flip_string(sclp_tty, buf, count - 2);
 		tty_flip_buffer_push(sclp_tty);
 		break;
 	}
diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
index 06bd85824d7b..9e02625c82cf 100644
--- a/drivers/s390/char/sclp_vt220.c
+++ b/drivers/s390/char/sclp_vt220.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
+#include <linux/tty_flip.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
@@ -482,16 +483,7 @@ sclp_vt220_receiver_fn(struct evbuf_header *evbuf)
 		/* Send input to line discipline */
 		buffer++;
 		count--;
-		/* Prevent buffer overrun by discarding input. Note that
-		 * because buffer_push works asynchronously, we cannot wait
-		 * for the buffer to be emptied. */
-		if (count + sclp_vt220_tty->flip.count > TTY_FLIPBUF_SIZE)
-			count = TTY_FLIPBUF_SIZE - sclp_vt220_tty->flip.count;
-		memcpy(sclp_vt220_tty->flip.char_buf_ptr, buffer, count);
-		memset(sclp_vt220_tty->flip.flag_buf_ptr, TTY_NORMAL, count);
-		sclp_vt220_tty->flip.char_buf_ptr += count;
-		sclp_vt220_tty->flip.flag_buf_ptr += count;
-		sclp_vt220_tty->flip.count += count;
+		tty_insert_flip_string(sclp_vt220_tty, buffer, count);
 		tty_flip_buffer_push(sclp_vt220_tty);
 		break;
 	}
diff --git a/drivers/s390/net/ctctty.c b/drivers/s390/net/ctctty.c
index 968f2c113efe..93d1725eb79b 100644
--- a/drivers/s390/net/ctctty.c
+++ b/drivers/s390/net/ctctty.c
@@ -25,6 +25,7 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/tty.h>
+#include <linux/tty_flip.h>
 #include <linux/serial_reg.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
@@ -101,25 +102,17 @@ static spinlock_t ctc_tty_lock;
 static int
 ctc_tty_try_read(ctc_tty_info * info, struct sk_buff *skb)
 {
-	int c;
 	int len;
 	struct tty_struct *tty;
 
 	DBF_TEXT(trace, 5, __FUNCTION__);
 	if ((tty = info->tty)) {
 		if (info->mcr & UART_MCR_RTS) {
-			c = TTY_FLIPBUF_SIZE - tty->flip.count;
 			len = skb->len;
-			if (c >= len) {
-				memcpy(tty->flip.char_buf_ptr, skb->data, len);
-				memset(tty->flip.flag_buf_ptr, 0, len);
-				tty->flip.count += len;
-				tty->flip.char_buf_ptr += len;
-				tty->flip.flag_buf_ptr += len;
-				tty_flip_buffer_push(tty);
-				kfree_skb(skb);
-				return 1;
-			}
+			tty_insert_flip_string(tty, skb->data, len);
+			tty_flip_buffer_push(tty);
+			kfree_skb(skb);
+			return 1;
 		}
 	}
 	return 0;
@@ -138,19 +131,12 @@ ctc_tty_readmodem(ctc_tty_info *info)
 	DBF_TEXT(trace, 5, __FUNCTION__);
 	if ((tty = info->tty)) {
 		if (info->mcr & UART_MCR_RTS) {
-			int c = TTY_FLIPBUF_SIZE - tty->flip.count;
 			struct sk_buff *skb;
 			
-			if ((c > 0) && (skb = skb_dequeue(&info->rx_queue))) {
+			if ((skb = skb_dequeue(&info->rx_queue))) {
 				int len = skb->len;
-				if (len > c)
-					len = c;
-				memcpy(tty->flip.char_buf_ptr, skb->data, len);
+				tty_insert_flip_string(tty, skb->data, len);
 				skb_pull(skb, len);
-				memset(tty->flip.flag_buf_ptr, 0, len);
-				tty->flip.count += len;
-				tty->flip.char_buf_ptr += len;
-				tty->flip.flag_buf_ptr += len;
 				tty_flip_buffer_push(tty);
 				if (skb->len > 0)
 					skb_queue_head(&info->rx_queue, skb);
diff --git a/drivers/serial/21285.c b/drivers/serial/21285.c
index b5cf39468d18..221999bcf8fe 100644
--- a/drivers/serial/21285.c
+++ b/drivers/serial/21285.c
@@ -94,15 +94,6 @@ static irqreturn_t serial21285_rx_chars(int irq, void *dev_id, struct pt_regs *r
 
 	status = *CSR_UARTFLG;
 	while (!(status & 0x10) && max_count--) {
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-			if (tty->low_latency)
-				tty_flip_buffer_push(tty);
-			/*
-			 * If this failed then we will throw away the
-			 * bytes but must do so to clear interrupts
-			 */
-		}
-
 		ch = *CSR_UARTDR;
 		flag = TTY_NORMAL;
 		port->icount.rx++;
diff --git a/drivers/serial/68328serial.c b/drivers/serial/68328serial.c
index 67e9afa000c1..4dd5c3f98167 100644
--- a/drivers/serial/68328serial.c
+++ b/drivers/serial/68328serial.c
@@ -294,7 +294,7 @@ static _INLINE_ void receive_chars(struct m68k_serial *info, struct pt_regs *reg
 {
 	struct tty_struct *tty = info->tty;
 	m68328_uart *uart = &uart_addr[info->line];
-	unsigned char ch;
+	unsigned char ch, flag;
 
 	/*
 	 * This do { } while() loop will get ALL chars out of Rx FIFO 
@@ -332,26 +332,24 @@ static _INLINE_ void receive_chars(struct m68k_serial *info, struct pt_regs *reg
 		/*
 		 * Make sure that we do not overflow the buffer
 		 */
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
+		if (tty_request_buffer_room(tty, 1) == 0) {
 			schedule_work(&tty->flip.work);
 			return;
 		}
 
+		flag = TTY_NORMAL;
+
 		if(rx & URX_PARITY_ERROR) {
-			*tty->flip.flag_buf_ptr++ = TTY_PARITY;
+			flag = TTY_PARITY;
 			status_handle(info, rx);
 		} else if(rx & URX_OVRUN) {
-			*tty->flip.flag_buf_ptr++ = TTY_OVERRUN;
+			flag = TTY_OVERRUN;
 			status_handle(info, rx);
 		} else if(rx & URX_FRAME_ERROR) {
-			*tty->flip.flag_buf_ptr++ = TTY_FRAME;
+			flag = TTY_FRAME;
 			status_handle(info, rx);
-		} else {
-			*tty->flip.flag_buf_ptr++ = 0; /* XXX */
 		}
-                *tty->flip.char_buf_ptr++ = ch;
-		tty->flip.count++;
-
+		tty_insert_flip_char(tty, ch, flag);
 #ifndef CONFIG_XCOPILOT_BUGS
 	} while((rx = uart->urx.w) & URX_DATA_READY);
 #endif
diff --git a/drivers/serial/68360serial.c b/drivers/serial/68360serial.c
index 170c9d2a749c..60f5a5dc17f1 100644
--- a/drivers/serial/68360serial.c
+++ b/drivers/serial/68360serial.c
@@ -394,7 +394,7 @@ static void rs_360_start(struct tty_struct *tty)
 static _INLINE_ void receive_chars(ser_info_t *info)
 {
 	struct tty_struct *tty = info->tty;
-	unsigned char ch, *cp;
+	unsigned char ch, flag, *cp;
 	/*int	ignored = 0;*/
 	int	i;
 	ushort	status;
@@ -438,24 +438,15 @@ static _INLINE_ void receive_chars(ser_info_t *info)
 		cp = (char *)bdp->buf;
 		status = bdp->status;
 
-		/* Check to see if there is room in the tty buffer for
-		 * the characters in our BD buffer.  If not, we exit
-		 * now, leaving the BD with the characters.  We'll pick
-		 * them up again on the next receive interrupt (which could
-		 * be a timeout).
-		 */
-		if ((tty->flip.count + i) >= TTY_FLIPBUF_SIZE)
-			break;
-
 		while (i-- > 0) {
 			ch = *cp++;
-			*tty->flip.char_buf_ptr = ch;
 			icount->rx++;
 
 #ifdef SERIAL_DEBUG_INTR
 			printk("DR%02x:%02x...", ch, status);
 #endif
-			*tty->flip.flag_buf_ptr = 0;
+			flag = TTY_NORMAL;
+
 			if (status & (BD_SC_BR | BD_SC_FR |
 				       BD_SC_PR | BD_SC_OV)) {
 				/*
@@ -490,30 +481,18 @@ static _INLINE_ void receive_chars(ser_info_t *info)
 					if (info->flags & ASYNC_SAK)
 						do_SAK(tty);
 				} else if (status & BD_SC_PR)
-					*tty->flip.flag_buf_ptr = TTY_PARITY;
+					flag = TTY_PARITY;
 				else if (status & BD_SC_FR)
-					*tty->flip.flag_buf_ptr = TTY_FRAME;
-				if (status & BD_SC_OV) {
-					/*
-					 * Overrun is special, since it's
-					 * reported immediately, and doesn't
-					 * affect the current character
-					 */
-					if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-						tty->flip.count++;
-						tty->flip.flag_buf_ptr++;
-						tty->flip.char_buf_ptr++;
-						*tty->flip.flag_buf_ptr =
-								TTY_OVERRUN;
-					}
-				}
+					flag = TTY_FRAME;
 			}
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-				break;
-
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
+			tty_insert_flip_char(tty, ch, flag);
+			if (status & BD_SC_OV)
+				/*
+				 * Overrun is special, since it's
+				 * reported immediately, and doesn't
+				 * affect the current character
+				 */
+				tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 		}
 
 		/* This BD is ready to be used again.  Clear status.
@@ -541,12 +520,7 @@ static _INLINE_ void receive_break(ser_info_t *info)
 	/* Check to see if there is room in the tty buffer for
 	 * the break.  If not, we exit now, losing the break.  FIXME
 	 */
-	if ((tty->flip.count + 1) >= TTY_FLIPBUF_SIZE)
-		return;
-	*(tty->flip.flag_buf_ptr++) = TTY_BREAK;
-	*(tty->flip.char_buf_ptr++) = 0;
-	tty->flip.count++;
-
+	tty_insert_flip_char(tty, 0, TTY_BREAK);
 	schedule_work(&tty->flip.work);
 }
 
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index e8454611cb65..54e5cc0dd5f8 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -1142,19 +1142,6 @@ receive_chars(struct uart_8250_port *up, int *status, struct pt_regs *regs)
 	char flag;
 
 	do {
-		/* The following is not allowed by the tty layer and
-		   unsafe. It should be fixed ASAP */
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			if (tty->low_latency) {
-				spin_unlock(&up->port.lock);
-				tty_flip_buffer_push(tty);
-				spin_lock(&up->port.lock);
-			}
-			/*
-			 * If this failed then we will throw away the
-			 * bytes but must do so to clear interrupts
-			 */
-		}
 		ch = serial_inp(up, UART_RX);
 		flag = TTY_NORMAL;
 		up->port.icount.rx++;
diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c
index 48f6e872314b..3490022e9fdc 100644
--- a/drivers/serial/amba-pl010.c
+++ b/drivers/serial/amba-pl010.c
@@ -154,15 +154,6 @@ pl010_rx_chars(struct uart_port *port)
 
 	status = UART_GET_FR(port);
 	while (UART_RX_DATA(status) && max_count--) {
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-			if (tty->low_latency)
-				tty_flip_buffer_push(tty);
-			/*
-			 * If this failed then we will throw away the
-			 * bytes but must do so to clear interrupts.
-			 */
-		}
-
 		ch = UART_GET_CHAR(port);
 		flag = TTY_NORMAL;
 
diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 129670556162..034a029e356e 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -120,15 +120,6 @@ pl011_rx_chars(struct uart_amba_port *uap)
 
 	status = readw(uap->port.membase + UART01x_FR);
 	while ((status & UART01x_FR_RXFE) == 0 && max_count--) {
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-			if (tty->low_latency)
-				tty_flip_buffer_push(tty);
-			/*
-			 * If this failed then we will throw away the
-			 * bytes but must do so to clear interrupts
-			 */
-		}
-
 		ch = readw(uap->port.membase + UART01x_DR) | UART_DUMMY_DR_RX;
 		flag = TTY_NORMAL;
 		uap->port.icount.rx++;
diff --git a/drivers/serial/au1x00_uart.c b/drivers/serial/au1x00_uart.c
index a274ebf256a1..ceb5d7f37bbd 100644
--- a/drivers/serial/au1x00_uart.c
+++ b/drivers/serial/au1x00_uart.c
@@ -241,18 +241,12 @@ static _INLINE_ void
 receive_chars(struct uart_8250_port *up, int *status, struct pt_regs *regs)
 {
 	struct tty_struct *tty = up->port.info->tty;
-	unsigned char ch;
+	unsigned char ch, flag;
 	int max_count = 256;
 
 	do {
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			tty->flip.work.func((void *)tty);
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-				return; // if TTY_DONT_FLIP is set
-		}
 		ch = serial_inp(up, UART_RX);
-		*tty->flip.char_buf_ptr = ch;
-		*tty->flip.flag_buf_ptr = TTY_NORMAL;
+		flag = TTY_NORMAL;
 		up->port.icount.rx++;
 
 		if (unlikely(*status & (UART_LSR_BI | UART_LSR_PE |
@@ -292,30 +286,23 @@ receive_chars(struct uart_8250_port *up, int *status, struct pt_regs *regs)
 #endif
 			if (*status & UART_LSR_BI) {
 				DEBUG_INTR("handling break....");
-				*tty->flip.flag_buf_ptr = TTY_BREAK;
+				flag = TTY_BREAK;
 			} else if (*status & UART_LSR_PE)
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			else if (*status & UART_LSR_FE)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
+				flag = TTY_FRAME;
 		}
 		if (uart_handle_sysrq_char(&up->port, ch, regs))
 			goto ignore_char;
-		if ((*status & up->port.ignore_status_mask) == 0) {
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
-		if ((*status & UART_LSR_OE) &&
-		    tty->flip.count < TTY_FLIPBUF_SIZE) {
+		if ((*status & up->port.ignore_status_mask) == 0)
+			tty_insert_flip_char(tty, ch, flag);
+		if (*status & UART_LSR_OE)
 			/*
 			 * Overrun is special, since it's reported
 			 * immediately, and doesn't affect the current
 			 * character.
 			 */
-			*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
+			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 		}
 	ignore_char:
 		*status = serial_inp(up, UART_LSR);
diff --git a/drivers/serial/clps711x.c b/drivers/serial/clps711x.c
index 87ef368384fb..8ef999481f93 100644
--- a/drivers/serial/clps711x.c
+++ b/drivers/serial/clps711x.c
@@ -104,8 +104,6 @@ static irqreturn_t clps711xuart_int_rx(int irq, void *dev_id, struct pt_regs *re
 	while (!(status & SYSFLG_URXFE)) {
 		ch = clps_readl(UARTDR(port));
 
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-			goto ignore_char;
 		port->icount.rx++;
 
 		flg = TTY_NORMAL;
diff --git a/drivers/serial/dz.c b/drivers/serial/dz.c
index 4d8516d1bb71..a64ba26a94e8 100644
--- a/drivers/serial/dz.c
+++ b/drivers/serial/dz.c
@@ -216,8 +216,6 @@ static inline void dz_receive_chars(struct dz_port *dport)
 
 		if (!tty)
 			break;
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-			break;
 
 		icount->rx++;
 
diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c
index eb31125c6a30..144a7a352b28 100644
--- a/drivers/serial/icom.c
+++ b/drivers/serial/icom.c
@@ -729,19 +729,20 @@ static void recv_interrupt(u16 port_int_reg, struct icom_port *icom_port)
 	unsigned short int status;
 	struct uart_icount *icount;
 	unsigned long offset;
+	unsigned char flag;
 
 	trace(icom_port, "RCV_COMPLETE", 0);
 	rcv_buff = icom_port->next_rcv;
 
 	status = cpu_to_le16(icom_port->statStg->rcv[rcv_buff].flags);
 	while (status & SA_FL_RCV_DONE) {
+		int first = -1;
 
 		trace(icom_port, "FID_STATUS", status);
 		count = cpu_to_le16(icom_port->statStg->rcv[rcv_buff].leLength);
 
+                count = tty_buffer_request_room(tty, count);
 		trace(icom_port, "RCV_COUNT", count);
-		if (count > (TTY_FLIPBUF_SIZE - tty->flip.count))
-			count = TTY_FLIPBUF_SIZE - tty->flip.count;
 
 		trace(icom_port, "REAL_COUNT", count);
 
@@ -749,15 +750,10 @@ static void recv_interrupt(u16 port_int_reg, struct icom_port *icom_port)
 			cpu_to_le32(icom_port->statStg->rcv[rcv_buff].leBuffer) -
 			icom_port->recv_buf_pci;
 
-		memcpy(tty->flip.char_buf_ptr,(unsigned char *)
-		       ((unsigned long)icom_port->recv_buf + offset), count);
-
+		/* Block copy all but the last byte as this may have status */
 		if (count > 0) {
-			tty->flip.count += count - 1;
-			tty->flip.char_buf_ptr += count - 1;
-
-			memset(tty->flip.flag_buf_ptr, 0, count);
-			tty->flip.flag_buf_ptr += count - 1;
+			first = icom_port->recv_buf[offset];
+			tty_insert_flip_string(tty, icom_port->recv_buf + offset, count - 1);
 		}
 
 		icount = &icom_port->uart_port.icount;
@@ -765,12 +761,14 @@ static void recv_interrupt(u16 port_int_reg, struct icom_port *icom_port)
 
 		/* Break detect logic */
 		if ((status & SA_FLAGS_FRAME_ERROR)
-		    && (tty->flip.char_buf_ptr[0] == 0x00)) {
+		    && first == 0) {
 			status &= ~SA_FLAGS_FRAME_ERROR;
 			status |= SA_FLAGS_BREAK_DET;
 			trace(icom_port, "BREAK_DET", 0);
 		}
 
+		flag = TTY_NORMAL;
+
 		if (status &
 		    (SA_FLAGS_BREAK_DET | SA_FLAGS_PARITY_ERROR |
 		     SA_FLAGS_FRAME_ERROR | SA_FLAGS_OVERRUN)) {
@@ -797,33 +795,26 @@ static void recv_interrupt(u16 port_int_reg, struct icom_port *icom_port)
 			status &= icom_port->read_status_mask;
 
 			if (status & SA_FLAGS_BREAK_DET) {
-				*tty->flip.flag_buf_ptr = TTY_BREAK;
+				flag = TTY_BREAK;
 			} else if (status & SA_FLAGS_PARITY_ERROR) {
 				trace(icom_port, "PARITY_ERROR", 0);
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			} else if (status & SA_FLAGS_FRAME_ERROR)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
-
-			if (status & SA_FLAGS_OVERRUN) {
-				/*
-				 * Overrun is special, since it's
-				 * reported immediately, and doesn't
-				 * affect the current character
-				 */
-				if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-					tty->flip.count++;
-					tty->flip.flag_buf_ptr++;
-					tty->flip.char_buf_ptr++;
-					*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-				}
-			}
+				flag = TTY_FRAME;
+
 		}
 
-		tty->flip.flag_buf_ptr++;
-		tty->flip.char_buf_ptr++;
-		tty->flip.count++;
-		ignore_char:
-			icom_port->statStg->rcv[rcv_buff].flags = 0;
+		tty_insert_flip_char(tty, *(icom_port->recv_buf + offset + count - 1), flag);
+
+		if (status & SA_FLAGS_OVERRUN)
+			/*
+			 * Overrun is special, since it's
+			 * reported immediately, and doesn't
+			 * affect the current character
+			 */
+			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
+ignore_char:
+		icom_port->statStg->rcv[rcv_buff].flags = 0;
 		icom_port->statStg->rcv[rcv_buff].leLength = 0;
 		icom_port->statStg->rcv[rcv_buff].WorkingLength =
 			(unsigned short int) cpu_to_le16(RCV_BUFF_SZ);
diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 83c4c1216587..5c098be9346b 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -256,9 +256,6 @@ static irqreturn_t imx_rxint(int irq, void *dev_id, struct pt_regs *regs)
 	error_return:
 		tty_insert_flip_char(tty, rx, flg);
 
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-			goto out;
-
 	ignore_char:
 		rx = URXD0((u32)sport->port.membase);
 	} while(rx & URXD_CHARRDY);
diff --git a/drivers/serial/ioc4_serial.c b/drivers/serial/ioc4_serial.c
index 771676abee60..1d85533d46d2 100644
--- a/drivers/serial/ioc4_serial.c
+++ b/drivers/serial/ioc4_serial.c
@@ -2327,19 +2327,13 @@ static void receive_chars(struct uart_port *the_port)
 	spin_lock_irqsave(&the_port->lock, pflags);
 	tty = info->tty;
 
-	if (request_count > TTY_FLIPBUF_SIZE - tty->flip.count)
-		request_count = TTY_FLIPBUF_SIZE - tty->flip.count;
+	request_count = tty_buffer_request_room(tty, IOC4_MAX_CHARS - 2);
 
 	if (request_count > 0) {
 		icount = &the_port->icount;
 		read_count = do_read(the_port, ch, request_count);
 		if (read_count > 0) {
-			flip = 1;
-			memcpy(tty->flip.char_buf_ptr, ch, read_count);
-			memset(tty->flip.flag_buf_ptr, TTY_NORMAL, read_count);
-			tty->flip.char_buf_ptr += read_count;
-			tty->flip.flag_buf_ptr += read_count;
-			tty->flip.count += read_count;
+			tty_insert_flip_string(tty, ch, read_count);
 			icount->rx += read_count;
 		}
 	}
diff --git a/drivers/serial/ip22zilog.c b/drivers/serial/ip22zilog.c
index ef132349f310..66f117d15065 100644
--- a/drivers/serial/ip22zilog.c
+++ b/drivers/serial/ip22zilog.c
@@ -259,13 +259,7 @@ static void ip22zilog_receive_chars(struct uart_ip22zilog_port *up,
 	struct tty_struct *tty = up->port.info->tty;	/* XXX info==NULL? */
 
 	while (1) {
-		unsigned char ch, r1;
-
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			tty->flip.work.func((void *)tty);
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-				return;		/* XXX Ignores SysRq when we need it most. Fix. */
-		}
+		unsigned char ch, r1, flag;
 
 		r1 = read_zsreg(channel, R1);
 		if (r1 & (PAR_ERR | Rx_OVR | CRC_ERR)) {
@@ -303,8 +297,7 @@ static void ip22zilog_receive_chars(struct uart_ip22zilog_port *up,
 		}
 
 		/* A real serial line, record the character and status.  */
-		*tty->flip.char_buf_ptr = ch;
-		*tty->flip.flag_buf_ptr = TTY_NORMAL;
+		flag = TTY_NORMAL;
 		up->port.icount.rx++;
 		if (r1 & (BRK_ABRT | PAR_ERR | Rx_OVR | CRC_ERR)) {
 			if (r1 & BRK_ABRT) {
@@ -321,28 +314,21 @@ static void ip22zilog_receive_chars(struct uart_ip22zilog_port *up,
 				up->port.icount.overrun++;
 			r1 &= up->port.read_status_mask;
 			if (r1 & BRK_ABRT)
-				*tty->flip.flag_buf_ptr = TTY_BREAK;
+				flag = TTY_BREAK;
 			else if (r1 & PAR_ERR)
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			else if (r1 & CRC_ERR)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
+				flag = TTY_FRAME;
 		}
 		if (uart_handle_sysrq_char(&up->port, ch, regs))
 			goto next_char;
 
 		if (up->port.ignore_status_mask == 0xff ||
-		    (r1 & up->port.ignore_status_mask) == 0) {
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
-		if ((r1 & Rx_OVR) &&
-		    tty->flip.count < TTY_FLIPBUF_SIZE) {
-			*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
+		    (r1 & up->port.ignore_status_mask) == 0)
+		    	tty_insert_flip_char(tty, ch, flag);
+
+		if (r1 & Rx_OVR)
+			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 	next_char:
 		ch = readb(&channel->control);
 		ZSDELAY();
diff --git a/drivers/serial/m32r_sio.c b/drivers/serial/m32r_sio.c
index b0ecc7537ce5..b48066a64a7d 100644
--- a/drivers/serial/m32r_sio.c
+++ b/drivers/serial/m32r_sio.c
@@ -331,17 +331,12 @@ static _INLINE_ void receive_chars(struct uart_sio_port *up, int *status,
 {
 	struct tty_struct *tty = up->port.info->tty;
 	unsigned char ch;
+	unsigned char flag;
 	int max_count = 256;
 
 	do {
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			tty->flip.work.func((void *)tty);
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-				return; // if TTY_DONT_FLIP is set
-		}
 		ch = sio_in(up, SIORXB);
-		*tty->flip.char_buf_ptr = ch;
-		*tty->flip.flag_buf_ptr = TTY_NORMAL;
+		flag = TTY_NORMAL;
 		up->port.icount.rx++;
 
 		if (unlikely(*status & (UART_LSR_BI | UART_LSR_PE |
@@ -380,30 +375,24 @@ static _INLINE_ void receive_chars(struct uart_sio_port *up, int *status,
 
 			if (*status & UART_LSR_BI) {
 				DEBUG_INTR("handling break....");
-				*tty->flip.flag_buf_ptr = TTY_BREAK;
+				flag = TTY_BREAK;
 			} else if (*status & UART_LSR_PE)
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			else if (*status & UART_LSR_FE)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
+				flag = TTY_FRAME;
 		}
 		if (uart_handle_sysrq_char(&up->port, ch, regs))
 			goto ignore_char;
-		if ((*status & up->port.ignore_status_mask) == 0) {
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
-		if ((*status & UART_LSR_OE) &&
-		    tty->flip.count < TTY_FLIPBUF_SIZE) {
+		if ((*status & up->port.ignore_status_mask) == 0)
+			tty_insert_flip_char(tty, ch, flag);
+
+		if (*status & UART_LSR_OE) {
 			/*
 			 * Overrun is special, since it's reported
 			 * immediately, and doesn't affect the current
 			 * character.
 			 */
-			*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
+			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 		}
 	ignore_char:
 		*status = serial_in(up, UART_LSR);
diff --git a/drivers/serial/mcfserial.c b/drivers/serial/mcfserial.c
index 47f7404cb045..f2a51e61eec7 100644
--- a/drivers/serial/mcfserial.c
+++ b/drivers/serial/mcfserial.c
@@ -313,7 +313,7 @@ static inline void receive_chars(struct mcf_serial *info)
 {
 	volatile unsigned char	*uartp;
 	struct tty_struct	*tty = info->tty;
-	unsigned char		status, ch;
+	unsigned char		status, ch, flag;
 
 	if (!tty)
 		return;
@@ -321,10 +321,6 @@ static inline void receive_chars(struct mcf_serial *info)
 	uartp = info->addr;
 
 	while ((status = uartp[MCFUART_USR]) & MCFUART_USR_RXREADY) {
-
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-			break;
-
 		ch = uartp[MCFUART_URB];
 		info->stats.rx++;
 
@@ -335,29 +331,24 @@ static inline void receive_chars(struct mcf_serial *info)
 		}
 #endif
 
-		tty->flip.count++;
+		flag = TTY_NORMAL;
 		if (status & MCFUART_USR_RXERR) {
 			uartp[MCFUART_UCR] = MCFUART_UCR_CMDRESETERR;
 			if (status & MCFUART_USR_RXBREAK) {
 				info->stats.rxbreak++;
-				*tty->flip.flag_buf_ptr++ = TTY_BREAK;
+				flag = TTY_BREAK;
 			} else if (status & MCFUART_USR_RXPARITY) {
 				info->stats.rxparity++;
-				*tty->flip.flag_buf_ptr++ = TTY_PARITY;
+				flag = TTY_PARITY;
 			} else if (status & MCFUART_USR_RXOVERRUN) {
 				info->stats.rxoverrun++;
-				*tty->flip.flag_buf_ptr++ = TTY_OVERRUN;
+				flag = TTY_OVERRUN;
 			} else if (status & MCFUART_USR_RXFRAMING) {
 				info->stats.rxframing++;
-				*tty->flip.flag_buf_ptr++ = TTY_FRAME;
-			} else {
-				/* This should never happen... */
-				*tty->flip.flag_buf_ptr++ = 0;
+				flag = TTY_FRAME;
 			}
-		} else {
-			*tty->flip.flag_buf_ptr++ = 0;
 		}
-		*tty->flip.char_buf_ptr++ = ch;
+		tty_insert_flip_char(tty, ch, flag);
 	}
 
 	schedule_work(&tty->flip.work);
diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c
index 1288d6203e94..61dd17d7bace 100644
--- a/drivers/serial/mpc52xx_uart.c
+++ b/drivers/serial/mpc52xx_uart.c
@@ -405,17 +405,13 @@ static inline int
 mpc52xx_uart_int_rx_chars(struct uart_port *port, struct pt_regs *regs)
 {
 	struct tty_struct *tty = port->info->tty;
-	unsigned char ch;
+	unsigned char ch, flag;
 	unsigned short status;
 
 	/* While we can read, do so ! */
 	while ( (status = in_be16(&PSC(port)->mpc52xx_psc_status)) &
 	        MPC52xx_PSC_SR_RXRDY) {
 
-		/* If we are full, just stop reading */
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-			break;
-		
 		/* Get the char */
 		ch = in_8(&PSC(port)->mpc52xx_psc_buffer_8);
 
@@ -428,45 +424,35 @@ mpc52xx_uart_int_rx_chars(struct uart_port *port, struct pt_regs *regs)
 #endif
 
 		/* Store it */
-		*tty->flip.char_buf_ptr = ch;
-		*tty->flip.flag_buf_ptr = 0;
+
+		flag = TTY_NORMAL;
 		port->icount.rx++;
 	
 		if ( status & (MPC52xx_PSC_SR_PE |
 		               MPC52xx_PSC_SR_FE |
-		               MPC52xx_PSC_SR_RB |
-		               MPC52xx_PSC_SR_OE) ) {
+		               MPC52xx_PSC_SR_RB) ) {
 			
 			if (status & MPC52xx_PSC_SR_RB) {
-				*tty->flip.flag_buf_ptr = TTY_BREAK;
+				flag = TTY_BREAK;
 				uart_handle_break(port);
 			} else if (status & MPC52xx_PSC_SR_PE)
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			else if (status & MPC52xx_PSC_SR_FE)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
-			if (status & MPC52xx_PSC_SR_OE) {
-				/*
-				 * Overrun is special, since it's
-				 * reported immediately, and doesn't
-				 * affect the current character
-				 */
-				if (tty->flip.count < (TTY_FLIPBUF_SIZE-1)) {
-					tty->flip.flag_buf_ptr++;
-					tty->flip.char_buf_ptr++;
-					tty->flip.count++;
-				}
-				*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-			}
+				flag = TTY_FRAME;
 
 			/* Clear error condition */
 			out_8(&PSC(port)->command,MPC52xx_PSC_RST_ERR_STAT);
 
 		}
-
-		tty->flip.char_buf_ptr++;
-		tty->flip.flag_buf_ptr++;
-		tty->flip.count++;
-
+		tty_insert_flip_char(tty, ch, flag);
+		if (status & MPC52xx_PSC_SR_OE) {
+			/*
+			 * Overrun is special, since it's
+			 * reported immediately, and doesn't
+			 * affect the current character
+			 */
+			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
+		}
 	}
 
 	tty_flip_buffer_push(tty);
diff --git a/drivers/serial/mpsc.c b/drivers/serial/mpsc.c
index 8f83e4007ecd..0ca83ac31d07 100644
--- a/drivers/serial/mpsc.c
+++ b/drivers/serial/mpsc.c
@@ -769,12 +769,12 @@ mpsc_rx_intr(struct mpsc_port_info *pi, struct pt_regs *regs)
 		bytes_in = be16_to_cpu(rxre->bytecnt);
 
 		/* Following use of tty struct directly is deprecated */
-		if (unlikely((tty->flip.count + bytes_in) >= TTY_FLIPBUF_SIZE)){
+		if (unlikely(tty_buffer_request_room(tty, bytes_in) < bytes_in)) {
 			if (tty->low_latency)
 				tty_flip_buffer_push(tty);
 			/*
-			 * If this failed then we will throw awa the bytes
-			 * but mst do so to clear interrupts.
+			 * If this failed then we will throw away the bytes
+			 * but must do so to clear interrupts.
 			 */
 		}
 
diff --git a/drivers/serial/mux.c b/drivers/serial/mux.c
index 7633132a10aa..4e49168c3176 100644
--- a/drivers/serial/mux.c
+++ b/drivers/serial/mux.c
@@ -223,11 +223,6 @@ static void mux_read(struct uart_port *port)
 		if (MUX_EOFIFO(data))
 			break;
 
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-			continue;
-
-		*tty->flip.char_buf_ptr = data & 0xffu;
-		*tty->flip.flag_buf_ptr = TTY_NORMAL;
 		port->icount.rx++;
 
 		if (MUX_BREAK(data)) {
@@ -239,9 +234,7 @@ static void mux_read(struct uart_port *port)
 		if (uart_handle_sysrq_char(port, data & 0xffu, NULL))
 			continue;
 
-		tty->flip.flag_buf_ptr++;
-		tty->flip.char_buf_ptr++;
-		tty->flip.count++;
+		tty_insert_flip_char(tty, data & 0xFF, TTY_NORMAL);
 	}
 	
 	if (start_count != port->icount.rx) {
diff --git a/drivers/serial/pmac_zilog.c b/drivers/serial/pmac_zilog.c
index ea24129eb6b9..f330d6c0e0df 100644
--- a/drivers/serial/pmac_zilog.c
+++ b/drivers/serial/pmac_zilog.c
@@ -210,10 +210,9 @@ static struct tty_struct *pmz_receive_chars(struct uart_pmac_port *uap,
 					    struct pt_regs *regs)
 {
 	struct tty_struct *tty = NULL;
-	unsigned char ch, r1, drop, error;
+	unsigned char ch, r1, drop, error, flag;
 	int loops = 0;
 
- retry:
 	/* The interrupt can be enabled when the port isn't open, typically
 	 * that happens when using one port is open and the other closed (stale
 	 * interrupt) or when one port is used as a console.
@@ -246,20 +245,6 @@ static struct tty_struct *pmz_receive_chars(struct uart_pmac_port *uap,
 		error = 0;
 		drop = 0;
 
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			/* Have to drop the lock here */
-			pmz_debug("pmz: flip overflow\n");
-			spin_unlock(&uap->port.lock);
-			tty->flip.work.func((void *)tty);
-			spin_lock(&uap->port.lock);
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-				drop = 1;
-			if (ZS_IS_ASLEEP(uap))
-				return NULL;
-			if (!ZS_IS_OPEN(uap))
-				goto retry;
-		}
-
 		r1 = read_zsreg(uap, R1);
 		ch = read_zsdata(uap);
 
@@ -295,8 +280,7 @@ static struct tty_struct *pmz_receive_chars(struct uart_pmac_port *uap,
 		if (drop)
 			goto next_char;
 
-		*tty->flip.char_buf_ptr = ch;
-		*tty->flip.flag_buf_ptr = TTY_NORMAL;
+		flag = TTY_NORMAL;
 		uap->port.icount.rx++;
 
 		if (r1 & (PAR_ERR | Rx_OVR | CRC_ERR | BRK_ABRT)) {
@@ -316,26 +300,19 @@ static struct tty_struct *pmz_receive_chars(struct uart_pmac_port *uap,
 				uap->port.icount.overrun++;
 			r1 &= uap->port.read_status_mask;
 			if (r1 & BRK_ABRT)
-				*tty->flip.flag_buf_ptr = TTY_BREAK;
+				flag = TTY_BREAK;
 			else if (r1 & PAR_ERR)
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			else if (r1 & CRC_ERR)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
+				flag = TTY_FRAME;
 		}
 
 		if (uap->port.ignore_status_mask == 0xff ||
 		    (r1 & uap->port.ignore_status_mask) == 0) {
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
-		if ((r1 & Rx_OVR) &&
-		    tty->flip.count < TTY_FLIPBUF_SIZE) {
-			*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
+		    	tty_insert_flip_char(tty, ch, flag);
 		}
+		if (r1 & Rx_OVR)
+			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 	next_char:
 		/* We can get stuck in an infinite loop getting char 0 when the
 		 * line is in a wrong HW state, we break that here.
diff --git a/drivers/serial/pxa.c b/drivers/serial/pxa.c
index cc998b99a19f..10535f00301f 100644
--- a/drivers/serial/pxa.c
+++ b/drivers/serial/pxa.c
@@ -107,14 +107,6 @@ receive_chars(struct uart_pxa_port *up, int *status, struct pt_regs *regs)
 	int max_count = 256;
 
 	do {
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			if (tty->low_latency)
-				tty_flip_buffer_push(tty);
-			/*
-			 * If this failed then we will throw away the
-			 * bytes but must do so to clear interrupts
-			 */
-		}
 		ch = serial_in(up, UART_RX);
 		flag = TTY_NORMAL;
 		up->port.icount.rx++;
diff --git a/drivers/serial/s3c2410.c b/drivers/serial/s3c2410.c
index fe83ce6fef52..eb4883efb7c6 100644
--- a/drivers/serial/s3c2410.c
+++ b/drivers/serial/s3c2410.c
@@ -323,16 +323,6 @@ s3c24xx_serial_rx_chars(int irq, void *dev_id, struct pt_regs *regs)
 		if (s3c24xx_serial_rx_fifocnt(ourport, ufstat) == 0)
 			break;
 
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-			if (tty->low_latency)
-				tty_flip_buffer_push(tty);
-
-			/*
-			 * If this failed then we will throw away the
-			 * bytes but must do so to clear interrupts
-			 */
-		}
-
 		uerstat = rd_regl(port, S3C2410_UERSTAT);
 		ch = rd_regb(port, S3C2410_URXH);
 
diff --git a/drivers/serial/sa1100.c b/drivers/serial/sa1100.c
index 25a086458ab9..1bd93168f504 100644
--- a/drivers/serial/sa1100.c
+++ b/drivers/serial/sa1100.c
@@ -201,8 +201,6 @@ sa1100_rx_chars(struct sa1100_port *sport, struct pt_regs *regs)
 	while (status & UTSR1_TO_SM(UTSR1_RNE)) {
 		ch = UART_GET_CHAR(sport);
 
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-			goto ignore_char;
 		sport->port.icount.rx++;
 
 		flg = TTY_NORMAL;
diff --git a/drivers/serial/serial_lh7a40x.c b/drivers/serial/serial_lh7a40x.c
index d01dbe5da3b9..d4a1f0e798c1 100644
--- a/drivers/serial/serial_lh7a40x.c
+++ b/drivers/serial/serial_lh7a40x.c
@@ -148,15 +148,6 @@ lh7a40xuart_rx_chars (struct uart_port* port)
 	unsigned int data, flag;/* Received data and status */
 
 	while (!(UR (port, UART_R_STATUS) & nRxRdy) && --cbRxMax) {
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-			if (tty->low_latency)
-				tty_flip_buffer_push(tty);
-			/*
-			 * If this failed then we will throw away the
-			 * bytes but must do so to clear interrupts
-			 */
-		}
-
 		data = UR (port, UART_R_DATA);
 		flag = TTY_NORMAL;
 		++port->icount.rx;
diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c
index 995d9dd9ddd5..fdd1f1915a42 100644
--- a/drivers/serial/serial_txx9.c
+++ b/drivers/serial/serial_txx9.c
@@ -303,17 +303,6 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status, struct pt_regs *r
 	char flag;
 
 	do {
-		/* The following is not allowed by the tty layer and
-		   unsafe. It should be fixed ASAP */
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			if (tty->low_latency) {
-				spin_unlock(&up->port.lock);
-				tty_flip_buffer_push(tty);
-				spin_lock(&up->port.lock);
-			}
-			/* If this failed then we will throw away the
-			   bytes but must do so to clear interrupts */
-		}
 		ch = sio_in(up, TXX9_SIRFIFO);
 		flag = TTY_NORMAL;
 		up->port.icount.rx++;
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 430754ebac8a..a9e070759628 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -482,6 +482,7 @@ static inline void sci_receive_chars(struct uart_port *port,
 	struct tty_struct *tty = port->info->tty;
 	int i, count, copied = 0;
 	unsigned short status;
+	unsigned char flag;
 
 	status = sci_in(port, SCxSR);
 	if (!(status & SCxSR_RDxF(port)))
@@ -499,8 +500,7 @@ static inline void sci_receive_chars(struct uart_port *port,
 #endif
 
 		/* Don't copy more bytes than there is room for in the buffer */
-		if (tty->flip.count + count > TTY_FLIPBUF_SIZE)
-			count = TTY_FLIPBUF_SIZE - tty->flip.count;
+		count = tty_buffer_request_room(tty, count);
 
 		/* If for any reason we can't copy more data, we're done! */
 		if (count == 0)
@@ -512,8 +512,7 @@ static inline void sci_receive_chars(struct uart_port *port,
 			    || uart_handle_sysrq_char(port, c, regs)) {
 				count = 0;
 			} else {
-			    tty->flip.char_buf_ptr[0] = c;
-			    tty->flip.flag_buf_ptr[0] = TTY_NORMAL;
+			    tty_insert_flip_char(tty, c, TTY_NORMAL);
 			}
 		} else {
 			for (i=0; i<count; i++) {
@@ -542,26 +541,21 @@ static inline void sci_receive_chars(struct uart_port *port,
 				}
 
 				/* Store data and status */
-				tty->flip.char_buf_ptr[i] = c;
 				if (status&SCxSR_FER(port)) {
-					tty->flip.flag_buf_ptr[i] = TTY_FRAME;
+					flag = TTY_FRAME;
 					pr_debug("sci: frame error\n");
 				} else if (status&SCxSR_PER(port)) {
-					tty->flip.flag_buf_ptr[i] = TTY_PARITY;
+					flag = TTY_PARITY;
 					pr_debug("sci: parity error\n");
-				} else {
-					tty->flip.flag_buf_ptr[i] = TTY_NORMAL;
-				}
+				} else
+					flag = TTY_NORMAL;
+				tty_insert_flip_char(tty, c, flag);
 			}
 		}
 
 		sci_in(port, SCxSR); /* dummy read */
 		sci_out(port, SCxSR, SCxSR_RDxF_CLEAR(port));
 
-		/* Update the kernel buffer end */
-		tty->flip.count += count;
-		tty->flip.char_buf_ptr += count;
-		tty->flip.flag_buf_ptr += count;
 		copied += count;
 		port->icount.rx += count;
 	}
@@ -608,48 +602,45 @@ static inline int sci_handle_errors(struct uart_port *port)
 	unsigned short status = sci_in(port, SCxSR);
 	struct tty_struct *tty = port->info->tty;
 
-	if (status&SCxSR_ORER(port) && tty->flip.count<TTY_FLIPBUF_SIZE) {
+	if (status&SCxSR_ORER(port)) {
 		/* overrun error */
-		copied++;
-		*tty->flip.flag_buf_ptr++ = TTY_OVERRUN;
+		if(tty_insert_flip_char(tty, 0, TTY_OVERRUN))
+			copied++;
 		pr_debug("sci: overrun error\n");
 	}
 
-	if (status&SCxSR_FER(port) && tty->flip.count<TTY_FLIPBUF_SIZE) {
+	if (status&SCxSR_FER(port)) {
 		if (sci_rxd_in(port) == 0) {
 			/* Notify of BREAK */
 			struct sci_port * sci_port = (struct sci_port *)port;
-                       if(!sci_port->break_flag) {
-	                        sci_port->break_flag = 1;
-                               sci_schedule_break_timer((struct sci_port *)port);
+			if(!sci_port->break_flag) {
+	                	sci_port->break_flag = 1;
+	                	sci_schedule_break_timer((struct sci_port *)port);
 				/* Do sysrq handling. */
-				if(uart_handle_break(port)) {
+				if(uart_handle_break(port))
 					return 0;
-				}
 			        pr_debug("sci: BREAK detected\n");
-			        copied++;
-			        *tty->flip.flag_buf_ptr++ = TTY_BREAK;
+			        if(tty_insert_flip_char(tty, 0, TTY_BREAK))
+				        copied++;
                        }
 		}
 		else {
 			/* frame error */
-			copied++;
-			*tty->flip.flag_buf_ptr++ = TTY_FRAME;
+			if(tty_insert_flip_char(tty, 0, TTY_FRAME))
+				copied++;
 			pr_debug("sci: frame error\n");
 		}
 	}
 
-	if (status&SCxSR_PER(port) && tty->flip.count<TTY_FLIPBUF_SIZE) {
+	if (status&SCxSR_PER(port)) {
+		if(tty_insert_flip_char(tty, 0, TTY_PARITY))
+			copied++;
 		/* parity error */
-		copied++;
-		*tty->flip.flag_buf_ptr++ = TTY_PARITY;
 		pr_debug("sci: parity error\n");
 	}
 
-	if (copied) {
-		tty->flip.count += copied;
+	if (copied)
 		tty_flip_buffer_push(tty);
-	}
 
 	return copied;
 }
@@ -661,15 +652,14 @@ static inline int sci_handle_breaks(struct uart_port *port)
 	struct tty_struct *tty = port->info->tty;
 	struct sci_port *s = &sci_ports[port->line];
 
-	if (!s->break_flag && status & SCxSR_BRK(port) &&
-	    tty->flip.count < TTY_FLIPBUF_SIZE) {
+	if (!s->break_flag && status & SCxSR_BRK(port))
 #if defined(CONFIG_CPU_SH3)
 		/* Debounce break */
 		s->break_flag = 1;
 #endif
 		/* Notify of BREAK */
-		copied++;
-		*tty->flip.flag_buf_ptr++ = TTY_BREAK;
+		if(tty_insert_flip_char(tty, 0, TTY_BREAK))
+			copied++;
 		pr_debug("sci: BREAK detected\n");
 	}
 
@@ -677,19 +667,15 @@ static inline int sci_handle_breaks(struct uart_port *port)
 	/* XXX: Handle SCIF overrun error */
 	if (port->type == PORT_SCIF && (sci_in(port, SCLSR) & SCIF_ORER) != 0) {
 		sci_out(port, SCLSR, 0);
-		if(tty->flip.count<TTY_FLIPBUF_SIZE) {
+		if(tty_insert_flip_char(tty, 0, TTY_OVERRUN)) {
 			copied++;
-			*tty->flip.flag_buf_ptr++ = TTY_OVERRUN;
 			pr_debug("sci: overrun error\n");
 		}
 	}
 #endif
 
-	if (copied) {
-		tty->flip.count += copied;
+	if (copied)
 		tty_flip_buffer_push(tty);
-	}
-
 	return copied;
 }
 
@@ -732,12 +718,9 @@ static irqreturn_t sci_er_interrupt(int irq, void *ptr, struct pt_regs *regs)
 			struct tty_struct *tty = port->info->tty;
 
 			sci_out(port, SCLSR, 0);
-			if(tty->flip.count<TTY_FLIPBUF_SIZE) {
-				*tty->flip.flag_buf_ptr++ = TTY_OVERRUN;
-				tty->flip.count++;
-				tty_flip_buffer_push(tty);
-				pr_debug("scif: overrun error\n");
-			}
+			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
+			tty_flip_buffer_push(tty);
+			pr_debug("scif: overrun error\n");
 		}
 #endif
 		sci_rx_interrupt(irq, ptr, regs);
diff --git a/drivers/serial/sn_console.c b/drivers/serial/sn_console.c
index 313f9df24a2d..5468e5a767e2 100644
--- a/drivers/serial/sn_console.c
+++ b/drivers/serial/sn_console.c
@@ -519,11 +519,7 @@ sn_receive_chars(struct sn_cons_port *port, struct pt_regs *regs,
 
 		/* record the character to pass up to the tty layer */
 		if (tty) {
-			*tty->flip.char_buf_ptr = ch;
-			*tty->flip.flag_buf_ptr = TTY_NORMAL;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-			if (tty->flip.count == TTY_FLIPBUF_SIZE)
+			if(tty_insert_flip_char(tty, ch, TTY_NORMAL) == 0)
 				break;
 		}
 		port->sc_port.icount.rx++;
diff --git a/drivers/serial/sunsab.c b/drivers/serial/sunsab.c
index ba9381fd3f2d..7e773ff76c61 100644
--- a/drivers/serial/sunsab.c
+++ b/drivers/serial/sunsab.c
@@ -159,21 +159,14 @@ receive_chars(struct uart_sunsab_port *up,
 		saw_console_brk = 1;
 
 	for (i = 0; i < count; i++) {
-		unsigned char ch = buf[i];
+		unsigned char ch = buf[i], flag;
 
 		if (tty == NULL) {
 			uart_handle_sysrq_char(&up->port, ch, regs);
 			continue;
 		}
 
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			tty->flip.work.func((void *)tty);
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-				return tty; // if TTY_DONT_FLIP is set
-		}
-
-		*tty->flip.char_buf_ptr = ch;
-		*tty->flip.flag_buf_ptr = TTY_NORMAL;
+		flag = TTY_NORMAL;
 		up->port.icount.rx++;
 
 		if (unlikely(stat->sreg.isr0 & (SAB82532_ISR0_PERR |
@@ -209,34 +202,21 @@ receive_chars(struct uart_sunsab_port *up,
 			stat->sreg.isr1 &= ((up->port.read_status_mask >> 8) & 0xff);
 
 			if (stat->sreg.isr1 & SAB82532_ISR1_BRK) {
-				*tty->flip.flag_buf_ptr = TTY_BREAK;
+				flag = TTY_BREAK;
 			} else if (stat->sreg.isr0 & SAB82532_ISR0_PERR)
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			else if (stat->sreg.isr0 & SAB82532_ISR0_FERR)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
+				flag = TTY_FRAME;
 		}
 
 		if (uart_handle_sysrq_char(&up->port, ch, regs))
 			continue;
 
 		if ((stat->sreg.isr0 & (up->port.ignore_status_mask & 0xff)) == 0 &&
-		    (stat->sreg.isr1 & ((up->port.ignore_status_mask >> 8) & 0xff)) == 0){
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
-		if ((stat->sreg.isr0 & SAB82532_ISR0_RFO) &&
-		    tty->flip.count < TTY_FLIPBUF_SIZE) {
-			/*
-			 * Overrun is special, since it's reported
-			 * immediately, and doesn't affect the current
-			 * character.
-			 */
-			*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
+		    (stat->sreg.isr1 & ((up->port.ignore_status_mask >> 8) & 0xff)) == 0)
+			tty_insert_flip_char(tty, ch, flag);
+		if (stat->sreg.isr0 & SAB82532_ISR0_RFO)
+			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 	}
 
 	if (saw_console_brk)
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index f0738533f39a..9a3665b34d97 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -323,19 +323,13 @@ static _INLINE_ struct tty_struct *
 receive_chars(struct uart_sunsu_port *up, unsigned char *status, struct pt_regs *regs)
 {
 	struct tty_struct *tty = up->port.info->tty;
-	unsigned char ch;
+	unsigned char ch, flag;
 	int max_count = 256;
 	int saw_console_brk = 0;
 
 	do {
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			tty->flip.work.func((void *)tty);
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-				return tty; // if TTY_DONT_FLIP is set
-		}
 		ch = serial_inp(up, UART_RX);
-		*tty->flip.char_buf_ptr = ch;
-		*tty->flip.flag_buf_ptr = TTY_NORMAL;
+		flag = TTY_NORMAL;
 		up->port.icount.rx++;
 
 		if (unlikely(*status & (UART_LSR_BI | UART_LSR_PE |
@@ -377,31 +371,23 @@ receive_chars(struct uart_sunsu_port *up, unsigned char *status, struct pt_regs
 			}
 
 			if (*status & UART_LSR_BI) {
-				*tty->flip.flag_buf_ptr = TTY_BREAK;
+				flag = TTY_BREAK;
 			} else if (*status & UART_LSR_PE)
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			else if (*status & UART_LSR_FE)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
+				flag = TTY_FRAME;
 		}
 		if (uart_handle_sysrq_char(&up->port, ch, regs))
 			goto ignore_char;
-		if ((*status & up->port.ignore_status_mask) == 0) {
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
-		if ((*status & UART_LSR_OE) &&
-		    tty->flip.count < TTY_FLIPBUF_SIZE) {
+		if ((*status & up->port.ignore_status_mask) == 0)
+			tty_insert_flip_char(tty, ch, flag);
+		if (*status & UART_LSR_OE)
 			/*
 			 * Overrun is special, since it's reported
 			 * immediately, and doesn't affect the current
 			 * character.
 			 */
-			*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
+			 tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 	ignore_char:
 		*status = serial_inp(up, UART_LSR);
 	} while ((*status & UART_LSR_DR) && (max_count-- > 0));
diff --git a/drivers/serial/sunzilog.c b/drivers/serial/sunzilog.c
index 7653d6cf05af..3c72484adea7 100644
--- a/drivers/serial/sunzilog.c
+++ b/drivers/serial/sunzilog.c
@@ -319,7 +319,7 @@ sunzilog_receive_chars(struct uart_sunzilog_port *up,
 		       struct pt_regs *regs)
 {
 	struct tty_struct *tty;
-	unsigned char ch, r1;
+	unsigned char ch, r1, flag;
 
 	tty = NULL;
 	if (up->port.info != NULL &&		/* Unopened serial console */
@@ -362,19 +362,8 @@ sunzilog_receive_chars(struct uart_sunzilog_port *up,
 			continue;
 		}
 
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			tty->flip.work.func((void *)tty);
-			/*
-			 * The 8250 bails out of the loop here,
-			 * but we need to read everything, or die.
-			 */
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-				continue;
-		}
-
 		/* A real serial line, record the character and status.  */
-		*tty->flip.char_buf_ptr = ch;
-		*tty->flip.flag_buf_ptr = TTY_NORMAL;
+		flag = TTY_NORMAL;
 		up->port.icount.rx++;
 		if (r1 & (BRK_ABRT | PAR_ERR | Rx_OVR | CRC_ERR)) {
 			if (r1 & BRK_ABRT) {
@@ -391,28 +380,21 @@ sunzilog_receive_chars(struct uart_sunzilog_port *up,
 				up->port.icount.overrun++;
 			r1 &= up->port.read_status_mask;
 			if (r1 & BRK_ABRT)
-				*tty->flip.flag_buf_ptr = TTY_BREAK;
+				flag = TTY_BREAK;
 			else if (r1 & PAR_ERR)
-				*tty->flip.flag_buf_ptr = TTY_PARITY;
+				flag = TTY_PARITY;
 			else if (r1 & CRC_ERR)
-				*tty->flip.flag_buf_ptr = TTY_FRAME;
+				flag = TTY_FRAME;
 		}
 		if (uart_handle_sysrq_char(&up->port, ch, regs))
 			continue;
 
 		if (up->port.ignore_status_mask == 0xff ||
 		    (r1 & up->port.ignore_status_mask) == 0) {
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
-		}
-		if ((r1 & Rx_OVR) &&
-		    tty->flip.count < TTY_FLIPBUF_SIZE) {
-			*tty->flip.flag_buf_ptr = TTY_OVERRUN;
-			tty->flip.flag_buf_ptr++;
-			tty->flip.char_buf_ptr++;
-			tty->flip.count++;
+		    	tty_insert_flip_char(tty, ch, flag);
 		}
+		if (r1 & Rx_OVR)
+			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
 	}
 
 	return tty;
diff --git a/drivers/serial/vr41xx_siu.c b/drivers/serial/vr41xx_siu.c
index 865d4dea65df..0a28deeb098d 100644
--- a/drivers/serial/vr41xx_siu.c
+++ b/drivers/serial/vr41xx_siu.c
@@ -371,11 +371,6 @@ static inline void receive_chars(struct uart_port *port, uint8_t *status,
 	lsr = *status;
 
 	do {
-		if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
-			if (tty->low_latency)
-				tty_flip_buffer_push(tty);
-		}
-
 		ch = siu_read(port, UART_RX);
 		port->icount.rx++;
 		flag = TTY_NORMAL;
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 248279e44c99..b9fd39fd1b5b 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -335,14 +335,9 @@ next_buffer:
 
 	dbg("acm_rx_tasklet: procesing buf 0x%p, size = %d\n", buf, buf->size);
 
-	for (i = 0; i < buf->size && !acm->throttle; i++) {
-		/* if we insert more than TTY_FLIPBUF_SIZE characters,
-		   we drop them. */
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-			tty_flip_buffer_push(tty);
- 		}
-		tty_insert_flip_char(tty, buf->base[i], 0);
- 	}
+	tty_buffer_request_room(tty, buf->size);
+	if (!acm->throttle)
+		tty_insert_flip_string(tty, buf->base, buf->size);
 	tty_flip_buffer_push(tty);
 
 	spin_lock(&acm->throttle_lock);
diff --git a/drivers/usb/gadget/serial.c b/drivers/usb/gadget/serial.c
index 65e084a2c87e..2e6926b33455 100644
--- a/drivers/usb/gadget/serial.c
+++ b/drivers/usb/gadget/serial.c
@@ -1271,6 +1271,7 @@ static int gs_recv_packet(struct gs_dev *dev, char *packet, unsigned int size)
 	unsigned int len;
 	struct gs_port *port;
 	int ret;
+	struct tty_struct *tty;
 
 	/* TEMPORARY -- only port 0 is supported right now */
 	port = dev->dev_port[0];
@@ -1290,7 +1291,10 @@ static int gs_recv_packet(struct gs_dev *dev, char *packet, unsigned int size)
 		goto exit;
 	}
 
-	if (port->port_tty == NULL) {
+
+	tty = port->port_tty;
+
+	if (tty == NULL) {
 		printk(KERN_ERR "gs_recv_packet: port=%d, NULL tty pointer\n",
 			port->port_num);
 		ret = -EIO;
@@ -1304,20 +1308,13 @@ static int gs_recv_packet(struct gs_dev *dev, char *packet, unsigned int size)
 		goto exit;
 	}
 
-	len = (unsigned int)(TTY_FLIPBUF_SIZE - port->port_tty->flip.count);
-	if (len < size)
-		size = len;
-
-	if (size > 0) {
-		memcpy(port->port_tty->flip.char_buf_ptr, packet, size);
-		port->port_tty->flip.char_buf_ptr += size;
-		port->port_tty->flip.count += size;
+	len = tty_buffer_request_room(tty, size);
+	if (len > 0) {
+		tty_insert_flip_string(tty, packet, len);
 		tty_flip_buffer_push(port->port_tty);
 		wake_up_interruptible(&port->port_tty->read_wait);
 	}
-
 	ret = 0;
-
 exit:
 	spin_unlock(&port->port_lock);
 	return ret;
diff --git a/drivers/usb/serial/Kconfig b/drivers/usb/serial/Kconfig
index 14f55fd26a64..be5dc80836c3 100644
--- a/drivers/usb/serial/Kconfig
+++ b/drivers/usb/serial/Kconfig
@@ -84,7 +84,7 @@ config USB_SERIAL_BELKIN
 
 config USB_SERIAL_WHITEHEAT
 	tristate "USB ConnectTech WhiteHEAT Serial Driver"
-	depends on USB_SERIAL && BROKEN_ON_SMP
+	depends on USB_SERIAL
 	help
 	  Say Y here if you want to use a ConnectTech WhiteHEAT 4 port
 	  USB to serial converter device.
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index 6d18d4eaba35..2357b1d102d7 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -364,7 +364,6 @@ static void cyberjack_read_bulk_callback (struct urb *urb, struct pt_regs *regs)
 	struct tty_struct *tty;
 	unsigned char *data = urb->transfer_buffer;
 	short todo;
-	int i;
 	int result;
 
 	dbg("%s - port %d", __FUNCTION__, port->number);
@@ -381,14 +380,8 @@ static void cyberjack_read_bulk_callback (struct urb *urb, struct pt_regs *regs)
 		return;
 	}
 	if (urb->actual_length) {
-		for (i = 0; i < urb->actual_length ; ++i) {
-			/* if we insert more than TTY_FLIPBUF_SIZE characters, we drop them. */
-			if(tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-			}
-			/* this doesn't actually push the data through unless tty->low_latency is set */
-			tty_insert_flip_char(tty, data[i], 0);
-		}
+		tty_buffer_request_room(tty, urb->actual_length);
+		tty_insert_flip_string(tty, data, urb->actual_length);
 	  	tty_flip_buffer_push(tty);
 	}
 
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index 4e9637eb6137..68067fe117a4 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -1263,12 +1263,10 @@ static void cypress_read_int_callback(struct urb *urb, struct pt_regs *regs)
 
 	/* process read if there is data other than line status */
 	if (tty && (bytes > i)) {
+		bytes = tty_buffer_request_room(tty, bytes);
 		for (; i < bytes ; ++i) {
 			dbg("pushing byte number %d - %d - %c", i, data[i],
 					data[i]);
-			if(tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-			}
 			tty_insert_flip_char(tty, data[i], tty_flag);
 		}
 		tty_flip_buffer_push(port->tty);
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 8fc414bd5b24..b3f776a90c93 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -946,13 +946,10 @@ dbg( "digi_rx_unthrottle: TOP: port=%d", priv->dp_port_num );
 	spin_lock_irqsave( &priv->dp_port_lock, flags );
 
 	/* send any buffered chars from throttle time on to tty subsystem */
-	len = min(priv->dp_in_buf_len, TTY_FLIPBUF_SIZE - tty->flip.count );
+
+	len = tty_buffer_request_room(tty, priv->dp_in_buf_len);
 	if( len > 0 ) {
-		memcpy( tty->flip.char_buf_ptr, priv->dp_in_buf, len );
-		memcpy( tty->flip.flag_buf_ptr, priv->dp_in_flag_buf, len );
-		tty->flip.char_buf_ptr += len;
-		tty->flip.flag_buf_ptr += len;
-		tty->flip.count += len;
+		tty_insert_flip_string_flags(tty, priv->dp_in_buf, priv->dp_in_flag_buf, len);
 		tty_flip_buffer_push( tty );
 	}
 
@@ -1827,6 +1824,7 @@ static int digi_read_inb_callback( struct urb *urb )
 	int status = ((unsigned char *)urb->transfer_buffer)[2];
 	unsigned char *data = ((unsigned char *)urb->transfer_buffer)+3;
 	int flag,throttled;
+	int i;
 
 	/* do not process callbacks on closed ports */
 	/* but do continue the read chain */
@@ -1885,20 +1883,18 @@ static int digi_read_inb_callback( struct urb *urb )
 			}
 
 		} else {
-
-			len = min( len, TTY_FLIPBUF_SIZE - tty->flip.count );
-
+			len = tty_buffer_request_room(tty, len);
 			if( len > 0 ) {
-				memcpy( tty->flip.char_buf_ptr, data, len );
-				memset( tty->flip.flag_buf_ptr, flag, len );
-				tty->flip.char_buf_ptr += len;
-				tty->flip.flag_buf_ptr += len;
-				tty->flip.count += len;
+				/* Hot path */
+				if(flag == TTY_NORMAL)
+					tty_insert_flip_string(tty, data, len);
+				else {
+					for(i = 0; i < len; i++)
+						tty_insert_flip_char(tty, data[i], flag);
+				}
 				tty_flip_buffer_push( tty );
 			}
-
 		}
-
 	}
 
 	spin_unlock( &priv->dp_port_lock );
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index 79a766e9ca23..63f7c78a1152 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -344,7 +344,6 @@ static void empeg_read_bulk_callback (struct urb *urb, struct pt_regs *regs)
 	struct usb_serial_port *port = (struct usb_serial_port *)urb->context;
 	struct tty_struct *tty;
 	unsigned char *data = urb->transfer_buffer;
-	int i;
 	int result;
 
 	dbg("%s - port %d", __FUNCTION__, port->number);
@@ -359,19 +358,8 @@ static void empeg_read_bulk_callback (struct urb *urb, struct pt_regs *regs)
 	tty = port->tty;
 
 	if (urb->actual_length) {
-		for (i = 0; i < urb->actual_length ; ++i) {
-			/* gb - 2000/11/13
-			 * If we insert too many characters we'll overflow the buffer.
-			 * This means we'll lose bytes - Decidedly bad.
-			 */
-			if(tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-				}
-			tty_insert_flip_char(tty, data[i], 0);
-		}
-		/* gb - 2000/11/13
-		 * Goes straight through instead of scheduling - if tty->low_latency is set.
-		 */
+		tty_buffer_request_room(tty, urb->actual_length);
+		tty_insert_flip_string(tty, data, urb->actual_length);
 		tty_flip_buffer_push(tty);
 		bytes_in += urb->actual_length;
 	}
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index eb863b3f2d79..10bc1bf23b35 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1610,24 +1610,11 @@ static void ftdi_process_read (void *param)
 			length = 0;
 		}
 
-		/* have to make sure we don't overflow the buffer
-		   with tty_insert_flip_char's */
-		if (tty->flip.count+length > TTY_FLIPBUF_SIZE) {
-			tty_flip_buffer_push(tty);
-			need_flip = 0;
-
-			if (tty->flip.count != 0) {
-				/* flip didn't work, this happens when ftdi_process_read() is
-				 * called from ftdi_unthrottle, because TTY_DONT_FLIP is set */
-				dbg("%s - flip buffer push failed", __FUNCTION__);
-				break;
-			}
-		}
 		if (priv->rx_flags & THROTTLED) {
 			dbg("%s - throttled", __FUNCTION__);
 			break;
 		}
-		if (tty->ldisc.receive_room(tty)-tty->flip.count < length) {
+		if (tty_buffer_request_room(tty, length) < length) {
 			/* break out & wait for throttling/unthrottling to happen */
 			dbg("%s - receive room low", __FUNCTION__);
 			break;
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index 452efce72714..d6f55e9dccae 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -275,23 +275,14 @@ static void send_to_tty(struct usb_serial_port *port,
                         char *data, unsigned int actual_length)
 {
 	struct tty_struct *tty = port->tty;
-	int i;
 
 	if (tty && actual_length) {
 
 		usb_serial_debug_data(debug, &port->dev, 
 					__FUNCTION__, actual_length, data);
 
-		for (i = 0; i < actual_length ; ++i) {
-			/* if we insert more than TTY_FLIPBUF_SIZE characters,
-			   we drop them. */
-			if(tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-			}
-			/* this doesn't actually push the data through unless
-			   tty->low_latency is set */
-			tty_insert_flip_char(tty, data[i], 0);
-		}
+		tty_buffer_request_room(tty, actual_length);
+		tty_insert_flip_string(tty, data, actual_length);
 		tty_flip_buffer_push(tty);
 	}
 }
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 4ddac620fc0c..476cda107f4f 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -254,7 +254,6 @@ void usb_serial_generic_read_bulk_callback (struct urb *urb, struct pt_regs *reg
 	struct usb_serial *serial = port->serial;
 	struct tty_struct *tty;
 	unsigned char *data = urb->transfer_buffer;
-	int i;
 	int result;
 
 	dbg("%s - port %d", __FUNCTION__, port->number);
@@ -268,14 +267,8 @@ void usb_serial_generic_read_bulk_callback (struct urb *urb, struct pt_regs *reg
 
 	tty = port->tty;
 	if (tty && urb->actual_length) {
-		for (i = 0; i < urb->actual_length ; ++i) {
-			/* if we insert more than TTY_FLIPBUF_SIZE characters, we drop them. */
-			if(tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-			}
-			/* this doesn't actually push the data through unless tty->low_latency is set */
-			tty_insert_flip_char(tty, data[i], 0);
-		}
+		tty_buffer_request_room(tty, urb->actual_length);
+		tty_insert_flip_string(tty, data, urb->actual_length);
 	  	tty_flip_buffer_push(tty);
 	}
 
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index faedbeb6ba49..3f29e6b0fd19 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -1965,20 +1965,14 @@ static void edge_tty_recv(struct device *dev, struct tty_struct *tty, unsigned c
 	int cnt;
 
 	do {
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-			tty_flip_buffer_push(tty);
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				dev_err(dev, "%s - dropping data, %d bytes lost\n",
-					__FUNCTION__, length);
-				return;
-			}
+		cnt = tty_buffer_request_room(tty, length);
+		if (cnt < length) {
+			dev_err(dev, "%s - dropping data, %d bytes lost\n",
+					__FUNCTION__, length - cnt);
+			if(cnt == 0)
+				break;
 		}
-		cnt = min(length, TTY_FLIPBUF_SIZE - tty->flip.count);
-		memcpy(tty->flip.char_buf_ptr, data, cnt);
-		memset(tty->flip.flag_buf_ptr, 0, cnt);
-		tty->flip.char_buf_ptr += cnt;
-		tty->flip.flag_buf_ptr += cnt;
-		tty->flip.count += cnt;
+		tty_insert_flip_string(tty, data, cnt);
 		data += cnt;
 		length -= cnt;
 	} while (length > 0);
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 2edf9cabad20..afc0f34b3a46 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -1865,20 +1865,14 @@ static void edge_tty_recv(struct device *dev, struct tty_struct *tty, unsigned c
 	int cnt;
 
 	do {
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-			tty_flip_buffer_push(tty);
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				dev_err(dev, "%s - dropping data, %d bytes lost\n",
-					__FUNCTION__, length);
-				return;
-			}
+		cnt = tty_buffer_request_room(tty, length);
+		if (cnt < length) {
+			dev_err(dev, "%s - dropping data, %d bytes lost\n",
+				__FUNCTION__, length - cnt);
+			if(cnt == 0)
+				break;
 		}
-		cnt = min(length, TTY_FLIPBUF_SIZE - tty->flip.count);
-		memcpy(tty->flip.char_buf_ptr, data, cnt);
-		memset(tty->flip.flag_buf_ptr, 0, cnt);
-		tty->flip.char_buf_ptr += cnt;
-		tty->flip.flag_buf_ptr += cnt;
-		tty->flip.count += cnt;
+		tty_insert_flip_string(tty, data, cnt);
 		data += cnt;
 		length -= cnt;
 	} while (length > 0);
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index 06d07cea0b70..9a5c97989562 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -711,7 +711,7 @@ static void ipaq_read_bulk_callback(struct urb *urb, struct pt_regs *regs)
 	struct usb_serial_port	*port = (struct usb_serial_port *)urb->context;
 	struct tty_struct	*tty;
 	unsigned char		*data = urb->transfer_buffer;
-	int			i, result;
+	int			result;
 
 	dbg("%s - port %d", __FUNCTION__, port->number);
 
@@ -724,14 +724,8 @@ static void ipaq_read_bulk_callback(struct urb *urb, struct pt_regs *regs)
 
 	tty = port->tty;
 	if (tty && urb->actual_length) {
-		for (i = 0; i < urb->actual_length ; ++i) {
-			/* if we insert more than TTY_FLIPBUF_SIZE characters, we drop them. */
-			if(tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-			}
-			/* this doesn't actually push the data through unless tty->low_latency is set */
-			tty_insert_flip_char(tty, data[i], 0);
-		}
+		tty_buffer_request_room(tty, urb->actual_length);
+		tty_insert_flip_string(tty, data, urb->actual_length);
 		tty_flip_buffer_push(tty);
 		bytes_in += urb->actual_length;
 	}
diff --git a/drivers/usb/serial/ipw.c b/drivers/usb/serial/ipw.c
index 2dd191f5fe76..e760a70242c1 100644
--- a/drivers/usb/serial/ipw.c
+++ b/drivers/usb/serial/ipw.c
@@ -166,7 +166,6 @@ static void ipw_read_bulk_callback(struct urb *urb, struct pt_regs *regs)
 	struct usb_serial_port *port = urb->context;
 	unsigned char *data = urb->transfer_buffer;
 	struct tty_struct *tty;
-	int i;
 	int result;
 
 	dbg("%s - port %d", __FUNCTION__, port->number);
@@ -180,14 +179,8 @@ static void ipw_read_bulk_callback(struct urb *urb, struct pt_regs *regs)
 
 	tty = port->tty;
 	if (tty && urb->actual_length) {
-		for (i = 0; i < urb->actual_length ; ++i) {
-			/* if we insert more than TTY_FLIPBUF_SIZE characters, we drop them. */
-			if(tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-			}
-			/* this doesn't actually push the data through unless tty->low_latency is set */
-			tty_insert_flip_char(tty, data[i], 0);
-		}
+		tty_buffer_request_room(tty, urb->actual_length);
+		tty_insert_flip_string(tty, data, urb->actual_length);
 		tty_flip_buffer_push(tty);
 	}
 
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index 4e2f7dfb58b2..78335a5f7743 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -648,7 +648,6 @@ static void klsi_105_read_bulk_callback (struct urb *urb, struct pt_regs *regs)
 		usb_serial_debug_data(debug, &port->dev, __FUNCTION__,
 				      urb->actual_length, data);
 	} else {
-		int i;
 		int bytes_sent = ((__u8 *) data)[0] +
 				 ((unsigned int) ((__u8 *) data)[1] << 8);
 		tty = port->tty;
@@ -669,16 +668,8 @@ static void klsi_105_read_bulk_callback (struct urb *urb, struct pt_regs *regs)
 			bytes_sent = urb->actual_length - 2;
 		}
 
-		for (i = 2; i < 2+bytes_sent; i++) {
-			/* if we insert more than TTY_FLIPBUF_SIZE characters,
-			 * we drop them. */
-			if(tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-			}
-			/* this doesn't actually push the data through unless 
-			 * tty->low_latency is set */
-			tty_insert_flip_char(tty, ((__u8*) data)[i], 0);
-		}
+		tty_buffer_request_room(tty, bytes_sent);
+		tty_insert_flip_string(tty, data + 2, bytes_sent);
 		tty_flip_buffer_push(tty);
 
 		/* again lockless, but debug info only */
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index d9c21e275130..b8b213185d0f 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -365,7 +365,6 @@ static void kobil_close (struct usb_serial_port *port, struct file *filp)
 
 static void kobil_read_int_callback( struct urb *purb, struct pt_regs *regs)
 {
-	int i;
 	int result;
 	struct usb_serial_port *port = (struct usb_serial_port *) purb->context;
 	struct tty_struct *tty;
@@ -397,14 +396,8 @@ static void kobil_read_int_callback( struct urb *purb, struct pt_regs *regs)
 		*/
 		// END DEBUG
 
-		for (i = 0; i < purb->actual_length; ++i) {
-			// if we insert more than TTY_FLIPBUF_SIZE characters, we drop them.
-			if(tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-			}
-			// this doesn't actually push the data through unless tty->low_latency is set
-			tty_insert_flip_char(tty, data[i], 0);
-		}
+		tty_buffer_request_room(tty, purb->actual_length);
+		tty_insert_flip_string(tty, data, purb->actual_length);
 		tty_flip_buffer_push(tty);
 	}
 
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 3fd2405304fd..52bdf6fe46f2 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -321,7 +321,7 @@ static int option_write(struct usb_serial_port *port,
 
 static void option_indat_callback(struct urb *urb, struct pt_regs *regs)
 {
-	int i, err;
+	int err;
 	int endpoint;
 	struct usb_serial_port *port;
 	struct tty_struct *tty;
@@ -338,11 +338,8 @@ static void option_indat_callback(struct urb *urb, struct pt_regs *regs)
 	} else {
 		tty = port->tty;
 		if (urb->actual_length) {
-			for (i = 0; i < urb->actual_length ; ++i) {
-				if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-					tty_flip_buffer_push(tty);
-				tty_insert_flip_char(tty, data[i], 0);
-			}
+			tty_buffer_request_room(tty, urb->actual_length);
+			tty_insert_flip_string(tty, data, urb->actual_length);
 			tty_flip_buffer_push(tty);
 		} else {
 			dbg("%s: empty read urb received", __FUNCTION__);
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index f03721056190..9ffff1938239 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -924,16 +924,12 @@ static void pl2303_read_bulk_callback (struct urb *urb, struct pt_regs *regs)
 
 	tty = port->tty;
 	if (tty && urb->actual_length) {
+		tty_buffer_request_room(tty, urb->actual_length + 1);
 		/* overrun is special, not associated with a char */
 		if (status & UART_OVERRUN_ERROR)
 			tty_insert_flip_char(tty, 0, TTY_OVERRUN);
-
-		for (i = 0; i < urb->actual_length; ++i) {
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-			}
+		for (i = 0; i < urb->actual_length; ++i)
 			tty_insert_flip_char (tty, data[i], tty_flag);
-		}
 		tty_flip_buffer_push (tty);
 	}
 
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index abb830cb77bd..c18db3257073 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -1280,24 +1280,18 @@ static void ti_recv(struct device *dev, struct tty_struct *tty,
 	int cnt;
 
 	do {
-		if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-			tty_flip_buffer_push(tty);
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				dev_err(dev, "%s - dropping data, %d bytes lost\n", __FUNCTION__, length);
-				return;
-			}
+		cnt = tty_buffer_request_room(tty, length);
+		if (cnt < length) {
+			dev_err(dev, "%s - dropping data, %d bytes lost\n", __FUNCTION__, length - cnt);
+			if(cnt == 0)
+				break;
 		}
-		cnt = min(length, TTY_FLIPBUF_SIZE - tty->flip.count);
-		memcpy(tty->flip.char_buf_ptr, data, cnt);
-		memset(tty->flip.flag_buf_ptr, 0, cnt);
-		tty->flip.char_buf_ptr += cnt;
-		tty->flip.flag_buf_ptr += cnt;
-		tty->flip.count += cnt;
+		tty_insert_flip_string(tty, data, cnt);
+		tty_flip_buffer_push(tty);
 		data += cnt;
 		length -= cnt;
 	} while (length > 0);
 
-	tty_flip_buffer_push(tty);
 }
 
 
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index 49b1fbe61f25..bce3d55affd8 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -488,7 +488,6 @@ static void visor_read_bulk_callback (struct urb *urb, struct pt_regs *regs)
 	unsigned char *data = urb->transfer_buffer;
 	struct tty_struct *tty;
 	unsigned long flags;
-	int i;
 	int throttled;
 	int result;
 
@@ -503,14 +502,8 @@ static void visor_read_bulk_callback (struct urb *urb, struct pt_regs *regs)
 
 	tty = port->tty;
 	if (tty && urb->actual_length) {
-		for (i = 0; i < urb->actual_length ; ++i) {
-			/* if we insert more than TTY_FLIPBUF_SIZE characters, we drop them. */
-			if(tty->flip.count >= TTY_FLIPBUF_SIZE) {
-				tty_flip_buffer_push(tty);
-			}
-			/* this doesn't actually push the data through unless tty->low_latency is set */
-			tty_insert_flip_char(tty, data[i], 0);
-		}
+		tty_buffer_request_room(tty, urb->actual_length);
+		tty_insert_flip_string(tty, data, urb->actual_length);
 		tty_flip_buffer_push(tty);
 	}
 	spin_lock_irqsave(&priv->lock, flags);
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index a7c3c4734d83..557411c6e7c7 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -1434,7 +1434,9 @@ static void rx_data_softint(void *private)
 		urb = wrap->urb;
 
 		if (tty && urb->actual_length) {
-			if (urb->actual_length > TTY_FLIPBUF_SIZE - tty->flip.count) {
+			int len = tty_buffer_request_room(tty, urb->actual_length);
+			/* This stuff can go away now I suspect */
+			if (unlikely(len < urb->actual_length)) {
 				spin_lock_irqsave(&info->lock, flags);
 				list_add(tmp, &info->rx_urb_q);
 				spin_unlock_irqrestore(&info->lock, flags);
@@ -1442,11 +1444,8 @@ static void rx_data_softint(void *private)
 				schedule_work(&info->rx_work);
 				return;
 			}
-
-			memcpy(tty->flip.char_buf_ptr, urb->transfer_buffer, urb->actual_length);
-			tty->flip.char_buf_ptr += urb->actual_length;
-			tty->flip.count += urb->actual_length;
-			sent += urb->actual_length;
+			tty_insert_flip_string(tty, urb->transfer_buffer, len);
+			sent += len;
 		}
 
 		urb->dev = port->serial->dev;
diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h
index 7428198111eb..45f625d7d0b2 100644
--- a/include/linux/kbd_kern.h
+++ b/include/linux/kbd_kern.h
@@ -151,7 +151,7 @@ extern unsigned int keymap_count;
 
 static inline void con_schedule_flip(struct tty_struct *t)
 {
-	schedule_work(&t->flip.work);
+	schedule_work(&t->buf.work);
 }
 
 #endif
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 57449704a47b..3787102e4b12 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -51,16 +51,22 @@
  */
 #define TTY_FLIPBUF_SIZE 512
 
-struct tty_flip_buffer {
+struct tty_buffer {
+	struct tty_buffer *next;
+	char *char_buf_ptr;
+	unsigned char *flag_buf_ptr;
+	int used;
+	int size;
+	/* Data points here */
+	unsigned long data[0];
+};
+
+struct tty_bufhead {
 	struct work_struct		work;
 	struct semaphore pty_sem;
-	char		*char_buf_ptr;
-	unsigned char	*flag_buf_ptr;
-	int		count;
-	int		buf_num;
-	unsigned char	char_buf[2*TTY_FLIPBUF_SIZE];
-	char		flag_buf[2*TTY_FLIPBUF_SIZE];
-	unsigned char	slop[4]; /* N.B. bug overwrites buffer by 1 */
+	struct tty_buffer *head;	/* Queue head */
+	struct tty_buffer *tail;	/* Active buffer */
+	struct tty_buffer *free;	/* Free queue head */
 };
 /*
  * The pty uses char_buf and flag_buf as a contiguous buffer
@@ -186,10 +192,11 @@ struct tty_struct {
 	unsigned char stopped:1, hw_stopped:1, flow_stopped:1, packet:1;
 	unsigned char low_latency:1, warned:1;
 	unsigned char ctrl_status;
+	unsigned int receive_room;	/* Bytes free for queue */
 
 	struct tty_struct *link;
 	struct fasync_struct *fasync;
-	struct tty_flip_buffer flip;
+	struct tty_bufhead buf;
 	int max_flip_cnt;
 	int alt_speed;		/* For magic substitution of 38400 bps */
 	wait_queue_head_t write_wait;
diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index abe9bfcf226c..be1400e82482 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -1,25 +1,33 @@
 #ifndef _LINUX_TTY_FLIP_H
 #define _LINUX_TTY_FLIP_H
 
+extern int tty_buffer_request_room(struct tty_struct *tty, size_t size);
+extern int tty_insert_flip_string(struct tty_struct *tty, unsigned char *chars, size_t size);
+extern int tty_insert_flip_string_flags(struct tty_struct *tty, unsigned char *chars, char *flags, size_t size);
+extern int tty_prepare_flip_string(struct tty_struct *tty, unsigned char **chars, size_t size);
+extern int tty_prepare_flip_string_flags(struct tty_struct *tty, unsigned char **chars, char **flags, size_t size);
+
 #ifdef INCLUDE_INLINE_FUNCS
 #define _INLINE_ extern
 #else
 #define _INLINE_ static __inline__
 #endif
 
-_INLINE_ void tty_insert_flip_char(struct tty_struct *tty,
+_INLINE_ int tty_insert_flip_char(struct tty_struct *tty,
 				   unsigned char ch, char flag)
 {
-	if (tty->flip.count < TTY_FLIPBUF_SIZE) {
-		tty->flip.count++;
-		*tty->flip.flag_buf_ptr++ = flag;
-		*tty->flip.char_buf_ptr++ = ch;
+	struct tty_buffer *tb = tty->buf.tail;
+	if (tb && tb->used < tb->size) {
+		tb->flag_buf_ptr[tb->used] = flag;
+		tb->char_buf_ptr[tb->used++] = ch;
+		return 1;
 	}
+	return tty_insert_flip_string_flags(tty, &ch, &flag, 1);
 }
 
 _INLINE_ void tty_schedule_flip(struct tty_struct *tty)
 {
-	schedule_delayed_work(&tty->flip.work, 1);
+	schedule_delayed_work(&tty->buf.work, 1);
 }
 
 #undef _INLINE_
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 6066afde5ce4..83c6e6c10ebb 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -81,14 +81,6 @@
  * 	pointer of flag bytes which indicate whether a character was
  * 	received with a parity error, etc.
  * 
- * int	(*receive_room)(struct tty_struct *);
- *
- * 	This function is called by the low-level tty driver to
- * 	determine how many characters the line discpline can accept.
- * 	The low-level driver must not send more characters than was
- * 	indicated by receive_room, or the line discpline may drop
- * 	those characters.
- * 
  * void	(*write_wakeup)(struct tty_struct *);
  *
  * 	This function is called by the low-level tty driver to signal
@@ -136,7 +128,6 @@ struct tty_ldisc {
 	 */
 	void	(*receive_buf)(struct tty_struct *, const unsigned char *cp,
 			       char *fp, int count);
-	int	(*receive_room)(struct tty_struct *);
 	void	(*write_wakeup)(struct tty_struct *);
 
 	struct  module *owner;
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 158a9c46d863..f57cde78c3de 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -480,13 +480,8 @@ static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb)
 	BT_DBG("dlc %p tty %p len %d", dlc, tty, skb->len);
 
 	if (test_bit(TTY_DONT_FLIP, &tty->flags)) {
-		register int i;
-		for (i = 0; i < skb->len; i++) {
-			if (tty->flip.count >= TTY_FLIPBUF_SIZE)
-				tty_flip_buffer_push(tty);
-
-			tty_insert_flip_char(tty, skb->data[i], 0);
-		}
+		tty_buffer_request_room(tty, skb->len);
+		tty_insert_flip_string(tty, skb->data, skb->len);
 		tty_flip_buffer_push(tty);
 	} else
 		tty->ldisc.receive_buf(tty, skb->data, NULL, skb->len);
-- 
cgit v1.2.3-71-gd317


From aaa246ea78c68cd205f505070650cda7c5a95d34 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <xslaby@fi.muni.cz>
Date: Mon, 9 Jan 2006 20:54:23 -0800
Subject: [PATCH] char/isicom: Other little changes

Move some code from one place to another.  Get rid of ugly ifdefs in code in
next p[patches, so here create functions and macros to enable it.  Rename some
functions and align some code to 80 chars.

Signed-off-by: Jiri Slaby <xslaby@fi.muni.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/isicom.c  | 163 ++++++++++++++++++++++++-------------------------
 include/linux/isicom.h |   3 -
 2 files changed, 81 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index 9ede7c211cc1..49f88c9006cb 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -135,6 +135,17 @@
 
 #include <linux/isicom.h>
 
+#define InterruptTheCard(base) outw(0, (base) + 0xc)
+#define ClearInterrupt(base) inw((base) + 0x0a)
+
+#ifdef DEBUG
+#define pr_dbg(str...) printk(KERN_DEBUG "ISICOM: " str)
+#define isicom_paranoia_check(a, b, c) __isicom_paranoia_check((a), (b), (c))
+#else
+#define pr_dbg(str...) do { } while (0)
+#define isicom_paranoia_check(a, b, c) 0
+#endif
+
 static struct pci_device_id isicom_pci_tbl[] = {
 	{ VENDOR_ID, 0x2028, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
 	{ VENDOR_ID, 0x2051, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
@@ -161,7 +172,6 @@ static void isicom_tx(unsigned long _data);
 static void isicom_start(struct tty_struct *tty);
 
 static unsigned char *tmp_buf;
-static DECLARE_MUTEX(tmp_buf_sem);
 
 /*   baud index mappings from linux defns to isi */
 
@@ -599,23 +609,20 @@ static int ISILoad_ioctl(struct inode *inode, struct file *filp,
  *
  */
 
-static inline int isicom_paranoia_check(struct isi_port const *port, char *name,
-	const char *routine)
+static inline int __isicom_paranoia_check(struct isi_port const *port,
+	char *name, const char *routine)
 {
-#ifdef ISICOM_DEBUG
-	static const char *badmagic =
-			KERN_WARNING "ISICOM: Warning: bad isicom magic for dev %s in %s.\n";
-	static const char *badport =
-			KERN_WARNING "ISICOM: Warning: NULL isicom port for dev %s in %s.\n";
 	if (!port) {
-		printk(badport, name, routine);
+		printk(KERN_WARNING "ISICOM: Warning: bad isicom magic for "
+			"dev %s in %s.\n", name, routine);
 		return 1;
 	}
 	if (port->magic != ISICOM_MAGIC) {
-		printk(badmagic, name, routine);
+		printk(KERN_WARNING "ISICOM: Warning: NULL isicom port for "
+			"dev %s in %s.\n", name, routine);
 		return 1;
 	}
-#endif
+
 	return 0;
 }
 
@@ -674,12 +681,10 @@ static void isicom_tx(unsigned long _data)
 			unlock_card(&isi_card[card]);
 			continue;
 		}
-#ifdef ISICOM_DEBUG
-		printk(KERN_DEBUG "ISICOM: txing %d bytes, port%d.\n",
-				txcount, port->channel+1);
-#endif
-		outw((port->channel << isi_card[card].shift_count) | txcount
-					, base);
+		pr_dbg("txing %d bytes, port%d.\n", txcount,
+			port->channel + 1);
+		outw((port->channel << isi_card[card].shift_count) | txcount,
+			base);
 		residue = NO;
 		wrd = 0;
 		while (1) {
@@ -725,8 +730,11 @@ static void isicom_tx(unsigned long _data)
 
 	/*	schedule another tx for hopefully in about 10ms	*/
 sched_again:
-	if (!re_schedule)
-		return;
+	if (!re_schedule) {
+		re_schedule = 2;
+ 		return;
+	}
+
 	init_timer(&tx);
 	tx.expires = jiffies + HZ/100;
 	tx.data = 0;
@@ -830,9 +838,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 				if (port->status & ISI_DCD) {
 					if (!(header & ISI_DCD)) {
 					/* Carrier has been lost  */
-#ifdef ISICOM_DEBUG
-						printk(KERN_DEBUG "ISICOM: interrupt: DCD->low.\n");
-#endif
+						pr_dbg("interrupt: DCD->low.\n");
 						port->status &= ~ISI_DCD;
 						schedule_work(&port->hangup_tq);
 					}
@@ -840,9 +846,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 				else {
 					if (header & ISI_DCD) {
 					/* Carrier has been detected */
-#ifdef ISICOM_DEBUG
-						printk(KERN_DEBUG "ISICOM: interrupt: DCD->high.\n");
-#endif
+						pr_dbg("interrupt: DCD->high.\n");
 						port->status |= ISI_DCD;
 						wake_up_interruptible(&port->open_wait);
 					}
@@ -899,21 +903,18 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 			break;
 
 		case 2:	/* Statistics		 */
-			printk(KERN_DEBUG "ISICOM: isicom_interrupt: stats!!!.\n");
+			pr_dbg("isicom_interrupt: stats!!!.\n");
 			break;
 
 		default:
-			printk(KERN_WARNING "ISICOM: Intr: Unknown code in status packet.\n");
+			pr_dbg("Intr: Unknown code in status packet.\n");
 			break;
 		}
 	}
 	else {				/* Data   Packet */
 
 		count = tty_prepare_flip_string(tty, &rp, byte_count & ~1);
-#ifdef ISICOM_DEBUG
-		printk(KERN_DEBUG "ISICOM: Intr: Can rx %d of %d bytes.\n",
-					count, byte_count);
-#endif
+		pr_dbg("Intr: Can rx %d of %d bytes.\n", count, byte_count);
 		word_count = count >> 1;
 		insw(base, rp, word_count);
 		byte_count -= (word_count << 1);
@@ -922,8 +923,8 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 			byte_count -= 2;
 		}
 		if (byte_count > 0) {
-			printk(KERN_DEBUG "ISICOM: Intr(0x%lx:%d): Flip buffer overflow! dropping bytes...\n",
-					base, channel+1);
+			pr_dbg("Intr(0x%lx:%d): Flip buffer overflow! dropping "
+				"bytes...\n", base, channel + 1);
 			while(byte_count > 0) { /* drain out unread xtra data */
 				inw(base);
 				byte_count -= 2;
@@ -1116,9 +1117,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp, struct isi
 	/* block if port is in the process of being closed */
 
 	if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
-#ifdef ISICOM_DEBUG
-		printk(KERN_DEBUG "ISICOM: block_til_ready: close in progress.\n");
-#endif
+		pr_dbg("block_til_ready: close in progress.\n");
 		interruptible_sleep_on(&port->close_wait);
 		if (port->flags & ASYNC_HUP_NOTIFY)
 			return -EAGAIN;
@@ -1129,9 +1128,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp, struct isi
 	/* if non-blocking mode is set ... */
 
 	if ((filp->f_flags & O_NONBLOCK) || (tty->flags & (1 << TTY_IO_ERROR))) {
-#ifdef ISICOM_DEBUG
-		printk(KERN_DEBUG "ISICOM: block_til_ready: non-block mode.\n");
-#endif
+		pr_dbg("block_til_ready: non-block mode.\n");
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
@@ -1271,7 +1268,7 @@ static void isicom_shutdown_port(struct isi_port *port)
 		set_bit(TTY_IO_ERROR, &tty->flags);
 
 	if (--card->count < 0) {
-		printk(KERN_DEBUG "ISICOM: isicom_shutdown_port: bad board(0x%lx) count %d.\n",
+		pr_dbg("isicom_shutdown_port: bad board(0x%lx) count %d.\n",
 			card->base, card->count);
 		card->count = 0;
 	}
@@ -1294,9 +1291,7 @@ static void isicom_close(struct tty_struct *tty, struct file *filp)
 	if (isicom_paranoia_check(port, tty->name, "isicom_close"))
 		return;
 
-#ifdef ISICOM_DEBUG
-	printk(KERN_DEBUG "ISICOM: Close start!!!.\n");
-#endif
+	pr_dbg("Close start!!!.\n");
 
 	spin_lock_irqsave(&card->card_lock, flags);
 	if (tty_hung_up_p(filp)) {
@@ -1347,9 +1342,7 @@ static void isicom_close(struct tty_struct *tty, struct file *filp)
 	if (port->blocked_open) {
 		spin_unlock_irqrestore(&card->card_lock, flags);
 		if (port->close_delay) {
-#ifdef ISICOM_DEBUG
-			printk(KERN_DEBUG "ISICOM: scheduling until time out.\n");
-#endif
+			pr_dbg("scheduling until time out.\n");
 			msleep_interruptible(jiffies_to_msecs(port->close_delay));
 		}
 		spin_lock_irqsave(&card->card_lock, flags);
@@ -1786,42 +1779,44 @@ static struct tty_operations isicom_ops = {
 	.tiocmset		= isicom_tiocmset,
 };
 
-static int __devinit register_drivers(void)
+static int __devinit isicom_register_tty_driver(void)
 {
-	int error;
+	int error = -ENOMEM;
 
 	/* tty driver structure initialization */
 	isicom_normal = alloc_tty_driver(PORT_COUNT);
 	if (!isicom_normal)
-		return -ENOMEM;
-
-	isicom_normal->owner	= THIS_MODULE;
-	isicom_normal->name 	= "ttyM";
-	isicom_normal->devfs_name = "isicom/";
-	isicom_normal->major	= ISICOM_NMAJOR;
-	isicom_normal->minor_start	= 0;
-	isicom_normal->type	= TTY_DRIVER_TYPE_SERIAL;
-	isicom_normal->subtype	= SERIAL_TYPE_NORMAL;
-	isicom_normal->init_termios	= tty_std_termios;
-	isicom_normal->init_termios.c_cflag	=
-				B9600 | CS8 | CREAD | HUPCL |CLOCAL;
-	isicom_normal->flags	= TTY_DRIVER_REAL_RAW;
+		goto end;
+
+	isicom_normal->owner			= THIS_MODULE;
+	isicom_normal->name 			= "ttyM";
+	isicom_normal->devfs_name	 	= "isicom/";
+	isicom_normal->major			= ISICOM_NMAJOR;
+	isicom_normal->minor_start		= 0;
+	isicom_normal->type			= TTY_DRIVER_TYPE_SERIAL;
+	isicom_normal->subtype			= SERIAL_TYPE_NORMAL;
+	isicom_normal->init_termios		= tty_std_termios;
+	isicom_normal->init_termios.c_cflag	= B9600 | CS8 | CREAD | HUPCL |
+		CLOCAL;
+	isicom_normal->flags			= TTY_DRIVER_REAL_RAW;
 	tty_set_operations(isicom_normal, &isicom_ops);
 
-	if ((error=tty_register_driver(isicom_normal))!=0) {
-		printk(KERN_DEBUG "ISICOM: Couldn't register the dialin driver, error=%d\n",
+	if ((error = tty_register_driver(isicom_normal))) {
+		pr_dbg("Couldn't register the dialin driver, error=%d\n",
 			error);
 		put_tty_driver(isicom_normal);
-		return error;
 	}
-	return 0;
+end:
+	return error;
 }
 
-static void unregister_drivers(void)
+static void isicom_unregister_tty_driver(void)
 {
-	int error = tty_unregister_driver(isicom_normal);
-	if (error)
-		printk(KERN_DEBUG "ISICOM: couldn't unregister normal driver error=%d.\n",error);
+	int error;
+
+	if ((error = tty_unregister_driver(isicom_normal)))
+		pr_dbg("couldn't unregister normal driver, error=%d.\n", error);
+
 	put_tty_driver(isicom_normal);
 }
 
@@ -1891,7 +1886,7 @@ static int __devinit isicom_init(void)
 		free_page((unsigned long)tmp_buf);
 		return 0;
 	}
-	if (register_drivers())
+	if (isicom_register_tty_driver())
 	{
 		unregister_ioregion();
 		free_page((unsigned long)tmp_buf);
@@ -1899,7 +1894,7 @@ static int __devinit isicom_init(void)
 	}
 	if (!register_isr())
 	{
-		unregister_drivers();
+		isicom_unregister_tty_driver();
 		/*  ioports already uregistered in register_isr */
 		free_page((unsigned long)tmp_buf);
 		return 0;
@@ -1936,14 +1931,6 @@ static int __devinit isicom_init(void)
 static int io[4];
 static int irq[4];
 
-MODULE_AUTHOR("MultiTech");
-MODULE_DESCRIPTION("Driver for the ISI series of cards by MultiTech");
-MODULE_LICENSE("GPL");
-module_param_array(io, int, NULL, 0);
-MODULE_PARM_DESC(io, "I/O ports for the cards");
-module_param_array(irq, int, NULL, 0);
-MODULE_PARM_DESC(irq, "Interrupts for the cards");
-
 static int __devinit isicom_setup(void)
 {
 	struct pci_dev *dev = NULL;
@@ -2047,11 +2034,15 @@ static int __devinit isicom_setup(void)
 
 static void __exit isicom_exit(void)
 {
+	unsigned int index = 0;
+
 	re_schedule = 0;
-	/* FIXME */
-	msleep(1000);
+
+	while (re_schedule != 2 && index++ < 100)
+		msleep(10);
+
 	unregister_isr();
-	unregister_drivers();
+	isicom_unregister_tty_driver();
 	unregister_ioregion();
 	if (tmp_buf)
 		free_page((unsigned long)tmp_buf);
@@ -2061,3 +2052,11 @@ static void __exit isicom_exit(void)
 
 module_init(isicom_setup);
 module_exit(isicom_exit);
+
+MODULE_AUTHOR("MultiTech");
+MODULE_DESCRIPTION("Driver for the ISI series of cards by MultiTech");
+MODULE_LICENSE("GPL");
+module_param_array(io, int, NULL, 0);
+MODULE_PARM_DESC(io, "I/O ports for the cards");
+module_param_array(irq, int, NULL, 0);
+MODULE_PARM_DESC(irq, "Interrupts for the cards");
diff --git a/include/linux/isicom.h b/include/linux/isicom.h
index 7c6eae7f6ed7..06cb7baa6db8 100644
--- a/include/linux/isicom.h
+++ b/include/linux/isicom.h
@@ -98,9 +98,6 @@ typedef	struct	{
 #define		ISICOM_INITIATE_XONXOFF	0x04
 #define		ISICOM_RESPOND_XONXOFF	0x08
 
-#define InterruptTheCard(base) (outw(0,(base)+0xc)) 
-#define ClearInterrupt(base) (inw((base)+0x0a))	
-
 #define	BOARD(line)  (((line) >> 4) & 0x3)
 
 	/*	isi kill queue bitmap	*/
-- 
cgit v1.2.3-71-gd317


From e65c1db19fe8177fa2da53e3e0bddffe585b2d47 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <xslaby@fi.muni.cz>
Date: Mon, 9 Jan 2006 20:54:25 -0800
Subject: [PATCH] char/isicom: Firmware loading

Firmware loading via hotplug added.
Cleanup firmware old-way fields in header file.

Signed-off-by: Jiri Slaby <xslaby@fi.muni.cz>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/isicom.c  | 410 +++++++++++++++++++++----------------------------
 include/linux/isicom.h |  35 -----
 2 files changed, 174 insertions(+), 271 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index 9ef8ab301768..55a47b33ff34 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -112,6 +112,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/firmware.h>
 #include <linux/kernel.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
@@ -120,7 +121,6 @@
 #include <linux/sched.h>
 #include <linux/serial.h>
 #include <linux/mm.h>
-#include <linux/miscdevice.h>
 #include <linux/interrupt.h>
 #include <linux/timer.h>
 #include <linux/delay.h>
@@ -175,8 +175,6 @@ static struct tty_driver *isicom_normal;
 static struct timer_list tx;
 static char re_schedule = 1;
 
-static int ISILoad_ioctl(struct inode *inode, struct file *filp, unsigned  int cmd, unsigned long arg);
-
 static void isicom_tx(unsigned long _data);
 static void isicom_start(struct tty_struct *tty);
 
@@ -384,233 +382,6 @@ static inline void kill_queue(struct isi_port *port, short queue)
 	unlock_card(card);
 }
 
-
-/*
- *  Firmware loader driver specific routines. This needs to mostly die
- *  and be replaced with request_firmware.
- */
-
-static struct file_operations ISILoad_fops = {
-	.owner		= THIS_MODULE,
-	.ioctl		= ISILoad_ioctl,
-};
-
-static struct miscdevice isiloader_device = {
-	ISILOAD_MISC_MINOR, "isictl", &ISILoad_fops
-};
-
-
-static inline int WaitTillCardIsFree(unsigned long base)
-{
-	unsigned long count=0;
-	while( (!(inw(base+0xe) & 0x1)) && (count++ < 6000000));
-	if (inw(base+0xe)&0x1)
-		return 0;
-	else
-		return 1;
-}
-
-static int ISILoad_ioctl(struct inode *inode, struct file *filp,
-	unsigned int cmd, unsigned long arg)
-{
-	unsigned int card, i, j, signature, status, portcount = 0;
-	unsigned long t, base;
-	u16 word_count;
-	bin_frame frame;
-	void __user *argp = (void __user *)arg;
-	/* exec_record exec_rec; */
-
-	if (get_user(card, (int __user *)argp))
-		return -EFAULT;
-
-	if (card < 0 || card >= BOARD_COUNT)
-		return -ENXIO;
-
-	base=isi_card[card].base;
-
-	if (base==0)
-		return -ENXIO;	/* disabled or not used */
-
-	switch(cmd) {
-	case MIOCTL_RESET_CARD:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-		printk(KERN_DEBUG "ISILoad:Resetting Card%d at 0x%lx ",card+1,base);
-
-		inw(base+0x8);
-
-		for (t=jiffies+HZ/100;time_before(jiffies, t););
-
-		outw(0,base+0x8); /* Reset */
-
-		for (j=1;j<=3;j++) {
-			for (t=jiffies+HZ;time_before(jiffies, t););
-			printk(".");
-		}
-		signature=(inw(base+0x4)) & 0xff;
-		if (isi_card[card].isa) {
-
-			if (!(inw(base+0xe) & 0x1) || (inw(base+0x2))) {
-#ifdef ISICOM_DEBUG
-				printk("\nbase+0x2=0x%x , base+0xe=0x%x",inw(base+0x2),inw(base+0xe));
-#endif
-				printk("\nISILoad:ISA Card%d reset failure (Possible bad I/O Port Address 0x%lx).\n",card+1,base);
-				return -EIO;
-			}
-		}
-		else {
-			portcount = inw(base+0x2);
-			if (!(inw(base+0xe) & 0x1) || ((portcount!=0) && (portcount!=4) && (portcount!=8))) {
-#ifdef ISICOM_DEBUG
-				printk("\nbase+0x2=0x%x , base+0xe=0x%x",inw(base+0x2),inw(base+0xe));
-#endif
-				printk("\nISILoad:PCI Card%d reset failure (Possible bad I/O Port Address 0x%lx).\n",card+1,base);
-				return -EIO;
-			}
-		}
-		switch(signature) {
-		case	0xa5:
-		case	0xbb:
-		case	0xdd:
-				if (isi_card[card].isa)
-					isi_card[card].port_count = 8;
-				else {
-					if (portcount == 4)
-						isi_card[card].port_count = 4;
-					else
-						isi_card[card].port_count = 8;
-				}
-				isi_card[card].shift_count = 12;
-				break;
-
-		case	0xcc:	isi_card[card].port_count = 16;
-				isi_card[card].shift_count = 11;
-				break;
-
-		default: printk("ISILoad:Card%d reset failure (Possible bad I/O Port Address 0x%lx).\n",card+1,base);
-#ifdef ISICOM_DEBUG
-			printk("Sig=0x%x\n",signature);
-#endif
-			return -EIO;
-		}
-		printk("-Done\n");
-		return put_user(signature,(unsigned __user *)argp);
-
-	case	MIOCTL_LOAD_FIRMWARE:
-			if (!capable(CAP_SYS_ADMIN))
-				return -EPERM;
-
-			if (copy_from_user(&frame, argp, sizeof(bin_frame)))
-				return -EFAULT;
-
-			if (WaitTillCardIsFree(base))
-				return -EIO;
-
-			outw(0xf0,base); /* start upload sequence */
-			outw(0x00,base);
-			outw((frame.addr), base); /* lsb of adderess */
-
-			word_count=(frame.count >> 1) + frame.count % 2;
-			outw(word_count, base);
-			InterruptTheCard(base);
-
-			for (i=0;i<=0x2f;i++);	/* a wee bit of delay */
-
-			if (WaitTillCardIsFree(base))
-				return -EIO;
-
-			if ((status=inw(base+0x4))!=0) {
-				printk(KERN_WARNING "ISILoad:Card%d rejected load header:\nAddress:0x%x \nCount:0x%x \nStatus:0x%x \n",
-				card+1, frame.addr, frame.count, status);
-				return -EIO;
-			}
-			outsw(base, (void *) frame.bin_data, word_count);
-
-			InterruptTheCard(base);
-
-			for (i=0;i<=0x0f;i++);	/* another wee bit of delay */
-
-			if (WaitTillCardIsFree(base))
-				return -EIO;
-
-			if ((status=inw(base+0x4))!=0) {
-				printk(KERN_ERR "ISILoad:Card%d got out of sync.Card Status:0x%x\n",card+1, status);
-				return -EIO;
-			}
-			return 0;
-
-	case	MIOCTL_READ_FIRMWARE:
-			if (!capable(CAP_SYS_ADMIN))
-				return -EPERM;
-
-			if (copy_from_user(&frame, argp, sizeof(bin_header)))
-				return -EFAULT;
-
-			if (WaitTillCardIsFree(base))
-				return -EIO;
-
-			outw(0xf1,base); /* start download sequence */
-			outw(0x00,base);
-			outw((frame.addr), base); /* lsb of adderess */
-
-			word_count=(frame.count >> 1) + frame.count % 2;
-			outw(word_count+1, base);
-			InterruptTheCard(base);
-
-			for (i=0;i<=0xf;i++);	/* a wee bit of delay */
-
-			if (WaitTillCardIsFree(base))
-				return -EIO;
-
-			if ((status=inw(base+0x4))!=0) {
-				printk(KERN_WARNING "ISILoad:Card%d rejected verify header:\nAddress:0x%x \nCount:0x%x \nStatus:0x%x \n",
-				card+1, frame.addr, frame.count, status);
-				return -EIO;
-			}
-
-			inw(base);
-			insw(base, frame.bin_data, word_count);
-			InterruptTheCard(base);
-
-			for (i=0;i<=0x0f;i++);	/* another wee bit of delay */
-
-			if (WaitTillCardIsFree(base))
-				return -EIO;
-
-			if ((status=inw(base+0x4))!=0) {
-				printk(KERN_ERR "ISILoad:Card%d verify got out of sync.Card Status:0x%x\n",card+1, status);
-				return -EIO;
-			}
-
-			if (copy_to_user(argp, &frame, sizeof(bin_frame)))
-				return -EFAULT;
-			return 0;
-
-	case	MIOCTL_XFER_CTRL:
-			if (!capable(CAP_SYS_ADMIN))
-				return -EPERM;
-			if (WaitTillCardIsFree(base))
-				return -EIO;
-
-			outw(0xf2, base);
-			outw(0x800, base);
-			outw(0x0, base);
-			outw(0x0, base);
-			InterruptTheCard(base);
-			outw(0x0, base+0x4); /* for ISI4608 cards */
-
-			isi_card[card].status |= FIRMWARE_LOADED;
-			return 0;
-
-	default:
-#ifdef ISICOM_DEBUG
-		printk(KERN_DEBUG "ISILoad: Received Ioctl cmd 0x%x.\n", cmd);
-#endif
-		return -ENOIOCTLCMD;
-	}
-}
-
-
 /*
  *	ISICOM Driver specific routines ...
  *
@@ -1927,6 +1698,175 @@ end:
 	return retval;
 }
 
+static inline int WaitTillCardIsFree(u16 base)
+{
+	unsigned long count = 0;
+
+	while (!(inw(base + 0xe) & 0x1) && count++ < 100)
+		msleep(5);
+
+	return !(inw(base + 0xe) & 0x1);
+}
+
+static int __devinit load_firmware(struct pci_dev *pdev,
+	const unsigned int index, const unsigned int signature)
+{
+	struct isi_board *board = pci_get_drvdata(pdev);
+	const struct firmware *fw;
+	unsigned long base = board->base;
+	unsigned int a;
+	u16 word_count, status;
+	int retval = -EIO;
+	char *name;
+	u8 *data;
+
+	struct stframe {
+		u16	addr;
+		u16	count;
+		u8	data[0];
+	} *frame;
+
+	switch (signature) {
+	case 0xa5:
+		name = "isi608.bin";
+		break;
+	case 0xbb:
+		name = "isi608em.bin";
+		break;
+	case 0xcc:
+		name = "isi616em.bin";
+		break;
+	case 0xdd:
+		name = "isi4608.bin";
+		break;
+	case 0xee:
+		name = "isi4616.bin";
+		break;
+	default:
+		dev_err(&pdev->dev, "Unknown signature.\n");
+		goto end;
+ 	}
+
+	retval = request_firmware(&fw, name, &pdev->dev);
+	if (retval)
+		goto end;
+
+	for (frame = (struct stframe *)fw->data;
+			frame < (struct stframe *)(fw->data + fw->size);
+			frame++) {
+		if (WaitTillCardIsFree(base))
+			goto errrelfw;
+
+		outw(0xf0, base);	/* start upload sequence */
+		outw(0x00, base);
+		outw(frame->addr, base); /* lsb of address */
+
+		word_count = frame->count / 2 + frame->count % 2;
+		outw(word_count, base);
+		InterruptTheCard(base);
+
+		udelay(100); /* 0x2f */
+
+		if (WaitTillCardIsFree(base))
+			goto errrelfw;
+
+		if ((status = inw(base + 0x4)) != 0) {
+			dev_warn(&pdev->dev, "Card%d rejected load header:\n"
+				"Address:0x%x\nCount:0x%x\nStatus:0x%x\n",
+				index + 1, frame->addr, frame->count, status);
+			goto errrelfw;
+		}
+		outsw(base, frame->data, word_count);
+
+		InterruptTheCard(base);
+
+		udelay(50); /* 0x0f */
+
+		if (WaitTillCardIsFree(base))
+			goto errrelfw;
+
+		if ((status = inw(base + 0x4)) != 0) {
+			dev_err(&pdev->dev, "Card%d got out of sync.Card "
+				"Status:0x%x\n", index + 1, status);
+			goto errrelfw;
+		}
+ 	}
+
+	retval = -EIO;
+
+	if (WaitTillCardIsFree(base))
+		goto errrelfw;
+
+	outw(0xf2, base);
+	outw(0x800, base);
+	outw(0x0, base);
+	outw(0x0, base);
+	InterruptTheCard(base);
+	outw(0x0, base + 0x4); /* for ISI4608 cards */
+
+/* XXX: should we test it by reading it back and comparing with original like
+ * in load firmware package? */
+	for (frame = (struct stframe*)fw->data;
+			frame < (struct stframe*)(fw->data + fw->size);
+			frame++) {
+		if (WaitTillCardIsFree(base))
+			goto errrelfw;
+
+		outw(0xf1, base); /* start download sequence */
+		outw(0x00, base);
+		outw(frame->addr, base); /* lsb of address */
+
+		word_count = (frame->count >> 1) + frame->count % 2;
+		outw(word_count + 1, base);
+		InterruptTheCard(base);
+
+		udelay(50); /* 0xf */
+
+		if (WaitTillCardIsFree(base))
+			goto errrelfw;
+
+		if ((status = inw(base + 0x4)) != 0) {
+			dev_warn(&pdev->dev, "Card%d rejected verify header:\n"
+				"Address:0x%x\nCount:0x%x\nStatus: 0x%x\n",
+				index + 1, frame->addr, frame->count, status);
+			goto errrelfw;
+		}
+
+		data = kmalloc(word_count * 2, GFP_KERNEL);
+		inw(base);
+		insw(base, data, word_count);
+		InterruptTheCard(base);
+
+		for (a = 0; a < frame->count; a++)
+			if (data[a] != frame->data[a]) {
+				kfree(data);
+				dev_err(&pdev->dev, "Card%d, firmware upload "
+					"failed\n", index + 1);
+				goto errrelfw;
+			}
+		kfree(data);
+
+		udelay(50); /* 0xf */
+
+		if (WaitTillCardIsFree(base))
+			goto errrelfw;
+
+		if ((status = inw(base + 0x4)) != 0) {
+			dev_err(&pdev->dev, "Card%d verify got out of sync. "
+				"Card Status:0x%x\n", index + 1, status);
+			goto errrelfw;
+		}
+	}
+
+	board->status |= FIRMWARE_LOADED;
+	retval = 0;
+
+errrelfw:
+	release_firmware(fw);
+end:
+	return retval;
+}
+
 /*
  *	Insmod can set static symbols so keep these static
  */
@@ -1976,6 +1916,10 @@ static int __devinit isicom_probe(struct pci_dev *pdev,
 	if (retval < 0)
 		goto errunri;
 
+	retval = load_firmware(pdev, index, signature);
+	if (retval < 0)
+		goto errunri;
+
 	return 0;
 
 errunri:
@@ -2048,10 +1992,6 @@ static int __devinit isicom_setup(void)
 		goto errtty;
 	}
 
-	retval = misc_register(&isiloader_device);
-	if (retval < 0)
-		goto errpci;
-
 	init_timer(&tx);
 	tx.expires = jiffies + 1;
 	tx.data = 0;
@@ -2060,8 +2000,6 @@ static int __devinit isicom_setup(void)
 	add_timer(&tx);
 
 	return 0;
-errpci:
-	pci_unregister_driver(&isicom_driver);
 errtty:
 	isicom_unregister_tty_driver();
 error:
diff --git a/include/linux/isicom.h b/include/linux/isicom.h
index 06cb7baa6db8..ba5291a9f993 100644
--- a/include/linux/isicom.h
+++ b/include/linux/isicom.h
@@ -4,46 +4,11 @@
 /*#define		ISICOM_DEBUG*/
 /*#define		ISICOM_DEBUG_DTR_RTS*/
 
-
-/*
- *	Firmware Loader definitions ...
- */
- 
-#define		__MultiTech		('M'<<8)
-#define		MIOCTL_LOAD_FIRMWARE	(__MultiTech | 0x01)
-#define         MIOCTL_READ_FIRMWARE    (__MultiTech | 0x02)
-#define         MIOCTL_XFER_CTRL	(__MultiTech | 0x03)
-#define         MIOCTL_RESET_CARD	(__MultiTech | 0x04)
-
-#define		DATA_SIZE	16
-
-typedef	struct	{
-		unsigned short	exec_segment;
-		unsigned short	exec_addr;
-}	exec_record;
-
-typedef	struct	{
-		int		board;		/* Board to load */
-		unsigned short	addr;
-		unsigned short	count;
-}	bin_header;
-
-typedef	struct	{
-		int		board;		/* Board to load */
-		unsigned short	addr;
-		unsigned short	count;
-		unsigned short	segment;
-		unsigned char	bin_data[DATA_SIZE];
-}	bin_frame;
-
 #ifdef __KERNEL__
 
 #define		YES	1
 #define		NO	0
 
-#define		ISILOAD_MISC_MINOR	155	/* /dev/isctl */
-#define		ISILOAD_NAME		"ISILoad"
-
 /*	
  *  ISICOM Driver definitions ...
  *
-- 
cgit v1.2.3-71-gd317


From a547dfe9563c49fd0f9743640e01d1d652119ec7 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <xslaby@fi.muni.cz>
Date: Mon, 9 Jan 2006 20:54:26 -0800
Subject: [PATCH] char/isicom: More whitespaces and coding style

Wrap all the code to 80 chars on a line.
`}\nelse' changed to `} else'.
Clean whitespaces in header file.

Signed-off-by: Jiri Slaby <xslaby@fi.muni.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/isicom.c  | 112 ++++++++++++++++++++++++++-----------------------
 include/linux/isicom.h |  21 +++++-----
 2 files changed, 69 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index 55a47b33ff34..e9ebabaf8cb0 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -189,7 +189,7 @@ struct	isi_board {
 	unsigned char		irq;
 	unsigned char		port_count;
 	unsigned short		status;
-	unsigned short		port_status; /* each bit represents a single port */
+	unsigned short		port_status; /* each bit for each port */
 	unsigned short		shift_count;
 	struct isi_port		* ports;
 	signed char		count;
@@ -242,7 +242,9 @@ static int lock_card(struct isi_board *card)
 			udelay(1000);   /* 1ms */
 		}
 	}
-	printk(KERN_WARNING "ISICOM: Failed to lock Card (0x%lx)\n", card->base);
+	printk(KERN_WARNING "ISICOM: Failed to lock Card (0x%lx)\n",
+		card->base);
+
 	return 0;	/* Failed to aquire the card! */
 }
 
@@ -466,33 +468,36 @@ static void isicom_tx(unsigned long _data)
 		residue = NO;
 		wrd = 0;
 		while (1) {
-			cnt = min_t(int, txcount, (SERIAL_XMIT_SIZE - port->xmit_tail));
+			cnt = min_t(int, txcount, (SERIAL_XMIT_SIZE
+					- port->xmit_tail));
 			if (residue == YES) {
 				residue = NO;
 				if (cnt > 0) {
-					wrd |= (port->xmit_buf[port->xmit_tail] << 8);
-					port->xmit_tail = (port->xmit_tail + 1) & (SERIAL_XMIT_SIZE - 1);
+					wrd |= (port->xmit_buf[port->xmit_tail]
+									<< 8);
+					port->xmit_tail = (port->xmit_tail + 1)
+						& (SERIAL_XMIT_SIZE - 1);
 					port->xmit_cnt--;
 					txcount--;
 					cnt--;
 					outw(wrd, base);
-				}
-				else {
+				} else {
 					outw(wrd, base);
 					break;
 				}
 			}
 			if (cnt <= 0) break;
 			word_count = cnt >> 1;
-			outsw(base, port->xmit_buf+port->xmit_tail, word_count);
-			port->xmit_tail = (port->xmit_tail + (word_count << 1)) &
-						(SERIAL_XMIT_SIZE - 1);
+			outsw(base, port->xmit_buf+port->xmit_tail,word_count);
+			port->xmit_tail = (port->xmit_tail
+				+ (word_count << 1)) & (SERIAL_XMIT_SIZE - 1);
 			txcount -= (word_count << 1);
 			port->xmit_cnt -= (word_count << 1);
 			if (cnt & 0x0001) {
 				residue = YES;
 				wrd = port->xmit_buf[port->xmit_tail];
-				port->xmit_tail = (port->xmit_tail + 1) & (SERIAL_XMIT_SIZE - 1);
+				port->xmit_tail = (port->xmit_tail + 1)
+					& (SERIAL_XMIT_SIZE - 1);
 				port->xmit_cnt--;
 				txcount--;
 			}
@@ -572,8 +577,8 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	byte_count = header & 0xff;
 
 	if (channel + 1 > card->port_count) {
-		printk(KERN_WARNING "ISICOM: isicom_interrupt(0x%lx): %d(channel) > port_count.\n",
-				base, channel+1);
+		printk(KERN_WARNING "ISICOM: isicom_interrupt(0x%lx): "
+			"%d(channel) > port_count.\n", base, channel+1);
 		if (card->isa)
 			ClearInterrupt(base);
 		else
@@ -611,26 +616,22 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 		header = inw(base);
 		switch(header & 0xff) {
 		case 0:	/* Change in EIA signals */
-
 			if (port->flags & ASYNC_CHECK_CD) {
 				if (port->status & ISI_DCD) {
 					if (!(header & ISI_DCD)) {
 					/* Carrier has been lost  */
-						pr_dbg("interrupt: DCD->low.\n");
+						pr_dbg("interrupt: DCD->low.\n"
+							);
 						port->status &= ~ISI_DCD;
 						schedule_work(&port->hangup_tq);
 					}
+				} else if (header & ISI_DCD) {
+				/* Carrier has been detected */
+					pr_dbg("interrupt: DCD->high.\n");
+					port->status |= ISI_DCD;
+					wake_up_interruptible(&port->open_wait);
 				}
-				else {
-					if (header & ISI_DCD) {
-					/* Carrier has been detected */
-						pr_dbg("interrupt: DCD->high.\n");
-						port->status |= ISI_DCD;
-						wake_up_interruptible(&port->open_wait);
-					}
-				}
-			}
-			else {
+			} else {
 				if (header & ISI_DCD)
 					port->status |= ISI_DCD;
 				else
@@ -642,19 +643,16 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 					if (header & ISI_CTS) {
 						port->tty->hw_stopped = 0;
 						/* start tx ing */
-						port->status |= (ISI_TXOK | ISI_CTS);
+						port->status |= (ISI_TXOK
+							| ISI_CTS);
 						schedule_work(&port->bh_tqueue);
 					}
+				} else if (!(header & ISI_CTS)) {
+					port->tty->hw_stopped = 1;
+					/* stop tx ing */
+					port->status &= ~(ISI_TXOK | ISI_CTS);
 				}
-				else {
-					if (!(header & ISI_CTS)) {
-						port->tty->hw_stopped = 1;
-						/* stop tx ing */
-						port->status &= ~(ISI_TXOK | ISI_CTS);
-					}
-				}
-			}
-			else {
+			} else {
 				if (header & ISI_CTS)
 					port->status |= ISI_CTS;
 				else
@@ -673,7 +671,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 
 			break;
 
-		case 1:	/* Received Break !!!	 */
+		case 1:	/* Received Break !!! */
 			tty_insert_flip_char(tty, 0, TTY_BREAK);
 			if (port->flags & ASYNC_SAK)
 				do_SAK(tty);
@@ -688,8 +686,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 			pr_dbg("Intr: Unknown code in status packet.\n");
 			break;
 		}
-	}
-	else {				/* Data   Packet */
+	} else {				/* Data   Packet */
 
 		count = tty_prepare_flip_string(tty, &rp, byte_count & ~1);
 		pr_dbg("Intr: Can rx %d of %d bytes.\n", count, byte_count);
@@ -697,7 +694,8 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 		insw(base, rp, word_count);
 		byte_count -= (word_count << 1);
 		if (count & 0x0001) {
-			tty_insert_flip_char(tty,  inw(base) & 0xff, TTY_NORMAL);
+			tty_insert_flip_char(tty,  inw(base) & 0xff,
+				TTY_NORMAL);
 			byte_count -= 2;
 		}
 		if (byte_count > 0) {
@@ -714,6 +712,7 @@ static irqreturn_t isicom_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 		ClearInterrupt(base);
 	else
 		outw(0x0000, base+0x04); /* enable interrupts */
+
 	return IRQ_HANDLED;
 }
 
@@ -885,7 +884,8 @@ static int isicom_setup_port(struct isi_port *port)
 	return 0;
 }
 
-static int block_til_ready(struct tty_struct *tty, struct file *filp, struct isi_port *port)
+static int block_til_ready(struct tty_struct *tty, struct file *filp,
+	struct isi_port *port)
 {
 	struct isi_board *card = port->card;
 	int do_clocal = 0, retval;
@@ -905,7 +905,8 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp, struct isi
 
 	/* if non-blocking mode is set ... */
 
-	if ((filp->f_flags & O_NONBLOCK) || (tty->flags & (1 << TTY_IO_ERROR))) {
+	if ((filp->f_flags & O_NONBLOCK) ||
+			(tty->flags & (1 << TTY_IO_ERROR))) {
 		pr_dbg("block_til_ready: non-block mode.\n");
 		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
@@ -1051,7 +1052,7 @@ static void isicom_shutdown_port(struct isi_port *port)
 		card->count = 0;
 	}
 
-	/* last port was closed , shutdown that boad too */
+	/* last port was closed, shutdown that boad too */
 	if (C_HUPCL(tty)) {
 		if (!card->count)
 			isicom_shutdown_board(card);
@@ -1078,14 +1079,14 @@ static void isicom_close(struct tty_struct *tty, struct file *filp)
 	}
 
 	if (tty->count == 1 && port->count != 1) {
-		printk(KERN_WARNING "ISICOM:(0x%lx) isicom_close: bad port count"
-			"tty->count = 1	port count = %d.\n",
+		printk(KERN_WARNING "ISICOM:(0x%lx) isicom_close: bad port "
+			"count tty->count = 1 port count = %d.\n",
 			card->base, port->count);
 		port->count = 1;
 	}
 	if (--port->count < 0) {
-		printk(KERN_WARNING "ISICOM:(0x%lx) isicom_close: bad port count for"
-			"channel%d = %d", card->base, port->channel,
+		printk(KERN_WARNING "ISICOM:(0x%lx) isicom_close: bad port "
+			"count for channel%d = %d", card->base, port->channel,
 			port->count);
 		port->count = 0;
 	}
@@ -1121,7 +1122,8 @@ static void isicom_close(struct tty_struct *tty, struct file *filp)
 		spin_unlock_irqrestore(&card->card_lock, flags);
 		if (port->close_delay) {
 			pr_dbg("scheduling until time out.\n");
-			msleep_interruptible(jiffies_to_msecs(port->close_delay));
+			msleep_interruptible(
+				jiffies_to_msecs(port->close_delay));
 		}
 		spin_lock_irqsave(&card->card_lock, flags);
 		wake_up_interruptible(&port->open_wait);
@@ -1149,13 +1151,14 @@ static int isicom_write(struct tty_struct *tty,	const unsigned char *buf,
 	spin_lock_irqsave(&card->card_lock, flags);
 
 	while(1) {
-		cnt = min_t(int, count, min(SERIAL_XMIT_SIZE - port->xmit_cnt - 1,
-					SERIAL_XMIT_SIZE - port->xmit_head));
+		cnt = min_t(int, count, min(SERIAL_XMIT_SIZE - port->xmit_cnt
+				- 1, SERIAL_XMIT_SIZE - port->xmit_head));
 		if (cnt <= 0)
 			break;
 
 		memcpy(port->xmit_buf + port->xmit_head, buf, cnt);
-		port->xmit_head = (port->xmit_head + cnt) & (SERIAL_XMIT_SIZE - 1);
+		port->xmit_head = (port->xmit_head + cnt) & (SERIAL_XMIT_SIZE
+			- 1);
 		port->xmit_cnt += cnt;
 		buf += cnt;
 		count -= cnt;
@@ -1200,7 +1203,8 @@ static void isicom_flush_chars(struct tty_struct *tty)
 	if (isicom_paranoia_check(port, tty->name, "isicom_flush_chars"))
 		return;
 
-	if (port->xmit_cnt <= 0 || tty->stopped || tty->hw_stopped || !port->xmit_buf)
+	if (port->xmit_cnt <= 0 || tty->stopped || tty->hw_stopped ||
+			!port->xmit_buf)
 		return;
 
 	/* this tells the transmitter to consider this port for
@@ -1233,7 +1237,8 @@ static int isicom_chars_in_buffer(struct tty_struct *tty)
 }
 
 /* ioctl et all */
-static inline void isicom_send_break(struct isi_port *port, unsigned long length)
+static inline void isicom_send_break(struct isi_port *port,
+	unsigned long length)
 {
 	struct isi_board *card = port->card;
 	unsigned long base = card->base;
@@ -1368,7 +1373,8 @@ static int isicom_ioctl(struct tty_struct *tty, struct file *filp,
 		return 0;
 
 	case TIOCGSOFTCAR:
-		return put_user(C_CLOCAL(tty) ? 1 : 0, (unsigned long __user *)argp);
+		return put_user(C_CLOCAL(tty) ? 1 : 0,
+				(unsigned long __user *)argp);
 
 	case TIOCSSOFTCAR:
 		if (get_user(arg, (unsigned long __user *) argp))
diff --git a/include/linux/isicom.h b/include/linux/isicom.h
index ba5291a9f993..45b3d48f0978 100644
--- a/include/linux/isicom.h
+++ b/include/linux/isicom.h
@@ -9,7 +9,7 @@
 #define		YES	1
 #define		NO	0
 
-/*	
+/*
  *  ISICOM Driver definitions ...
  *
  */
@@ -20,8 +20,8 @@
  *      PCI definitions
  */
 
- #define        DEVID_COUNT     9
- #define        VENDOR_ID       0x10b5
+#define		DEVID_COUNT	9
+#define		VENDOR_ID	0x10b5
 
 /*
  *	These are now officially allocated numbers
@@ -31,9 +31,9 @@
 #define		ISICOM_CMAJOR	113	/* callout */
 #define		ISICOM_MAGIC	(('M' << 8) | 'T')
 
-#define		WAKEUP_CHARS	256	/* hard coded for now	*/ 
-#define		TX_SIZE		254 
- 
+#define		WAKEUP_CHARS	256	/* hard coded for now	*/
+#define		TX_SIZE		254
+
 #define		BOARD_COUNT	4
 #define		PORT_COUNT	(BOARD_COUNT*16)
 
@@ -66,12 +66,12 @@
 #define	BOARD(line)  (((line) >> 4) & 0x3)
 
 	/*	isi kill queue bitmap	*/
-	
+
 #define		ISICOM_KILLTX		0x01
 #define		ISICOM_KILLRX		0x02
 
 	/* isi_board status bitmap */
-	
+
 #define		FIRMWARE_LOADED		0x0001
 #define		BOARD_ACTIVE		0x0002
 
@@ -85,9 +85,8 @@
 #define		ISI_RTS			0x0200
 
 
-#define		ISI_TXOK		0x0001 
- 
+#define		ISI_TXOK		0x0001
+
 #endif	/*	__KERNEL__	*/
 
 #endif	/*	ISICOM_H	*/
-
-- 
cgit v1.2.3-71-gd317


From 4c29c4c5f28616f2a87f0e6499aa9776d9be58ad Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 9 Jan 2006 20:54:50 -0800
Subject: [PATCH] include/linux/sched.h: no need to guard the
 normalize_rt_tasks() prototype

There's no need to guard the normalize_rt_tasks() prototype with an #ifdef
CONFIG_MAGIC_SYSRQ.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ee4677ad204e..c4ee35dd18ae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1390,12 +1390,8 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
-#ifdef CONFIG_MAGIC_SYSRQ
-
 extern void normalize_rt_tasks(void);
 
-#endif
-
 #ifdef CONFIG_PM
 /*
  * Check if a process has been frozen
-- 
cgit v1.2.3-71-gd317


From c8d52465f95c4187871f8e65666c07806ca06d41 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 10 Jan 2006 18:21:20 +1100
Subject: [PATCH] Work around ppc64 compiler bug

In the process of optimising our per cpu data code, I found a ppc64
compiler bug that has been around forever. Basically the current
RELOC_HIDE can end up trashing r30. Details of the bug can be found at

  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25572

This bug is present in all compilers before 4.1. It is masked by the
fact that our current per cpu data code is inefficient and causes
other loads that end up marking r30 as used.

A workaround identified by Alan Modra is to use the =r asm constraint
instead of =g.

Signed-off-by: Anton Blanchard <anton@samba.org>
[ Verified that this makes no real difference on x86[-64] */
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compiler-gcc.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 2e05e1e6b0e6..6e1c44a935d4 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -11,9 +11,15 @@
 
 /* This macro obfuscates arithmetic on a variable address so that gcc
    shouldn't recognize the original var, and make assumptions about it */
+/*
+ * Versions of the ppc64 compiler before 4.1 had a bug where use of
+ * RELOC_HIDE could trash r30. The bug can be worked around by changing
+ * the inline assembly constraint from =g to =r, in this particular
+ * case either is valid.
+ */
 #define RELOC_HIDE(ptr, off)					\
   ({ unsigned long __ptr;					\
-    __asm__ ("" : "=g"(__ptr) : "0"(ptr));		\
+    __asm__ ("" : "=r"(__ptr) : "0"(ptr));		\
     (typeof(ptr)) (__ptr + (off)); })
 
 
-- 
cgit v1.2.3-71-gd317


From 69a0b3157983925f14fe0bdc49622d5389538d8d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 10 Jan 2006 16:48:02 +0300
Subject: [PATCH] rcu: join rcu_ctrlblk and rcu_state

This patch moves rcu_state into the rcu_ctrlblk. I think there
are no reasons why we should have 2 different variables to control
rcu state. Every user of rcu_state has also "rcu_ctrlblk *rcp" in
the parameter list.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h |  4 +++
 kernel/rcupdate.c        | 82 ++++++++++++++++++++++--------------------------
 2 files changed, 42 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a1d26cb28925..981f9aa43353 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -65,6 +65,10 @@ struct rcu_ctrlblk {
 	long	cur;		/* Current batch number.                      */
 	long	completed;	/* Number of the last completed batch         */
 	int	next_pending;	/* Is the next batch already waiting?         */
+
+	spinlock_t	lock	____cacheline_internodealigned_in_smp;
+	cpumask_t	cpumask; /* CPUs that need to switch in order    */
+	                         /* for current batch to proceed.        */
 } ____cacheline_internodealigned_in_smp;
 
 /* Is batch a before batch b ? */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 05ee48316f70..e18f9190eafa 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -49,22 +49,18 @@
 #include <linux/cpu.h>
 
 /* Definition for rcupdate control block. */
-struct rcu_ctrlblk rcu_ctrlblk = 
-	{ .cur = -300, .completed = -300 };
-struct rcu_ctrlblk rcu_bh_ctrlblk =
-	{ .cur = -300, .completed = -300 };
-
-/* Bookkeeping of the progress of the grace period */
-struct rcu_state {
-	spinlock_t	lock; /* Guard this struct and writes to rcu_ctrlblk */
-	cpumask_t	cpumask; /* CPUs that need to switch in order    */
-	                              /* for current batch to proceed.        */
+struct rcu_ctrlblk rcu_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = SPIN_LOCK_UNLOCKED,
+	.cpumask = CPU_MASK_NONE,
+};
+struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = SPIN_LOCK_UNLOCKED,
+	.cpumask = CPU_MASK_NONE,
 };
-
-static struct rcu_state rcu_state ____cacheline_internodealigned_in_smp =
-	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
-static struct rcu_state rcu_bh_state ____cacheline_internodealigned_in_smp =
-	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
@@ -220,13 +216,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
  *   This is done by rcu_start_batch. The start is not broadcasted to
  *   all cpus, they must pick this up by comparing rcp->cur with
  *   rdp->quiescbatch. All cpus are recorded  in the
- *   rcu_state.cpumask bitmap.
+ *   rcu_ctrlblk.cpumask bitmap.
  * - All cpus must go through a quiescent state.
  *   Since the start of the grace period is not broadcasted, at least two
  *   calls to rcu_check_quiescent_state are required:
  *   The first call just notices that a new grace period is running. The
  *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_state.cpumask. If
+ *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
  *   the bitmap is empty, then the grace period is completed.
  *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
  *   period (if necessary).
@@ -234,9 +230,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
 /*
  * Register a new batch of callbacks, and start it up if there is currently no
  * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_state.lock.
+ * Caller must hold rcu_ctrlblk.lock.
  */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
+static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 {
 	if (rcp->next_pending &&
 			rcp->completed == rcp->cur) {
@@ -251,11 +247,11 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
 		/*
 		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
 		 * Barrier  Otherwise it can cause tickless idle CPUs to be
-		 * included in rsp->cpumask, which will extend graceperiods
+		 * included in rcp->cpumask, which will extend graceperiods
 		 * unnecessarily.
 		 */
 		smp_mb();
-		cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
+		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
 
 	}
 }
@@ -265,13 +261,13 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
  * Clear it from the cpu mask and complete the grace period if it was the last
  * cpu. Start another grace period if someone has further entries pending
  */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
+static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
 {
-	cpu_clear(cpu, rsp->cpumask);
-	if (cpus_empty(rsp->cpumask)) {
+	cpu_clear(cpu, rcp->cpumask);
+	if (cpus_empty(rcp->cpumask)) {
 		/* batch completed ! */
 		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp, rsp);
+		rcu_start_batch(rcp);
 	}
 }
 
@@ -281,7 +277,7 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
  * quiescent cycle, then indicate that it has done so.
  */
 static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
-			struct rcu_state *rsp, struct rcu_data *rdp)
+					struct rcu_data *rdp)
 {
 	if (rdp->quiescbatch != rcp->cur) {
 		/* start new grace period: */
@@ -306,15 +302,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 		return;
 	rdp->qs_pending = 0;
 
-	spin_lock(&rsp->lock);
+	spin_lock(&rcp->lock);
 	/*
 	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
 	 * during cpu startup. Ignore the quiescent state.
 	 */
 	if (likely(rdp->quiescbatch == rcp->cur))
-		cpu_quiet(rdp->cpu, rcp, rsp);
+		cpu_quiet(rdp->cpu, rcp);
 
-	spin_unlock(&rsp->lock);
+	spin_unlock(&rcp->lock);
 }
 
 
@@ -335,16 +331,16 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 }
 
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
-	struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp)
+				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
 	/* if the cpu going offline owns the grace period
 	 * we can block indefinitely waiting for it, so flush
 	 * it here
 	 */
-	spin_lock_bh(&rsp->lock);
+	spin_lock_bh(&rcp->lock);
 	if (rcp->cur != rcp->completed)
-		cpu_quiet(rdp->cpu, rcp, rsp);
-	spin_unlock_bh(&rsp->lock);
+		cpu_quiet(rdp->cpu, rcp);
+	spin_unlock_bh(&rcp->lock);
 	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
 	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
 
@@ -354,9 +350,9 @@ static void rcu_offline_cpu(int cpu)
 	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
 	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
 
-	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state,
+	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
 					&per_cpu(rcu_data, cpu));
-	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state,
+	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
 					&per_cpu(rcu_bh_data, cpu));
 	put_cpu_var(rcu_data);
 	put_cpu_var(rcu_bh_data);
@@ -375,7 +371,7 @@ static void rcu_offline_cpu(int cpu)
  * This does the RCU processing work from tasklet context. 
  */
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
-			struct rcu_state *rsp, struct rcu_data *rdp)
+					struct rcu_data *rdp)
 {
 	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
 		*rdp->donetail = rdp->curlist;
@@ -405,25 +401,23 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 
 		if (!rcp->next_pending) {
 			/* and start it/schedule start if it's a new batch */
-			spin_lock(&rsp->lock);
+			spin_lock(&rcp->lock);
 			rcp->next_pending = 1;
-			rcu_start_batch(rcp, rsp);
-			spin_unlock(&rsp->lock);
+			rcu_start_batch(rcp);
+			spin_unlock(&rcp->lock);
 		}
 	} else {
 		local_irq_enable();
 	}
-	rcu_check_quiescent_state(rcp, rsp, rdp);
+	rcu_check_quiescent_state(rcp, rdp);
 	if (rdp->donelist)
 		rcu_do_batch(rdp);
 }
 
 static void rcu_process_callbacks(unsigned long unused)
 {
-	__rcu_process_callbacks(&rcu_ctrlblk, &rcu_state,
-				&__get_cpu_var(rcu_data));
-	__rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state,
-				&__get_cpu_var(rcu_bh_data));
+	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
 }
 
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-- 
cgit v1.2.3-71-gd317


From 1e6c9c2878c9c1f301449c78551e0b7c5f3e3ae5 Mon Sep 17 00:00:00 2001
From: Andrew Victor <andrew@sanpeople.com>
Date: Tue, 10 Jan 2006 16:59:27 +0000
Subject: [ARM] 3242/2: AT91RM9200 support for 2.6 (Serial)

Patch from Andrew Victor

This patch adds support to the 2.6 kernel series for the Atmel
AT91RM9200 processor.

This patch is the Serial driver.

This version uses the newly re-written GPL'ed hardware headers.

Signed-off-by: Andrew Victor <andrew@sanpeople.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/Kconfig                             |  34 +
 drivers/serial/Makefile                            |   1 +
 drivers/serial/at91_serial.c                       | 894 +++++++++++++++++++++
 include/asm-arm/arch-at91rm9200/at91rm9200_pdc.h   |  36 +
 include/asm-arm/arch-at91rm9200/at91rm9200_usart.h | 123 +++
 include/asm-arm/mach/serial_at91rm9200.h           |  36 +
 include/linux/serial_core.h                        |   3 +
 7 files changed, 1127 insertions(+)
 create mode 100644 drivers/serial/at91_serial.c
 create mode 100644 include/asm-arm/arch-at91rm9200/at91rm9200_pdc.h
 create mode 100644 include/asm-arm/arch-at91rm9200/at91rm9200_usart.h
 create mode 100644 include/asm-arm/mach/serial_at91rm9200.h

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 1bae26a8a503..a256a020b15e 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -280,6 +280,40 @@ config SERIAL_AMBA_PL011_CONSOLE
 	  your boot loader (lilo or loadlin) about how to pass options to the
 	  kernel at boot time.)
 
+config SERIAL_AT91
+	bool "AT91RM9200 serial port support"
+	depends on ARM && ARCH_AT91RM9200
+	select SERIAL_CORE
+	help
+	  This enables the driver for the on-chip UARTs of the AT91RM9200
+	  processor.
+
+config SERIAL_AT91_CONSOLE
+	bool "Support for console on AT91RM9200 serial port"
+	depends on SERIAL_AT91=y
+	select SERIAL_CORE_CONSOLE
+	help
+	  Say Y here if you wish to use a UART on the AT91RM9200 as the system
+	  console (the system console is the device which receives all kernel
+	  messages and warnings and which allows logins in single user mode).
+
+config SERIAL_AT91_TTYAT
+	bool "Install as device ttyAT0-4 instead of ttyS0-4"
+	depends on SERIAL_AT91=y
+	help
+	  Say Y here if you wish to have the five internal AT91RM9200 UARTs
+	  appear as /dev/ttyAT0-4 (major 240, minor 0-4) instead of the
+	  normal /dev/ttyS0-4 (major 4, minor 64-68). This is necessary if
+	  you also want other UARTs, such as external 8250/16C550 compatible
+	  UARTs.
+	  The ttySn nodes are legally reserved for the 8250 serial driver
+	  but are often misused by other serial drivers.
+
+	  To use this, you should create suitable ttyATn device nodes in
+	  /dev/, and pass "console=ttyATn" to the kernel.
+
+	  Say Y if you have an external 8250/16C550 UART.  If unsure, say N.
+
 config SERIAL_CLPS711X
 	tristate "CLPS711X serial port support"
 	depends on ARM && ARCH_CLPS711X
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 137148bba4fa..24a583e482bb 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -56,3 +56,4 @@ obj-$(CONFIG_SERIAL_JSM) += jsm/
 obj-$(CONFIG_SERIAL_TXX9) += serial_txx9.o
 obj-$(CONFIG_SERIAL_VR41XX) += vr41xx_siu.o
 obj-$(CONFIG_SERIAL_SGI_IOC4) += ioc4_serial.o
+obj-$(CONFIG_SERIAL_AT91) += at91_serial.o
diff --git a/drivers/serial/at91_serial.c b/drivers/serial/at91_serial.c
new file mode 100644
index 000000000000..0e206063d685
--- /dev/null
+++ b/drivers/serial/at91_serial.c
@@ -0,0 +1,894 @@
+/*
+ *  linux/drivers/char/at91_serial.c
+ *
+ *  Driver for Atmel AT91RM9200 Serial ports
+ *
+ *  Copyright (C) 2003 Rick Bronson
+ *
+ *  Based on drivers/char/serial_sa1100.c, by Deep Blue Solutions Ltd.
+ *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/tty.h>
+#include <linux/ioport.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/serial.h>
+#include <linux/console.h>
+#include <linux/sysrq.h>
+#include <linux/tty_flip.h>
+
+#include <asm/io.h>
+
+#include <asm/arch/at91rm9200_usart.h>
+#include <asm/mach/serial_at91rm9200.h>
+#include <asm/arch/board.h>
+#include <asm/arch/pio.h>
+
+
+#if defined(CONFIG_SERIAL_AT91_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/serial_core.h>
+
+#ifdef CONFIG_SERIAL_AT91_TTYAT
+
+/* Use device name ttyAT, major 204 and minor 154-169.  This is necessary if we
+ * should coexist with the 8250 driver, such as if we have an external 16C550
+ * UART. */
+#define SERIAL_AT91_MAJOR	204
+#define MINOR_START		154
+#define AT91_DEVICENAME		"ttyAT"
+
+#else
+
+/* Use device name ttyS, major 4, minor 64-68.  This is the usual serial port
+ * name, but it is legally reserved for the 8250 driver. */
+#define SERIAL_AT91_MAJOR	TTY_MAJOR
+#define MINOR_START		64
+#define AT91_DEVICENAME		"ttyS"
+
+#endif
+
+#define AT91_VA_BASE_DBGU	((unsigned long) AT91_VA_BASE_SYS + AT91_DBGU)
+#define AT91_ISR_PASS_LIMIT	256
+
+#define UART_PUT_CR(port,v)	writel(v, (port)->membase + AT91_US_CR)
+#define UART_GET_MR(port)	readl((port)->membase + AT91_US_MR)
+#define UART_PUT_MR(port,v)	writel(v, (port)->membase + AT91_US_MR)
+#define UART_PUT_IER(port,v)	writel(v, (port)->membase + AT91_US_IER)
+#define UART_PUT_IDR(port,v)	writel(v, (port)->membase + AT91_US_IDR)
+#define UART_GET_IMR(port)	readl((port)->membase + AT91_US_IMR)
+#define UART_GET_CSR(port)	readl((port)->membase + AT91_US_CSR)
+#define UART_GET_CHAR(port)	readl((port)->membase + AT91_US_RHR)
+#define UART_PUT_CHAR(port,v)	writel(v, (port)->membase + AT91_US_THR)
+#define UART_GET_BRGR(port)	readl((port)->membase + AT91_US_BRGR)
+#define UART_PUT_BRGR(port,v)	writel(v, (port)->membase + AT91_US_BRGR)
+#define UART_PUT_RTOR(port,v)	writel(v, (port)->membase + AT91_US_RTOR)
+
+// #define UART_GET_CR(port)	readl((port)->membase + AT91_US_CR)		// is write-only
+
+ /* PDC registers */
+#define UART_PUT_PTCR(port,v)	writel(v, (port)->membase + AT91_PDC_PTCR)
+#define UART_PUT_RPR(port,v)	writel(v, (port)->membase + AT91_PDC_RPR)
+#define UART_PUT_RCR(port,v)	writel(v, (port)->membase + AT91_PDC_RCR)
+#define UART_GET_RCR(port)	readl((port)->membase + AT91_PDC_RCR)
+#define UART_PUT_RNPR(port,v)	writel(v, (port)->membase + AT91_PDC_RNPR)
+#define UART_PUT_RNCR(port,v)	writel(v, (port)->membase + AT91_PDC_RNCR)
+
+
+static int (*at91_open)(struct uart_port *);
+static void (*at91_close)(struct uart_port *);
+
+#ifdef SUPPORT_SYSRQ
+static struct console at91_console;
+#endif
+
+/*
+ * Return TIOCSER_TEMT when transmitter FIFO and Shift register is empty.
+ */
+static u_int at91_tx_empty(struct uart_port *port)
+{
+	return (UART_GET_CSR(port) & AT91_US_TXEMPTY) ? TIOCSER_TEMT : 0;
+}
+
+/*
+ * Set state of the modem control output lines
+ */
+static void at91_set_mctrl(struct uart_port *port, u_int mctrl)
+{
+	unsigned int control = 0;
+
+	/*
+	 * Errata #39: RTS0 is not internally connected to PA21.  We need to drive
+	 *  the pin manually.
+	 */
+	if (port->mapbase == AT91_VA_BASE_US0) {
+		if (mctrl & TIOCM_RTS)
+			at91_sys_write(AT91_PIOA + PIO_CODR, AT91_PA21_RTS0);
+		else
+			at91_sys_write(AT91_PIOA + PIO_SODR, AT91_PA21_RTS0);
+	}
+
+	if (mctrl & TIOCM_RTS)
+		control |= AT91_US_RTSEN;
+	else
+		control |= AT91_US_RTSDIS;
+
+	if (mctrl & TIOCM_DTR)
+		control |= AT91_US_DTREN;
+	else
+		control |= AT91_US_DTRDIS;
+
+	UART_PUT_CR(port,control);
+}
+
+/*
+ * Get state of the modem control input lines
+ */
+static u_int at91_get_mctrl(struct uart_port *port)
+{
+	unsigned int status, ret = 0;
+
+	status = UART_GET_CSR(port);
+
+	/*
+	 * The control signals are active low.
+	 */
+	if (!(status & AT91_US_DCD))
+		ret |= TIOCM_CD;
+	if (!(status & AT91_US_CTS))
+		ret |= TIOCM_CTS;
+	if (!(status & AT91_US_DSR))
+		ret |= TIOCM_DSR;
+	if (!(status & AT91_US_RI))
+		ret |= TIOCM_RI;
+
+	return ret;
+}
+
+/*
+ * Stop transmitting.
+ */
+static void at91_stop_tx(struct uart_port *port)
+{
+	UART_PUT_IDR(port, AT91_US_TXRDY);
+	port->read_status_mask &= ~AT91_US_TXRDY;
+}
+
+/*
+ * Start transmitting.
+ */
+static void at91_start_tx(struct uart_port *port)
+{
+	port->read_status_mask |= AT91_US_TXRDY;
+	UART_PUT_IER(port, AT91_US_TXRDY);
+}
+
+/*
+ * Stop receiving - port is in process of being closed.
+ */
+static void at91_stop_rx(struct uart_port *port)
+{
+	UART_PUT_IDR(port, AT91_US_RXRDY);
+}
+
+/*
+ * Enable modem status interrupts
+ */
+static void at91_enable_ms(struct uart_port *port)
+{
+	port->read_status_mask |= (AT91_US_RIIC | AT91_US_DSRIC | AT91_US_DCDIC | AT91_US_CTSIC);
+	UART_PUT_IER(port, AT91_US_RIIC | AT91_US_DSRIC | AT91_US_DCDIC | AT91_US_CTSIC);
+}
+
+/*
+ * Control the transmission of a break signal
+ */
+static void at91_break_ctl(struct uart_port *port, int break_state)
+{
+	if (break_state != 0)
+		UART_PUT_CR(port, AT91_US_STTBRK);	/* start break */
+	else
+		UART_PUT_CR(port, AT91_US_STPBRK);	/* stop break */
+}
+
+/*
+ * Characters received (called from interrupt handler)
+ */
+static void at91_rx_chars(struct uart_port *port, struct pt_regs *regs)
+{
+	struct tty_struct *tty = port->info->tty;
+	unsigned int status, ch, flg;
+
+	status = UART_GET_CSR(port) & port->read_status_mask;
+	while (status & (AT91_US_RXRDY)) {
+		ch = UART_GET_CHAR(port);
+
+		if (tty->flip.count >= TTY_FLIPBUF_SIZE)
+			goto ignore_char;
+		port->icount.rx++;
+
+		flg = TTY_NORMAL;
+
+		/*
+		 * note that the error handling code is
+		 * out of the main execution path
+		 */
+		if (unlikely(status & (AT91_US_PARE | AT91_US_FRAME | AT91_US_OVRE))) {
+			UART_PUT_CR(port, AT91_US_RSTSTA);	/* clear error */
+			if (status & (AT91_US_PARE))
+				port->icount.parity++;
+			if (status & (AT91_US_FRAME))
+				port->icount.frame++;
+			if (status & (AT91_US_OVRE))
+				port->icount.overrun++;
+
+			if (status & AT91_US_PARE)
+				flg = TTY_PARITY;
+			else if (status & AT91_US_FRAME)
+				flg = TTY_FRAME;
+			if (status & AT91_US_OVRE) {
+				/*
+				 * overrun does *not* affect the character
+				 * we read from the FIFO
+				 */
+				tty_insert_flip_char(tty, ch, flg);
+				ch = 0;
+				flg = TTY_OVERRUN;
+			}
+#ifdef SUPPORT_SYSRQ
+			port->sysrq = 0;
+#endif
+		}
+
+		if (uart_handle_sysrq_char(port, ch, regs))
+			goto ignore_char;
+
+		tty_insert_flip_char(tty, ch, flg);
+
+	ignore_char:
+		status = UART_GET_CSR(port) & port->read_status_mask;
+	}
+
+	tty_flip_buffer_push(tty);
+}
+
+/*
+ * Transmit characters (called from interrupt handler)
+ */
+static void at91_tx_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->info->xmit;
+
+	if (port->x_char) {
+		UART_PUT_CHAR(port, port->x_char);
+		port->icount.tx++;
+		port->x_char = 0;
+		return;
+	}
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+		at91_stop_tx(port);
+		return;
+	}
+
+	while (UART_GET_CSR(port) & AT91_US_TXRDY) {
+		UART_PUT_CHAR(port, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+		if (uart_circ_empty(xmit))
+			break;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (uart_circ_empty(xmit))
+		at91_stop_tx(port);
+}
+
+/*
+ * Interrupt handler
+ */
+static irqreturn_t at91_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct uart_port *port = dev_id;
+	unsigned int status, pending, pass_counter = 0;
+
+	status = UART_GET_CSR(port);
+	pending = status & port->read_status_mask;
+	if (pending) {
+		do {
+			if (pending & AT91_US_RXRDY)
+				at91_rx_chars(port, regs);
+
+			/* Clear the relevent break bits */
+			if (pending & AT91_US_RXBRK) {
+				UART_PUT_CR(port, AT91_US_RSTSTA);
+				port->icount.brk++;
+				uart_handle_break(port);
+			}
+
+			// TODO: All reads to CSR will clear these interrupts!
+			if (pending & AT91_US_RIIC) port->icount.rng++;
+			if (pending & AT91_US_DSRIC) port->icount.dsr++;
+			if (pending & AT91_US_DCDIC)
+				uart_handle_dcd_change(port, !(status & AT91_US_DCD));
+			if (pending & AT91_US_CTSIC)
+				uart_handle_cts_change(port, !(status & AT91_US_CTS));
+			if (pending & (AT91_US_RIIC | AT91_US_DSRIC | AT91_US_DCDIC | AT91_US_CTSIC))
+				wake_up_interruptible(&port->info->delta_msr_wait);
+
+			if (pending & AT91_US_TXRDY)
+				at91_tx_chars(port);
+			if (pass_counter++ > AT91_ISR_PASS_LIMIT)
+				break;
+
+			status = UART_GET_CSR(port);
+			pending = status & port->read_status_mask;
+		} while (pending);
+	}
+	return IRQ_HANDLED;
+}
+
+/*
+ * Perform initialization and enable port for reception
+ */
+static int at91_startup(struct uart_port *port)
+{
+	int retval;
+
+	/*
+	 * Ensure that no interrupts are enabled otherwise when
+	 * request_irq() is called we could get stuck trying to
+	 * handle an unexpected interrupt
+	 */
+	UART_PUT_IDR(port, -1);
+
+	/*
+	 * Allocate the IRQ
+	 */
+	retval = request_irq(port->irq, at91_interrupt, SA_SHIRQ, "at91_serial", port);
+	if (retval) {
+		printk("at91_serial: at91_startup - Can't get irq\n");
+		return retval;
+	}
+
+	/*
+	 * If there is a specific "open" function (to register
+	 * control line interrupts)
+	 */
+	if (at91_open) {
+		retval = at91_open(port);
+		if (retval) {
+			free_irq(port->irq, port);
+			return retval;
+		}
+	}
+
+	port->read_status_mask = AT91_US_RXRDY | AT91_US_TXRDY | AT91_US_OVRE
+			| AT91_US_FRAME | AT91_US_PARE | AT91_US_RXBRK;
+	/*
+	 * Finally, enable the serial port
+	 */
+	UART_PUT_CR(port, AT91_US_RSTSTA | AT91_US_RSTRX);
+	UART_PUT_CR(port, AT91_US_TXEN | AT91_US_RXEN);		/* enable xmit & rcvr */
+	UART_PUT_IER(port, AT91_US_RXRDY);			/* do receive only */
+	return 0;
+}
+
+/*
+ * Disable the port
+ */
+static void at91_shutdown(struct uart_port *port)
+{
+	/*
+	 * Disable all interrupts, port and break condition.
+	 */
+	UART_PUT_CR(port, AT91_US_RSTSTA);
+	UART_PUT_IDR(port, -1);
+
+	/*
+	 * Free the interrupt
+	 */
+	free_irq(port->irq, port);
+
+	/*
+	 * If there is a specific "close" function (to unregister
+	 * control line interrupts)
+	 */
+	if (at91_close)
+		at91_close(port);
+}
+
+/*
+ * Power / Clock management.
+ */
+static void at91_serial_pm(struct uart_port *port, unsigned int state, unsigned int oldstate)
+{
+	switch (state) {
+		case 0:
+			/*
+			 * Enable the peripheral clock for this serial port.
+			 * This is called on uart_open() or a resume event.
+			 */
+			at91_sys_write(AT91_PMC_PCER, 1 << port->irq);
+			break;
+		case 3:
+			/*
+			 * Disable the peripheral clock for this serial port.
+			 * This is called on uart_close() or a suspend event.
+			 */
+			if (port->irq != AT91_ID_SYS)			/* is this a shared clock? */
+				at91_sys_write(AT91_PMC_PCDR, 1 << port->irq);
+			break;
+		default:
+			printk(KERN_ERR "at91_serial: unknown pm %d\n", state);
+	}
+}
+
+/*
+ * Change the port parameters
+ */
+static void at91_set_termios(struct uart_port *port, struct termios * termios, struct termios * old)
+{
+	unsigned long flags;
+	unsigned int mode, imr, quot, baud;
+
+	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk/16);
+	quot = uart_get_divisor(port, baud);
+
+	/* Get current mode register */
+	mode = UART_GET_MR(port) & ~(AT91_US_CHRL | AT91_US_NBSTOP | AT91_US_PAR);
+
+	/* byte size */
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+		mode |= AT91_US_CHRL_5;
+		break;
+	case CS6:
+		mode |= AT91_US_CHRL_6;
+		break;
+	case CS7:
+		mode |= AT91_US_CHRL_7;
+		break;
+	default:
+		mode |= AT91_US_CHRL_8;
+		break;
+	}
+
+	/* stop bits */
+	if (termios->c_cflag & CSTOPB)
+		mode |= AT91_US_NBSTOP_2;
+
+	/* parity */
+	if (termios->c_cflag & PARENB) {
+		if (termios->c_cflag & CMSPAR) {			/* Mark or Space parity */
+			if (termios->c_cflag & PARODD)
+				mode |= AT91_US_PAR_MARK;
+			else
+				mode |= AT91_US_PAR_SPACE;
+		}
+		else if (termios->c_cflag & PARODD)
+			mode |= AT91_US_PAR_ODD;
+		else
+			mode |= AT91_US_PAR_EVEN;
+	}
+	else
+		mode |= AT91_US_PAR_NONE;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	port->read_status_mask |= AT91_US_OVRE;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= AT91_US_FRAME | AT91_US_PARE;
+	if (termios->c_iflag & (BRKINT | PARMRK))
+		port->read_status_mask |= AT91_US_RXBRK;
+
+	/*
+	 * Characters to ignore
+	 */
+	port->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		port->ignore_status_mask |= (AT91_US_FRAME | AT91_US_PARE);
+	if (termios->c_iflag & IGNBRK) {
+		port->ignore_status_mask |= AT91_US_RXBRK;
+		/*
+		 * If we're ignoring parity and break indicators,
+		 * ignore overruns too (for real raw support).
+		 */
+		if (termios->c_iflag & IGNPAR)
+			port->ignore_status_mask |= AT91_US_OVRE;
+	}
+
+	// TODO: Ignore all characters if CREAD is set.
+
+	/* update the per-port timeout */
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	/* disable interrupts and drain transmitter */
+	imr = UART_GET_IMR(port);	/* get interrupt mask */
+	UART_PUT_IDR(port, -1);		/* disable all interrupts */
+	while (!(UART_GET_CSR(port) & AT91_US_TXEMPTY)) { barrier(); }
+
+	/* disable receiver and transmitter */
+	UART_PUT_CR(port, AT91_US_TXDIS | AT91_US_RXDIS);
+
+	/* set the parity, stop bits and data size */
+	UART_PUT_MR(port, mode);
+
+	/* set the baud rate */
+	UART_PUT_BRGR(port, quot);
+	UART_PUT_CR(port, AT91_US_RSTSTA | AT91_US_RSTRX);
+	UART_PUT_CR(port, AT91_US_TXEN | AT91_US_RXEN);
+
+	/* restore interrupts */
+	UART_PUT_IER(port, imr);
+
+	/* CTS flow-control and modem-status interrupts */
+	if (UART_ENABLE_MS(port, termios->c_cflag))
+		port->ops->enable_ms(port);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+/*
+ * Return string describing the specified port
+ */
+static const char *at91_type(struct uart_port *port)
+{
+	return (port->type == PORT_AT91RM9200) ? "AT91_SERIAL" : NULL;
+}
+
+/*
+ * Release the memory region(s) being used by 'port'.
+ */
+static void at91_release_port(struct uart_port *port)
+{
+	release_mem_region(port->mapbase,
+		(port->mapbase == AT91_VA_BASE_DBGU) ? 512 : SZ_16K);
+}
+
+/*
+ * Request the memory region(s) being used by 'port'.
+ */
+static int at91_request_port(struct uart_port *port)
+{
+	return request_mem_region(port->mapbase,
+		(port->mapbase == AT91_VA_BASE_DBGU) ? 512 : SZ_16K,
+		"at91_serial") != NULL ? 0 : -EBUSY;
+
+}
+
+/*
+ * Configure/autoconfigure the port.
+ */
+static void at91_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_AT91RM9200;
+		at91_request_port(port);
+	}
+}
+
+/*
+ * Verify the new serial_struct (for TIOCSSERIAL).
+ */
+static int at91_verify_port(struct uart_port *port, struct serial_struct *ser)
+{
+	int ret = 0;
+	if (ser->type != PORT_UNKNOWN && ser->type != PORT_AT91RM9200)
+		ret = -EINVAL;
+	if (port->irq != ser->irq)
+		ret = -EINVAL;
+	if (ser->io_type != SERIAL_IO_MEM)
+		ret = -EINVAL;
+	if (port->uartclk / 16 != ser->baud_base)
+		ret = -EINVAL;
+	if ((void *)port->mapbase != ser->iomem_base)
+		ret = -EINVAL;
+	if (port->iobase != ser->port)
+		ret = -EINVAL;
+	if (ser->hub6 != 0)
+		ret = -EINVAL;
+	return ret;
+}
+
+static struct uart_ops at91_pops = {
+	.tx_empty	= at91_tx_empty,
+	.set_mctrl	= at91_set_mctrl,
+	.get_mctrl	= at91_get_mctrl,
+	.stop_tx	= at91_stop_tx,
+	.start_tx	= at91_start_tx,
+	.stop_rx	= at91_stop_rx,
+	.enable_ms	= at91_enable_ms,
+	.break_ctl	= at91_break_ctl,
+	.startup	= at91_startup,
+	.shutdown	= at91_shutdown,
+	.set_termios	= at91_set_termios,
+	.type		= at91_type,
+	.release_port	= at91_release_port,
+	.request_port	= at91_request_port,
+	.config_port 	= at91_config_port,
+	.verify_port 	= at91_verify_port,
+	.pm		= at91_serial_pm,
+};
+
+static struct uart_port at91_ports[AT91_NR_UART];
+
+void __init at91_init_ports(void)
+{
+	static int first = 1;
+	int i;
+
+	if (!first)
+		return;
+	first = 0;
+
+	for (i = 0; i < AT91_NR_UART; i++) {
+		at91_ports[i].iotype	= UPIO_MEM;
+		at91_ports[i].flags     = UPF_BOOT_AUTOCONF;
+		at91_ports[i].uartclk   = at91_master_clock;
+		at91_ports[i].ops	= &at91_pops;
+		at91_ports[i].fifosize  = 1;
+		at91_ports[i].line	= i;
+ 	}
+}
+
+void __init at91_register_uart_fns(struct at91rm9200_port_fns *fns)
+{
+	if (fns->enable_ms)
+		at91_pops.enable_ms = fns->enable_ms;
+	if (fns->get_mctrl)
+		at91_pops.get_mctrl = fns->get_mctrl;
+	if (fns->set_mctrl)
+		at91_pops.set_mctrl = fns->set_mctrl;
+	at91_open          = fns->open;
+	at91_close         = fns->close;
+	at91_pops.pm       = fns->pm;
+	at91_pops.set_wake = fns->set_wake;
+}
+
+/*
+ * Setup ports.
+ */
+void __init at91_register_uart(int idx, int port)
+{
+	if ((idx < 0) || (idx >= AT91_NR_UART)) {
+		printk(KERN_ERR "%s: bad index number %d\n", __FUNCTION__, idx);
+		return;
+	}
+
+	switch (port) {
+	case 0:
+		at91_ports[idx].membase = (void __iomem *) AT91_VA_BASE_US0;
+		at91_ports[idx].mapbase = AT91_VA_BASE_US0;
+		at91_ports[idx].irq     = AT91_ID_US0;
+		AT91_CfgPIO_USART0();
+		break;
+	case 1:
+		at91_ports[idx].membase = (void __iomem *) AT91_VA_BASE_US1;
+		at91_ports[idx].mapbase = AT91_VA_BASE_US1;
+		at91_ports[idx].irq     = AT91_ID_US1;
+		AT91_CfgPIO_USART1();
+		break;
+	case 2:
+		at91_ports[idx].membase = (void __iomem *) AT91_VA_BASE_US2;
+		at91_ports[idx].mapbase = AT91_VA_BASE_US2;
+		at91_ports[idx].irq     = AT91_ID_US2;
+		AT91_CfgPIO_USART2();
+		break;
+	case 3:
+		at91_ports[idx].membase = (void __iomem *) AT91_VA_BASE_US3;
+		at91_ports[idx].mapbase = AT91_VA_BASE_US3;
+		at91_ports[idx].irq     = AT91_ID_US3;
+		AT91_CfgPIO_USART3();
+		break;
+	case 4:
+		at91_ports[idx].membase = (void __iomem *) AT91_VA_BASE_DBGU;
+		at91_ports[idx].mapbase = AT91_VA_BASE_DBGU;
+		at91_ports[idx].irq     = AT91_ID_SYS;
+		AT91_CfgPIO_DBGU();
+		break;
+	default:
+		printk(KERN_ERR  "%s : bad port number %d\n", __FUNCTION__, port);
+	}
+}
+
+#ifdef CONFIG_SERIAL_AT91_CONSOLE
+
+/*
+ * Interrupts are disabled on entering
+ */
+static void at91_console_write(struct console *co, const char *s, u_int count)
+{
+	struct uart_port *port = at91_ports + co->index;
+	unsigned int status, i, imr;
+
+	/*
+	 *	First, save IMR and then disable interrupts
+	 */
+	imr = UART_GET_IMR(port);	/* get interrupt mask */
+	UART_PUT_IDR(port, AT91_US_RXRDY | AT91_US_TXRDY);
+
+	/*
+	 *	Now, do each character
+	 */
+	for (i = 0; i < count; i++) {
+		do {
+			status = UART_GET_CSR(port);
+		} while (!(status & AT91_US_TXRDY));
+		UART_PUT_CHAR(port, s[i]);
+		if (s[i] == '\n') {
+			do {
+				status = UART_GET_CSR(port);
+			} while (!(status & AT91_US_TXRDY));
+			UART_PUT_CHAR(port, '\r');
+		}
+	}
+
+	/*
+	 *	Finally, wait for transmitter to become empty
+	 *	and restore IMR
+	 */
+	do {
+		status = UART_GET_CSR(port);
+	} while (!(status & AT91_US_TXRDY));
+	UART_PUT_IER(port, imr);	/* set interrupts back the way they were */
+}
+
+/*
+ * If the port was already initialised (eg, by a boot loader), try to determine
+ * the current setup.
+ */
+static void __init at91_console_get_options(struct uart_port *port, int *baud, int *parity, int *bits)
+{
+	unsigned int mr, quot;
+
+// TODO: CR is a write-only register
+//	unsigned int cr;
+//
+//	cr = UART_GET_CR(port) & (AT91_US_RXEN | AT91_US_TXEN);
+//	if (cr == (AT91_US_RXEN | AT91_US_TXEN)) {
+//		/* ok, the port was enabled */
+//	}
+
+	mr = UART_GET_MR(port) & AT91_US_CHRL;
+	if (mr == AT91_US_CHRL_8)
+		*bits = 8;
+	else
+		*bits = 7;
+
+	mr = UART_GET_MR(port) & AT91_US_PAR;
+	if (mr == AT91_US_PAR_EVEN)
+		*parity = 'e';
+	else if (mr == AT91_US_PAR_ODD)
+		*parity = 'o';
+
+	quot = UART_GET_BRGR(port);
+	*baud = port->uartclk / (16 * (quot));
+}
+
+static int __init at91_console_setup(struct console *co, char *options)
+{
+	struct uart_port *port;
+	int baud = 115200;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	/*
+	 * Check whether an invalid uart number has been specified, and
+	 * if so, search for the first available port that does have
+	 * console support.
+	 */
+	port = uart_get_console(at91_ports, AT91_NR_UART, co);
+
+	/*
+	 * Enable the serial console, in-case bootloader did not do it.
+	 */
+	at91_sys_write(AT91_PMC_PCER, 1 << port->irq);	/* enable clock */
+	UART_PUT_IDR(port, -1);				/* disable interrupts */
+	UART_PUT_CR(port, AT91_US_RSTSTA | AT91_US_RSTRX);
+	UART_PUT_CR(port, AT91_US_TXEN | AT91_US_RXEN);
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+	else
+		at91_console_get_options(port, &baud, &parity, &bits);
+
+	return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver at91_uart;
+
+static struct console at91_console = {
+	.name		= AT91_DEVICENAME,
+	.write		= at91_console_write,
+	.device		= uart_console_device,
+	.setup		= at91_console_setup,
+	.flags		= CON_PRINTBUFFER,
+	.index		= -1,
+	.data		= &at91_uart,
+};
+
+#define AT91_CONSOLE_DEVICE	&at91_console
+
+static int  __init at91_console_init(void)
+{
+	at91_init_ports();
+
+	at91_console.index = at91_console_port;
+	register_console(&at91_console);
+	return 0;
+}
+console_initcall(at91_console_init);
+
+#else
+#define AT91_CONSOLE_DEVICE	NULL
+#endif
+
+static struct uart_driver at91_uart = {
+	.owner			= THIS_MODULE,
+	.driver_name		= AT91_DEVICENAME,
+	.dev_name		= AT91_DEVICENAME,
+	.devfs_name		= AT91_DEVICENAME,
+	.major			= SERIAL_AT91_MAJOR,
+	.minor			= MINOR_START,
+	.nr			= AT91_NR_UART,
+	.cons			= AT91_CONSOLE_DEVICE,
+};
+
+static int __init at91_serial_init(void)
+{
+	int ret, i;
+
+	at91_init_ports();
+
+	ret = uart_register_driver(&at91_uart);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < AT91_NR_UART; i++) {
+		if (at91_serial_map[i] >= 0)
+			uart_add_one_port(&at91_uart, &at91_ports[i]);
+	}
+
+	return 0;
+}
+
+static void __exit at91_serial_exit(void)
+{
+	int i;
+
+	for (i = 0; i < AT91_NR_UART; i++) {
+ 		if (at91_serial_map[i] >= 0)
+			uart_remove_one_port(&at91_uart, &at91_ports[i]);
+  	}
+
+	uart_unregister_driver(&at91_uart);
+}
+
+module_init(at91_serial_init);
+module_exit(at91_serial_exit);
+
+MODULE_AUTHOR("Rick Bronson");
+MODULE_DESCRIPTION("AT91 generic serial port driver");
+MODULE_LICENSE("GPL");
diff --git a/include/asm-arm/arch-at91rm9200/at91rm9200_pdc.h b/include/asm-arm/arch-at91rm9200/at91rm9200_pdc.h
new file mode 100644
index 000000000000..ce1150d4438d
--- /dev/null
+++ b/include/asm-arm/arch-at91rm9200/at91rm9200_pdc.h
@@ -0,0 +1,36 @@
+/*
+ * include/asm-arm/arch-at91rm9200/at91rm9200_pdc.h
+ *
+ * Copyright (C) 2005 Ivan Kokshaysky
+ * Copyright (C) SAN People
+ *
+ * Peripheral Data Controller (PDC) registers.
+ * Based on AT91RM9200 datasheet revision E.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef AT91RM9200_PDC_H
+#define AT91RM9200_PDC_H
+
+#define AT91_PDC_RPR		0x100	/* Receive Pointer Register */
+#define AT91_PDC_RCR		0x104	/* Receive Counter Register */
+#define AT91_PDC_TPR		0x108	/* Transmit Pointer Register */
+#define AT91_PDC_TCR		0x10c	/* Transmit Counter Register */
+#define AT91_PDC_RNPR		0x110	/* Receive Next Pointer Register */
+#define AT91_PDC_RNCR		0x114	/* Receive Next Counter Register */
+#define AT91_PDC_TNPR		0x118	/* Transmit Next Pointer Register */
+#define AT91_PDC_TNCR		0x11c	/* Transmit Next Counter Register */
+
+#define AT91_PDC_PTCR		0x120	/* Transfer Control Register */
+#define		AT91_PDC_RXTEN		(1 << 0)	/* Receiver Transfer Enable */
+#define		AT91_PDC_RXTDIS		(1 << 1)	/* Receiver Transfer Disable */
+#define		AT91_PDC_TXTEN		(1 << 8)	/* Transmitter Transfer Enable */
+#define		AT91_PDC_TXTDIS		(1 << 9)	/* Transmitter Transfer Disable */
+
+#define AT91_PDC_PTSR		0x124	/* Transfer Status Register */
+
+#endif
diff --git a/include/asm-arm/arch-at91rm9200/at91rm9200_usart.h b/include/asm-arm/arch-at91rm9200/at91rm9200_usart.h
new file mode 100644
index 000000000000..79f851e31b9c
--- /dev/null
+++ b/include/asm-arm/arch-at91rm9200/at91rm9200_usart.h
@@ -0,0 +1,123 @@
+/*
+ * include/asm-arm/arch-at91rm9200/at91rm9200_usart.h
+ *
+ * Copyright (C) 2005 Ivan Kokshaysky
+ * Copyright (C) SAN People
+ *
+ * USART registers.
+ * Based on AT91RM9200 datasheet revision E.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef AT91RM9200_USART_H
+#define AT91RM9200_USART_H
+
+#define AT91_US_CR		0x00			/* Control Register */
+#define		AT91_US_RSTRX		(1 <<  2)		/* Reset Receiver */
+#define		AT91_US_RSTTX		(1 <<  3)		/* Reset Transmitter */
+#define		AT91_US_RXEN		(1 <<  4)		/* Receiver Enable */
+#define		AT91_US_RXDIS		(1 <<  5)		/* Receiver Disable */
+#define		AT91_US_TXEN		(1 <<  6)		/* Transmitter Enable */
+#define		AT91_US_TXDIS		(1 <<  7)		/* Transmitter Disable */
+#define		AT91_US_RSTSTA		(1 <<  8)		/* Reset Status Bits */
+#define		AT91_US_STTBRK		(1 <<  9)		/* Start Break */
+#define		AT91_US_STPBRK		(1 << 10)		/* Stop Break */
+#define		AT91_US_STTTO		(1 << 11)		/* Start Time-out */
+#define		AT91_US_SENDA		(1 << 12)		/* Send Address */
+#define		AT91_US_RSTIT		(1 << 13)		/* Reset Iterations */
+#define		AT91_US_RSTNACK		(1 << 14)		/* Reset Non Acknowledge */
+#define		AT91_US_RETTO		(1 << 15)		/* Rearm Time-out */
+#define		AT91_US_DTREN		(1 << 16)		/* Data Terminal Ready Enable */
+#define		AT91_US_DTRDIS		(1 << 17)		/* Data Terminal Ready Disable */
+#define		AT91_US_RTSEN		(1 << 18)		/* Request To Send Enable */
+#define		AT91_US_RTSDIS		(1 << 19)		/* Request To Send Disable */
+
+#define AT91_US_MR		0x04			/* Mode Register */
+#define		AT91_US_USMODE		(0xf <<  0)		/* Mode of the USART */
+#define			AT91_US_USMODE_NORMAL		0
+#define			AT91_US_USMODE_RS485		1
+#define			AT91_US_USMODE_HWHS		2
+#define			AT91_US_USMODE_MODEM		3
+#define			AT91_US_USMODE_ISO7816_T0	4
+#define			AT91_US_USMODE_ISO7816_T1	6
+#define			AT91_US_USMODE_IRDA		8
+#define		AT91_US_USCLKS		(3   <<  4)		/* Clock Selection */
+#define		AT91_US_CHRL		(3   <<  6)		/* Character Length */
+#define			AT91_US_CHRL_5			(0 <<  6)
+#define			AT91_US_CHRL_6			(1 <<  6)
+#define			AT91_US_CHRL_7			(2 <<  6)
+#define			AT91_US_CHRL_8			(3 <<  6)
+#define		AT91_US_SYNC		(1 <<  8)		/* Synchronous Mode Select */
+#define		AT91_US_PAR		(7 <<  9)		/* Parity Type */
+#define			AT91_US_PAR_EVEN		(0 <<  9)
+#define			AT91_US_PAR_ODD			(1 <<  9)
+#define			AT91_US_PAR_SPACE		(2 <<  9)
+#define			AT91_US_PAR_MARK		(3 <<  9)
+#define			AT91_US_PAR_NONE		(4 <<  9)
+#define			AT91_US_PAR_MULTI_DROP		(6 <<  9)
+#define		AT91_US_NBSTOP		(3 << 12)		/* Number of Stop Bits */
+#define			AT91_US_NBSTOP_1		(0 << 12)
+#define			AT91_US_NBSTOP_1_5		(1 << 12)
+#define			AT91_US_NBSTOP_2		(2 << 12)
+#define		AT91_US_CHMODE		(3 << 14)		/* Channel Mode */
+#define			AT91_US_CHMODE_NORMAL		(0 << 14)
+#define			AT91_US_CHMODE_ECHO		(1 << 14)
+#define			AT91_US_CHMODE_LOC_LOOP		(2 << 14)
+#define			AT91_US_CHMODE_REM_LOOP		(3 << 14)
+#define		AT91_US_MSBF		(1 << 16)		/* Bit Order */
+#define		AT91_US_MODE9		(1 << 17)		/* 9-bit Character Length */
+#define		AT91_US_CLKO		(1 << 18)		/* Clock Output Select */
+#define		AT91_US_OVER		(1 << 19)		/* Oversampling Mode */
+#define		AT91_US_INACK		(1 << 20)		/* Inhibit Non Acknowledge */
+#define		AT91_US_DSNACK		(1 << 21)		/* Disable Successive NACK */
+#define		AT91_US_MAX_ITER	(7 << 24)		/* Max Iterations */
+#define		AT91_US_FILTER		(1 << 28)		/* Infrared Receive Line Filter */
+
+#define AT91_US_IER		0x08			/* Interrupt Enable Register */
+#define		AT91_US_RXRDY		(1 <<  0)		/* Receiver Ready */
+#define		AT91_US_TXRDY		(1 <<  1)		/* Transmitter Ready */
+#define		AT91_US_RXBRK		(1 <<  2)		/* Break Received / End of Break */
+#define		AT91_US_ENDRX		(1 <<  3)		/* End of Receiver Transfer */
+#define		AT91_US_ENDTX		(1 <<  4)		/* End of Transmitter Transfer */
+#define		AT91_US_OVRE		(1 <<  5)		/* Overrun Error */
+#define		AT91_US_FRAME		(1 <<  6)		/* Framing Error */
+#define		AT91_US_PARE		(1 <<  7)		/* Parity Error */
+#define		AT91_US_TIMEOUT		(1 <<  8)		/* Receiver Time-out */
+#define		AT91_US_TXEMPTY		(1 <<  9)		/* Transmitter Empty */
+#define		AT91_US_ITERATION	(1 << 10)		/* Max number of Repetitions Reached */
+#define		AT91_US_TXBUFE		(1 << 11)		/* Transmission Buffer Empty */
+#define		AT91_US_RXBUFF		(1 << 12)		/* Reception Buffer Full */
+#define		AT91_US_NACK		(1 << 13)		/* Non Acknowledge */
+#define		AT91_US_RIIC		(1 << 16)		/* Ring Indicator Input Change */
+#define		AT91_US_DSRIC		(1 << 17)		/* Data Set Ready Input Change */
+#define		AT91_US_DCDIC		(1 << 18)		/* Data Carrier Detect Input Change */
+#define		AT91_US_CTSIC		(1 << 19)		/* Clear to Send Input Change */
+#define		AT91_US_RI		(1 << 20)		/* RI */
+#define		AT91_US_DSR		(1 << 21)		/* DSR */
+#define		AT91_US_DCD		(1 << 22)		/* DCD */
+#define		AT91_US_CTS		(1 << 23)		/* CTS */
+
+#define AT91_US_IDR		0x0c			/* Interrupt Disable Register */
+#define AT91_US_IMR		0x10			/* Interrupt Mask Register */
+#define AT91_US_CSR		0x14			/* Channel Status Register */
+#define AT91_US_RHR		0x18			/* Receiver Holding Register */
+#define AT91_US_THR		0x1c			/* Transmitter Holding Register */
+
+#define AT91_US_BRGR		0x20			/* Baud Rate Generator Register */
+#define		AT91_US_CD		(0xffff << 0)		/* Clock Divider */
+
+#define AT91_US_RTOR		0x24			/* Receiver Time-out Register */
+#define		AT91_US_TO		(0xffff << 0)		/* Time-out Value */
+
+#define AT91_US_TTGR		0x28			/* Transmitter Timeguard Register */
+#define		AT91_US_TG		(0xff << 0)		/* Timeguard Value */
+
+#define AT91_US_FIDI		0x40			/* FI DI Ratio Register */
+#define AT91_US_NER		0x44			/* Number of Errors Register */
+#define AT91_US_IF		0x4c			/* IrDA Filter Register */
+
+#endif
diff --git a/include/asm-arm/mach/serial_at91rm9200.h b/include/asm-arm/mach/serial_at91rm9200.h
new file mode 100644
index 000000000000..98f4b0cb883c
--- /dev/null
+++ b/include/asm-arm/mach/serial_at91rm9200.h
@@ -0,0 +1,36 @@
+/*
+ *  linux/include/asm-arm/mach/serial_at91rm9200.h
+ *
+ *  Based on serial_sa1100.h  by Nicolas Pitre
+ *
+ *  Copyright (C) 2002 ATMEL Rousset
+ *
+ *  Low level machine dependent UART functions.
+ */
+#include <linux/config.h>
+
+struct uart_port;
+
+/*
+ * This is a temporary structure for registering these
+ * functions; it is intended to be discarded after boot.
+ */
+struct at91rm9200_port_fns {
+	void	(*set_mctrl)(struct uart_port *, u_int);
+	u_int	(*get_mctrl)(struct uart_port *);
+	void	(*enable_ms)(struct uart_port *);
+	void	(*pm)(struct uart_port *, u_int, u_int);
+	int	(*set_wake)(struct uart_port *, u_int);
+	int	(*open)(struct uart_port *);
+	void	(*close)(struct uart_port *);
+};
+
+#if defined(CONFIG_SERIAL_AT91)
+void at91_register_uart_fns(struct at91rm9200_port_fns *fns);
+void at91_register_uart(int idx, int port);
+#else
+#define at91_register_uart_fns(fns) do { } while (0)
+#define at91_register_uart(idx,port) do { } while (0)
+#endif
+
+
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index e3710d7e260a..a8187c3c8a7b 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -67,6 +67,9 @@
 /* Parisc type numbers. */
 #define PORT_MUX	48
 
+/* Atmel AT91RM9200 SoC */
+#define PORT_AT91RM9200 49
+
 /* Macintosh Zilog type numbers */
 #define PORT_MAC_ZILOG	50	/* m68k : not yet implemented */
 #define PORT_PMAC_ZILOG	51
-- 
cgit v1.2.3-71-gd317


From bb94aa169eaa6e713a429370d37388722f08666f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 9 Jan 2006 16:43:13 -0800
Subject: [NETFILTER]: net/ipv[46]/netfilter.c cleanups

Don't wrap entire file in #ifdef CONFIG_NETFILTER, remove a few
unneccessary includes.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6.h |  5 +++++
 net/ipv4/Makefile              |  4 ++--
 net/ipv4/netfilter.c           | 10 ----------
 net/ipv6/Makefile              |  5 ++---
 net/ipv6/netfilter.c           | 19 ++-----------------
 5 files changed, 11 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 53b2983f6278..14f2bd010884 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -72,7 +72,12 @@ enum nf_ip6_hook_priorities {
 	NF_IP6_PRI_LAST = INT_MAX,
 };
 
+#ifdef CONFIG_NETFILTER
 extern int ipv6_netfilter_init(void);
 extern void ipv6_netfilter_fini(void);
+#else /* CONFIG_NETFILTER */
+static inline int ipv6_netfilter_init(void) { return 0; }
+static inline void ipv6_netfilter_fini(void) { return; }
+#endif /* CONFIG_NETFILTER */
 
 #endif /*__LINUX_IP6_NETFILTER_H*/
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c54edd76de09..35e5f5999092 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -9,7 +9,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o \
 	     datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
-	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o
+	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
 
 obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
 obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
@@ -28,7 +28,7 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
-obj-$(CONFIG_NETFILTER)	+= netfilter/
+obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/
 obj-$(CONFIG_IP_VS) += ipvs/
 obj-$(CONFIG_INET_DIAG) += inet_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 3321092b0914..52a3d7c57907 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -1,16 +1,8 @@
 /* IPv4 specific functions of netfilter core */
-
-#include <linux/config.h>
-#ifdef CONFIG_NETFILTER
-
 #include <linux/kernel.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
-
 #include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/icmp.h>
 #include <net/route.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -146,5 +138,3 @@ static void fini(void)
 
 module_init(init);
 module_exit(fini);
-
-#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 9601fd7f9d66..bf18cff13120 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,8 +8,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
 		protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
 		exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
-		ip6_flowlabel.o ipv6_syms.o netfilter.o \
-		inet6_connection_sock.o
+		ip6_flowlabel.o ipv6_syms.o inet6_connection_sock.o
 
 ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
 	xfrm6_output.o
@@ -19,7 +18,7 @@ obj-$(CONFIG_INET6_AH) += ah6.o
 obj-$(CONFIG_INET6_ESP) += esp6.o
 obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
 obj-$(CONFIG_INET6_TUNNEL) += xfrm6_tunnel.o 
-obj-$(CONFIG_NETFILTER)	+= netfilter/
+obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/
 
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
 
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index b63678328a3b..1ab62f033664 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -1,9 +1,5 @@
-#include <linux/config.h>
-#include <linux/init.h>
-
-#ifdef CONFIG_NETFILTER
-
 #include <linux/kernel.h>
+#include <linux/init.h>
 #include <linux/ipv6.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
@@ -94,18 +90,7 @@ int __init ipv6_netfilter_init(void)
 	return nf_register_queue_rerouter(PF_INET6, &ip6_reroute);
 }
 
-void ipv6_netfilter_fini(void)
+void __exit ipv6_netfilter_fini(void)
 {
 	nf_unregister_queue_rerouter(PF_INET6);
 }
-
-#else /* CONFIG_NETFILTER */
-int __init ipv6_netfilter_init(void)
-{
-	return 0;
-}
-
-void ipv6_netfilter_fini(void)
-{
-}
-#endif /* CONFIG_NETFILTER */
-- 
cgit v1.2.3-71-gd317


From 9d28026b7ec0f3e2a407d5c03fcb37d0b59d1add Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 9 Jan 2006 16:44:36 -0800
Subject: [NETFILTER]: Remove unused function from NAT protocol helpers

->print and ->print_range are not used (and apparently never were).

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_nat_protocol.h |  7 -----
 net/ipv4/netfilter/ip_nat_proto_gre.c          | 38 --------------------------
 net/ipv4/netfilter/ip_nat_proto_icmp.c         | 34 -----------------------
 net/ipv4/netfilter/ip_nat_proto_tcp.c          | 36 ------------------------
 net/ipv4/netfilter/ip_nat_proto_udp.c          | 36 ------------------------
 net/ipv4/netfilter/ip_nat_proto_unknown.c      | 16 -----------
 6 files changed, 167 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_nat_protocol.h b/include/linux/netfilter_ipv4/ip_nat_protocol.h
index ef63aa991a06..612a43614e7b 100644
--- a/include/linux/netfilter_ipv4/ip_nat_protocol.h
+++ b/include/linux/netfilter_ipv4/ip_nat_protocol.h
@@ -42,13 +42,6 @@ struct ip_nat_protocol
 			    enum ip_nat_manip_type maniptype,
 			    const struct ip_conntrack *conntrack);
 
-	unsigned int (*print)(char *buffer,
-			      const struct ip_conntrack_tuple *match,
-			      const struct ip_conntrack_tuple *mask);
-
-	unsigned int (*print_range)(char *buffer,
-				    const struct ip_nat_range *range);
-
 	int (*range_to_nfattr)(struct sk_buff *skb,
 			       const struct ip_nat_range *range);
 
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
index f7cad7cf1aec..6c4899d8046a 100644
--- a/net/ipv4/netfilter/ip_nat_proto_gre.c
+++ b/net/ipv4/netfilter/ip_nat_proto_gre.c
@@ -151,42 +151,6 @@ gre_manip_pkt(struct sk_buff **pskb,
 	return 1;
 }
 
-/* print out a nat tuple */
-static unsigned int 
-gre_print(char *buffer, 
-	  const struct ip_conntrack_tuple *match,
-	  const struct ip_conntrack_tuple *mask)
-{
-	unsigned int len = 0;
-
-	if (mask->src.u.gre.key)
-		len += sprintf(buffer + len, "srckey=0x%x ", 
-				ntohl(match->src.u.gre.key));
-
-	if (mask->dst.u.gre.key)
-		len += sprintf(buffer + len, "dstkey=0x%x ",
-				ntohl(match->src.u.gre.key));
-
-	return len;
-}
-
-/* print a range of keys */
-static unsigned int 
-gre_print_range(char *buffer, const struct ip_nat_range *range)
-{
-	if (range->min.gre.key != 0 
-	    || range->max.gre.key != 0xFFFF) {
-		if (range->min.gre.key == range->max.gre.key)
-			return sprintf(buffer, "key 0x%x ",
-					ntohl(range->min.gre.key));
-		else
-			return sprintf(buffer, "keys 0x%u-0x%u ",
-					ntohl(range->min.gre.key),
-					ntohl(range->max.gre.key));
-	} else
-		return 0;
-}
-
 /* nat helper struct */
 static struct ip_nat_protocol gre = { 
 	.name		= "GRE", 
@@ -194,8 +158,6 @@ static struct ip_nat_protocol gre = {
 	.manip_pkt	= gre_manip_pkt,
 	.in_range	= gre_in_range,
 	.unique_tuple	= gre_unique_tuple,
-	.print		= gre_print,
-	.print_range	= gre_print_range,
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
 	.range_to_nfattr	= ip_nat_port_range_to_nfattr,
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index 938719043999..31a3f4ccb99c 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -74,38 +74,6 @@ icmp_manip_pkt(struct sk_buff **pskb,
 	return 1;
 }
 
-static unsigned int
-icmp_print(char *buffer,
-	   const struct ip_conntrack_tuple *match,
-	   const struct ip_conntrack_tuple *mask)
-{
-	unsigned int len = 0;
-
-	if (mask->src.u.icmp.id)
-		len += sprintf(buffer + len, "id=%u ",
-			       ntohs(match->src.u.icmp.id));
-
-	if (mask->dst.u.icmp.type)
-		len += sprintf(buffer + len, "type=%u ",
-			       ntohs(match->dst.u.icmp.type));
-
-	if (mask->dst.u.icmp.code)
-		len += sprintf(buffer + len, "code=%u ",
-			       ntohs(match->dst.u.icmp.code));
-
-	return len;
-}
-
-static unsigned int
-icmp_print_range(char *buffer, const struct ip_nat_range *range)
-{
-	if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF)
-		return sprintf(buffer, "id %u-%u ",
-			       ntohs(range->min.icmp.id),
-			       ntohs(range->max.icmp.id));
-	else return 0;
-}
-
 struct ip_nat_protocol ip_nat_protocol_icmp = {
 	.name			= "ICMP",
 	.protonum		= IPPROTO_ICMP,
@@ -113,8 +81,6 @@ struct ip_nat_protocol ip_nat_protocol_icmp = {
 	.manip_pkt		= icmp_manip_pkt,
 	.in_range		= icmp_in_range,
 	.unique_tuple		= icmp_unique_tuple,
-	.print			= icmp_print,
-	.print_range		= icmp_print_range,
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
 	.range_to_nfattr	= ip_nat_port_range_to_nfattr,
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index 1d381bf68574..a3d14079eba6 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -136,40 +136,6 @@ tcp_manip_pkt(struct sk_buff **pskb,
 	return 1;
 }
 
-static unsigned int
-tcp_print(char *buffer,
-	  const struct ip_conntrack_tuple *match,
-	  const struct ip_conntrack_tuple *mask)
-{
-	unsigned int len = 0;
-
-	if (mask->src.u.tcp.port)
-		len += sprintf(buffer + len, "srcpt=%u ",
-			       ntohs(match->src.u.tcp.port));
-
-
-	if (mask->dst.u.tcp.port)
-		len += sprintf(buffer + len, "dstpt=%u ",
-			       ntohs(match->dst.u.tcp.port));
-
-	return len;
-}
-
-static unsigned int
-tcp_print_range(char *buffer, const struct ip_nat_range *range)
-{
-	if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) {
-		if (range->min.tcp.port == range->max.tcp.port)
-			return sprintf(buffer, "port %u ",
-				       ntohs(range->min.tcp.port));
-		else
-			return sprintf(buffer, "ports %u-%u ",
-				       ntohs(range->min.tcp.port),
-				       ntohs(range->max.tcp.port));
-	}
-	else return 0;
-}
-
 struct ip_nat_protocol ip_nat_protocol_tcp = {
 	.name			= "TCP",
 	.protonum		= IPPROTO_TCP,
@@ -177,8 +143,6 @@ struct ip_nat_protocol ip_nat_protocol_tcp = {
 	.manip_pkt		= tcp_manip_pkt,
 	.in_range		= tcp_in_range,
 	.unique_tuple		= tcp_unique_tuple,
-	.print			= tcp_print,
-	.print_range		= tcp_print_range,
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
 	.range_to_nfattr	= ip_nat_port_range_to_nfattr,
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index c4906e1aa24a..ec6053fdc867 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -122,40 +122,6 @@ udp_manip_pkt(struct sk_buff **pskb,
 	return 1;
 }
 
-static unsigned int
-udp_print(char *buffer,
-	  const struct ip_conntrack_tuple *match,
-	  const struct ip_conntrack_tuple *mask)
-{
-	unsigned int len = 0;
-
-	if (mask->src.u.udp.port)
-		len += sprintf(buffer + len, "srcpt=%u ",
-			       ntohs(match->src.u.udp.port));
-
-
-	if (mask->dst.u.udp.port)
-		len += sprintf(buffer + len, "dstpt=%u ",
-			       ntohs(match->dst.u.udp.port));
-
-	return len;
-}
-
-static unsigned int
-udp_print_range(char *buffer, const struct ip_nat_range *range)
-{
-	if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) {
-		if (range->min.udp.port == range->max.udp.port)
-			return sprintf(buffer, "port %u ",
-				       ntohs(range->min.udp.port));
-		else
-			return sprintf(buffer, "ports %u-%u ",
-				       ntohs(range->min.udp.port),
-				       ntohs(range->max.udp.port));
-	}
-	else return 0;
-}
-
 struct ip_nat_protocol ip_nat_protocol_udp = {
 	.name			= "UDP",
 	.protonum		= IPPROTO_UDP,
@@ -163,8 +129,6 @@ struct ip_nat_protocol ip_nat_protocol_udp = {
 	.manip_pkt		= udp_manip_pkt,
 	.in_range		= udp_in_range,
 	.unique_tuple		= udp_unique_tuple,
-	.print			= udp_print,
-	.print_range		= udp_print_range,
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
 	.range_to_nfattr	= ip_nat_port_range_to_nfattr,
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index f0099a646a0b..3bf049517246 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -46,26 +46,10 @@ unknown_manip_pkt(struct sk_buff **pskb,
 	return 1;
 }
 
-static unsigned int
-unknown_print(char *buffer,
-	      const struct ip_conntrack_tuple *match,
-	      const struct ip_conntrack_tuple *mask)
-{
-	return 0;
-}
-
-static unsigned int
-unknown_print_range(char *buffer, const struct ip_nat_range *range)
-{
-	return 0;
-}
-
 struct ip_nat_protocol ip_nat_unknown_protocol = {
 	.name			= "unknown",
 	/* .me isn't set: getting a ref to this cannot fail. */
 	.manip_pkt		= unknown_manip_pkt,
 	.in_range		= unknown_in_range,
 	.unique_tuple		= unknown_unique_tuple,
-	.print			= unknown_print,
-	.print_range		= unknown_print_range
 };
-- 
cgit v1.2.3-71-gd317


From 8039de10aae3cd4cf0ef0ccebd58aff0e8810df2 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@parisc-linux.org>
Date: Tue, 10 Jan 2006 20:35:03 -0500
Subject: [PARISC] Add __read_mostly section for parisc

Flag a whole bunch of things as __read_mostly on parisc. Also flag a few
branches as unlikely() and cleanup a bit of code.

Signed-off-by: Helge Deller <deller@parisc-linux.org>
Signed-off-by: Kyle McMartin <kyle@parisc-linux.org>
---
 arch/parisc/kernel/cache.c       | 12 ++++++------
 arch/parisc/kernel/drivers.c     |  2 +-
 arch/parisc/kernel/firmware.c    |  2 +-
 arch/parisc/kernel/inventory.c   |  6 +++---
 arch/parisc/kernel/pci-dma.c     |  6 +++---
 arch/parisc/kernel/pdc_chassis.c | 13 +++++++------
 arch/parisc/kernel/perf.c        |  6 +++---
 arch/parisc/kernel/process.c     |  2 +-
 arch/parisc/kernel/processor.c   |  8 ++++----
 arch/parisc/kernel/setup.c       | 10 +++++-----
 arch/parisc/kernel/smp.c         |  8 ++++----
 arch/parisc/kernel/time.c        |  4 ++--
 arch/parisc/kernel/topology.c    |  3 ++-
 arch/parisc/kernel/unaligned.c   |  2 +-
 arch/parisc/kernel/unwind.c      |  2 +-
 arch/parisc/kernel/vmlinux.lds.S |  4 ++++
 arch/parisc/mm/init.c            | 20 ++++++++++----------
 drivers/parisc/eisa.c            |  4 ++--
 drivers/parisc/lasi.c            |  2 +-
 drivers/parisc/lba_pci.c         |  2 +-
 drivers/parisc/led.c             | 18 +++++++++---------
 drivers/parisc/pdc_stable.c      |  2 +-
 drivers/parisc/power.c           | 12 ++++++------
 include/linux/cache.h            |  2 +-
 24 files changed, 79 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index a065349aee37..63047c6d2d04 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -29,9 +29,9 @@
 #include <asm/processor.h>
 #include <asm/sections.h>
 
-int split_tlb;
-int dcache_stride;
-int icache_stride;
+int split_tlb __read_mostly;
+int dcache_stride __read_mostly;
+int icache_stride __read_mostly;
 EXPORT_SYMBOL(dcache_stride);
 
 
@@ -45,9 +45,9 @@ DEFINE_SPINLOCK(pa_tlb_lock);
 EXPORT_SYMBOL(pa_tlb_lock);
 #endif
 
-struct pdc_cache_info cache_info;
+struct pdc_cache_info cache_info __read_mostly;
 #ifndef CONFIG_PA20
-static struct pdc_btlb_info btlb_info;
+static struct pdc_btlb_info btlb_info __read_mostly;
 #endif
 
 #ifdef CONFIG_SMP
@@ -332,7 +332,7 @@ void clear_user_page_asm(void *page, unsigned long vaddr)
 }
 
 #define FLUSH_THRESHOLD 0x80000 /* 0.5MB */
-int parisc_cache_flush_threshold = FLUSH_THRESHOLD;
+int parisc_cache_flush_threshold __read_mostly = FLUSH_THRESHOLD;
 
 void parisc_setup_cache_timing(void)
 {
diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
index d016d672ec2b..041524d24ef1 100644
--- a/arch/parisc/kernel/drivers.c
+++ b/arch/parisc/kernel/drivers.c
@@ -39,7 +39,7 @@
 #include <asm/parisc-device.h>
 
 /* See comments in include/asm-parisc/pci.h */
-struct hppa_dma_ops *hppa_dma_ops;
+struct hppa_dma_ops *hppa_dma_ops __read_mostly;
 EXPORT_SYMBOL(hppa_dma_ops);
 
 static struct device root = {
diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c
index 553f8fe03224..2dc06b8e1817 100644
--- a/arch/parisc/kernel/firmware.c
+++ b/arch/parisc/kernel/firmware.c
@@ -80,7 +80,7 @@ static unsigned long pdc_result2[32] __attribute__ ((aligned (8)));
 
 /* Firmware needs to be initially set to narrow to determine the 
  * actual firmware width. */
-int parisc_narrow_firmware = 1;
+int parisc_narrow_firmware __read_mostly = 1;
 #endif
 
 /* On most currently-supported platforms, IODC I/O calls are 32-bit calls
diff --git a/arch/parisc/kernel/inventory.c b/arch/parisc/kernel/inventory.c
index 8f563871e83c..4e847ba53180 100644
--- a/arch/parisc/kernel/inventory.c
+++ b/arch/parisc/kernel/inventory.c
@@ -38,7 +38,7 @@
 */
 #undef DEBUG_PAT
 
-int pdc_type = PDC_TYPE_ILLEGAL;
+int pdc_type __read_mostly = PDC_TYPE_ILLEGAL;
 
 void __init setup_pdc(void)
 {
@@ -120,8 +120,8 @@ set_pmem_entry(physmem_range_t *pmem_ptr, unsigned long start,
 	 * pdc info is bad in this case).
 	 */
 
-	if (   ((start & (PAGE_SIZE - 1)) != 0)
-	    || ((pages4k & ((1UL << PDC_PAGE_ADJ_SHIFT) - 1)) != 0) ) {
+	if (unlikely( ((start & (PAGE_SIZE - 1)) != 0)
+	    || ((pages4k & ((1UL << PDC_PAGE_ADJ_SHIFT) - 1)) != 0) )) {
 
 		panic("Memory range doesn't align with page size!\n");
 	}
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index f94a02ef3d95..a6caf1073085 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -33,10 +33,10 @@
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>	/* for purge_tlb_*() macros */
 
-static struct proc_dir_entry * proc_gsc_root = NULL;
+static struct proc_dir_entry * proc_gsc_root __read_mostly = NULL;
 static int pcxl_proc_info(char *buffer, char **start, off_t offset, int length);
-static unsigned long pcxl_used_bytes = 0;
-static unsigned long pcxl_used_pages = 0;
+static unsigned long pcxl_used_bytes __read_mostly = 0;
+static unsigned long pcxl_used_pages __read_mostly = 0;
 
 extern unsigned long pcxl_dma_start; /* Start of pcxl dma mapping area */
 static spinlock_t   pcxl_res_lock;
diff --git a/arch/parisc/kernel/pdc_chassis.c b/arch/parisc/kernel/pdc_chassis.c
index 52004ae28d20..2a01fe1bdc98 100644
--- a/arch/parisc/kernel/pdc_chassis.c
+++ b/arch/parisc/kernel/pdc_chassis.c
@@ -30,6 +30,7 @@
 #include <linux/kernel.h>
 #include <linux/reboot.h>
 #include <linux/notifier.h>
+#include <linux/cache.h>
 
 #include <asm/pdc_chassis.h>
 #include <asm/processor.h>
@@ -38,8 +39,8 @@
 
 
 #ifdef CONFIG_PDC_CHASSIS
-static int pdc_chassis_old = 0;	
-static unsigned int pdc_chassis_enabled = 1;
+static int pdc_chassis_old __read_mostly = 0;	
+static unsigned int pdc_chassis_enabled __read_mostly = 1;
 
 
 /**
@@ -132,7 +133,7 @@ void __init parisc_pdc_chassis_init(void)
 {
 #ifdef CONFIG_PDC_CHASSIS
 	int handle = 0;
-	if (pdc_chassis_enabled) {
+	if (likely(pdc_chassis_enabled)) {
 		DPRINTK(KERN_DEBUG "%s: parisc_pdc_chassis_init()\n", __FILE__);
 
 		/* Let see if we have something to handle... */
@@ -142,7 +143,7 @@ void __init parisc_pdc_chassis_init(void)
 			printk(KERN_INFO "Enabling PDC_PAT chassis codes support.\n");
 			handle = 1;
 		}
-		else if (pdc_chassis_old) {
+		else if (unlikely(pdc_chassis_old)) {
 			printk(KERN_INFO "Enabling old style chassis LED panel support.\n");
 			handle = 1;
 		}
@@ -178,7 +179,7 @@ int pdc_chassis_send_status(int message)
 	/* Maybe we should do that in an other way ? */
 	int retval = 0;
 #ifdef CONFIG_PDC_CHASSIS
-	if (pdc_chassis_enabled) {
+	if (likely(pdc_chassis_enabled)) {
 
 		DPRINTK(KERN_DEBUG "%s: pdc_chassis_send_status(%d)\n", __FILE__, message);
 
@@ -214,7 +215,7 @@ int pdc_chassis_send_status(int message)
 			}
 		} else retval = -1;
 #else
-		if (pdc_chassis_old) {
+		if (unlikely(pdc_chassis_old)) {
 			switch (message) {
 				case PDC_CHASSIS_DIRECT_BSTART:
 				case PDC_CHASSIS_DIRECT_BCOMPLETE:
diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c
index f6fec62b6a2f..79dcbcccecb8 100644
--- a/arch/parisc/kernel/perf.c
+++ b/arch/parisc/kernel/perf.c
@@ -66,10 +66,10 @@ struct rdr_tbl_ent {
 	uint8_t		write_control;
 };
 
-static int perf_processor_interface = UNKNOWN_INTF;
-static int perf_enabled = 0;
+static int perf_processor_interface __read_mostly = UNKNOWN_INTF;
+static int perf_enabled __read_mostly = 0;
 static spinlock_t perf_lock;
-struct parisc_device *cpu_device = NULL;
+struct parisc_device *cpu_device __read_mostly = NULL;
 
 /* RDRs to write for PCX-W */
 static int perf_rdrs_W[] = 
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index fee4f1f09adc..4eb70a40ec7e 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -54,7 +54,7 @@
 #include <asm/uaccess.h>
 #include <asm/unwind.h>
 
-static int hlt_counter;
+static int hlt_counter __read_mostly;
 
 /*
  * Power off function, if any
diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c
index 4f5bbcf1f5a4..6df9f62cecb5 100644
--- a/arch/parisc/kernel/processor.c
+++ b/arch/parisc/kernel/processor.c
@@ -44,10 +44,10 @@
 #include <asm/irq.h>		/* for struct irq_region */
 #include <asm/parisc-device.h>
 
-struct system_cpuinfo_parisc boot_cpu_data;
+struct system_cpuinfo_parisc boot_cpu_data __read_mostly;
 EXPORT_SYMBOL(boot_cpu_data);
 
-struct cpuinfo_parisc cpu_data[NR_CPUS];
+struct cpuinfo_parisc cpu_data[NR_CPUS] __read_mostly;
 
 /*
 **  	PARISC CPU driver - claim "device" and initialize CPU data structures.
@@ -378,12 +378,12 @@ show_cpuinfo (struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct parisc_device_id processor_tbl[] = {
+static struct parisc_device_id processor_tbl[] __read_mostly = {
 	{ HPHW_NPROC, HVERSION_REV_ANY_ID, HVERSION_ANY_ID, SVERSION_ANY_ID },
 	{ 0, }
 };
 
-static struct parisc_driver cpu_driver = {
+static struct parisc_driver cpu_driver __read_mostly = {
 	.name		= "CPU",
 	.id_table	= processor_tbl,
 	.probe		= processor_probe
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 73e9c34b0948..4a36ec3f6ac1 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -46,15 +46,15 @@
 #include <asm/io.h>
 #include <asm/setup.h>
 
-char	command_line[COMMAND_LINE_SIZE];
+char	command_line[COMMAND_LINE_SIZE] __read_mostly;
 
 /* Intended for ccio/sba/cpu statistics under /proc/bus/{runway|gsc} */
-struct proc_dir_entry * proc_runway_root = NULL;
-struct proc_dir_entry * proc_gsc_root = NULL;
-struct proc_dir_entry * proc_mckinley_root = NULL;
+struct proc_dir_entry * proc_runway_root __read_mostly = NULL;
+struct proc_dir_entry * proc_gsc_root __read_mostly = NULL;
+struct proc_dir_entry * proc_mckinley_root __read_mostly = NULL;
 
 #if !defined(CONFIG_PA20) && (defined(CONFIG_IOMMU_CCIO) || defined(CONFIG_IOMMU_SBA))
-int parisc_bus_is_phys = 1;	/* Assume no IOMMU is present */
+int parisc_bus_is_phys __read_mostly = 1;	/* Assume no IOMMU is present */
 EXPORT_SYMBOL(parisc_bus_is_phys);
 #endif
 
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index ce89da0f654d..fb3ca84f1b97 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -58,9 +58,9 @@ DEFINE_SPINLOCK(smp_lock);
 
 volatile struct task_struct *smp_init_current_idle_task;
 
-static volatile int cpu_now_booting = 0;	/* track which CPU is booting */
+static volatile int cpu_now_booting __read_mostly = 0;	/* track which CPU is booting */
 
-static int parisc_max_cpus = 1;
+static int parisc_max_cpus __read_mostly = 1;
 
 /* online cpus are ones that we've managed to bring up completely
  * possible cpus are all valid cpu 
@@ -71,8 +71,8 @@ static int parisc_max_cpus = 1;
  * empty in the beginning.
  */
 
-cpumask_t cpu_online_map = CPU_MASK_NONE;	/* Bitmap of online CPUs */
-cpumask_t cpu_possible_map = CPU_MASK_ALL;	/* Bitmap of Present CPUs */
+cpumask_t cpu_online_map   __read_mostly = CPU_MASK_NONE;	/* Bitmap of online CPUs */
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;	/* Bitmap of Present CPUs */
 
 EXPORT_SYMBOL(cpu_online_map);
 EXPORT_SYMBOL(cpu_possible_map);
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index cded25680787..594930bc4bcf 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -36,8 +36,8 @@
 /* xtime and wall_jiffies keep wall-clock time */
 extern unsigned long wall_jiffies;
 
-static long clocktick;	/* timer cycles per tick */
-static long halftick;
+static long clocktick __read_mostly;	/* timer cycles per tick */
+static long halftick __read_mostly;
 
 #ifdef CONFIG_SMP
 extern void smp_do_timer(struct pt_regs *regs);
diff --git a/arch/parisc/kernel/topology.c b/arch/parisc/kernel/topology.c
index ac2a40681414..3ba040050e4c 100644
--- a/arch/parisc/kernel/topology.c
+++ b/arch/parisc/kernel/topology.c
@@ -20,8 +20,9 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
+#include <linux/cache.h>
 
-static struct cpu cpu_devices[NR_CPUS];
+static struct cpu cpu_devices[NR_CPUS] __read_mostly;
 
 static int __init topology_init(void)
 {
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index eaae8a021f9f..de0a1b21cb40 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -122,7 +122,7 @@
 #define ERR_NOTHANDLED	-1
 #define ERR_PAGEFAULT	-2
 
-int unaligned_enabled = 1;
+int unaligned_enabled __read_mostly = 1;
 
 void die_if_kernel (char *str, struct pt_regs *regs, long err);
 
diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c
index db141108412e..cc1c1afc3187 100644
--- a/arch/parisc/kernel/unwind.c
+++ b/arch/parisc/kernel/unwind.c
@@ -35,7 +35,7 @@ static spinlock_t unwind_lock;
  * we can call unwind_init as early in the bootup process as 
  * possible (before the slab allocator is initialized)
  */
-static struct unwind_table kernel_unwind_table;
+static struct unwind_table kernel_unwind_table __read_mostly;
 static LIST_HEAD(unwind_tables);
 
 static inline const struct unwind_table_entry *
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index e5fac3e08c7a..b8b9174f6425 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -105,6 +105,10 @@ SECTIONS
   . = ALIGN(16);
   .data.lock_aligned : { *(.data.lock_aligned) }
 
+  /* rarely changed data like cpu maps */
+  . = ALIGN(16);
+  .data.read_mostly : { *(.data.read_mostly) }
+
   _edata = .;			/* End of data section */
 
   . = ALIGN(16384); 		/* init_task */
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 29b998e430e6..a992cb8cfe61 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -36,9 +36,9 @@ extern char _end;	/* end of BSS, defined by linker */
 extern char __init_begin, __init_end;
 
 #ifdef CONFIG_DISCONTIGMEM
-struct node_map_data node_data[MAX_NUMNODES];
-bootmem_data_t bmem_data[MAX_NUMNODES];
-unsigned char pfnnid_map[PFNNID_MAP_MAX];
+struct node_map_data node_data[MAX_NUMNODES] __read_mostly;
+bootmem_data_t bmem_data[MAX_NUMNODES] __read_mostly;
+unsigned char pfnnid_map[PFNNID_MAP_MAX] __read_mostly;
 #endif
 
 static struct resource data_resource = {
@@ -58,14 +58,14 @@ static struct resource pdcdata_resource = {
 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM,
 };
 
-static struct resource sysram_resources[MAX_PHYSMEM_RANGES];
+static struct resource sysram_resources[MAX_PHYSMEM_RANGES] __read_mostly;
 
 /* The following array is initialized from the firmware specific
  * information retrieved in kernel/inventory.c.
  */
 
-physmem_range_t pmem_ranges[MAX_PHYSMEM_RANGES];
-int npmem_ranges;
+physmem_range_t pmem_ranges[MAX_PHYSMEM_RANGES] __read_mostly;
+int npmem_ranges __read_mostly;
 
 #ifdef __LP64__
 #define MAX_MEM         (~0UL)
@@ -73,7 +73,7 @@ int npmem_ranges;
 #define MAX_MEM         (3584U*1024U*1024U)
 #endif /* !__LP64__ */
 
-static unsigned long mem_limit = MAX_MEM;
+static unsigned long mem_limit __read_mostly = MAX_MEM;
 
 static void __init mem_limit_func(void)
 {
@@ -431,11 +431,11 @@ void free_initmem(void)
 #define SET_MAP_OFFSET(x) ((void *)(((unsigned long)(x) + VM_MAP_OFFSET) \
 				     & ~(VM_MAP_OFFSET-1)))
 
-void *vmalloc_start;
+void *vmalloc_start __read_mostly;
 EXPORT_SYMBOL(vmalloc_start);
 
 #ifdef CONFIG_PA11
-unsigned long pcxl_dma_start;
+unsigned long pcxl_dma_start __read_mostly;
 #endif
 
 void __init mem_init(void)
@@ -475,7 +475,7 @@ int do_check_pgt_cache(int low, int high)
 	return 0;
 }
 
-unsigned long *empty_zero_page;
+unsigned long *empty_zero_page __read_mostly;
 
 void show_mem(void)
 {
diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c
index 6362bf99eff6..3d94d86c1c9f 100644
--- a/drivers/parisc/eisa.c
+++ b/drivers/parisc/eisa.c
@@ -57,7 +57,7 @@
 
 static DEFINE_SPINLOCK(eisa_irq_lock);
 
-void __iomem *eisa_eeprom_addr;
+void __iomem *eisa_eeprom_addr __read_mostly;
 
 /* We can only have one EISA adapter in the system because neither
  * implementation can be flexed.
@@ -141,7 +141,7 @@ static int slave_mask;
  * in the furure. 
  */
 /* irq 13,8,2,1,0 must be edge */
-static unsigned int eisa_irq_level; /* default to edge triggered */
+static unsigned int eisa_irq_level __read_mostly; /* default to edge triggered */
 
 
 /* called by free irq */
diff --git a/drivers/parisc/lasi.c b/drivers/parisc/lasi.c
index a8c20396ffbe..2b3ba1dcf332 100644
--- a/drivers/parisc/lasi.c
+++ b/drivers/parisc/lasi.c
@@ -150,7 +150,7 @@ void __init lasi_led_init(unsigned long lasi_hpa)
  * 
  */
 
-static unsigned long lasi_power_off_hpa;
+static unsigned long lasi_power_off_hpa __read_mostly;
 
 static void lasi_power_off(void)
 {
diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c
index 5e495dcbc58a..4f6bdf0881b5 100644
--- a/drivers/parisc/lba_pci.c
+++ b/drivers/parisc/lba_pci.c
@@ -167,7 +167,7 @@
 
 /* non-postable I/O port space, densely packed */
 #define LBA_PORT_BASE	(PCI_F_EXTEND | 0xfee00000UL)
-static void __iomem *astro_iop_base;
+static void __iomem *astro_iop_base __read_mostly;
 
 #define ELROY_HVERS	0x782
 #define MERCURY_HVERS	0x783
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 315be4770d3e..f357d3f60360 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -3,7 +3,7 @@
  *
  *      (c) Copyright 2000 Red Hat Software
  *      (c) Copyright 2000 Helge Deller <hdeller@redhat.com>
- *      (c) Copyright 2001-2004 Helge Deller <deller@gmx.de>
+ *      (c) Copyright 2001-2005 Helge Deller <deller@gmx.de>
  *      (c) Copyright 2001 Randolph Chung <tausq@debian.org>
  *
  *      This program is free software; you can redistribute it and/or modify
@@ -56,13 +56,13 @@
    relatively large amount of CPU time, some of the calculations can be 
    turned off with the following variables (controlled via procfs) */
 
-static int led_type = -1;
+static int led_type __read_mostly = -1;
 static unsigned char lastleds;	/* LED state from most recent update */
-static unsigned int led_heartbeat = 1;
-static unsigned int led_diskio = 1;
-static unsigned int led_lanrxtx = 1;
-static char lcd_text[32];
-static char lcd_text_default[32];
+static unsigned int led_heartbeat __read_mostly = 1;
+static unsigned int led_diskio    __read_mostly = 1;
+static unsigned int led_lanrxtx   __read_mostly = 1;
+static char lcd_text[32]          __read_mostly;
+static char lcd_text_default[32]  __read_mostly;
 
 
 static struct workqueue_struct *led_wq;
@@ -108,7 +108,7 @@ struct pdc_chassis_lcd_info_ret_block {
 /* lcd_info is pre-initialized to the values needed to program KittyHawk LCD's 
  * HP seems to have used Sharp/Hitachi HD44780 LCDs most of the time. */
 static struct pdc_chassis_lcd_info_ret_block
-lcd_info __attribute__((aligned(8))) =
+lcd_info __attribute__((aligned(8))) __read_mostly =
 {
 	.model =		DISPLAY_MODEL_LCD,
 	.lcd_width =		16,
@@ -144,7 +144,7 @@ static int start_task(void)
 device_initcall(start_task);
 
 /* ptr to LCD/LED-specific function */
-static void (*led_func_ptr) (unsigned char);
+static void (*led_func_ptr) (unsigned char) __read_mostly;
 
 #ifdef CONFIG_PROC_FS
 static int led_proc_read(char *page, char **start, off_t off, int count, 
diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c
index 273a74179720..11750cbb05c6 100644
--- a/drivers/parisc/pdc_stable.c
+++ b/drivers/parisc/pdc_stable.c
@@ -70,7 +70,7 @@ MODULE_DESCRIPTION("sysfs interface to HP PDC Stable Storage data");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(PDCS_VERSION);
 
-static unsigned long pdcs_size = 0;
+static unsigned long pdcs_size __read_mostly;
 
 /* This struct defines what we need to deal with a parisc pdc path entry */
 struct pdcspath_entry {
diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index ff75e9296df9..54b2b7f20b96 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -2,7 +2,7 @@
  * linux/arch/parisc/kernel/power.c
  * HP PARISC soft power switch support driver
  *
- * Copyright (c) 2001-2002 Helge Deller <deller@gmx.de>
+ * Copyright (c) 2001-2005 Helge Deller <deller@gmx.de>
  * All rights reserved.
  *
  *
@@ -102,7 +102,7 @@ static DECLARE_WORK(poweroff_work, deferred_poweroff, NULL);
 
 static void poweroff(void)
 {
-	static int powering_off;
+	static int powering_off __read_mostly;
 
 	if (powering_off)
 		return;
@@ -113,7 +113,7 @@ static void poweroff(void)
 
 
 /* local time-counter for shutdown */
-static int shutdown_timer;
+static int shutdown_timer __read_mostly;
 
 /* check, give feedback and start shutdown after one second */
 static void process_shutdown(void)
@@ -139,7 +139,7 @@ static void process_shutdown(void)
 DECLARE_TASKLET_DISABLED(power_tasklet, NULL, 0);
 
 /* soft power switch enabled/disabled */
-int pwrsw_enabled = 1;
+int pwrsw_enabled __read_mostly = 1;
 
 /*
  * On gecko style machines (e.g. 712/xx and 715/xx) 
@@ -149,7 +149,7 @@ int pwrsw_enabled = 1;
  */
 static void gecko_tasklet_func(unsigned long unused)
 {
-	if (!pwrsw_enabled)
+	if (unlikely(!pwrsw_enabled))
 		return;
 
 	if (__getDIAG(25) & 0x80000000) {
@@ -173,7 +173,7 @@ static void polling_tasklet_func(unsigned long soft_power_reg)
 {
         unsigned long current_status;
 	
-	if (!pwrsw_enabled)
+	if (unlikely(!pwrsw_enabled))
 		return;
 
 	current_status = gsc_readl(soft_power_reg);
diff --git a/include/linux/cache.h b/include/linux/cache.h
index ffe52210fc4f..d22e632f41fb 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -13,7 +13,7 @@
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
 #endif
 
-#if defined(CONFIG_X86) || defined(CONFIG_SPARC64) || defined(CONFIG_IA64)
+#if defined(CONFIG_X86) || defined(CONFIG_SPARC64) || defined(CONFIG_IA64) || defined(CONFIG_PARISC)
 #define __read_mostly __attribute__((__section__(".data.read_mostly")))
 #else
 #define __read_mostly
-- 
cgit v1.2.3-71-gd317


From a8b9ee7396ccc8db3bdb4108993556acbe2d3527 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Wed, 11 Jan 2006 00:15:16 -0800
Subject: [MUTEX]: linux/mutex.h needs linux/linkage.h too

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mutex.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 9bce0fee68d4..f1c84b1252f5 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -12,6 +12,7 @@
 
 #include <linux/list.h>
 #include <linux/spinlock_types.h>
+#include <linux/linkage.h>
 
 #include <asm/atomic.h>
 
-- 
cgit v1.2.3-71-gd317


From a4fc7ab1d065a9dd89ed0e74439ef87d4a16e980 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 11 Jan 2006 14:41:26 +0000
Subject: [PATCH] fix/simplify mutex debugging code

Let's switch mutex_debug_check_no_locks_freed() to take (addr, len) as
arguments instead, since all its callers were just calculating the 'to'
address for themselves anyway... (and sometimes doing so badly).

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pageattr.c     | 2 +-
 include/linux/mm.h          | 2 +-
 include/linux/mutex-debug.h | 2 +-
 include/linux/mutex.h       | 2 +-
 kernel/mutex-debug.c        | 5 +++--
 mm/page_alloc.c             | 2 +-
 mm/slab.c                   | 2 +-
 7 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index e8a53552b13d..d0cadb33b54c 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -224,7 +224,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 		return;
 	if (!enable)
 		mutex_debug_check_no_locks_freed(page_address(page),
-						 page_address(page+numpages));
+						 numpages * PAGE_SIZE);
 
 	/* the return value is ignored - the calls cannot fail,
 	 * large pages are disabled at boot time.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3f1fafc0245e..e53d2c6fd5f4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1027,7 +1027,7 @@ kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (!PageHighMem(page) && !enable)
 		mutex_debug_check_no_locks_freed(page_address(page),
-						 page_address(page + numpages));
+						 numpages * PAGE_SIZE);
 }
 #endif
 
diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
index 8138d9eb58ec..8b5769f00467 100644
--- a/include/linux/mutex-debug.h
+++ b/include/linux/mutex-debug.h
@@ -18,6 +18,6 @@ extern void FASTCALL(mutex_destroy(struct mutex *lock));
 extern void mutex_debug_show_all_locks(void);
 extern void mutex_debug_show_held_locks(struct task_struct *filter);
 extern void mutex_debug_check_no_locks_held(struct task_struct *task);
-extern void mutex_debug_check_no_locks_freed(const void *from, const void *to);
+extern void mutex_debug_check_no_locks_freed(const void *from, unsigned long len);
 
 #endif
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index f1c84b1252f5..f1ac507fa20d 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -79,7 +79,7 @@ struct mutex_waiter {
 # define mutex_debug_show_all_locks()			do { } while (0)
 # define mutex_debug_show_held_locks(p)			do { } while (0)
 # define mutex_debug_check_no_locks_held(task)		do { } while (0)
-# define mutex_debug_check_no_locks_freed(from, to)	do { } while (0)
+# define mutex_debug_check_no_locks_freed(from, len)	do { } while (0)
 #endif
 
 #define __MUTEX_INITIALIZER(lockname) \
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 6f829058ae4a..f4913c376950 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -333,9 +333,10 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
  * is destroyed or reinitialized - this code checks whether there is
  * any held lock in the memory range of <from> to <to>:
  */
-void mutex_debug_check_no_locks_freed(const void *from, const void *to)
+void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
 {
 	struct list_head *curr, *next;
+	const void *to = from + len;
 	unsigned long flags;
 	struct mutex *lock;
 	void *lock_addr;
@@ -437,7 +438,7 @@ void debug_mutex_init(struct mutex *lock, const char *name)
 	/*
 	 * Make sure we are not reinitializing a held lock:
 	 */
-	mutex_debug_check_no_locks_freed((void *)lock, (void *)(lock + 1));
+	mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lock->owner = NULL;
 	INIT_LIST_HEAD(&lock->held_list);
 	lock->name = name;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a5e6891f7bb6..8e363536e2da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -417,7 +417,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	arch_free_page(page, order);
 	if (!PageHighMem(page))
 		mutex_debug_check_no_locks_freed(page_address(page),
-			page_address(page+(1<<order)));
+						 PAGE_SIZE<<order);
 
 #ifndef CONFIG_MMU
 	for (i = 1 ; i < (1 << order) ; ++i)
diff --git a/mm/slab.c b/mm/slab.c
index 33aab345cd4a..9374293a3012 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3071,7 +3071,7 @@ void kfree(const void *objp)
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = page_get_cache(virt_to_page(objp));
-	mutex_debug_check_no_locks_freed(objp, objp+obj_reallen(c));
+	mutex_debug_check_no_locks_freed(objp, obj_reallen(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3-71-gd317


From 4eac915d02453e81a32595cd7423492c81337a26 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 11 Jan 2006 12:17:19 -0800
Subject: [PATCH] mm: gfp_atomic comments

Clarify in comments that GFP_ATOMIC means both "don't sleep" and "use
emergency pools", hence both ALLOC_HARDER and ALLOC_HIGH.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/gfp.h | 1 +
 mm/page_alloc.c     | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 8b2eab90abb6..da7ce8730e97 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -57,6 +57,7 @@ struct vm_area_struct;
 			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
 			__GFP_NOMEMALLOC|__GFP_HARDWALL)
 
+/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
 #define GFP_NOFS	(__GFP_WAIT | __GFP_IO)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ce991b173aa9..d41a0662d4da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -931,7 +931,8 @@ restart:
 	 *
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
-	 * policy.
+	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags = ALLOC_WMARK_MIN;
 	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
-- 
cgit v1.2.3-71-gd317


From df019b1d8b893d0f0ee5a9b0f71486f0892561ae Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Wed, 11 Jan 2006 12:17:41 -0800
Subject: [PATCH] kprobes: fix unloading of self probed module

When a kprobes modules is written in such a way that probes are inserted on
itself, then unload of that moudle was not possible due to reference
couning on the same module.

The below patch makes a check and incrementes the module refcount only if
it is not a self probed module.

We need to allow modules to probe themself for kprobes performance
measurements

This patch has been tested on several x86_64, ppc64 and IA64 architectures.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h |  3 +++
 kernel/kprobes.c        | 42 ++++++++++++++++++++++++++++++++----------
 2 files changed, 35 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 10005bc92a31..669756bc20a2 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -68,6 +68,9 @@ struct kprobe {
 	/* list of kprobes for multi-handler support */
 	struct list_head list;
 
+	/* Indicates that the corresponding module has been ref counted */
+	unsigned int mod_refcounted;
+
 	/*count the number of times this probe was temporarily disarmed */
 	unsigned long nmissed;
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 34a885bb82e0..3ea6325228da 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -449,19 +449,32 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
 	return 0;
 }
 
-int __kprobes register_kprobe(struct kprobe *p)
+static int __kprobes __register_kprobe(struct kprobe *p,
+	unsigned long called_from)
 {
 	int ret = 0;
 	struct kprobe *old_p;
-	struct module *mod;
+	struct module *probed_mod;
 
 	if ((!kernel_text_address((unsigned long) p->addr)) ||
 		in_kprobes_functions((unsigned long) p->addr))
 		return -EINVAL;
 
-	if ((mod = module_text_address((unsigned long) p->addr)) &&
-			(unlikely(!try_module_get(mod))))
-		return -EINVAL;
+	p->mod_refcounted = 0;
+	/* Check are we probing a module */
+	if ((probed_mod = module_text_address((unsigned long) p->addr))) {
+		struct module *calling_mod = module_text_address(called_from);
+		/* We must allow modules to probe themself and
+		 * in this case avoid incrementing the module refcount,
+		 * so as to allow unloading of self probing modules.
+		 */
+		if (calling_mod && (calling_mod != probed_mod)) {
+			if (unlikely(!try_module_get(probed_mod)))
+				return -EINVAL;
+			p->mod_refcounted = 1;
+		} else
+			probed_mod = NULL;
+	}
 
 	p->nmissed = 0;
 	down(&kprobe_mutex);
@@ -483,11 +496,17 @@ int __kprobes register_kprobe(struct kprobe *p)
 out:
 	up(&kprobe_mutex);
 
-	if (ret && mod)
-		module_put(mod);
+	if (ret && probed_mod)
+		module_put(probed_mod);
 	return ret;
 }
 
+int __kprobes register_kprobe(struct kprobe *p)
+{
+	return __register_kprobe(p,
+		(unsigned long)__builtin_return_address(0));
+}
+
 void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	struct module *mod;
@@ -524,7 +543,8 @@ valid_p:
 	up(&kprobe_mutex);
 
 	synchronize_sched();
-	if ((mod = module_text_address((unsigned long)p->addr)))
+	if (p->mod_refcounted &&
+	    (mod = module_text_address((unsigned long)p->addr)))
 		module_put(mod);
 
 	if (cleanup_p) {
@@ -547,7 +567,8 @@ int __kprobes register_jprobe(struct jprobe *jp)
 	jp->kp.pre_handler = setjmp_pre_handler;
 	jp->kp.break_handler = longjmp_break_handler;
 
-	return register_kprobe(&jp->kp);
+	return __register_kprobe(&jp->kp,
+		(unsigned long)__builtin_return_address(0));
 }
 
 void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -587,7 +608,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
-	if ((ret = register_kprobe(&rp->kp)) != 0)
+	if ((ret = __register_kprobe(&rp->kp,
+		(unsigned long)__builtin_return_address(0))) != 0)
 		free_rp_inst(rp);
 	return ret;
 }
-- 
cgit v1.2.3-71-gd317


From e16885c5ad624a6efe1b1bf764e075d75f65a788 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 11 Jan 2006 12:17:45 -0800
Subject: [PATCH] uninline capable()

Uninline capable().  Saves 2K of kernel text on a generic .config, and 1K on a
tiny config.  In addition it makes the use of capable more consistent between
CONFIG_SECURITY and !CONFIG_SECURITY

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 13 +------------
 kernel/sys.c          | 12 ++++++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c4ee35dd18ae..2ae8711bfba1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1113,19 +1113,8 @@ static inline int sas_ss_flags(unsigned long sp)
 }
 
 
-#ifdef CONFIG_SECURITY
-/* code is in security.c */
+/* code is in security.c or kernel/sys.c if !SECURITY */
 extern int capable(int cap);
-#else
-static inline int capable(int cap)
-{
-	if (cap_raised(current->cap_effective, cap)) {
-		current->flags |= PF_SUPERPRIV;
-		return 1;
-	}
-	return 0;
-}
-#endif
 
 /*
  * Routines for handling mm_structs
diff --git a/kernel/sys.c b/kernel/sys.c
index b6941e06d5d5..9ccf713491f9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -223,6 +223,18 @@ int unregister_reboot_notifier(struct notifier_block * nb)
 
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
+#ifndef CONFIG_SECURITY
+int capable(int cap)
+{
+        if (cap_raised(current->cap_effective, cap)) {
+	       current->flags |= PF_SUPERPRIV;
+	       return 1;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(capable);
+#endif
+
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
 	int no_nice;
-- 
cgit v1.2.3-71-gd317


From c59ede7b78db329949d9cdcd7064e22d357560ef Mon Sep 17 00:00:00 2001
From: "Randy.Dunlap" <rdunlap@xenotime.net>
Date: Wed, 11 Jan 2006 12:17:46 -0800
Subject: [PATCH] move capable() to capability.h

- Move capable() from sched.h to capability.h;

- Use <linux/capability.h> where capable() is used
	(in include/, block/, ipc/, kernel/, a few drivers/,
	mm/, security/, & sound/;
	many more drivers/ to go)

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ioctl.c                   | 2 +-
 block/scsi_ioctl.c              | 1 +
 drivers/acorn/char/i2c.c        | 1 +
 drivers/base/firmware_class.c   | 1 +
 drivers/base/memory.c           | 2 +-
 drivers/firmware/efivars.c      | 2 +-
 drivers/oprofile/event_buffer.c | 1 +
 drivers/parisc/led.c            | 1 +
 drivers/parisc/pdc_stable.c     | 2 +-
 fs/xfs/linux-2.6/xfs_cred.h     | 4 +++-
 include/linux/capability.h      | 3 +++
 include/linux/mm.h              | 1 +
 include/linux/sched.h           | 4 ----
 ipc/mqueue.c                    | 1 +
 ipc/msg.c                       | 1 +
 ipc/sem.c                       | 1 +
 ipc/shm.c                       | 1 +
 ipc/util.c                      | 1 +
 kernel/acct.c                   | 1 +
 kernel/capability.c             | 1 +
 kernel/exit.c                   | 1 +
 kernel/fork.c                   | 1 +
 kernel/kexec.c                  | 1 +
 kernel/module.c                 | 1 +
 kernel/ptrace.c                 | 1 +
 kernel/sched.c                  | 1 +
 kernel/signal.c                 | 1 +
 kernel/sys.c                    | 1 +
 kernel/sysctl.c                 | 1 +
 kernel/time.c                   | 1 +
 kernel/uid16.c                  | 1 +
 mm/filemap.c                    | 1 +
 mm/mlock.c                      | 1 +
 mm/mmap.c                       | 1 +
 mm/mremap.c                     | 1 +
 mm/swapfile.c                   | 1 +
 security/commoncap.c            | 1 +
 security/dummy.c                | 1 +
 security/keys/keyctl.c          | 1 +
 security/security.c             | 1 +
 sound/pci/emu10k1/emufx.c       | 1 +
 41 files changed, 44 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 82030e1dfd63..e1109491c234 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>		/* for capable() */
+#include <linux/capability.h>
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 18de84c8ccd8..cc72210687eb 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -21,6 +21,7 @@
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/cdrom.h>
 #include <linux/slab.h>
diff --git a/drivers/acorn/char/i2c.c b/drivers/acorn/char/i2c.c
index c22bb9dca1ec..c26c08b36829 100644
--- a/drivers/acorn/char/i2c.c
+++ b/drivers/acorn/char/i2c.c
@@ -12,6 +12,7 @@
  *  On Acorn machines, the following i2c devices are on the bus:
  *	- PCF8583 real time clock & static RAM
  */
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/time.h>
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 5b3d5e9ddcb6..3d384e3d34de 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -7,6 +7,7 @@
  *
  */
 
+#include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 58801d718cc2..d1a05224627e 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -13,8 +13,8 @@
 #include <linux/sysdev.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>	/* capable() */
 #include <linux/topology.h>
+#include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/memory.h>
 #include <linux/kobject.h>
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index bda5bce681b6..343379f23a53 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -65,11 +65,11 @@
  *   v0.01 release to linux-ia64@linuxia64.org
  */
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/init.h>
-#include <linux/sched.h>		/* for capable() */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/string.h>
diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c
index 166bca790133..b80318f03420 100644
--- a/drivers/oprofile/event_buffer.c
+++ b/drivers/oprofile/event_buffer.c
@@ -15,6 +15,7 @@
 #include <linux/vmalloc.h>
 #include <linux/oprofile.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
 #include <linux/dcookies.h>
 #include <linux/fs.h>
 #include <asm/uaccess.h>
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index f357d3f60360..3627a2d7f79f 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -30,6 +30,7 @@
 #include <linux/types.h>
 #include <linux/ioport.h>
 #include <linux/utsname.h>
+#include <linux/capability.h>
 #include <linux/delay.h>
 #include <linux/netdevice.h>
 #include <linux/inetdevice.h>
diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c
index 38bdca2fac6b..42a3c54e8e6c 100644
--- a/drivers/parisc/pdc_stable.c
+++ b/drivers/parisc/pdc_stable.c
@@ -42,9 +42,9 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>		/* for capable() */
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/capability.h>
 #include <linux/ctype.h>
 #include <linux/sysfs.h>
 #include <linux/kobject.h>
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index 4af491024727..e7f3da61c6c3 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_CRED_H__
 #define __XFS_CRED_H__
 
+#include <linux/capability.h>
+
 /*
  * Credentials
  */
@@ -27,7 +29,7 @@ typedef struct cred {
 
 extern struct cred *sys_cred;
 
-/* this is a hack.. (assums sys_cred is the only cred_t in the system) */
+/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
 static __inline int capable_cred(cred_t *cr, int cid)
 {
 	return (cr == sys_cred) ? 1 : capable(cid);
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6b4618902d3d..5a23ce752629 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -43,6 +43,7 @@ typedef struct __user_cap_data_struct {
 #ifdef __KERNEL__
 
 #include <linux/spinlock.h>
+#include <asm/current.h>
 
 /* #define STRICT_CAP_T_TYPECHECKS */
 
@@ -356,6 +357,8 @@ static inline kernel_cap_t cap_invert(kernel_cap_t c)
 
 #define cap_is_fs_cap(c)     (CAP_TO_MASK(c) & CAP_FS_MASK)
 
+extern int capable(int cap);
+
 #endif /* __KERNEL__ */
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e53d2c6fd5f4..c643016499a1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3,6 +3,7 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/capability.h>
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2ae8711bfba1..3b74c4bf2934 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1112,10 +1112,6 @@ static inline int sas_ss_flags(unsigned long sp)
 		: on_sig_stack(sp) ? SS_ONSTACK : 0);
 }
 
-
-/* code is in security.c or kernel/sys.c if !SECURITY */
-extern int capable(int cap);
-
 /*
  * Routines for handling mm_structs
  */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a8aa6152eea6..4e776f9c80e7 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -11,6 +11,7 @@
  * This file is released under the GPL.
  */
 
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
diff --git a/ipc/msg.c b/ipc/msg.c
index d035bd2aba96..a91b64763b86 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -15,6 +15,7 @@
  * (c) 1999 Manfred Spraul <manfreds@colorfullife.com>
  */
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/msg.h>
diff --git a/ipc/sem.c b/ipc/sem.c
index cb5bb2a5df96..46bb8a678dec 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -73,6 +73,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
+#include <linux/capability.h>
 #include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include "util.h"
diff --git a/ipc/shm.c b/ipc/shm.c
index 0b92e874fc06..4c28d2d8e305 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -27,6 +27,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
+#include <linux/capability.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
 
diff --git a/ipc/util.c b/ipc/util.c
index 23f1cec150c1..38b9a0af3bd8 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -20,6 +20,7 @@
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/capability.h>
 #include <linux/highuid.h>
 #include <linux/security.h>
 #include <linux/rcupdate.h>
diff --git a/kernel/acct.c b/kernel/acct.c
index 38d57fa6b78f..065d8b4e51ef 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -47,6 +47,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/acct.h>
+#include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/tty.h>
 #include <linux/security.h>
diff --git a/kernel/capability.c b/kernel/capability.c
index 8986a37a67ea..bfa3c92e16f2 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
  * 30 May 2002:	Cleanup, Robert M. Love <rml@tech9.net>
  */ 
 
+#include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/security.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 802722814925..f8e609ff1893 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -10,6 +10,7 @@
 #include <linux/interrupt.h>
 #include <linux/smp_lock.h>
 #include <linux/module.h>
+#include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 3bdcab49998d..16a776ec2c0b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,7 @@
 #include <linux/binfmts.h>
 #include <linux/mman.h>
 #include <linux/fs.h>
+#include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/security.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index de1441656efd..bf39d28e4c0e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,7 @@
  * Version 2.  See the file COPYING for more details.
  */
 
+#include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/slab.h>
diff --git a/kernel/module.c b/kernel/module.c
index e4276046a1b6..618ed6e23ecc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -28,6 +28,7 @@
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
 #include <linux/rcupdate.h>
+#include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/moduleparam.h>
 #include <linux/errno.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index cceaf09ac413..5f33cdb6fff5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -7,6 +7,7 @@
  * to continually duplicate across every architecture.
  */
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 34a945bcc022..d129e560cc0d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -27,6 +27,7 @@
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
+#include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/security.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 08aa5b263f36..1da2e74beb97 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -25,6 +25,7 @@
 #include <linux/posix-timers.h>
 #include <linux/signal.h>
 #include <linux/audit.h>
+#include <linux/capability.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 9ccf713491f9..d09cac23fdfd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/kexec.h>
 #include <linux/workqueue.h>
+#include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/key.h>
 #include <linux/times.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 03b0598f2369..62d4d9566876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include <linux/capability.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
 #include <linux/capability.h>
diff --git a/kernel/time.c b/kernel/time.c
index 169e8329e0b6..7477b1d2079e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -29,6 +29,7 @@
 
 #include <linux/module.h>
 #include <linux/timex.h>
+#include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/syscalls.h>
diff --git a/kernel/uid16.c b/kernel/uid16.c
index f669941e8b26..aa25605027c8 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -10,6 +10,7 @@
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/security.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index 96de772be487..a965b6b35f26 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/aio.h>
+#include <linux/capability.h>
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
diff --git a/mm/mlock.c b/mm/mlock.c
index 4ae3a46ff768..b90c59573abf 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -5,6 +5,7 @@
  *  (C) Copyright 2002 Christoph Hellwig
  */
 
+#include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/mempolicy.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index 64ba4dbcb7de..47556d2b3e90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index ddaeee9a0b69..1903bdf65e42 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -13,6 +13,7 @@
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/highmem.h>
 #include <linux/security.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d8a5afc8b2a3..957fef43fa60 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,7 @@
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
+#include <linux/capability.h>
 #include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
diff --git a/security/commoncap.c b/security/commoncap.c
index 04c12f58d656..8a6e097f99ea 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -7,6 +7,7 @@
  *
  */
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/security/dummy.c b/security/dummy.c
index a15c54709fde..f1a5bd98bf10 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -14,6 +14,7 @@
 
 #undef DEBUG
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 3d2ebae029c1..90db5c76cf6e 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -16,6 +16,7 @@
 #include <linux/syscalls.h>
 #include <linux/keyctl.h>
 #include <linux/fs.h>
+#include <linux/capability.h>
 #include <linux/err.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/security/security.c b/security/security.c
index ed5fb80769c3..f693e1f66b98 100644
--- a/security/security.c
+++ b/security/security.c
@@ -11,6 +11,7 @@
  *	(at your option) any later version.
  */
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c
index 1a903390ad6d..509837252735 100644
--- a/sound/pci/emu10k1/emufx.c
+++ b/sound/pci/emu10k1/emufx.c
@@ -27,6 +27,7 @@
 
 #include <sound/driver.h>
 #include <linux/pci.h>
+#include <linux/capability.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-- 
cgit v1.2.3-71-gd317


From 1f6818b90dbb887261c616a318733703ed526f0a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 11 Jan 2006 22:42:26 +0100
Subject: [PATCH] x86_64: Minor GFP_DMA32 comment fix

Pretty obvious

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7e4ae6ab1977..34cbefd2ebde 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -98,7 +98,7 @@ struct per_cpu_pageset {
 
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
- * into multiple physical zones. On a PC we have 4 zones:
+ * into multiple physical zones. On a 32bit PC we have 4 zones:
  *
  * ZONE_DMA	  < 16 MB	ISA DMA capable memory
  * ZONE_DMA32	     0 MB 	Empty
-- 
cgit v1.2.3-71-gd317


From e99286744599a66195de4cd975d7ef4d643c2789 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 11 Jan 2006 22:43:33 +0100
Subject: [PATCH] x86_64: Generalize DMI and enable for x86-64

Some people need it now on 64bit so reuse the i386 code for
x86-64. This will be also useful for future bug workarounds.

It is a bit simplified there because there is no need
to do it very early on x86-64. This means it doesn't need
early ioremap et.al. We run it as a core initcall right now.

I hope it's not needed for early setup.

I added a general CONFIG_DMI symbol in case IA64 or someone
else wants to reuse the code later too.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig           |  4 ++++
 arch/i386/kernel/dmi_scan.c | 16 ++++++++--------
 arch/i386/kernel/setup.c    |  2 +-
 arch/x86_64/Kconfig         |  4 ++++
 arch/x86_64/kernel/Makefile |  5 ++++-
 arch/x86_64/kernel/setup.c  |  9 +++++++++
 include/asm-i386/io.h       |  5 +++++
 include/asm-x86_64/io.h     |  5 +++++
 include/linux/dmi.h         |  5 ++++-
 9 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 815878ebd30f..81ae9627701d 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -41,6 +41,10 @@ config ARCH_MAY_HAVE_PC_FDC
 	bool
 	default y
 
+config DMI
+	bool
+	default y
+
 source "init/Kconfig"
 
 menu "Processor type and features"
diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index 58516e2ac172..6a93d75db431 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -4,7 +4,7 @@
 #include <linux/module.h>
 #include <linux/dmi.h>
 #include <linux/bootmem.h>
-
+#include <linux/slab.h>
 
 static char * __init dmi_string(struct dmi_header *dm, u8 s)
 {
@@ -19,7 +19,7 @@ static char * __init dmi_string(struct dmi_header *dm, u8 s)
 		}
 
 		if (*bp != 0) {
-			str = alloc_bootmem(strlen(bp) + 1);
+			str = dmi_alloc(strlen(bp) + 1);
 			if (str != NULL)
 				strcpy(str, bp);
 			else
@@ -40,7 +40,7 @@ static int __init dmi_table(u32 base, int len, int num,
 	u8 *buf, *data;
 	int i = 0;
 		
-	buf = bt_ioremap(base, len);
+	buf = dmi_ioremap(base, len);
 	if (buf == NULL)
 		return -1;
 
@@ -65,7 +65,7 @@ static int __init dmi_table(u32 base, int len, int num,
 		data += 2;
 		i++;
 	}
-	bt_iounmap(buf, len);
+	dmi_iounmap(buf, len);
 	return 0;
 }
 
@@ -112,7 +112,7 @@ static void __init dmi_save_devices(struct dmi_header *dm)
 		if ((*d & 0x80) == 0)
 			continue;
 
-		dev = alloc_bootmem(sizeof(*dev));
+		dev = dmi_alloc(sizeof(*dev));
 		if (!dev) {
 			printk(KERN_ERR "dmi_save_devices: out of memory.\n");
 			break;
@@ -131,7 +131,7 @@ static void __init dmi_save_ipmi_device(struct dmi_header *dm)
 	struct dmi_device *dev;
 	void * data;
 
-	data = alloc_bootmem(dm->length);
+	data = dmi_alloc(dm->length);
 	if (data == NULL) {
 		printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
 		return;
@@ -139,7 +139,7 @@ static void __init dmi_save_ipmi_device(struct dmi_header *dm)
 
 	memcpy(data, dm, dm->length);
 
-	dev = alloc_bootmem(sizeof(*dev));
+	dev = dmi_alloc(sizeof(*dev));
 	if (!dev) {
 		printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
 		return;
@@ -221,7 +221,7 @@ void __init dmi_scan_machine(void)
 		}
 	}
 
-out:	printk(KERN_INFO "DMI not present.\n");
+out:	printk(KERN_INFO "DMI not present or invalid.\n");
 }
 
 
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index f685637a100d..a720f743ea6e 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -45,6 +45,7 @@
 #include <linux/nodemask.h>
 #include <linux/kexec.h>
 #include <linux/crash_dump.h>
+#include <linux/dmi.h>
 
 #include <video/edid.h>
 
@@ -146,7 +147,6 @@ EXPORT_SYMBOL(ist_info);
 struct e820map e820;
 
 extern void early_cpu_init(void);
-extern void dmi_scan_machine(void);
 extern void generic_apic_probe(char *);
 extern int root_mountflags;
 
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 348b4a0d0d6f..150e1ac0bfed 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -69,6 +69,10 @@ config ARCH_MAY_HAVE_PC_FDC
 	bool
 	default y
 
+config DMI
+	bool
+	default y
+
 source "init/Kconfig"
 
 
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 12bc54005e2f..4601fdafb7cb 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -7,7 +7,8 @@ EXTRA_AFLAGS	:= -traditional
 obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
-		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o
+		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
+		dmi_scan.o
 
 obj-$(CONFIG_X86_MCE)         += mce.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
@@ -47,3 +48,5 @@ intel_cacheinfo-y		+= ../../i386/kernel/cpu/intel_cacheinfo.o
 quirks-y			+= ../../i386/kernel/quirks.o
 i8237-y				+= ../../i386/kernel/i8237.o
 msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
+dmi_scan-y			+= ../../i386/kernel/dmi_scan.o
+
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 33e77ec8da22..6810690d6045 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -44,6 +44,7 @@
 #include <linux/mmzone.h>
 #include <linux/kexec.h>
 #include <linux/cpufreq.h>
+#include <linux/dmi.h>
 
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -1392,3 +1393,11 @@ struct seq_operations cpuinfo_op = {
 	.stop =	c_stop,
 	.show =	show_cpuinfo,
 };
+
+static int __init run_dmi_scan(void)
+{
+	dmi_scan_machine();
+	return 0;
+}
+core_initcall(run_dmi_scan);
+
diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h
index 7babb97a02eb..03233c2ab820 100644
--- a/include/asm-i386/io.h
+++ b/include/asm-i386/io.h
@@ -131,6 +131,11 @@ extern void iounmap(volatile void __iomem *addr);
 extern void *bt_ioremap(unsigned long offset, unsigned long size);
 extern void bt_iounmap(void *addr, unsigned long size);
 
+/* Use early IO mappings for DMI because it's initialized early */
+#define dmi_ioremap bt_ioremap
+#define dmi_iounmap bt_iounmap
+#define dmi_alloc alloc_bootmem
+
 /*
  * ISA I/O bus memory addresses are 1:1 with the physical address.
  */
diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h
index 52ff269fe054..9dac18db8291 100644
--- a/include/asm-x86_64/io.h
+++ b/include/asm-x86_64/io.h
@@ -143,6 +143,11 @@ static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
 extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
 extern void iounmap(volatile void __iomem *addr);
 
+/* Use normal IO mappings for DMI */
+#define dmi_ioremap ioremap
+#define dmi_iounmap(x,l) iounmap(x)
+#define dmi_alloc(l) kmalloc(l, GFP_ATOMIC)
+
 /*
  * ISA I/O bus memory addresses are 1:1 with the physical address.
  */
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index 05f4132622fc..2e6bbe014157 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -2,6 +2,7 @@
 #define __DMI_H__
 
 #include <linux/list.h>
+#include <linux/config.h>
 
 enum dmi_field {
 	DMI_NONE,
@@ -60,12 +61,14 @@ struct dmi_device {
 	void *device_data;	/* Type specific data */
 };
 
-#if defined(CONFIG_X86_32)
+#ifdef CONFIG_DMI
 
 extern int dmi_check_system(struct dmi_system_id *list);
 extern char * dmi_get_system_info(int field);
 extern struct dmi_device * dmi_find_device(int type, const char *name,
 	struct dmi_device *from);
+extern void dmi_scan_machine(void);
+
 #else
 
 static inline int dmi_check_system(struct dmi_system_id *list) { return 0; }
-- 
cgit v1.2.3-71-gd317


From 819a692804a8d2d42b7bb033d2650dba47622149 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 11 Jan 2006 22:43:45 +0100
Subject: [PATCH] x86_64: Handle unknown node (-1) in alloc_pages_node

Following kmalloc_node.

Needed for another patch to return -1 for unknown nodes in x86-64.

Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: kiran@scalex86.org
Signed-off-by: Andi Kleen <ak@suse.de>
[ Changed 0 to numa_node_id() on suggestion by Christoph Lameter ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/gfp.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index da7ce8730e97..20f9148e38d9 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -110,6 +110,10 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 	if (unlikely(order >= MAX_ORDER))
 		return NULL;
 
+	/* Unknown node is current node */
+	if (nid < 0)
+		nid = numa_node_id();
+
 	return __alloc_pages(gfp_mask, order,
 		NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
 }
-- 
cgit v1.2.3-71-gd317


From 79f12614a6537cc3ac9ca4d1ea26f6e4f4a34aee Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Wed, 11 Jan 2006 22:46:18 +0100
Subject: [PATCH] x86_64: Inclusion of ScaleMP vSMP architecture patches -
 vsmp_arch

Introduce vSMP arch to the kernel.

This patch:
1. Adds CONFIG_X86_VSMP
2. Adds machine specific macros for local_irq_disabled, local_irq_enabled
   and irqs_disabled
3. Writes to the vSMP CTL device to indicate kernel compiled with CONFIG_VSMP

Signed-off-by: Ravikiran Thirumalai <kiran@scalemp.com>
Signed-off-by: Shai Fultheim <shai@scalemp.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig         | 18 ++++++++++++++++++
 arch/x86_64/kernel/Makefile |  1 +
 arch/x86_64/kernel/vsmp.c   | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86_64/system.h | 18 ++++++++++++++++++
 include/linux/pci_ids.h     |  3 +++
 5 files changed, 85 insertions(+)
 create mode 100644 arch/x86_64/kernel/vsmp.c

(limited to 'include/linux')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index ee053e32a721..2efc4be22709 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -78,6 +78,24 @@ source "init/Kconfig"
 
 menu "Processor type and features"
 
+choice
+	prompt "Subarchitecture Type"
+	default X86_PC
+
+config X86_PC
+	bool "PC-compatible"
+	help
+	  Choose this option if your computer is a standard PC or compatible.
+
+config X86_VSMP
+	bool "Support for ScaleMP vSMP"
+	 help
+	  Support for ScaleMP vSMP systems.  Say 'Y' here if this kernel is
+	  supposed to run on these EM64T-based machines.  Only choose this option
+	  if you have one of these machines.
+
+endchoice
+
 choice
 	prompt "Processor family"
 	default MK8
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 7df09edb36a1..72fe60c20d39 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
 obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
+obj-$(CONFIG_X86_VSMP)		+= vsmp.o
 
 obj-$(CONFIG_MODULES)		+= module.o
 
diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c
new file mode 100644
index 000000000000..92f70c74965f
--- /dev/null
+++ b/arch/x86_64/kernel/vsmp.c
@@ -0,0 +1,45 @@
+/*
+ * vSMPowered(tm) systems specific initialization
+ * Copyright (C) 2005 ScaleMP Inc.
+ *
+ * Use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ *
+ * Ravikiran Thirumalai <kiran@scalemp.com>,
+ * Shai Fultheim <shai@scalemp.com>
+ */
+
+#include <linux/init.h>
+#include <linux/pci_ids.h>
+#include <linux/pci_regs.h>
+#include <asm/pci-direct.h>
+
+static int __init vsmp_init(void)
+{
+	void *address;
+	unsigned int cap, ctl;
+
+	/* Check if we are running on a ScaleMP vSMP box */
+	if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
+	    (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
+		return 0;
+
+	/* set vSMP magic bits to indicate vSMP capable kernel */
+	address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
+	cap = readl(address);
+	ctl = readl(address + 4);
+	printk("vSMP CTL: capabilities:0x%08x  control:0x%08x\n", cap, ctl);
+	if (cap & ctl & (1 << 4)) {
+		/* Turn on vSMP IRQ fastpath handling (see system.h) */
+		ctl &= ~(1 << 4);
+		writel(ctl, address + 4);
+		ctl = readl(address + 4);
+		printk("vSMP CTL: control set to:0x%08x\n", ctl);
+	}
+
+	iounmap(address);
+	return 0;
+}
+
+core_initcall(vsmp_init);
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index 80272190570e..38c1e8a69c9c 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -326,8 +326,25 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 /* interrupt control.. */
 #define local_save_flags(x)	do { warn_if_not_ulong(x); __asm__ __volatile__("# save_flags \n\t pushfq ; popq %q0":"=g" (x): /* no input */ :"memory"); } while (0)
 #define local_irq_restore(x) 	__asm__ __volatile__("# restore_flags \n\t pushq %0 ; popfq": /* no output */ :"g" (x):"memory", "cc")
+
+#ifdef CONFIG_X86_VSMP
+/* Interrupt control for VSMP  architecture */
+#define local_irq_disable()	do { unsigned long flags; local_save_flags(flags); local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); } while (0)
+#define local_irq_enable()	do { unsigned long flags; local_save_flags(flags); local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); } while (0)
+
+#define irqs_disabled()					\
+({							\
+	unsigned long flags;				\
+	local_save_flags(flags);			\
+	(flags & (1<<18)) || !(flags & (1<<9));		\
+})
+
+/* For spinlocks etc */
+#define local_irq_save(x)	do { local_save_flags(x); local_irq_restore((x & ~(1 << 9)) | (1 << 18)); } while (0)
+#else  /* CONFIG_X86_VSMP */
 #define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
 #define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+
 /* used in the idle loop; sti takes one instruction cycle to complete */
 #define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
 /* used when interrupts are already enabled or to shutdown the processor */
@@ -342,6 +359,7 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 
 /* For spinlocks etc */
 #define local_irq_save(x) 	do { warn_if_not_ulong(x); __asm__ __volatile__("# local_irq_save \n\t pushfq ; popq %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
+#endif
 
 void cpu_idle_wait(void);
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index f55c98a68aa9..7fb397e3f2d3 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2152,6 +2152,9 @@
 #define PCI_DEVICE_ID_INTEL_IXP2800	0x9004
 #define PCI_DEVICE_ID_INTEL_S21152BB	0xb152
 
+#define PCI_VENDOR_ID_SCALEMP		0x8686
+#define PCI_DEVICE_ID_SCALEMP_VSMP_CTL	0x1010
+
 #define PCI_VENDOR_ID_COMPUTONE		0x8e0e
 #define PCI_DEVICE_ID_COMPUTONE_IP2EX	0x0291
 #define PCI_DEVICE_ID_COMPUTONE_PG	0x0302
-- 
cgit v1.2.3-71-gd317


From 288867ec5c377db82933b64460ce050e5c998ee9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Jan 2006 11:25:54 +0100
Subject: [hrtimer] Remove listhead from hrtimer struct

The list_head in the hrtimer structure was introduced for easy access
to the first timer with the further extensions of real high resolution
timers in mind, but it turned out in the course of development that
it is not necessary for the standard use case. Remove the list head
and access the first expiry timer by a datafield in the timer base.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  7 ++-----
 kernel/hrtimer.c        | 26 ++++++++++++++------------
 2 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index cf5cfdf8d613..abb674c9b764 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -49,8 +49,6 @@ struct hrtimer_base;
  * struct hrtimer - the basic hrtimer structure
  *
  * @node:	red black tree node for time ordered insertion
- * @list:	list head for easier access to the time ordered list,
- *		without walking the red black tree.
  * @expires:	the absolute expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
  *		which the timer is based.
@@ -63,7 +61,6 @@ struct hrtimer_base;
  */
 struct hrtimer {
 	struct rb_node		node;
-	struct list_head	list;
 	ktime_t			expires;
 	enum hrtimer_state	state;
 	int			(*function)(void *);
@@ -78,7 +75,7 @@ struct hrtimer {
  *		to a base on another cpu.
  * @lock:	lock protecting the base and associated timers
  * @active:	red black tree root node for the active timers
- * @pending:	list of pending timers for simple time ordered access
+ * @first:	pointer to the timer node which expires first
  * @resolution:	the resolution of the clock, in nanoseconds
  * @get_time:	function to retrieve the current time of the clock
  * @curr_timer:	the timer which is executing a callback right now
@@ -87,7 +84,7 @@ struct hrtimer_base {
 	clockid_t		index;
 	spinlock_t		lock;
 	struct rb_root		active;
-	struct list_head	pending;
+	struct rb_node		*first;
 	unsigned long		resolution;
 	ktime_t			(*get_time)(void);
 	struct hrtimer		*curr_timer;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f073a2461faa..e6e8278bcb18 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -314,7 +314,6 @@ hrtimer_forward(struct hrtimer *timer, const ktime_t interval)
 static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	struct rb_node **link = &base->active.rb_node;
-	struct list_head *prev = &base->pending;
 	struct rb_node *parent = NULL;
 	struct hrtimer *entry;
 
@@ -330,22 +329,23 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 		 */
 		if (timer->expires.tv64 < entry->expires.tv64)
 			link = &(*link)->rb_left;
-		else {
+		else
 			link = &(*link)->rb_right;
-			prev = &entry->list;
-		}
 	}
 
 	/*
-	 * Insert the timer to the rbtree and to the sorted list:
+	 * Insert the timer to the rbtree and check whether it
+	 * replaces the first pending timer
 	 */
 	rb_link_node(&timer->node, parent, link);
 	rb_insert_color(&timer->node, &base->active);
-	list_add(&timer->list, prev);
 
 	timer->state = HRTIMER_PENDING;
-}
 
+	if (!base->first || timer->expires.tv64 <
+	    rb_entry(base->first, struct hrtimer, node)->expires.tv64)
+		base->first = &timer->node;
+}
 
 /*
  * __remove_hrtimer - internal function to remove a timer
@@ -355,9 +355,11 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	/*
-	 * Remove the timer from the sorted list and from the rbtree:
+	 * Remove the timer from the rbtree and replace the
+	 * first entry pointer if necessary.
 	 */
-	list_del(&timer->list);
+	if (base->first == &timer->node)
+		base->first = rb_next(&timer->node);
 	rb_erase(&timer->node, &base->active);
 }
 
@@ -529,16 +531,17 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 static inline void run_hrtimer_queue(struct hrtimer_base *base)
 {
 	ktime_t now = base->get_time();
+	struct rb_node *node;
 
 	spin_lock_irq(&base->lock);
 
-	while (!list_empty(&base->pending)) {
+	while ((node = base->first)) {
 		struct hrtimer *timer;
 		int (*fn)(void *);
 		int restart;
 		void *data;
 
-		timer = list_entry(base->pending.next, struct hrtimer, list);
+		timer = rb_entry(node, struct hrtimer, node);
 		if (now.tv64 <= timer->expires.tv64)
 			break;
 
@@ -732,7 +735,6 @@ static void __devinit init_hrtimers_cpu(int cpu)
 
 	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
 		spin_lock_init(&base->lock);
-		INIT_LIST_HEAD(&base->pending);
 		base++;
 	}
 }
-- 
cgit v1.2.3-71-gd317


From e2787630c1abb075c935cf47e91beb7c656f48c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Jan 2006 11:36:14 +0100
Subject: [hrtimer] Change resolution storage to ktime_t format

Change the storage format of the per base resolution to ktime_t to
make it easier accessible in the hrtimers code.

Change the resolution from (NSEC_PER_SEC/HZ) to TICK_NSEC as Roman
pointed out. TICK_NSEC is closer to the real resolution.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 2 +-
 include/linux/ktime.h   | 4 ++--
 kernel/hrtimer.c        | 3 +--
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index abb674c9b764..98c5c1537b5d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -85,7 +85,7 @@ struct hrtimer_base {
 	spinlock_t		lock;
 	struct rb_root		active;
 	struct rb_node		*first;
-	unsigned long		resolution;
+	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
 	struct hrtimer		*curr_timer;
 };
diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 222a047cc145..1bd6552cc341 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -272,8 +272,8 @@ static inline u64 ktime_to_ns(const ktime_t kt)
  * idea of the (in)accuracy of timers. Timer values are rounded up to
  * this resolution values.
  */
-#define KTIME_REALTIME_RES	(NSEC_PER_SEC/HZ)
-#define KTIME_MONOTONIC_RES	(NSEC_PER_SEC/HZ)
+#define KTIME_REALTIME_RES	(ktime_t){ .tv64 = TICK_NSEC }
+#define KTIME_MONOTONIC_RES	(ktime_t){ .tv64 = TICK_NSEC }
 
 /* Get the monotonic time in timespec format: */
 extern void ktime_get_ts(struct timespec *ts);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e6e8278bcb18..76d759ce6231 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -518,9 +518,8 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
 	struct hrtimer_base *bases;
 
-	tp->tv_sec = 0;
 	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
-	tp->tv_nsec = bases[which_clock].resolution;
+	*tp = ktime_to_timespec(bases[which_clock].resolution);
 
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From c9db4fa11526affde83603fe52595bd1260c1354 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Jan 2006 11:47:34 +0100
Subject: [hrtimer] Enforce resolution as lower limit of intervals

Roman Zippel pointed out that the missing lower limit of intervals
leads to an accounting error in the overrun count. Enforce the lower
limit of intervals to resolution in the timer forwarding code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 3 +--
 kernel/hrtimer.c        | 5 ++++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 98c5c1537b5d..089bfb1fa01a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -122,8 +122,7 @@ static inline int hrtimer_active(const struct hrtimer *timer)
 }
 
 /* Forward a hrtimer so it expires after now: */
-extern unsigned long hrtimer_forward(struct hrtimer *timer,
-				     const ktime_t interval);
+extern unsigned long hrtimer_forward(struct hrtimer *timer, ktime_t interval);
 
 /* Precise sleep: */
 extern long hrtimer_nanosleep(struct timespec *rqtp,
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 76d759ce6231..04ccab099e84 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -275,7 +275,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  * The number of overruns is added to the overrun field.
  */
 unsigned long
-hrtimer_forward(struct hrtimer *timer, const ktime_t interval)
+hrtimer_forward(struct hrtimer *timer, ktime_t interval)
 {
 	unsigned long orun = 1;
 	ktime_t delta, now;
@@ -287,6 +287,9 @@ hrtimer_forward(struct hrtimer *timer, const ktime_t interval)
 	if (delta.tv64 < 0)
 		return 0;
 
+	if (interval.tv64 < timer->base->resolution.tv64)
+		interval.tv64 = timer->base->resolution.tv64;
+
 	if (unlikely(delta.tv64 >= interval.tv64)) {
 		nsec_t incr = ktime_to_ns(interval);
 
-- 
cgit v1.2.3-71-gd317


From 198e2f181163233b379dc7ce8a6d7516b84042e7 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Thu, 12 Jan 2006 01:05:30 -0800
Subject: [PATCH] scheduler cache-hot-autodetect

)

From: Ingo Molnar <mingo@elte.hu>

This is the latest version of the scheduler cache-hot-auto-tune patch.

The first problem was that detection time scaled with O(N^2), which is
unacceptable on larger SMP and NUMA systems. To solve this:

- I've added a 'domain distance' function, which is used to cache
  measurement results. Each distance is only measured once. This means
  that e.g. on NUMA distances of 0, 1 and 2 might be measured, on HT
  distances 0 and 1, and on SMP distance 0 is measured. The code walks
  the domain tree to determine the distance, so it automatically follows
  whatever hierarchy an architecture sets up. This cuts down on the boot
  time significantly and removes the O(N^2) limit. The only assumption
  is that migration costs can be expressed as a function of domain
  distance - this covers the overwhelming majority of existing systems,
  and is a good guess even for more assymetric systems.

  [ People hacking systems that have assymetries that break this
    assumption (e.g. different CPU speeds) should experiment a bit with
    the cpu_distance() function. Adding a ->migration_distance factor to
    the domain structure would be one possible solution - but lets first
    see the problem systems, if they exist at all. Lets not overdesign. ]

Another problem was that only a single cache-size was used for measuring
the cost of migration, and most architectures didnt set that variable
up. Furthermore, a single cache-size does not fit NUMA hierarchies with
L3 caches and does not fit HT setups, where different CPUs will often
have different 'effective cache sizes'. To solve this problem:

- Instead of relying on a single cache-size provided by the platform and
  sticking to it, the code now auto-detects the 'effective migration
  cost' between two measured CPUs, via iterating through a wide range of
  cachesizes. The code searches for the maximum migration cost, which
  occurs when the working set of the test-workload falls just below the
  'effective cache size'. I.e. real-life optimized search is done for
  the maximum migration cost, between two real CPUs.

  This, amongst other things, has the positive effect hat if e.g. two
  CPUs share a L2/L3 cache, a different (and accurate) migration cost
  will be found than between two CPUs on the same system that dont share
  any caches.

(The reliable measurement of migration costs is tricky - see the source
for details.)

Furthermore i've added various boot-time options to override/tune
migration behavior.

Firstly, there's a blanket override for autodetection:

	migration_cost=1000,2000,3000

will override the depth 0/1/2 values with 1msec/2msec/3msec values.

Secondly, there's a global factor that can be used to increase (or
decrease) the autodetected values:

	migration_factor=120

will increase the autodetected values by 20%. This option is useful to
tune things in a workload-dependent way - e.g. if a workload is
cache-insensitive then CPU utilization can be maximized by specifying
migration_factor=0.

I've tested the autodetection code quite extensively on x86, on 3
P3/Xeon/2MB, and the autodetected values look pretty good:

Dual Celeron (128K L2 cache):

 ---------------------
 migration cost matrix (max_cache_size: 131072, cpu: 467 MHz):
 ---------------------
           [00]    [01]
 [00]:     -     1.7(1)
 [01]:   1.7(1)    -
 ---------------------
 cacheflush times [2]: 0.0 (0) 1.7 (1784008)
 ---------------------

Here the slow memory subsystem dominates system performance, and even
though caches are small, the migration cost is 1.7 msecs.

Dual HT P4 (512K L2 cache):

 ---------------------
 migration cost matrix (max_cache_size: 524288, cpu: 2379 MHz):
 ---------------------
           [00]    [01]    [02]    [03]
 [00]:     -     0.4(1)  0.0(0)  0.4(1)
 [01]:   0.4(1)    -     0.4(1)  0.0(0)
 [02]:   0.0(0)  0.4(1)    -     0.4(1)
 [03]:   0.4(1)  0.0(0)  0.4(1)    -
 ---------------------
 cacheflush times [2]: 0.0 (33900) 0.4 (448514)
 ---------------------

Here it can be seen that there is no migration cost between two HT
siblings (CPU#0/2 and CPU#1/3 are separate physical CPUs). A fast memory
system makes inter-physical-CPU migration pretty cheap: 0.4 msecs.

8-way P3/Xeon [2MB L2 cache]:

 ---------------------
 migration cost matrix (max_cache_size: 2097152, cpu: 700 MHz):
 ---------------------
           [00]    [01]    [02]    [03]    [04]    [05]    [06]    [07]
 [00]:     -    19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
 [01]:  19.2(1)    -    19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
 [02]:  19.2(1) 19.2(1)    -    19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
 [03]:  19.2(1) 19.2(1) 19.2(1)    -    19.2(1) 19.2(1) 19.2(1) 19.2(1)
 [04]:  19.2(1) 19.2(1) 19.2(1) 19.2(1)    -    19.2(1) 19.2(1) 19.2(1)
 [05]:  19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)    -    19.2(1) 19.2(1)
 [06]:  19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)    -    19.2(1)
 [07]:  19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)    -
 ---------------------
 cacheflush times [2]: 0.0 (0) 19.2 (19281756)
 ---------------------

This one has huge caches and a relatively slow memory subsystem - so the
migration cost is 19 msecs.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: <wilder@us.ibm.com>
Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt   |  43 ++++
 arch/i386/kernel/smpboot.c            |   1 +
 arch/ia64/kernel/setup.c              |   6 +
 arch/ppc/xmon/xmon.c                  |   2 +-
 include/asm-i386/topology.h           |   1 -
 include/asm-ia64/topology.h           |   2 -
 include/asm-mips/mach-ip27/topology.h |   1 -
 include/asm-powerpc/topology.h        |   1 -
 include/asm-x86_64/topology.h         |   1 -
 include/linux/sched.h                 |   9 +-
 include/linux/topology.h              |   2 -
 kernel/sched.c                        | 468 ++++++++++++++++++++++++++++++++++
 12 files changed, 527 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index dd0bfc291a68..fe11fccf7e41 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -856,6 +856,49 @@ running once the system is up.
 
 	mga=		[HW,DRM]
 
+	migration_cost=
+			[KNL,SMP] debug: override scheduler migration costs
+			Format: <level-1-usecs>,<level-2-usecs>,...
+			This debugging option can be used to override the
+			default scheduler migration cost matrix. The numbers
+			are indexed by 'CPU domain distance'.
+			E.g. migration_cost=1000,2000,3000 on an SMT NUMA
+			box will set up an intra-core migration cost of
+			1 msec, an inter-core migration cost of 2 msecs,
+			and an inter-node migration cost of 3 msecs.
+
+			WARNING: using the wrong values here can break
+			scheduler performance, so it's only for scheduler
+			development purposes, not production environments.
+
+	migration_debug=
+			[KNL,SMP] migration cost auto-detect verbosity
+			Format=<0|1|2>
+			If a system's migration matrix reported at bootup
+			seems erroneous then this option can be used to
+			increase verbosity of the detection process.
+			We default to 0 (no extra messages), 1 will print
+			some more information, and 2 will be really
+			verbose (probably only useful if you also have a
+			serial console attached to the system).
+
+	migration_factor=
+			[KNL,SMP] multiply/divide migration costs by a factor
+			Format=<percent>
+			This debug option can be used to proportionally
+			increase or decrease the auto-detected migration
+			costs for all entries of the migration matrix.
+			E.g. migration_factor=150 will increase migration
+			costs by 50%. (and thus the scheduler will be less
+			eager migrating cache-hot tasks)
+			migration_factor=80 will decrease migration costs
+			by 20%. (thus the scheduler will be more eager to
+			migrate tasks)
+
+			WARNING: using the wrong values here can break
+			scheduler performance, so it's only for scheduler
+			development purposes, not production environments.
+
 	mousedev.tap_time=
 			[MOUSE] Maximum time between finger touching and
 			leaving touchpad surface for touch to be considered
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index b3c2e2c26743..a9bf5f222e47 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1096,6 +1096,7 @@ static void smp_tune_scheduling (void)
 			cachesize = 16; /* Pentiums, 2x8kB cache */
 			bandwidth = 100;
 		}
+		max_cache_size = cachesize * 1024;
 	}
 }
 
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index d91c8ff2c0d7..0daa8fa9ef32 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -696,6 +696,7 @@ static void
 get_max_cacheline_size (void)
 {
 	unsigned long line_size, max = 1;
+	unsigned int cache_size = 0;
 	u64 l, levels, unique_caches;
         pal_cache_config_info_t cci;
         s64 status;
@@ -725,6 +726,8 @@ get_max_cacheline_size (void)
 		line_size = 1 << cci.pcci_line_size;
 		if (line_size > max)
 			max = line_size;
+		if (cache_size < cci.pcci_cache_size)
+			cache_size = cci.pcci_cache_size;
 		if (!cci.pcci_unified) {
 			status = ia64_pal_cache_config_info(l,
 						    /* cache_type (instruction)= */ 1,
@@ -741,6 +744,9 @@ get_max_cacheline_size (void)
 			ia64_i_cache_stride_shift = cci.pcci_stride;
 	}
   out:
+#ifdef CONFIG_SMP
+	max_cache_size = max(max_cache_size, cache_size);
+#endif
 	if (max > ia64_max_cacheline_size)
 		ia64_max_cacheline_size = max;
 }
diff --git a/arch/ppc/xmon/xmon.c b/arch/ppc/xmon/xmon.c
index 2b483b4f1602..9075a7538e26 100644
--- a/arch/ppc/xmon/xmon.c
+++ b/arch/ppc/xmon/xmon.c
@@ -99,7 +99,7 @@ static void remove_bpts(void);
 static void insert_bpts(void);
 static struct bpt *at_breakpoint(unsigned pc);
 static void bpt_cmds(void);
-static void cacheflush(void);
+void cacheflush(void);
 #ifdef CONFIG_SMP
 static void cpu_cmd(void);
 #endif /* CONFIG_SMP */
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 0ec27c9e8e45..d7e19eb344b7 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -72,7 +72,6 @@ static inline int node_to_first_cpu(int node)
 	.max_interval		= 32,			\
 	.busy_factor		= 32,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 1,			\
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index f7c330467e7e..d8aae4da3978 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -55,7 +55,6 @@ void build_cpu_to_node_map(void);
 	.max_interval		= 4,			\
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000000),		\
 	.per_cpu_gain		= 100,			\
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 2,			\
@@ -81,7 +80,6 @@ void build_cpu_to_node_map(void);
 	.max_interval		= 8*(min(num_online_cpus(), 32)), \
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
diff --git a/include/asm-mips/mach-ip27/topology.h b/include/asm-mips/mach-ip27/topology.h
index 82141c711c33..59d26b52ba32 100644
--- a/include/asm-mips/mach-ip27/topology.h
+++ b/include/asm-mips/mach-ip27/topology.h
@@ -27,7 +27,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 	.max_interval		= 32,			\
 	.busy_factor		= 32,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000),		\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h
index 9f3d4da261c4..1e19cd00af25 100644
--- a/include/asm-powerpc/topology.h
+++ b/include/asm-powerpc/topology.h
@@ -39,7 +39,6 @@ static inline int node_to_first_cpu(int node)
 	.max_interval		= 32,			\
 	.busy_factor		= 32,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.busy_idx		= 3,			\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 7d82bc56b9fa..2fa7f27381b4 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -39,7 +39,6 @@ extern int __node_distance(int, int);
 	.max_interval		= 32,			\
 	.busy_factor		= 32,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3b74c4bf2934..5d6b9228bba9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -631,7 +631,14 @@ struct sched_domain {
 
 extern void partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
-#endif /* CONFIG_SMP */
+
+/*
+ * Maximum cache size the migration-costs auto-tuning code will
+ * search from:
+ */
+extern unsigned int max_cache_size;
+
+#endif	/* CONFIG_SMP */
 
 
 struct io_context;			/* See blkdev.h */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 3df1d474e5c5..315a5163d6a0 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -86,7 +86,6 @@
 	.max_interval		= 2,			\
 	.busy_factor		= 8,			\
 	.imbalance_pct		= 110,			\
-	.cache_hot_time		= 0,			\
 	.cache_nice_tries	= 0,			\
 	.per_cpu_gain		= 25,			\
 	.busy_idx		= 0,			\
@@ -117,7 +116,6 @@
 	.max_interval		= 4,			\
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (5*1000000/2),	\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.busy_idx		= 2,			\
diff --git a/kernel/sched.c b/kernel/sched.c
index c0c60c926d5e..98461de1ab65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/suspend.h>
+#include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
@@ -5082,7 +5083,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 
 #define SD_NODES_PER_DOMAIN 16
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE		2
+#define MIN_CACHE_SIZE		(64*1024U)
+#define DEFAULT_CACHE_SIZE	(5*1024*1024U)
+#define ITERATIONS		2
+#define SIZE_THRESH		130
+#define COST_THRESH		130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+	int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+
+	printk("#ints: %d\n", ints[0]);
+	for (i = 1; i <= ints[0]; i++) {
+		migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+		printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
+	}
+	return 1;
+}
+
+__setup ("migration_cost=", migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+	get_option(&str, &migration_factor);
+	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+	return 1;
+}
+
+__setup("migration_factor=", setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+static unsigned long domain_distance(int cpu1, int cpu2)
+{
+	unsigned long distance = 0;
+	struct sched_domain *sd;
+
+	for_each_domain(cpu1, sd) {
+		WARN_ON(!cpu_isset(cpu1, sd->span));
+		if (cpu_isset(cpu2, sd->span))
+			return distance;
+		distance++;
+	}
+	if (distance >= MAX_DOMAIN_DISTANCE) {
+		WARN_ON(1);
+		distance = MAX_DOMAIN_DISTANCE-1;
+	}
+
+	return distance;
+}
+
+static unsigned int migration_debug;
+
+static int __init setup_migration_debug(char *str)
+{
+	get_option(&str, &migration_debug);
+	return 1;
+}
+
+__setup("migration_debug=", setup_migration_debug);
+
+/*
+ * Maximum cache-size that the scheduler should try to measure.
+ * Architectures with larger caches should tune this up during
+ * bootup. Gets used in the domain-setup code (i.e. during SMP
+ * bootup).
+ */
+unsigned int max_cache_size;
+
+static int __init setup_max_cache_size(char *str)
+{
+	get_option(&str, &max_cache_size);
+	return 1;
+}
+
+__setup("max_cache_size=", setup_max_cache_size);
+
+/*
+ * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
+ * is the operation that is timed, so we try to generate unpredictable
+ * cachemisses that still end up filling the L2 cache:
+ */
+static void touch_cache(void *__cache, unsigned long __size)
+{
+	unsigned long size = __size/sizeof(long), chunk1 = size/3,
+			chunk2 = 2*size/3;
+	unsigned long *cache = __cache;
+	int i;
+
+	for (i = 0; i < size/6; i += 8) {
+		switch (i % 6) {
+			case 0: cache[i]++;
+			case 1: cache[size-1-i]++;
+			case 2: cache[chunk1-i]++;
+			case 3: cache[chunk1+i]++;
+			case 4: cache[chunk2-i]++;
+			case 5: cache[chunk2+i]++;
+		}
+	}
+}
+
+/*
+ * Measure the cache-cost of one task migration. Returns in units of nsec.
+ */
+static unsigned long long measure_one(void *cache, unsigned long size,
+				      int source, int target)
+{
+	cpumask_t mask, saved_mask;
+	unsigned long long t0, t1, t2, t3, cost;
+
+	saved_mask = current->cpus_allowed;
+
+	/*
+	 * Flush source caches to RAM and invalidate them:
+	 */
+	sched_cacheflush();
+
+	/*
+	 * Migrate to the source CPU:
+	 */
+	mask = cpumask_of_cpu(source);
+	set_cpus_allowed(current, mask);
+	WARN_ON(smp_processor_id() != source);
+
+	/*
+	 * Dirty the working set:
+	 */
+	t0 = sched_clock();
+	touch_cache(cache, size);
+	t1 = sched_clock();
+
+	/*
+	 * Migrate to the target CPU, dirty the L2 cache and access
+	 * the shared buffer. (which represents the working set
+	 * of a migrated task.)
+	 */
+	mask = cpumask_of_cpu(target);
+	set_cpus_allowed(current, mask);
+	WARN_ON(smp_processor_id() != target);
+
+	t2 = sched_clock();
+	touch_cache(cache, size);
+	t3 = sched_clock();
+
+	cost = t1-t0 + t3-t2;
+
+	if (migration_debug >= 2)
+		printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
+			source, target, t1-t0, t1-t0, t3-t2, cost);
+	/*
+	 * Flush target caches to RAM and invalidate them:
+	 */
+	sched_cacheflush();
+
+	set_cpus_allowed(current, saved_mask);
+
+	return cost;
+}
+
+/*
+ * Measure a series of task migrations and return the average
+ * result. Since this code runs early during bootup the system
+ * is 'undisturbed' and the average latency makes sense.
+ *
+ * The algorithm in essence auto-detects the relevant cache-size,
+ * so it will properly detect different cachesizes for different
+ * cache-hierarchies, depending on how the CPUs are connected.
+ *
+ * Architectures can prime the upper limit of the search range via
+ * max_cache_size, otherwise the search range defaults to 20MB...64K.
+ */
+static unsigned long long
+measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
+{
+	unsigned long long cost1, cost2;
+	int i;
+
+	/*
+	 * Measure the migration cost of 'size' bytes, over an
+	 * average of 10 runs:
+	 *
+	 * (We perturb the cache size by a small (0..4k)
+	 *  value to compensate size/alignment related artifacts.
+	 *  We also subtract the cost of the operation done on
+	 *  the same CPU.)
+	 */
+	cost1 = 0;
+
+	/*
+	 * dry run, to make sure we start off cache-cold on cpu1,
+	 * and to get any vmalloc pagefaults in advance:
+	 */
+	measure_one(cache, size, cpu1, cpu2);
+	for (i = 0; i < ITERATIONS; i++)
+		cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+
+	measure_one(cache, size, cpu2, cpu1);
+	for (i = 0; i < ITERATIONS; i++)
+		cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+
+	/*
+	 * (We measure the non-migrating [cached] cost on both
+	 *  cpu1 and cpu2, to handle CPUs with different speeds)
+	 */
+	cost2 = 0;
+
+	measure_one(cache, size, cpu1, cpu1);
+	for (i = 0; i < ITERATIONS; i++)
+		cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+
+	measure_one(cache, size, cpu2, cpu2);
+	for (i = 0; i < ITERATIONS; i++)
+		cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+
+	/*
+	 * Get the per-iteration migration cost:
+	 */
+	do_div(cost1, 2*ITERATIONS);
+	do_div(cost2, 2*ITERATIONS);
+
+	return cost1 - cost2;
+}
+
+static unsigned long long measure_migration_cost(int cpu1, int cpu2)
+{
+	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
+	unsigned int max_size, size, size_found = 0;
+	long long cost = 0, prev_cost;
+	void *cache;
+
+	/*
+	 * Search from max_cache_size*5 down to 64K - the real relevant
+	 * cachesize has to lie somewhere inbetween.
+	 */
+	if (max_cache_size) {
+		max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
+		size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
+	} else {
+		/*
+		 * Since we have no estimation about the relevant
+		 * search range
+		 */
+		max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
+		size = MIN_CACHE_SIZE;
+	}
+
+	if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
+		printk("cpu %d and %d not both online!\n", cpu1, cpu2);
+		return 0;
+	}
+
+	/*
+	 * Allocate the working set:
+	 */
+	cache = vmalloc(max_size);
+	if (!cache) {
+		printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+		return 1000000; // return 1 msec on very small boxen
+	}
+
+	while (size <= max_size) {
+		prev_cost = cost;
+		cost = measure_cost(cpu1, cpu2, cache, size);
+
+		/*
+		 * Update the max:
+		 */
+		if (cost > 0) {
+			if (max_cost < cost) {
+				max_cost = cost;
+				size_found = size;
+			}
+		}
+		/*
+		 * Calculate average fluctuation, we use this to prevent
+		 * noise from triggering an early break out of the loop:
+		 */
+		fluct = abs(cost - prev_cost);
+		avg_fluct = (avg_fluct + fluct)/2;
+
+		if (migration_debug)
+			printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+				cpu1, cpu2, size,
+				(long)cost / 1000000,
+				((long)cost / 100000) % 10,
+				(long)max_cost / 1000000,
+				((long)max_cost / 100000) % 10,
+				domain_distance(cpu1, cpu2),
+				cost, avg_fluct);
+
+		/*
+		 * If we iterated at least 20% past the previous maximum,
+		 * and the cost has dropped by more than 20% already,
+		 * (taking fluctuations into account) then we assume to
+		 * have found the maximum and break out of the loop early:
+		 */
+		if (size_found && (size*100 > size_found*SIZE_THRESH))
+			if (cost+avg_fluct <= 0 ||
+				max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
+
+				if (migration_debug)
+					printk("-> found max.\n");
+				break;
+			}
+		/*
+		 * Increase the cachesize in 5% steps:
+		 */
+		size = size * 20 / 19;
+	}
+
+	if (migration_debug)
+		printk("[%d][%d] working set size found: %d, cost: %Ld\n",
+			cpu1, cpu2, size_found, max_cost);
+
+	vfree(cache);
+
+	/*
+	 * A task is considered 'cache cold' if at least 2 times
+	 * the worst-case cost of migration has passed.
+	 *
+	 * (this limit is only listened to if the load-balancing
+	 * situation is 'nice' - if there is a large imbalance we
+	 * ignore it for the sake of CPU utilization and
+	 * processing fairness.)
+	 */
+	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
+}
+
+static void calibrate_migration_costs(const cpumask_t *cpu_map)
+{
+	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
+	unsigned long j0, j1, distance, max_distance = 0;
+	struct sched_domain *sd;
+
+	j0 = jiffies;
+
+	/*
+	 * First pass - calculate the cacheflush times:
+	 */
+	for_each_cpu_mask(cpu1, *cpu_map) {
+		for_each_cpu_mask(cpu2, *cpu_map) {
+			if (cpu1 == cpu2)
+				continue;
+			distance = domain_distance(cpu1, cpu2);
+			max_distance = max(max_distance, distance);
+			/*
+			 * No result cached yet?
+			 */
+			if (migration_cost[distance] == -1LL)
+				migration_cost[distance] =
+					measure_migration_cost(cpu1, cpu2);
+		}
+	}
+	/*
+	 * Second pass - update the sched domain hierarchy with
+	 * the new cache-hot-time estimations:
+	 */
+	for_each_cpu_mask(cpu, *cpu_map) {
+		distance = 0;
+		for_each_domain(cpu, sd) {
+			sd->cache_hot_time = migration_cost[distance];
+			distance++;
+		}
+	}
+	/*
+	 * Print the matrix:
+	 */
+	if (migration_debug)
+		printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
+			max_cache_size,
+#ifdef CONFIG_X86
+			cpu_khz/1000
+#else
+			-1
+#endif
+		);
+	printk("migration_cost=");
+	for (distance = 0; distance <= max_distance; distance++) {
+		if (distance)
+			printk(",");
+		printk("%ld", (long)migration_cost[distance] / 1000);
+	}
+	printk("\n");
+	j1 = jiffies;
+	if (migration_debug)
+		printk("migration: %ld seconds\n", (j1-j0)/HZ);
+
+	/*
+	 * Move back to the original CPU. NUMA-Q gets confused
+	 * if we migrate to another quad during bootup.
+	 */
+	if (raw_smp_processor_id() != orig_cpu) {
+		cpumask_t mask = cpumask_of_cpu(orig_cpu),
+			saved_mask = current->cpus_allowed;
+
+		set_cpus_allowed(current, mask);
+		set_cpus_allowed(current, saved_mask);
+	}
+}
+
 #ifdef CONFIG_NUMA
+
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
@@ -5448,6 +5912,10 @@ next_sg:
 #endif
 		cpu_attach_domain(sd, i);
 	}
+	/*
+	 * Tune cache-hot values:
+	 */
+	calibrate_migration_costs(cpu_map);
 }
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
-- 
cgit v1.2.3-71-gd317


From d7102e95b7b9c00277562c29aad421d2d521c5f6 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Thu, 12 Jan 2006 01:05:32 -0800
Subject: [PATCH] sched: filter affine wakeups

)

From: Nick Piggin <nickpiggin@yahoo.com.au>

Track the last waker CPU, and only consider wakeup-balancing if there's a
match between current waker CPU and the previous waker CPU.  This ensures
that there is some correlation between two subsequent wakeup events before
we move the task.  Should help random-wakeup workloads on large SMP
systems, by reducing the migration attempts by a factor of nr_cpus.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  5 ++++-
 kernel/sched.c        | 10 +++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5d6b9228bba9..b5ef92a043a6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -696,8 +696,11 @@ struct task_struct {
 
 	int lock_depth;		/* BKL lock depth */
 
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
+	int last_waker_cpu;	/* CPU that last woke this task up */
+#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	int oncpu;
+#endif
 #endif
 	int prio, static_prio;
 	struct list_head run_list;
diff --git a/kernel/sched.c b/kernel/sched.c
index 98461de1ab65..c9dec2aa1976 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1290,6 +1290,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 		}
 	}
 
+	if (p->last_waker_cpu != this_cpu)
+		goto out_set_cpu;
+
 	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
@@ -1360,6 +1363,8 @@ out_set_cpu:
 		cpu = task_cpu(p);
 	}
 
+	p->last_waker_cpu = this_cpu;
+
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1441,9 +1446,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
+	p->last_waker_cpu = cpu;
+#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
+#endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
-- 
cgit v1.2.3-71-gd317


From 9fc658763bf992e778243ebe898b03746151ab88 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Thu, 12 Jan 2006 01:05:34 -0800
Subject: [PATCH] missing helper - task_stack_page()

Patchset annotates arch/* uses of ->thread_info.  Ones that really are about
access of thread_info of given process are simply switched to
task_thread_info(task); ones that deal with access to objects on stack are
switched to new helper - task_stack_page().  A _lot_ of the latter are
actually open-coded instances of "find where pt_regs are"; those are
consolidated into task_pt_regs(task) (many architectures actually have such
helper already).

Note that these annotations are not mandatory - any code not converted to
these helpers still works.  However, they clean up a lot of places and have
actually caught a number of bugs, so converting out of tree ports would be a
good idea...

As an example of breakage caught by that stuff, see i386 pt_regs mess - we
used to have it open-coded in a bunch of places and when back in April Stas
had fixed a bug in copy_thread(), the rest had been left out of sync.  That
required two followup patches (the latest - just before 2.6.15) _and_ still
had left /proc/*/stat eip field broken.  Try ps -eo eip on i386 and watch the
junk...

This patch:

new helper - task_stack_page(task).  Returns pointer to the memory object
containing task stack; usually thread_info of task sits in the beginning
of that object.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-m68k/thread_info.h | 1 +
 include/linux/sched.h          | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-m68k/thread_info.h b/include/asm-m68k/thread_info.h
index 9532ca3c45cb..c4d622a57dfb 100644
--- a/include/asm-m68k/thread_info.h
+++ b/include/asm-m68k/thread_info.h
@@ -37,6 +37,7 @@ struct thread_info {
 #define init_stack		(init_thread_union.stack)
 
 #define task_thread_info(tsk)	(&(tsk)->thread.info)
+#define task_stack_page(tsk)	((void *)(tsk)->thread_info)
 #define current_thread_info()	task_thread_info(current)
 
 #define __HAVE_THREAD_FUNCTIONS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5ef92a043a6..a72e17135421 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1240,6 +1240,7 @@ static inline void task_unlock(struct task_struct *p)
 #ifndef __HAVE_THREAD_FUNCTIONS
 
 #define task_thread_info(task) (task)->thread_info
+#define task_stack_page(task) ((void*)((task)->thread_info))
 
 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
 {
-- 
cgit v1.2.3-71-gd317


From 8e32ca49ef2eb5bfec1444b5e731cc2d35111519 Mon Sep 17 00:00:00 2001
From: "Moore, Eric" <Eric.Moore@lsil.com>
Date: Wed, 4 Jan 2006 14:58:43 -0700
Subject: [SCSI] raid_class.c - adding RAID10 and RAID10 defines

Adding defines for RAID10 and RAID50 levels, in preparation
of adding RAID Transport support in the mpt fusion drivers.
(BTW: IME is RAID10, and IM is RAID1).

Signed-off-by: Eric Moore <Eric.Moore@lsil.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/raid_class.c  | 2 ++
 include/linux/raid_class.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/raid_class.c b/drivers/scsi/raid_class.c
index 5ec5f44602ac..50c398aab557 100644
--- a/drivers/scsi/raid_class.c
+++ b/drivers/scsi/raid_class.c
@@ -148,9 +148,11 @@ static struct {
 	{ RAID_LEVEL_LINEAR, "linear" },
 	{ RAID_LEVEL_0, "raid0" },
 	{ RAID_LEVEL_1, "raid1" },
+	{ RAID_LEVEL_10, "raid10" },
 	{ RAID_LEVEL_3, "raid3" },
 	{ RAID_LEVEL_4, "raid4" },
 	{ RAID_LEVEL_5, "raid5" },
+	{ RAID_LEVEL_50, "raid50" },
 	{ RAID_LEVEL_6, "raid6" },
 };
 
diff --git a/include/linux/raid_class.h b/include/linux/raid_class.h
index 48831eac2910..d0dd38b3a2fd 100644
--- a/include/linux/raid_class.h
+++ b/include/linux/raid_class.h
@@ -31,9 +31,11 @@ enum raid_level {
 	RAID_LEVEL_LINEAR,
 	RAID_LEVEL_0,
 	RAID_LEVEL_1,
+	RAID_LEVEL_10,
 	RAID_LEVEL_3,
 	RAID_LEVEL_4,
 	RAID_LEVEL_5,
+	RAID_LEVEL_50,
 	RAID_LEVEL_6,
 };
 
-- 
cgit v1.2.3-71-gd317


From ba027def7be0d6494b72603d5758acc0fb1c7514 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 12 Jan 2006 20:44:12 +0100
Subject: [PATCH] Revert ide softirq handling

There's a problem with the REQ_BLOCK_PC handling as well (bad ->data_len
handling) where it could actually complete a request ahead of time.  I
suggest we just back this out for now, I will resubmit it later when I'm
fully confident in it.

This reverts commit 8672d57138b34447719cd7749f3d21070e1175a1

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/ide-io.c    | 34 +++-------------------------------
 drivers/ide/ide-probe.c |  2 --
 include/linux/ide.h     |  1 -
 3 files changed, 3 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index bcbaeb50bb93..8d50df4526a4 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -55,22 +55,9 @@
 #include <asm/io.h>
 #include <asm/bitops.h>
 
-void ide_softirq_done(struct request *rq)
-{
-	request_queue_t *q = rq->q;
-
-	add_disk_randomness(rq->rq_disk);
-	end_that_request_chunk(rq, 1, rq->data_len);
-
-	spin_lock_irq(q->queue_lock);
-	end_that_request_last(rq, 1);
-	spin_unlock_irq(q->queue_lock);
-}
-
 int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate,
 		      int nr_sectors)
 {
-	unsigned int nbytes;
 	int ret = 1;
 
 	BUG_ON(!(rq->flags & REQ_STARTED));
@@ -94,27 +81,12 @@ int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate,
 		HWGROUP(drive)->hwif->ide_dma_on(drive);
 	}
 
-	/*
-	 * For partial completions (or non fs/pc requests), use the regular
-	 * direct completion path. Same thing for requests that failed, to
-	 * preserve the ->errors value we use the normal completion path
-	 * for those
-	 */
-	nbytes = nr_sectors << 9;
-	if (!rq->errors && rq_all_done(rq, nbytes)) {
-		rq->data_len = nbytes;
+	if (!end_that_request_first(rq, uptodate, nr_sectors)) {
+		add_disk_randomness(rq->rq_disk);
 		blkdev_dequeue_request(rq);
 		HWGROUP(drive)->rq = NULL;
-		blk_complete_request(rq);
+		end_that_request_last(rq, uptodate);
 		ret = 0;
-	} else {
-		if (!end_that_request_first(rq, uptodate, nr_sectors)) {
-			add_disk_randomness(rq->rq_disk);
-			blkdev_dequeue_request(rq);
-			HWGROUP(drive)->rq = NULL;
-			end_that_request_last(rq, uptodate);
-			ret = 0;
-		}
 	}
 
 	return ret;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 7cb2d86601db..e7425546b4b1 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1011,8 +1011,6 @@ static int ide_init_queue(ide_drive_t *drive)
 	blk_queue_max_hw_segments(q, max_sg_entries);
 	blk_queue_max_phys_segments(q, max_sg_entries);
 
-	blk_queue_softirq_done(q, ide_softirq_done);
-
 	/* assign drive queue */
 	drive->queue = q;
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9a8c05dbe4f3..f2e1b5b22898 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1002,7 +1002,6 @@ extern int noautodma;
 
 extern int ide_end_request (ide_drive_t *drive, int uptodate, int nrsecs);
 extern int __ide_end_request (ide_drive_t *drive, struct request *rq, int uptodate, int nrsecs);
-extern void ide_softirq_done(struct request *rq);
 
 /*
  * This is used on exit from the driver to designate the next irq handler
-- 
cgit v1.2.3-71-gd317


From 1d5326774c9245fef77334a9e0f11cd4f8aa7b4e Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@gate.crashing.org>
Date: Wed, 11 Jan 2006 11:27:32 -0800
Subject: [PATCH] gianfar mii: Use proper resource for MII memory region

We can now have the gianfar mii platform device have a proper resource for the
IO memory region for its registers.  Previously we passed this information
that the platform_data structure because we couldn't handle overlapping memory
regions for platform devices.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/net/gianfar_mii.c   | 5 ++++-
 include/linux/fsl_devices.h | 3 ---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/gianfar_mii.c b/drivers/net/gianfar_mii.c
index 04a462c2a5b7..74e52fcbf806 100644
--- a/drivers/net/gianfar_mii.c
+++ b/drivers/net/gianfar_mii.c
@@ -128,6 +128,7 @@ int gfar_mdio_probe(struct device *dev)
 	struct gianfar_mdio_data *pdata;
 	struct gfar_mii *regs;
 	struct mii_bus *new_bus;
+	struct resource *r;
 	int err = 0;
 
 	if (NULL == dev)
@@ -151,8 +152,10 @@ int gfar_mdio_probe(struct device *dev)
 		return -ENODEV;
 	}
 
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+
 	/* Set the PHY base address */
-	regs = (struct gfar_mii *) ioremap(pdata->paddr, 
+	regs = (struct gfar_mii *) ioremap(r->start,
 			sizeof (struct gfar_mii));
 
 	if (NULL == regs) {
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 934aa9bda481..a7a2b855ba72 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -55,9 +55,6 @@ struct gianfar_platform_data {
 };
 
 struct gianfar_mdio_data {
-	/* device specific information */
-	u32 paddr;
-
 	/* board specific information */
 	int irq[32];
 };
-- 
cgit v1.2.3-71-gd317


From a4d00f179fcec7065fe5742e9cebd6500886070f Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@gate.crashing.org>
Date: Wed, 11 Jan 2006 11:27:33 -0800
Subject: [PATCH] phy: Added a macro to represent the string format used to
 match a phy device

Add the PHY_ID_FMT macro to ensure that the format of the id string used by a
driver to match to its specific phy is consistent between the mdio_bus and the
driver.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/net/phy/mdio_bus.c | 2 +-
 drivers/net/phy/phy.c      | 2 +-
 include/linux/phy.h        | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 02940c0fef68..459443b572ce 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -81,7 +81,7 @@ int mdiobus_register(struct mii_bus *bus)
 
 			phydev->dev.parent = bus->dev;
 			phydev->dev.bus = &mdio_bus_type;
-			sprintf(phydev->dev.bus_id, "phy%d:%d", bus->id, i);
+			snprintf(phydev->dev.bus_id, BUS_ID_SIZE, PHY_ID_FMT, bus->id, i);
 
 			phydev->bus = bus;
 
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index b8686e47f899..1474b7c5ac0b 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -42,7 +42,7 @@
  */
 void phy_print_status(struct phy_device *phydev)
 {
-	pr_info("%s: Link is %s", phydev->dev.bus_id,
+	pr_info("PHY: %s - Link is %s", phydev->dev.bus_id,
 			phydev->link ? "Up" : "Down");
 	if (phydev->link)
 		printk(" - %d/%s", phydev->speed,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 92a9696fdebe..331521a10a2d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -53,6 +53,9 @@
 
 #define PHY_MAX_ADDR 32
 
+/* Used when trying to connect to a specific phy (mii bus id:phy device id) */
+#define PHY_ID_FMT "%x:%02x"
+
 /* The Bus class for PHYs.  Devices which provide access to
  * PHYs should register using this structure */
 struct mii_bus {
-- 
cgit v1.2.3-71-gd317


From 4d3248a29cb78b31bb0520eb99b4be620e810a40 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@gate.crashing.org>
Date: Wed, 11 Jan 2006 11:27:33 -0800
Subject: [PATCH] gianfar: Use new PHY_ID_FMT macro

Make the driver produce the string used by phy_connect and have board specific
code pass the integer mii bus id and phy device id for the specific controller
instance.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/net/gianfar.c       | 5 ++++-
 include/linux/fsl_devices.h | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 637b73ad5e90..0c18dbd67d3b 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -399,12 +399,15 @@ static int init_phy(struct net_device *dev)
 		priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_GIGABIT ?
 		SUPPORTED_1000baseT_Full : 0;
 	struct phy_device *phydev;
+	char phy_id[BUS_ID_SIZE];
 
 	priv->oldlink = 0;
 	priv->oldspeed = 0;
 	priv->oldduplex = -1;
 
-	phydev = phy_connect(dev, priv->einfo->bus_id, &adjust_link, 0);
+	snprintf(phy_id, BUS_ID_SIZE, PHY_ID_FMT, priv->einfo->bus_id, priv->einfo->phy_id);
+
+	phydev = phy_connect(dev, phy_id, &adjust_link, 0);
 
 	if (IS_ERR(phydev)) {
 		printk(KERN_ERR "%s: Could not attach to PHY\n", dev->name);
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index a7a2b855ba72..a9f1cfd096ff 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -50,7 +50,8 @@ struct gianfar_platform_data {
 
 	/* board specific information */
 	u32 board_flags;
-	const char *bus_id;
+	u32 bus_id;
+	u32 phy_id;
 	u8 mac_addr[6];
 };
 
-- 
cgit v1.2.3-71-gd317


From b97bf3fd8f6a16966d4f18983b2c40993ff937d4 Mon Sep 17 00:00:00 2001
From: Per Liden <per.liden@nospam.ericsson.com>
Date: Mon, 2 Jan 2006 19:04:38 +0100
Subject: [TIPC] Initial merge

TIPC (Transparent Inter Process Communication) is a protocol designed for
intra cluster communication. For more information see
http://tipc.sourceforge.net

Signed-off-by: Per Liden <per.liden@nospam.ericsson.com>
---
 include/linux/socket.h         |    3 +
 include/linux/tipc.h           |  598 ++++++++
 include/net/tipc/tipc.h        |  255 ++++
 include/net/tipc/tipc_bearer.h |   92 ++
 include/net/tipc/tipc_msg.h    |  220 +++
 include/net/tipc/tipc_port.h   |  105 ++
 net/Kconfig                    |    1 +
 net/Makefile                   |    1 +
 net/tipc/Kconfig               |  112 ++
 net/tipc/Makefile              |   13 +
 net/tipc/addr.c                |   91 ++
 net/tipc/addr.h                |  125 ++
 net/tipc/bcast.c               |  803 ++++++++++
 net/tipc/bcast.h               |  220 +++
 net/tipc/bearer.c              |  689 +++++++++
 net/tipc/bearer.h              |  169 +++
 net/tipc/cluster.c             |  573 ++++++++
 net/tipc/cluster.h             |   89 ++
 net/tipc/config.c              |  715 +++++++++
 net/tipc/config.h              |   76 +
 net/tipc/core.c                |  282 ++++
 net/tipc/core.h                |  313 ++++
 net/tipc/dbg.c                 |  392 +++++
 net/tipc/dbg.h                 |   56 +
 net/tipc/discover.c            |  339 +++++
 net/tipc/discover.h            |   55 +
 net/tipc/eth_media.c           |  296 ++++
 net/tipc/handler.c             |  129 ++
 net/tipc/link.c                | 3164 ++++++++++++++++++++++++++++++++++++++++
 net/tipc/link.h                |  293 ++++
 net/tipc/msg.c                 |  331 +++++
 net/tipc/msg.h                 |  815 +++++++++++
 net/tipc/name_distr.c          |  306 ++++
 net/tipc/name_distr.h          |   45 +
 net/tipc/name_table.c          | 1076 ++++++++++++++
 net/tipc/name_table.h          |  105 ++
 net/tipc/net.c                 |  308 ++++
 net/tipc/net.h                 |   63 +
 net/tipc/netlink.c             |  107 ++
 net/tipc/node.c                |  676 +++++++++
 net/tipc/node.h                |  141 ++
 net/tipc/node_subscr.c         |   76 +
 net/tipc/node_subscr.h         |   60 +
 net/tipc/port.c                | 1704 ++++++++++++++++++++++
 net/tipc/port.h                |  206 +++
 net/tipc/ref.c                 |  186 +++
 net/tipc/ref.h                 |  128 ++
 net/tipc/socket.c              | 1722 ++++++++++++++++++++++
 net/tipc/subscr.c              |  522 +++++++
 net/tipc/subscr.h              |   77 +
 net/tipc/user_reg.c            |  262 ++++
 net/tipc/user_reg.h            |   45 +
 net/tipc/zone.c                |  166 +++
 net/tipc/zone.h                |   68 +
 54 files changed, 19464 insertions(+)
 create mode 100644 include/linux/tipc.h
 create mode 100644 include/net/tipc/tipc.h
 create mode 100644 include/net/tipc/tipc_bearer.h
 create mode 100644 include/net/tipc/tipc_msg.h
 create mode 100644 include/net/tipc/tipc_port.h
 create mode 100644 net/tipc/Kconfig
 create mode 100644 net/tipc/Makefile
 create mode 100644 net/tipc/addr.c
 create mode 100644 net/tipc/addr.h
 create mode 100644 net/tipc/bcast.c
 create mode 100644 net/tipc/bcast.h
 create mode 100644 net/tipc/bearer.c
 create mode 100644 net/tipc/bearer.h
 create mode 100644 net/tipc/cluster.c
 create mode 100644 net/tipc/cluster.h
 create mode 100644 net/tipc/config.c
 create mode 100644 net/tipc/config.h
 create mode 100644 net/tipc/core.c
 create mode 100644 net/tipc/core.h
 create mode 100644 net/tipc/dbg.c
 create mode 100644 net/tipc/dbg.h
 create mode 100644 net/tipc/discover.c
 create mode 100644 net/tipc/discover.h
 create mode 100644 net/tipc/eth_media.c
 create mode 100644 net/tipc/handler.c
 create mode 100644 net/tipc/link.c
 create mode 100644 net/tipc/link.h
 create mode 100644 net/tipc/msg.c
 create mode 100644 net/tipc/msg.h
 create mode 100644 net/tipc/name_distr.c
 create mode 100644 net/tipc/name_distr.h
 create mode 100644 net/tipc/name_table.c
 create mode 100644 net/tipc/name_table.h
 create mode 100644 net/tipc/net.c
 create mode 100644 net/tipc/net.h
 create mode 100644 net/tipc/netlink.c
 create mode 100644 net/tipc/node.c
 create mode 100644 net/tipc/node.h
 create mode 100644 net/tipc/node_subscr.c
 create mode 100644 net/tipc/node_subscr.h
 create mode 100644 net/tipc/port.c
 create mode 100644 net/tipc/port.h
 create mode 100644 net/tipc/ref.c
 create mode 100644 net/tipc/ref.h
 create mode 100644 net/tipc/socket.c
 create mode 100644 net/tipc/subscr.c
 create mode 100644 net/tipc/subscr.h
 create mode 100644 net/tipc/user_reg.c
 create mode 100644 net/tipc/user_reg.h
 create mode 100644 net/tipc/zone.c
 create mode 100644 net/tipc/zone.h

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 9f4019156fd8..b02dda4ee83d 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -186,6 +186,7 @@ struct ucred {
 #define AF_PPPOX	24	/* PPPoX sockets		*/
 #define AF_WANPIPE	25	/* Wanpipe API Sockets */
 #define AF_LLC		26	/* Linux LLC			*/
+#define AF_TIPC		30	/* TIPC sockets			*/
 #define AF_BLUETOOTH	31	/* Bluetooth sockets 		*/
 #define AF_MAX		32	/* For now.. */
 
@@ -218,6 +219,7 @@ struct ucred {
 #define PF_PPPOX	AF_PPPOX
 #define PF_WANPIPE	AF_WANPIPE
 #define PF_LLC		AF_LLC
+#define PF_TIPC		AF_TIPC
 #define PF_BLUETOOTH	AF_BLUETOOTH
 #define PF_MAX		AF_MAX
 
@@ -279,6 +281,7 @@ struct ucred {
 #define SOL_LLC		268
 #define SOL_DCCP	269
 #define SOL_NETLINK	270
+#define SOL_TIPC	271
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/linux/tipc.h b/include/linux/tipc.h
new file mode 100644
index 000000000000..ca754f3317f2
--- /dev/null
+++ b/include/linux/tipc.h
@@ -0,0 +1,598 @@
+/*
+ * include/linux/tipc.h: Include file for TIPC users
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_TIPC_H_
+#define _LINUX_TIPC_H_
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <asm/byteorder.h>
+
+/*
+ * TIPC addressing primitives
+ */
+ 
+struct tipc_portid {
+	__u32 ref;
+	__u32 node;
+};
+
+struct tipc_name {
+	__u32 type;
+	__u32 instance;
+};
+
+struct tipc_name_seq {
+	__u32 type;
+	__u32 lower;
+	__u32 upper;
+};
+
+static inline __u32 tipc_addr(unsigned int zone,
+			      unsigned int cluster,
+			      unsigned int node)
+{
+	return(zone << 24) | (cluster << 12) | node;
+}
+
+static inline unsigned int tipc_zone(__u32 addr)
+{
+	return  addr >> 24;
+}
+
+static inline unsigned int tipc_cluster(__u32 addr)
+{
+	return(addr >> 12) & 0xfff;
+}
+
+static inline unsigned int tipc_node(__u32 addr)
+{
+	return  addr & 0xfff;
+}
+
+/*
+ * Application-accessible port name types
+ */
+
+#define TIPC_NET_EVENTS		0	/* network event subscription name type */
+#define TIPC_TOP_SRV         	1  	/* topology service name type */
+#define TIPC_RESERVED_TYPES	64	/* lowest user-publishable name type */
+
+/* 
+ * Publication scopes when binding port names and port name sequences
+ */
+
+#define TIPC_ZONE_SCOPE         1
+#define TIPC_CLUSTER_SCOPE      2     
+#define TIPC_NODE_SCOPE         3
+
+/*
+ * Limiting values for messages
+ */
+
+#define TIPC_MAX_USER_MSG_SIZE 66000
+
+/* 
+ * Message importance levels
+ */
+
+#define TIPC_LOW_IMPORTANCE     	0  /* default */
+#define TIPC_MEDIUM_IMPORTANCE		1
+#define TIPC_HIGH_IMPORTANCE		2
+#define TIPC_CRITICAL_IMPORTANCE	3
+
+/* 
+ * Msg rejection/connection shutdown reasons
+ */
+
+#define TIPC_OK			0
+#define TIPC_ERR_NO_NAME	1
+#define TIPC_ERR_NO_PORT	2
+#define TIPC_ERR_NO_NODE	3
+#define TIPC_ERR_OVERLOAD	4
+#define TIPC_CONN_SHUTDOWN	5
+
+/*
+ * TIPC topology subscription service definitions
+ */
+
+#define TIPC_SUB_PORTS     	0x01  	/* filter for port availability */
+#define TIPC_SUB_SERVICE     	0x02  	/* filter for service availability */
+#if 0
+/* The following filter options are not currently implemented */
+#define TIPC_SUB_NO_BIND_EVTS	0x04	/* filter out "publish" events */
+#define TIPC_SUB_NO_UNBIND_EVTS	0x08	/* filter out "withdraw" events */
+#define TIPC_SUB_SINGLE_EVT	0x10	/* expire after first event */
+#endif
+
+#define TIPC_WAIT_FOREVER   	~0	/* timeout for permanent subscription */
+
+struct tipc_subscr {
+	struct tipc_name_seq seq;	/* name sequence of interest */
+	__u32 timeout;			/* subscription duration (in ms) */
+        __u32 filter;   		/* bitmask of filter options */
+	char usr_handle[8];		/* available for subscriber use */
+};
+
+
+#define TIPC_PUBLISHED          1	/* publication event */
+#define TIPC_WITHDRAWN          2	/* withdraw event */
+#define TIPC_SUBSCR_TIMEOUT     3	/* subscription timeout event */
+
+struct tipc_event {
+	__u32 event;			/* event type */
+	__u32 found_lower;		/* matching name seq instances */
+	__u32 found_upper;		/*    "      "    "     "      */
+        struct tipc_portid port;	/* associated port */
+	struct tipc_subscr s;		/* associated subscription */
+};
+
+/*
+ * Socket API
+ */
+
+#ifndef AF_TIPC
+#define AF_TIPC		30
+#endif
+
+#ifndef PF_TIPC
+#define PF_TIPC		AF_TIPC
+#endif
+
+#ifndef SOL_TIPC
+#define SOL_TIPC	271
+#endif
+
+#define TIPC_ADDR_NAMESEQ   	1
+#define TIPC_ADDR_MCAST         1
+#define TIPC_ADDR_NAME      	2
+#define TIPC_ADDR_ID        	3
+
+struct sockaddr_tipc {
+	unsigned short family;
+	unsigned char  addrtype;
+	signed   char  scope;
+	union {
+		struct tipc_portid id;
+		struct tipc_name_seq nameseq;
+		struct {
+			struct tipc_name name;
+			__u32 domain; /* 0: own zone */
+		} name;
+	} addr;
+};
+
+/*
+ * Ancillary data objects supported by recvmsg()
+ */
+
+#define TIPC_ERRINFO	1	/* error info */
+#define TIPC_RETDATA	2	/* returned data */
+#define TIPC_DESTNAME	3	/* destination name */
+
+/*
+ * TIPC-specific socket option values
+ */
+
+#define TIPC_IMPORTANCE		127	/* Default: TIPC_LOW_IMPORTANCE */
+#define TIPC_SRC_DROPPABLE	128	/* Default: 0 (resend congested msg) */
+#define TIPC_DEST_DROPPABLE	129	/* Default: based on socket type */
+#define TIPC_CONN_TIMEOUT     	130	/* Default: 8000 (ms)  */
+
+/*
+ * Bearer
+ */
+
+/* Identifiers of supported TIPC media types */
+
+#define TIPC_MEDIA_TYPE_ETH	1
+
+/* Maximum sizes of TIPC bearer-related names (including terminating NUL) */ 
+
+#define TIPC_MAX_MEDIA_NAME	16	/* format = media */
+#define TIPC_MAX_IF_NAME	16	/* format = interface */
+#define TIPC_MAX_BEARER_NAME	32	/* format = media:interface */
+#define TIPC_MAX_LINK_NAME	60	/* format = Z.C.N:interface-Z.C.N:interface */
+
+struct tipc_media_addr {
+	__u32  type;
+	union {
+		__u8   eth_addr[6];	/* Ethernet bearer */ 
+#if 0
+		/* Prototypes for other possible bearer types */
+
+		struct {
+			__u16 sin_family;
+			__u16 sin_port;
+			struct {
+				__u32 s_addr;
+			} sin_addr;
+			char pad[4];
+		} addr_in;		/* IP-based bearer */
+		__u16  sock_descr;	/* generic socket bearer */
+#endif
+	} dev_addr;
+};
+
+
+/* Link priority limits (range from 0 to # priorities - 1) */
+
+#define TIPC_NUM_LINK_PRI 32
+
+/* Link tolerance limits (min, default, max), in ms */
+
+#define TIPC_MIN_LINK_TOL 50
+#define TIPC_DEF_LINK_TOL 1500
+#define TIPC_MAX_LINK_TOL 30000
+
+/* Link window limits (min, default, max), in packets */
+
+#define TIPC_MIN_LINK_WIN 16
+#define TIPC_DEF_LINK_WIN 50
+#define TIPC_MAX_LINK_WIN 150
+
+/*
+ * Configuration
+ *
+ * All configuration management messaging involves sending a request message
+ * to the TIPC configuration service on a node, which sends a reply message
+ * back.  (In the future multi-message replies may be supported.)
+ *
+ * Both request and reply messages consist of a transport header and payload.
+ * The transport header contains info about the desired operation;
+ * the payload consists of zero or more type/length/value (TLV) items
+ * which specify parameters or results for the operation.
+ *
+ * For many operations, the request and reply messages have a fixed number
+ * of TLVs (usually zero or one); however, some reply messages may return 
+ * a variable number of TLVs.  A failed request is denoted by the presence
+ * of an "error string" TLV in the reply message instead of the TLV(s) the
+ * reply should contain if the request succeeds.
+ */
+ 
+#define TIPC_CFG_SRV	0		/* configuration service name type */
+
+/* 
+ * Public commands:
+ * May be issued by any process.
+ * Accepted by own node, or by remote node only if remote management enabled.                       
+ */
+ 
+#define  TIPC_CMD_NOOP   	    0x0000    /* tx none, rx none */
+#define  TIPC_CMD_GET_NODES         0x0001    /* tx net_addr, rx node_info(s) */
+#define  TIPC_CMD_GET_MEDIA_NAMES   0x0002    /* tx none, rx media_name(s) */
+#define  TIPC_CMD_GET_BEARER_NAMES  0x0003    /* tx none, rx bearer_name(s) */
+#define  TIPC_CMD_GET_LINKS         0x0004    /* tx net_addr, rx link_info(s) */
+#define  TIPC_CMD_SHOW_NAME_TABLE   0x0005    /* tx name_tbl_query, rx ultra_string */
+#define  TIPC_CMD_SHOW_PORTS        0x0006    /* tx none, rx ultra_string */
+#define  TIPC_CMD_SHOW_LINK_STATS   0x000B    /* tx link_name, rx ultra_string */
+
+#if 0
+#define  TIPC_CMD_SHOW_PORT_STATS   0x0008    /* tx port_ref, rx ultra_string */
+#define  TIPC_CMD_RESET_PORT_STATS  0x0009    /* tx port_ref, rx none */
+#define  TIPC_CMD_GET_ROUTES        0x000A    /* tx ?, rx ? */
+#define  TIPC_CMD_GET_LINK_PEER     0x000D    /* tx link_name, rx ? */
+#endif
+
+/* 
+ * Protected commands:
+ * May only be issued by "network administration capable" process.
+ * Accepted by own node, or by remote node only if remote management enabled
+ * and this node is zone manager.                       
+ */
+
+#define  TIPC_CMD_GET_REMOTE_MNG    0x4003    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_PORTS     0x4004    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_PUBL      0x4005    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_SUBSCR    0x4006    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_ZONES     0x4007    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_CLUSTERS  0x4008    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_NODES     0x4009    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_SLAVES    0x400A    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_NETID         0x400B    /* tx none, rx unsigned */
+
+#define  TIPC_CMD_ENABLE_BEARER     0x4101    /* tx bearer_config, rx none */
+#define  TIPC_CMD_DISABLE_BEARER    0x4102    /* tx bearer_name, rx none */
+#define  TIPC_CMD_SET_LINK_TOL      0x4107    /* tx link_config, rx none */
+#define  TIPC_CMD_SET_LINK_PRI      0x4108    /* tx link_config, rx none */
+#define  TIPC_CMD_SET_LINK_WINDOW   0x4109    /* tx link_config, rx none */
+#define  TIPC_CMD_SET_LOG_SIZE      0x410A    /* tx unsigned, rx none */
+#define  TIPC_CMD_DUMP_LOG          0x410B    /* tx none, rx ultra_string */
+#define  TIPC_CMD_RESET_LINK_STATS  0x410C    /* tx link_name, rx none */
+
+#if 0
+#define  TIPC_CMD_CREATE_LINK       0x4103    /* tx link_create, rx none */
+#define  TIPC_CMD_REMOVE_LINK       0x4104    /* tx link_name, rx none */
+#define  TIPC_CMD_BLOCK_LINK        0x4105    /* tx link_name, rx none */
+#define  TIPC_CMD_UNBLOCK_LINK      0x4106    /* tx link_name, rx none */
+#endif
+
+/* 
+ * Private commands:
+ * May only be issued by "network administration capable" process.
+ * Accepted by own node only; cannot be used on a remote node.                       
+ */
+
+#define  TIPC_CMD_SET_NODE_ADDR     0x8001    /* tx net_addr, rx none */
+#if 0
+#define  TIPC_CMD_SET_ZONE_MASTER   0x8002    /* tx none, rx none */
+#endif
+#define  TIPC_CMD_SET_REMOTE_MNG    0x8003    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_PORTS     0x8004    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_PUBL      0x8005    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_SUBSCR    0x8006    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_ZONES     0x8007    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_CLUSTERS  0x8008    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_NODES     0x8009    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_SLAVES    0x800A    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_NETID         0x800B    /* tx unsigned, rx none */
+
+/*
+ * TLV types defined for TIPC
+ */
+
+#define TIPC_TLV_NONE		0	/* no TLV present */
+#define TIPC_TLV_VOID		1	/* empty TLV (0 data bytes)*/
+#define TIPC_TLV_UNSIGNED	2	/* 32-bit integer */
+#define TIPC_TLV_STRING		3	/* char[128] (max) */
+#define TIPC_TLV_LARGE_STRING	4	/* char[2048] (max) */
+#define TIPC_TLV_ULTRA_STRING	5	/* char[32768] (max) */
+
+#define TIPC_TLV_ERROR_STRING	16	/* char[128] containing "error code" */
+#define TIPC_TLV_NET_ADDR   	17	/* 32-bit integer denoting <Z.C.N> */
+#define TIPC_TLV_MEDIA_NAME	18	/* char[MAX_MEDIA_NAME] */
+#define TIPC_TLV_BEARER_NAME	19	/* char[MAX_BEARER_NAME] */
+#define TIPC_TLV_LINK_NAME	20	/* char[MAX_LINK_NAME] */
+#define TIPC_TLV_NODE_INFO	21	/* struct tipc_node_info */
+#define TIPC_TLV_LINK_INFO	22	/* struct tipc_link_info */
+#define TIPC_TLV_BEARER_CONFIG  23	/* struct tipc_bearer_config */
+#define TIPC_TLV_LINK_CONFIG    24	/* struct tipc_link_config */
+#define TIPC_TLV_NAME_TBL_QUERY	25	/* struct tipc_name_table_query */
+#define TIPC_TLV_PORT_REF   	26	/* 32-bit port reference */
+
+struct tipc_node_info {
+	__u32 addr;			/* network address of node */
+	__u32 up;			/* 0=down, 1= up */
+};
+
+struct tipc_link_info {
+	__u32 dest;			/* network address of peer node */
+	__u32 up;			/* 0=down, 1=up */
+	char str[TIPC_MAX_LINK_NAME];	/* link name */
+};
+
+struct tipc_bearer_config {
+	__u32 priority;			/* Range [1,31]. Override per link  */
+	__u32 detect_scope;     
+	char name[TIPC_MAX_BEARER_NAME];
+};
+
+struct tipc_link_config {
+	__u32 value;
+	char name[TIPC_MAX_LINK_NAME];
+};
+
+#define TIPC_NTQ_ALLTYPES 0x80000000
+
+struct tipc_name_table_query {
+	__u32 depth;	/* 1:type, 2:+name info, 3:+port info, 4+:+debug info */
+	__u32 type;	/* {t,l,u} info ignored if high bit of "depth" is set */
+	__u32 lowbound; /* (i.e. displays all entries of name table) */
+	__u32 upbound;
+};
+
+/*
+ * The error string TLV is a null-terminated string describing the cause 
+ * of the request failure.  To simplify error processing (and to save space)
+ * the first character of the string can be a special error code character
+ * (lying by the range 0x80 to 0xFF) which represents a pre-defined reason.
+ */
+
+#define TIPC_CFG_TLV_ERROR      "\x80"  /* request contains incorrect TLV(s) */
+#define TIPC_CFG_NOT_NET_ADMIN  "\x81"	/* must be network administrator */
+#define TIPC_CFG_NOT_ZONE_MSTR	"\x82"	/* must be zone master */
+#define TIPC_CFG_NO_REMOTE	"\x83"	/* remote management not enabled */
+#define TIPC_CFG_NOT_SUPPORTED  "\x84"	/* request is not supported by TIPC */
+#define TIPC_CFG_INVALID_VALUE  "\x85"  /* request has invalid argument value */
+
+#if 0
+/* prototypes TLV structures for proposed commands */
+struct tipc_link_create {
+	__u32   domain;
+	struct tipc_media_addr peer_addr;
+	char bearer_name[MAX_BEARER_NAME];
+};
+
+struct tipc_route_info {
+	__u32 dest;
+	__u32 router;
+};
+#endif
+
+/*
+ * A TLV consists of a descriptor, followed by the TLV value.
+ * TLV descriptor fields are stored in network byte order; 
+ * TLV values must also be stored in network byte order (where applicable).
+ * TLV descriptors must be aligned to addresses which are multiple of 4,
+ * so up to 3 bytes of padding may exist at the end of the TLV value area.
+ * There must not be any padding between the TLV descriptor and its value.
+ */
+
+struct tlv_desc {
+	__u16 tlv_len;		/* TLV length (descriptor + value) */
+	__u16 tlv_type;		/* TLV identifier */
+};
+
+#define TLV_ALIGNTO 4
+
+#define TLV_ALIGN(datalen) (((datalen)+(TLV_ALIGNTO-1)) & ~(TLV_ALIGNTO-1))
+#define TLV_LENGTH(datalen) (sizeof(struct tlv_desc) + (datalen))
+#define TLV_SPACE(datalen) (TLV_ALIGN(TLV_LENGTH(datalen)))
+#define TLV_DATA(tlv) ((void *)((char *)(tlv) + TLV_LENGTH(0)))
+
+static inline int TLV_OK(const void *tlv, __u16 space)
+{
+	/*
+	 * Would also like to check that "tlv" is a multiple of 4,
+	 * but don't know how to do this in a portable way.
+	 * - Tried doing (!(tlv & (TLV_ALIGNTO-1))), but GCC compiler
+	 *   won't allow binary "&" with a pointer.
+	 * - Tried casting "tlv" to integer type, but causes warning about size
+	 *   mismatch when pointer is bigger than chosen type (int, long, ...).
+	 */
+
+	return (space >= TLV_SPACE(0)) &&
+		(ntohs(((struct tlv_desc *)tlv)->tlv_len) <= space);
+}
+
+static inline int TLV_CHECK(const void *tlv, __u16 space, __u16 exp_type)
+{
+	return TLV_OK(tlv, space) && 
+		(ntohs(((struct tlv_desc *)tlv)->tlv_type) == exp_type);
+}
+
+static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len)
+{
+	struct tlv_desc *tlv_ptr;
+	int tlv_len;
+
+	tlv_len = TLV_LENGTH(len);
+	tlv_ptr = (struct tlv_desc *)tlv;
+	tlv_ptr->tlv_type = htons(type);
+	tlv_ptr->tlv_len  = htons(tlv_len);
+	if (len && data)
+		memcpy(TLV_DATA(tlv_ptr), data, tlv_len);
+	return TLV_SPACE(len);
+}
+
+/*
+ * A TLV list descriptor simplifies processing of messages 
+ * containing multiple TLVs.
+ */
+
+struct tlv_list_desc {
+	struct tlv_desc *tlv_ptr;	/* ptr to current TLV */
+	__u32 tlv_space;		/* # bytes from curr TLV to list end */
+};
+
+static inline void TLV_LIST_INIT(struct tlv_list_desc *list, 
+				 void *data, __u32 space)
+{
+	list->tlv_ptr = (struct tlv_desc *)data;
+	list->tlv_space = space;
+}
+	     
+static inline int TLV_LIST_EMPTY(struct tlv_list_desc *list)
+{ 
+	return (list->tlv_space == 0);
+}
+
+static inline int TLV_LIST_CHECK(struct tlv_list_desc *list, __u16 exp_type)
+{
+	return TLV_CHECK(list->tlv_ptr, list->tlv_space, exp_type);
+}
+
+static inline void *TLV_LIST_DATA(struct tlv_list_desc *list)
+{
+	return TLV_DATA(list->tlv_ptr);
+}
+
+static inline void TLV_LIST_STEP(struct tlv_list_desc *list)
+{
+	__u16 tlv_space = TLV_ALIGN(ntohs(list->tlv_ptr->tlv_len));
+
+        list->tlv_ptr = (struct tlv_desc *)((char *)list->tlv_ptr + tlv_space);
+	list->tlv_space -= tlv_space;
+}
+
+/*
+ * Configuration messages exchanged via NETLINK_GENERIC use the following
+ * family id, name, version and command.
+ */
+#define TIPC_GENL_FAMILY	0x222
+#define TIPC_GENL_NAME		"TIPC"
+#define TIPC_GENL_VERSION	0x1
+#define TIPC_GENL_CMD		0x1
+
+/*
+ * TIPC specific header used in NETLINK_GENERIC requests.
+ */
+struct tipc_genlmsghdr {
+	__u32 dest;		/* Destination address */
+	__u16 cmd;		/* Command */
+	__u16 reserved;		/* Unused */
+};
+
+#define TIPC_GENL_HDRLEN	NLMSG_ALIGN(sizeof(struct tipc_genlmsghdr))
+
+/*
+ * Configuration messages exchanged via TIPC sockets use the TIPC configuration 
+ * message header, which is defined below.  This structure is analogous 
+ * to the Netlink message header, but fields are stored in network byte order 
+ * and no padding is permitted between the header and the message data 
+ * that follows.
+ */
+
+struct tipc_cfg_msg_hdr
+{
+	__u32 tcm_len;		/* Message length (including header) */
+	__u16 tcm_type;		/* Command type */
+	__u16 tcm_flags;	/* Additional flags */
+	char  tcm_reserved[8];	/* Unused */
+};
+
+#define TCM_F_REQUEST	0x1	/* Flag: Request message */
+#define TCM_F_MORE	0x2	/* Flag: Message to be continued */
+
+#define TCM_ALIGN(datalen)  (((datalen)+3) & ~3)
+#define TCM_LENGTH(datalen) (sizeof(struct tipc_cfg_msg_hdr) + datalen)
+#define TCM_SPACE(datalen)  (TCM_ALIGN(TCM_LENGTH(datalen)))
+#define TCM_DATA(tcm_hdr)   ((void *)((char *)(tcm_hdr) + TCM_LENGTH(0)))
+
+static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags,
+			  void *data, __u16 data_len)
+{
+	struct tipc_cfg_msg_hdr *tcm_hdr;
+	int msg_len;
+
+	msg_len = TCM_LENGTH(data_len);
+	tcm_hdr = (struct tipc_cfg_msg_hdr *)msg;
+	tcm_hdr->tcm_len   = htonl(msg_len);
+	tcm_hdr->tcm_type  = htons(cmd);
+	tcm_hdr->tcm_flags = htons(flags);
+	if (data_len && data)
+		memcpy(TCM_DATA(msg), data, data_len);
+	return TCM_SPACE(data_len);
+}
+
+#endif
diff --git a/include/net/tipc/tipc.h b/include/net/tipc/tipc.h
new file mode 100644
index 000000000000..1d4d8d0701e3
--- /dev/null
+++ b/include/net/tipc/tipc.h
@@ -0,0 +1,255 @@
+/*
+ * include/net/tipc/tipc.h: Main include file for TIPC users
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_TIPC_H_
+#define _NET_TIPC_H_
+
+#ifdef __KERNEL__
+
+#include <linux/tipc.h>
+#include <linux/skbuff.h>
+
+/* 
+ * Native API
+ * ----------
+ */
+
+/*
+ * TIPC operating mode routines
+ */
+
+u32 tipc_get_addr(void);
+
+#define TIPC_NOT_RUNNING  0
+#define TIPC_NODE_MODE    1
+#define TIPC_NET_MODE     2
+
+typedef void (*tipc_mode_event)(void *usr_handle, int mode, u32 addr);
+
+int tipc_attach(unsigned int *userref, tipc_mode_event, void *usr_handle);
+
+void tipc_detach(unsigned int userref);
+
+int tipc_get_mode(void);
+
+/*
+ * TIPC port manipulation routines
+ */
+
+typedef void (*tipc_msg_err_event) (void *usr_handle,
+				    u32 portref,
+				    struct sk_buff **buf,
+				    unsigned char const *data,
+				    unsigned int size,
+				    int reason, 
+				    struct tipc_portid const *attmpt_destid);
+
+typedef void (*tipc_named_msg_err_event) (void *usr_handle,
+					  u32 portref,
+					  struct sk_buff **buf,
+					  unsigned char const *data,
+					  unsigned int size,
+					  int reason, 
+					  struct tipc_name_seq const *attmpt_dest);
+
+typedef void (*tipc_conn_shutdown_event) (void *usr_handle,
+					  u32 portref,
+					  struct sk_buff **buf,
+					  unsigned char const *data,
+					  unsigned int size,
+					  int reason);
+
+typedef void (*tipc_msg_event) (void *usr_handle,
+				u32 portref,
+				struct sk_buff **buf,
+				unsigned char const *data,
+				unsigned int size,
+				unsigned int importance, 
+				struct tipc_portid const *origin);
+
+typedef void (*tipc_named_msg_event) (void *usr_handle,
+				      u32 portref,
+				      struct sk_buff **buf,
+				      unsigned char const *data,
+				      unsigned int size,
+				      unsigned int importance, 
+				      struct tipc_portid const *orig,
+				      struct tipc_name_seq const *dest);
+
+typedef void (*tipc_conn_msg_event) (void *usr_handle,
+				     u32 portref,
+				     struct sk_buff **buf,
+				     unsigned char const *data,
+				     unsigned int size);
+
+typedef void (*tipc_continue_event) (void *usr_handle, 
+				     u32 portref);
+
+int tipc_createport(unsigned int tipc_user, 
+		    void *usr_handle, 
+		    unsigned int importance, 
+		    tipc_msg_err_event error_cb, 
+		    tipc_named_msg_err_event named_error_cb, 
+		    tipc_conn_shutdown_event conn_error_cb, 
+		    tipc_msg_event message_cb, 
+		    tipc_named_msg_event named_message_cb, 
+		    tipc_conn_msg_event conn_message_cb, 
+		    tipc_continue_event continue_event_cb,/* May be zero */
+		    u32 *portref);
+
+int tipc_deleteport(u32 portref);
+
+int tipc_ownidentity(u32 portref, struct tipc_portid *port);
+
+int tipc_portimportance(u32 portref, unsigned int *importance);
+int tipc_set_portimportance(u32 portref, unsigned int importance);
+
+int tipc_portunreliable(u32 portref, unsigned int *isunreliable);
+int tipc_set_portunreliable(u32 portref, unsigned int isunreliable);
+
+int tipc_portunreturnable(u32 portref, unsigned int *isunreturnable);
+int tipc_set_portunreturnable(u32 portref, unsigned int isunreturnable);
+
+int tipc_publish(u32 portref, unsigned int scope, 
+		 struct tipc_name_seq const *name_seq);
+int tipc_withdraw(u32 portref, unsigned int scope,
+		  struct tipc_name_seq const *name_seq); /* 0: all */
+
+int tipc_connect2port(u32 portref, struct tipc_portid const *port);
+
+int tipc_disconnect(u32 portref);
+
+int tipc_shutdown(u32 ref); /* Sends SHUTDOWN msg */
+
+int tipc_isconnected(u32 portref, int *isconnected);
+
+int tipc_peer(u32 portref, struct tipc_portid *peer);
+
+int tipc_ref_valid(u32 portref); 
+
+/*
+ * TIPC messaging routines
+ */
+
+#define TIPC_PORT_IMPORTANCE 100	/* send using current port setting */
+
+
+int tipc_send(u32 portref,
+	      unsigned int num_sect,
+	      struct iovec const *msg_sect);
+
+int tipc_send_buf(u32 portref,
+		  struct sk_buff *buf,
+		  unsigned int dsz);
+
+int tipc_send2name(u32 portref, 
+		   struct tipc_name const *name, 
+		   u32 domain,	/* 0:own zone */
+		   unsigned int num_sect,
+		   struct iovec const *msg_sect);
+
+int tipc_send_buf2name(u32 portref,
+		       struct tipc_name const *name,
+		       u32 domain,
+		       struct sk_buff *buf,
+		       unsigned int dsz);
+
+int tipc_forward2name(u32 portref, 
+		      struct tipc_name const *name, 
+		      u32 domain,   /*0: own zone */
+		      unsigned int section_count,
+		      struct iovec const *msg_sect,
+		      struct tipc_portid const *origin,
+		      unsigned int importance);
+
+int tipc_forward_buf2name(u32 portref,
+			  struct tipc_name const *name,
+			  u32 domain,
+			  struct sk_buff *buf,
+			  unsigned int dsz,
+			  struct tipc_portid const *orig,
+			  unsigned int importance);
+
+int tipc_send2port(u32 portref,
+		   struct tipc_portid const *dest,
+		   unsigned int num_sect,
+		   struct iovec const *msg_sect);
+
+int tipc_send_buf2port(u32 portref,
+		       struct tipc_portid const *dest,
+		       struct sk_buff *buf,
+		       unsigned int dsz);
+
+int tipc_forward2port(u32 portref,
+		      struct tipc_portid const *dest,
+		      unsigned int num_sect,
+		      struct iovec const *msg_sect,
+		      struct tipc_portid const *origin,
+		      unsigned int importance);
+
+int tipc_forward_buf2port(u32 portref,
+			  struct tipc_portid const *dest,
+			  struct sk_buff *buf,
+			  unsigned int dsz,
+			  struct tipc_portid const *orig,
+			  unsigned int importance);
+
+int tipc_multicast(u32 portref, 
+		   struct tipc_name_seq const *seq, 
+		   u32 domain,	/* 0:own zone */
+		   unsigned int section_count,
+		   struct iovec const *msg);
+
+#if 0
+int tipc_multicast_buf(u32 portref, 
+		       struct tipc_name_seq const *seq, 
+		       u32 domain,	/* 0:own zone */
+		       void *buf,
+		       unsigned int size);
+#endif
+
+/*
+ * TIPC subscription routines
+ */
+
+int tipc_ispublished(struct tipc_name const *name);
+
+/*
+ * Get number of available nodes within specified domain (excluding own node)
+ */
+
+unsigned int tipc_available_nodes(const u32 domain);
+
+#endif
+
+#endif
diff --git a/include/net/tipc/tipc_bearer.h b/include/net/tipc/tipc_bearer.h
new file mode 100644
index 000000000000..a3daf697b6b6
--- /dev/null
+++ b/include/net/tipc/tipc_bearer.h
@@ -0,0 +1,92 @@
+/*
+ * include/net/tipc/tipc_bearer.h: Include file for privileged access to TIPC bearers
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_TIPC_BEARER_H_
+#define _NET_TIPC_BEARER_H_
+
+#ifdef __KERNEL__
+
+#include <linux/tipc.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+
+/**
+ * struct tipc_bearer - TIPC bearer info available to privileged users
+ * @usr_handle: pointer to additional user-defined information about bearer
+ * @mtu: max packet size bearer can support
+ * @blocked: non-zero if bearer is blocked
+ * @lock: spinlock for controlling access to bearer
+ * @addr: media-specific address associated with bearer
+ * @name: bearer name (format = media:interface)
+ * 
+ * Note: TIPC initializes "name" and "lock" fields; user is responsible for
+ * initialization all other fields when a bearer is enabled.
+ */
+
+struct tipc_bearer {
+	void *usr_handle;
+	u32 mtu;
+	int blocked;
+	spinlock_t lock;
+	struct tipc_media_addr addr;
+	char name[TIPC_MAX_BEARER_NAME];
+};
+
+
+int  tipc_register_media(u32 media_type,
+			 char *media_name, 
+			 int (*enable)(struct tipc_bearer *), 
+			 void (*disable)(struct tipc_bearer *), 
+			 int (*send_msg)(struct sk_buff *, 
+					 struct tipc_bearer *,
+					 struct tipc_media_addr *), 
+			 char *(*addr2str)(struct tipc_media_addr *a,
+					   char *str_buf,
+					   int str_size),
+			 struct tipc_media_addr *bcast_addr,
+			 const u32 bearer_priority,
+			 const u32 link_tolerance,  /* [ms] */
+			 const u32 send_window_limit); 
+
+void tipc_recv_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr);
+
+int  tipc_block_bearer(const char *name);
+void tipc_continue(struct tipc_bearer *tb_ptr); 
+
+int tipc_enable_bearer(const char *bearer_name, u32 bcast_scope, u32 priority);
+int tipc_disable_bearer(const char *name);
+
+
+#endif
+
+#endif
diff --git a/include/net/tipc/tipc_msg.h b/include/net/tipc/tipc_msg.h
new file mode 100644
index 000000000000..78513f5236bb
--- /dev/null
+++ b/include/net/tipc/tipc_msg.h
@@ -0,0 +1,220 @@
+/*
+ * include/net/tipc/tipc_msg.h: Include file for privileged access to TIPC message headers
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_TIPC_MSG_H_
+#define _NET_TIPC_MSG_H_
+
+#ifdef __KERNEL__
+
+struct tipc_msg {
+	u32 hdr[15];
+};
+
+
+/*
+		TIPC user data message header format, version 2:
+
+
+       1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0 
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w0:|vers | user  |hdr sz |n|d|s|-|          message size           |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w1:|mstyp| error |rer cnt|lsc|opt p|      broadcast ack no         |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w2:|        link level ack no      |   broadcast/link level seq no |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w3:|                       previous node                           |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w4:|                      originating port                         |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w5:|                      destination port                         |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+    
+   w6:|                      originating node                         |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w7:|                      destination node                         |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w8:|            name type / transport sequence number              |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w9:|              name instance/multicast lower bound              |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+    
+   wA:|                    multicast upper bound                      |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+    
+      /                                                               /
+      \                           options                             \
+      /                                                               /
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+*/
+
+#define TIPC_CONN_MSG	0
+#define TIPC_MCAST_MSG	1
+#define TIPC_NAMED_MSG	2
+#define TIPC_DIRECT_MSG	3
+
+
+static inline u32 msg_word(struct tipc_msg *m, u32 pos)
+{
+	return ntohl(m->hdr[pos]);
+}
+
+static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask)
+{
+	return (msg_word(m, w) >> pos) & mask;
+}
+
+static inline u32 msg_importance(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 25, 0xf);
+}
+
+static inline u32 msg_hdr_sz(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 21, 0xf) << 2;
+}
+
+static inline int msg_short(struct tipc_msg *m)
+{
+	return (msg_hdr_sz(m) == 24);
+}
+
+static inline u32 msg_size(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 0, 0x1ffff);
+}
+
+static inline u32 msg_data_sz(struct tipc_msg *m)
+{
+	return (msg_size(m) - msg_hdr_sz(m));
+}
+
+static inline unchar *msg_data(struct tipc_msg *m)
+{
+	return ((unchar *)m) + msg_hdr_sz(m);
+}
+
+static inline u32 msg_type(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 29, 0x7);
+}
+
+static inline u32 msg_direct(struct tipc_msg *m)
+{
+	return (msg_type(m) == TIPC_DIRECT_MSG);
+}
+
+static inline u32 msg_named(struct tipc_msg *m)
+{
+	return (msg_type(m) == TIPC_NAMED_MSG);
+}
+
+static inline u32 msg_mcast(struct tipc_msg *m)
+{
+	return (msg_type(m) == TIPC_MCAST_MSG);
+}
+
+static inline u32 msg_connected(struct tipc_msg *m)
+{
+	return (msg_type(m) == TIPC_CONN_MSG);
+}
+
+static inline u32 msg_errcode(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 25, 0xf);
+}
+
+static inline u32 msg_prevnode(struct tipc_msg *m)
+{
+	return msg_word(m, 3);
+}
+
+static inline u32 msg_origport(struct tipc_msg *m)
+{
+	return msg_word(m, 4);
+}
+
+static inline u32 msg_destport(struct tipc_msg *m)
+{
+	return msg_word(m, 5);
+}
+
+static inline u32 msg_mc_netid(struct tipc_msg *m)
+{
+	return msg_word(m, 5);
+}
+
+static inline u32 msg_orignode(struct tipc_msg *m)
+{
+	if (likely(msg_short(m)))
+		return msg_prevnode(m);
+	return msg_word(m, 6);
+}
+
+static inline u32 msg_destnode(struct tipc_msg *m)
+{
+	return msg_word(m, 7);
+}
+
+static inline u32 msg_nametype(struct tipc_msg *m)
+{
+	return msg_word(m, 8);
+}
+
+static inline u32 msg_nameinst(struct tipc_msg *m)
+{
+	return msg_word(m, 9);
+}
+
+static inline u32 msg_namelower(struct tipc_msg *m)
+{
+	return msg_nameinst(m);
+}
+
+static inline u32 msg_nameupper(struct tipc_msg *m)
+{
+	return msg_word(m, 10);
+}
+
+static inline char *msg_options(struct tipc_msg *m, u32 *len)
+{
+	u32 pos = msg_bits(m, 1, 16, 0x7);
+
+	if (!pos)
+		return 0;
+	pos = (pos * 4) + 28;
+	*len = msg_hdr_sz(m) - pos;
+	return (char *)&m->hdr[pos/4];
+}
+
+#endif
+
+#endif
diff --git a/include/net/tipc/tipc_port.h b/include/net/tipc/tipc_port.h
new file mode 100644
index 000000000000..ec0f0de7f0e2
--- /dev/null
+++ b/include/net/tipc/tipc_port.h
@@ -0,0 +1,105 @@
+/*
+ * include/net/tipc/tipc_port.h: Include file for privileged access to TIPC ports
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_TIPC_PORT_H_
+#define _NET_TIPC_PORT_H_
+
+#ifdef __KERNEL__
+
+#include <linux/tipc.h>
+#include <linux/skbuff.h>
+#include <net/tipc/tipc_msg.h>
+
+#define TIPC_FLOW_CONTROL_WIN 512
+
+/**
+ * struct tipc_port - native TIPC port info available to privileged users
+ * @usr_handle: pointer to additional user-defined information about port
+ * @lock: pointer to spinlock for controlling access to port
+ * @connected: non-zero if port is currently connected to a peer port
+ * @conn_type: TIPC type used when connection was established
+ * @conn_instance: TIPC instance used when connection was established
+ * @conn_unacked: number of unacknowledged messages received from peer port
+ * @published: non-zero if port has one or more associated names
+ * @congested: non-zero if cannot send because of link or port congestion
+ * @ref: unique reference to port in TIPC object registry
+ * @phdr: preformatted message header used when sending messages
+ */
+
+struct tipc_port {
+        void *usr_handle;
+        spinlock_t *lock;
+	int connected;
+        u32 conn_type;
+        u32 conn_instance;
+	u32 conn_unacked;
+	int published;
+	u32 congested;
+	u32 ref;
+	struct tipc_msg phdr;
+};
+
+
+/**
+ * tipc_createport_raw - create a native TIPC port and return it's reference
+ *
+ * Note: 'dispatcher' and 'wakeup' deliver a locked port.
+ */
+
+u32 tipc_createport_raw(void *usr_handle,
+			u32 (*dispatcher)(struct tipc_port *, struct sk_buff *),
+			void (*wakeup)(struct tipc_port *),
+			const u32 importance);
+
+/*
+ * tipc_set_msg_option(): port must be locked.
+ */
+int tipc_set_msg_option(struct tipc_port *tp_ptr,
+			const char *opt,
+			const u32 len);
+
+int tipc_reject_msg(struct sk_buff *buf, u32 err);
+
+int tipc_send_buf_fast(struct sk_buff *buf, u32 destnode);
+
+void tipc_acknowledge(u32 port_ref,u32 ack);
+
+struct tipc_port *tipc_get_port(const u32 ref);
+
+void *tipc_get_handle(const u32 ref);
+
+
+#endif
+
+#endif
+
diff --git a/net/Kconfig b/net/Kconfig
index 60f6f321bd76..9296b269d675 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -159,6 +159,7 @@ source "net/ipx/Kconfig"
 source "drivers/net/appletalk/Kconfig"
 source "net/x25/Kconfig"
 source "net/lapb/Kconfig"
+source "net/tipc/Kconfig"
 
 config NET_DIVERT
 	bool "Frame Diverter (EXPERIMENTAL)"
diff --git a/net/Makefile b/net/Makefile
index f5141b9d4f38..065796f5fb17 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_VLAN_8021Q)	+= 8021q/
 obj-$(CONFIG_IP_DCCP)		+= dccp/
 obj-$(CONFIG_IP_SCTP)		+= sctp/
 obj-$(CONFIG_IEEE80211)		+= ieee80211/
+obj-$(CONFIG_TIPC)		+= tipc/
 
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig
new file mode 100644
index 000000000000..05ab18e62dee
--- /dev/null
+++ b/net/tipc/Kconfig
@@ -0,0 +1,112 @@
+#
+# TIPC configuration
+#
+
+menu "TIPC Configuration (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+
+config TIPC
+	tristate "The TIPC Protocol (EXPERIMENTAL)"
+	---help---
+	  TBD.
+
+	  This protocol support is also available as a module ( = code which
+	  can be inserted in and removed from the running kernel whenever you
+	  want). The module will be called tipc. If you want to compile it
+	  as a module, say M here and read <file:Documentation/modules.txt>.
+
+	  If in doubt, say N.
+
+config TIPC_ADVANCED
+	bool "TIPC: Advanced configuration"
+	depends on TIPC
+	default n
+	help
+	  Saying Y here will open some advanced configuration
+          for TIPC. Most users do not need to bother, so if
+          unsure, just say N.
+
+config TIPC_ZONES
+	int "Maximum number of zones in network"
+	depends on TIPC && TIPC_ADVANCED
+	default "3"
+	help
+	 Max number of zones inside TIPC network. Max supported value 
+         is 255 zones, minimum is 1
+
+	 Default is 3 zones in a network; setting this to higher
+	 allows more zones but might use more memory.
+
+config TIPC_CLUSTERS
+	int "Maximum number of clusters in a zone"
+	depends on TIPC && TIPC_ADVANCED
+	default "1"
+	help
+          ***Only 1 (one cluster in a zone) is supported by current code.
+          Any value set here will be overridden.***
+
+          (Max number of clusters inside TIPC zone. Max supported 
+          value is 4095 clusters, minimum is 1.
+
+	  Default is 1; setting this to smaller value might save 
+          some memory, setting it to higher
+	  allows more clusters and might consume more memory.)
+
+config TIPC_NODES
+	int "Maximum number of nodes in cluster"
+	depends on TIPC && TIPC_ADVANCED
+	default "255"
+	help
+	  Maximum number of nodes inside a TIPC cluster. Maximum 
+          supported value is 2047 nodes, minimum is 8. 
+
+	  Setting this to a smaller value saves some memory, 
+	  setting it to higher allows more nodes.
+
+config TIPC_SLAVE_NODES
+	int "Maximum number of slave nodes in cluster"
+	depends on TIPC && TIPC_ADVANCED
+	default "0"
+	help
+          ***This capability is not supported by current code.***
+	  
+	  Maximum number of slave nodes inside a TIPC cluster. Maximum 
+          supported value is 2047 nodes, minimum is 0. 
+
+	  Setting this to a smaller value saves some memory, 
+	  setting it to higher allows more nodes.
+
+config TIPC_PORTS
+	int "Maximum number of ports in a node"
+	depends on TIPC && TIPC_ADVANCED
+	default "8191"
+	help
+	  Maximum number of ports within a node. Maximum 
+          supported value is 64535 nodes, minimum is 127. 
+
+	  Setting this to a smaller value saves some memory, 
+	  setting it to higher allows more ports.
+
+config TIPC_LOG
+	int "Size of log buffer"
+	depends on TIPC && TIPC_ADVANCED
+	default 0
+	help
+ 	  Size (in bytes) of TIPC's internal log buffer, which records the
+	  occurrence of significant events.  Maximum supported value
+	  is 32768 bytes, minimum is 0.
+
+	  There is no need to enable the log buffer unless the node will be
+	  managed remotely via TIPC.
+
+config TIPC_DEBUG
+	bool "Enable debugging support"
+	depends on TIPC
+	default n
+	help
+ 	  This will enable debugging of TIPC.
+
+	  Only say Y here if you are having trouble with TIPC.  It will
+	  enable the display of detailed information about what is going on.
+
+endmenu
diff --git a/net/tipc/Makefile b/net/tipc/Makefile
new file mode 100644
index 000000000000..dceb7027946c
--- /dev/null
+++ b/net/tipc/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for the Linux TIPC layer
+#
+
+obj-$(CONFIG_TIPC) := tipc.o
+
+tipc-y	+= addr.o bcast.o bearer.o config.o cluster.o \
+	   core.o handler.o link.o discover.o msg.o  \
+	   name_distr.o  subscr.o name_table.o net.o  \
+	   netlink.o node.o node_subscr.o port.o ref.o  \
+	   socket.o user_reg.o zone.o dbg.o eth_media.o
+
+# End of file
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
new file mode 100644
index 000000000000..bc353639562a
--- /dev/null
+++ b/net/tipc/addr.c
@@ -0,0 +1,91 @@
+/*
+ * net/tipc/addr.c: TIPC address utility routines
+ *     
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "dbg.h"
+#include "addr.h"
+#include "zone.h"
+#include "cluster.h"
+#include "net.h"
+
+u32 tipc_get_addr(void)
+{
+	return tipc_own_addr;
+}
+
+/**
+ * addr_domain_valid - validates a network domain address
+ * 
+ * Accepts <Z.C.N>, <Z.C.0>, <Z.0.0>, and <0.0.0>, 
+ * where Z, C, and N are non-zero and do not exceed the configured limits.
+ * 
+ * Returns 1 if domain address is valid, otherwise 0
+ */
+
+int addr_domain_valid(u32 addr)
+{
+	u32 n = tipc_node(addr);
+	u32 c = tipc_cluster(addr);
+	u32 z = tipc_zone(addr);
+	u32 max_nodes = tipc_max_nodes;
+
+	if (is_slave(addr))
+		max_nodes = LOWEST_SLAVE + tipc_max_slaves;
+	if (n > max_nodes)
+		return 0;
+	if (c > tipc_max_clusters)
+		return 0;
+	if (z > tipc_max_zones)
+		return 0;
+
+	if (n && (!z || !c))
+		return 0;
+	if (c && !z)
+		return 0;
+	return 1;
+}
+
+/**
+ * addr_node_valid - validates a proposed network address for this node
+ * 
+ * Accepts <Z.C.N>, where Z, C, and N are non-zero and do not exceed 
+ * the configured limits.
+ * 
+ * Returns 1 if address can be used, otherwise 0
+ */
+
+int addr_node_valid(u32 addr)
+{
+	return (addr_domain_valid(addr) && tipc_node(addr));
+}
+
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
new file mode 100644
index 000000000000..9dabebcba6de
--- /dev/null
+++ b/net/tipc/addr.h
@@ -0,0 +1,125 @@
+/*
+ * net/tipc/addr.h: Include file for TIPC address utility routines
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_ADDR_H
+#define _TIPC_ADDR_H
+
+static inline u32 own_node(void)
+{
+	return tipc_node(tipc_own_addr);
+}
+
+static inline u32 own_cluster(void)
+{
+	return tipc_cluster(tipc_own_addr);
+}
+
+static inline u32 own_zone(void)
+{
+	return tipc_zone(tipc_own_addr);
+}
+
+static inline int in_own_cluster(u32 addr)
+{
+	return !((addr ^ tipc_own_addr) >> 12);
+}
+
+static inline int in_own_zone(u32 addr)
+{
+	return !((addr ^ tipc_own_addr) >> 24);
+}
+
+static inline int is_slave(u32 addr)
+{
+	return addr & 0x800;
+}
+
+static inline int may_route(u32 addr)
+{
+	return(addr ^ tipc_own_addr) >> 11;
+}
+
+static inline int in_scope(u32 domain, u32 addr)
+{
+	if (!domain || (domain == addr))
+		return 1;
+	if (domain == (addr & 0xfffff000u)) /* domain <Z.C.0> */
+		return 1;
+	if (domain == (addr & 0xff000000u)) /* domain <Z.0.0> */
+		return 1;
+	return 0;
+}
+
+/**
+ * addr_scope - convert message lookup domain to equivalent 2-bit scope value
+ */
+
+static inline int addr_scope(u32 domain)
+{
+	if (likely(!domain))
+		return TIPC_ZONE_SCOPE;
+	if (tipc_node(domain))
+		return TIPC_NODE_SCOPE;
+	if (tipc_cluster(domain))
+		return TIPC_CLUSTER_SCOPE;
+	return TIPC_ZONE_SCOPE;
+}
+
+/**
+ * addr_domain - convert 2-bit scope value to equivalent message lookup domain
+ *  
+ * Needed when address of a named message must be looked up a second time 
+ * after a network hop.
+ */
+
+static inline int addr_domain(int sc)
+{
+	if (likely(sc == TIPC_NODE_SCOPE))
+		return tipc_own_addr;
+	if (sc == TIPC_CLUSTER_SCOPE)
+		return tipc_addr(tipc_zone(tipc_own_addr),
+				 tipc_cluster(tipc_own_addr), 0);
+	return tipc_addr(tipc_zone(tipc_own_addr), 0, 0);
+}
+
+static inline char *addr_string_fill(char *string, u32 addr)
+{
+	snprintf(string, 16, "<%u.%u.%u>",
+		 tipc_zone(addr), tipc_cluster(addr), tipc_node(addr));
+	return string;
+}
+
+int addr_domain_valid(u32);
+int addr_node_valid(u32 addr);
+
+#endif
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
new file mode 100644
index 000000000000..35ca90667a59
--- /dev/null
+++ b/net/tipc/bcast.c
@@ -0,0 +1,803 @@
+/*
+ * net/tipc/bcast.c: TIPC broadcast code
+ *     
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004, Intel Corporation.
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "msg.h"
+#include "dbg.h"
+#include "link.h"
+#include "net.h"
+#include "node.h"
+#include "port.h"
+#include "addr.h"
+#include "node_subscr.h"
+#include "name_distr.h"
+#include "bearer.h"
+#include "name_table.h"
+#include "bcast.h"
+
+
+#define MAX_PKT_DEFAULT_MCAST 1500	/* bcast link max packet size (fixed) */
+
+#define BCLINK_WIN_DEFAULT 20		/* bcast link window size (default) */
+
+#define BCLINK_LOG_BUF_SIZE 0
+
+/**
+ * struct bcbearer_pair - a pair of bearers used by broadcast link
+ * @primary: pointer to primary bearer
+ * @secondary: pointer to secondary bearer
+ * 
+ * Bearers must have same priority and same set of reachable destinations 
+ * to be paired.
+ */
+
+struct bcbearer_pair {
+	struct bearer *primary;
+	struct bearer *secondary;
+};
+
+/**
+ * struct bcbearer - bearer used by broadcast link
+ * @bearer: (non-standard) broadcast bearer structure
+ * @media: (non-standard) broadcast media structure
+ * @bpairs: array of bearer pairs
+ * @bpairs_temp: array of bearer pairs used during creation of "bpairs"
+ */
+
+struct bcbearer {
+	struct bearer bearer;
+	struct media media;
+	struct bcbearer_pair bpairs[MAX_BEARERS];
+	struct bcbearer_pair bpairs_temp[TIPC_NUM_LINK_PRI];
+};
+
+/**
+ * struct bclink - link used for broadcast messages
+ * @link: (non-standard) broadcast link structure
+ * @node: (non-standard) node structure representing b'cast link's peer node
+ * 
+ * Handles sequence numbering, fragmentation, bundling, etc.
+ */
+
+struct bclink {
+	struct link link;
+	struct node node;
+};
+
+
+static struct bcbearer *bcbearer = NULL;
+static struct bclink *bclink = NULL;
+static struct link *bcl = NULL;
+static spinlock_t bc_lock = SPIN_LOCK_UNLOCKED;
+
+char bc_link_name[] = "multicast-link";
+
+
+static inline u32 buf_seqno(struct sk_buff *buf)
+{
+	return msg_seqno(buf_msg(buf));
+} 
+
+static inline u32 bcbuf_acks(struct sk_buff *buf)
+{
+	return (u32)TIPC_SKB_CB(buf)->handle;
+}
+
+static inline void bcbuf_set_acks(struct sk_buff *buf, u32 acks)
+{
+	TIPC_SKB_CB(buf)->handle = (void *)acks;
+}
+
+static inline void bcbuf_decr_acks(struct sk_buff *buf)
+{
+	bcbuf_set_acks(buf, bcbuf_acks(buf) - 1);
+}
+
+
+/** 
+ * bclink_set_gap - set gap according to contents of current deferred pkt queue
+ * 
+ * Called with 'node' locked, bc_lock unlocked
+ */
+
+static inline void bclink_set_gap(struct node *n_ptr)
+{
+	struct sk_buff *buf = n_ptr->bclink.deferred_head;
+
+	n_ptr->bclink.gap_after = n_ptr->bclink.gap_to =
+		mod(n_ptr->bclink.last_in);
+	if (unlikely(buf != NULL))
+		n_ptr->bclink.gap_to = mod(buf_seqno(buf) - 1);
+}
+
+/** 
+ * bclink_ack_allowed - test if ACK or NACK message can be sent at this moment
+ * 
+ * This mechanism endeavours to prevent all nodes in network from trying
+ * to ACK or NACK at the same time.
+ * 
+ * Note: TIPC uses a different trigger to distribute ACKs than it does to
+ *       distribute NACKs, but tries to use the same spacing (divide by 16). 
+ */
+
+static inline int bclink_ack_allowed(u32 n)
+{
+	return((n % TIPC_MIN_LINK_WIN) == tipc_own_tag);
+}
+
+
+/** 
+ * bclink_retransmit_pkt - retransmit broadcast packets
+ * @after: sequence number of last packet to *not* retransmit
+ * @to: sequence number of last packet to retransmit
+ * 
+ * Called with 'node' locked, bc_lock unlocked
+ */
+
+static void bclink_retransmit_pkt(u32 after, u32 to)
+{
+	struct sk_buff *buf;
+
+	spin_lock_bh(&bc_lock);
+	buf = bcl->first_out;
+	while (buf && less_eq(buf_seqno(buf), after)) {
+		buf = buf->next;                
+	}
+	if (buf != NULL)
+		link_retransmit(bcl, buf, mod(to - after));
+	spin_unlock_bh(&bc_lock);              
+}
+
+/** 
+ * bclink_acknowledge - handle acknowledgement of broadcast packets
+ * @n_ptr: node that sent acknowledgement info
+ * @acked: broadcast sequence # that has been acknowledged
+ * 
+ * Node is locked, bc_lock unlocked.
+ */
+
+void bclink_acknowledge(struct node *n_ptr, u32 acked)
+{
+	struct sk_buff *crs;
+	struct sk_buff *next;
+	unsigned int released = 0;
+
+	if (less_eq(acked, n_ptr->bclink.acked))
+		return;
+
+	spin_lock_bh(&bc_lock);
+
+	/* Skip over packets that node has previously acknowledged */
+
+	crs = bcl->first_out;
+	while (crs && less_eq(buf_seqno(crs), n_ptr->bclink.acked)) {
+		crs = crs->next;
+	}
+
+	/* Update packets that node is now acknowledging */
+
+	while (crs && less_eq(buf_seqno(crs), acked)) {
+		next = crs->next;
+		bcbuf_decr_acks(crs);
+		if (bcbuf_acks(crs) == 0) {
+			bcl->first_out = next;
+			bcl->out_queue_size--;
+			buf_discard(crs);
+			released = 1;
+		}
+		crs = next;
+	}
+	n_ptr->bclink.acked = acked;
+
+	/* Try resolving broadcast link congestion, if necessary */
+
+	if (unlikely(bcl->next_out))
+		link_push_queue(bcl);
+	if (unlikely(released && !list_empty(&bcl->waiting_ports)))
+		link_wakeup_ports(bcl, 0);
+	spin_unlock_bh(&bc_lock);
+}
+
+/** 
+ * bclink_send_ack - unicast an ACK msg
+ * 
+ * net_lock and node lock set
+ */
+
+static void bclink_send_ack(struct node *n_ptr)
+{
+	struct link *l_ptr = n_ptr->active_links[n_ptr->addr & 1];
+
+	if (l_ptr != NULL)
+		link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0);
+}
+
+/** 
+ * bclink_send_nack- broadcast a NACK msg
+ * 
+ * net_lock and node lock set
+ */
+
+static void bclink_send_nack(struct node *n_ptr)
+{
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+
+	if (!less(n_ptr->bclink.gap_after, n_ptr->bclink.gap_to))
+		return;
+
+	buf = buf_acquire(INT_H_SIZE);
+	if (buf) {
+		msg = buf_msg(buf);
+		msg_init(msg, BCAST_PROTOCOL, STATE_MSG,
+			 TIPC_OK, INT_H_SIZE, n_ptr->addr);
+		msg_set_mc_netid(msg, tipc_net_id);
+		msg_set_bcast_ack(msg, mod(n_ptr->bclink.last_in)); 
+		msg_set_bcgap_after(msg, n_ptr->bclink.gap_after);
+		msg_set_bcgap_to(msg, n_ptr->bclink.gap_to);
+		msg_set_bcast_tag(msg, tipc_own_tag);
+
+		if (bearer_send(&bcbearer->bearer, buf, 0)) {
+			bcl->stats.sent_nacks++;
+			buf_discard(buf);
+		} else {
+			bearer_schedule(bcl->b_ptr, bcl);
+			bcl->proto_msg_queue = buf;
+			bcl->stats.bearer_congs++;
+		}
+
+		/* 
+		 * Ensure we doesn't send another NACK msg to the node
+		 * until 16 more deferred messages arrive from it
+		 * (i.e. helps prevent all nodes from NACK'ing at same time)
+		 */
+		
+		n_ptr->bclink.nack_sync = tipc_own_tag;
+	}
+}
+
+/** 
+ * bclink_check_gap - send a NACK if a sequence gap exists
+ *
+ * net_lock and node lock set
+ */
+
+void bclink_check_gap(struct node *n_ptr, u32 last_sent)
+{
+	if (!n_ptr->bclink.supported ||
+	    less_eq(last_sent, mod(n_ptr->bclink.last_in)))
+		return;
+
+	bclink_set_gap(n_ptr);
+	if (n_ptr->bclink.gap_after == n_ptr->bclink.gap_to)
+		n_ptr->bclink.gap_to = last_sent;
+	bclink_send_nack(n_ptr);
+}
+
+/** 
+ * bclink_peek_nack - process a NACK msg meant for another node
+ * 
+ * Only net_lock set.
+ */
+
+void bclink_peek_nack(u32 dest, u32 sender_tag, u32 gap_after, u32 gap_to)
+{
+	struct node *n_ptr = node_find(dest);
+	u32 my_after, my_to;
+
+	if (unlikely(!n_ptr || !node_is_up(n_ptr)))
+		return;
+	node_lock(n_ptr);
+	/*
+	 * Modify gap to suppress unnecessary NACKs from this node
+	 */
+	my_after = n_ptr->bclink.gap_after;
+	my_to = n_ptr->bclink.gap_to;
+
+	if (less_eq(gap_after, my_after)) {
+		if (less(my_after, gap_to) && less(gap_to, my_to))
+			n_ptr->bclink.gap_after = gap_to;
+		else if (less_eq(my_to, gap_to))
+			n_ptr->bclink.gap_to = n_ptr->bclink.gap_after;
+	} else if (less_eq(gap_after, my_to)) {
+		if (less_eq(my_to, gap_to))
+			n_ptr->bclink.gap_to = gap_after;
+	} else {
+		/* 
+		 * Expand gap if missing bufs not in deferred queue:
+		 */
+		struct sk_buff *buf = n_ptr->bclink.deferred_head;
+		u32 prev = n_ptr->bclink.gap_to;
+
+		for (; buf; buf = buf->next) {
+			u32 seqno = buf_seqno(buf);
+
+			if (mod(seqno - prev) != 1)
+				buf = NULL;
+			if (seqno == gap_after)
+				break;
+			prev = seqno;
+		}
+		if (buf == NULL)
+			n_ptr->bclink.gap_to = gap_after;
+	}
+	/*
+	 * Some nodes may send a complementary NACK now:
+	 */ 
+	if (bclink_ack_allowed(sender_tag + 1)) {
+		if (n_ptr->bclink.gap_to != n_ptr->bclink.gap_after) {
+			bclink_send_nack(n_ptr);
+			bclink_set_gap(n_ptr);
+		}
+	}
+	node_unlock(n_ptr);
+}
+
+/**
+ * bclink_send_msg - broadcast a packet to all nodes in cluster
+ */
+
+int bclink_send_msg(struct sk_buff *buf)
+{
+	int res;
+
+	spin_lock_bh(&bc_lock);
+
+	res = link_send_buf(bcl, buf);
+	if (unlikely(res == -ELINKCONG))
+		buf_discard(buf);
+	else
+		bcl->stats.sent_info++;
+
+	if (bcl->out_queue_size > bcl->stats.max_queue_sz)
+		bcl->stats.max_queue_sz = bcl->out_queue_size;
+	bcl->stats.queue_sz_counts++;
+	bcl->stats.accu_queue_sz += bcl->out_queue_size;
+
+	spin_unlock_bh(&bc_lock);
+	return res;
+}
+
+/**
+ * bclink_recv_pkt - receive a broadcast packet, and deliver upwards
+ * 
+ * net_lock is read_locked, no other locks set
+ */
+
+void bclink_recv_pkt(struct sk_buff *buf)
+{        
+	struct tipc_msg *msg = buf_msg(buf);
+	struct node* node = node_find(msg_prevnode(msg));
+	u32 next_in;
+	u32 seqno;
+	struct sk_buff *deferred;
+
+	msg_dbg(msg, "<BC<<<");
+
+	if (unlikely(!node || !node_is_up(node) || !node->bclink.supported || 
+		     (msg_mc_netid(msg) != tipc_net_id))) {
+		buf_discard(buf);
+		return;
+	}
+
+	if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) {
+		msg_dbg(msg, "<BCNACK<<<");
+		if (msg_destnode(msg) == tipc_own_addr) {
+			node_lock(node);
+			bclink_acknowledge(node, msg_bcast_ack(msg));
+			node_unlock(node);
+			bcl->stats.recv_nacks++;
+			bclink_retransmit_pkt(msg_bcgap_after(msg),
+					      msg_bcgap_to(msg));
+		} else {
+			bclink_peek_nack(msg_destnode(msg),
+					 msg_bcast_tag(msg),
+					 msg_bcgap_after(msg),
+					 msg_bcgap_to(msg));
+		}
+		buf_discard(buf);
+		return;
+	}
+
+	node_lock(node);
+receive:
+	deferred = node->bclink.deferred_head;
+	next_in = mod(node->bclink.last_in + 1);
+	seqno = msg_seqno(msg);
+
+	if (likely(seqno == next_in)) {
+		bcl->stats.recv_info++;
+		node->bclink.last_in++;
+		bclink_set_gap(node);
+		if (unlikely(bclink_ack_allowed(seqno))) {
+			bclink_send_ack(node);
+			bcl->stats.sent_acks++;
+		}
+		if (likely(msg_isdata(msg))) {
+			node_unlock(node);
+			port_recv_mcast(buf, NULL);
+		} else if (msg_user(msg) == MSG_BUNDLER) {
+			bcl->stats.recv_bundles++;
+			bcl->stats.recv_bundled += msg_msgcnt(msg);
+			node_unlock(node);
+			link_recv_bundle(buf);
+		} else if (msg_user(msg) == MSG_FRAGMENTER) {
+			bcl->stats.recv_fragments++;
+			if (link_recv_fragment(&node->bclink.defragm,
+					       &buf, &msg))
+				bcl->stats.recv_fragmented++;
+			node_unlock(node);
+			net_route_msg(buf);
+		} else {
+			node_unlock(node);
+			net_route_msg(buf);
+		}
+		if (deferred && (buf_seqno(deferred) == mod(next_in + 1))) {
+			node_lock(node);
+			buf = deferred;
+			msg = buf_msg(buf);
+			node->bclink.deferred_head = deferred->next;
+			goto receive;
+		}
+		return;
+	} else if (less(next_in, seqno)) {
+		u32 gap_after = node->bclink.gap_after;
+		u32 gap_to = node->bclink.gap_to;
+
+		if (link_defer_pkt(&node->bclink.deferred_head,
+				   &node->bclink.deferred_tail,
+				   buf)) {
+			node->bclink.nack_sync++;
+			bcl->stats.deferred_recv++;
+			if (seqno == mod(gap_after + 1))
+				node->bclink.gap_after = seqno;
+			else if (less(gap_after, seqno) && less(seqno, gap_to))
+				node->bclink.gap_to = seqno;
+		}
+		if (bclink_ack_allowed(node->bclink.nack_sync)) {
+			if (gap_to != gap_after)
+				bclink_send_nack(node);
+			bclink_set_gap(node);
+		}
+	} else {
+		bcl->stats.duplicates++;
+		buf_discard(buf);
+	}
+	node_unlock(node);
+}
+
+u32 bclink_get_last_sent(void)
+{
+	u32 last_sent = mod(bcl->next_out_no - 1);
+
+	if (bcl->next_out)
+		last_sent = mod(buf_seqno(bcl->next_out) - 1);
+	return last_sent;
+}
+
+u32 bclink_acks_missing(struct node *n_ptr)
+{
+	return (n_ptr->bclink.supported &&
+		(bclink_get_last_sent() != n_ptr->bclink.acked));
+}
+
+
+/**
+ * bcbearer_send - send a packet through the broadcast pseudo-bearer
+ * 
+ * Send through as many bearers as necessary to reach all nodes
+ * that support TIPC multicasting.
+ * 
+ * Returns 0 if packet sent successfully, non-zero if not
+ */
+
+int bcbearer_send(struct sk_buff *buf,
+		  struct tipc_bearer *unused1,
+		  struct tipc_media_addr *unused2)
+{
+	static int send_count = 0;
+
+	struct node_map remains;
+	struct node_map remains_new;
+	int bp_index;
+	int swap_time;
+
+	/* Prepare buffer for broadcasting (if first time trying to send it) */
+
+	if (likely(!msg_non_seq(buf_msg(buf)))) {
+		struct tipc_msg *msg;
+
+		assert(cluster_bcast_nodes.count != 0);
+		bcbuf_set_acks(buf, cluster_bcast_nodes.count);
+		msg = buf_msg(buf);
+		msg_set_non_seq(msg);
+		msg_set_mc_netid(msg, tipc_net_id);
+	}
+
+	/* Determine if bearer pairs should be swapped following this attempt */
+
+	if ((swap_time = (++send_count >= 10)))
+		send_count = 0;
+
+	/* Send buffer over bearers until all targets reached */
+	
+	remains = cluster_bcast_nodes;
+
+	for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) {
+		struct bearer *p = bcbearer->bpairs[bp_index].primary;
+		struct bearer *s = bcbearer->bpairs[bp_index].secondary;
+
+		if (!p)
+			break;	/* no more bearers to try */
+
+		nmap_diff(&remains, &p->nodes, &remains_new);
+		if (remains_new.count == remains.count)
+			continue;	/* bearer pair doesn't add anything */
+
+		if (!p->publ.blocked &&
+		    !p->media->send_msg(buf, &p->publ, &p->media->bcast_addr)) {
+			if (swap_time && s && !s->publ.blocked)
+				goto swap;
+			else
+				goto update;
+		}
+
+		if (!s || s->publ.blocked ||
+		    s->media->send_msg(buf, &s->publ, &s->media->bcast_addr))
+			continue;	/* unable to send using bearer pair */
+swap:
+		bcbearer->bpairs[bp_index].primary = s;
+		bcbearer->bpairs[bp_index].secondary = p;
+update:
+		if (remains_new.count == 0)
+			return TIPC_OK;
+
+		remains = remains_new;
+	}
+	
+	/* Unable to reach all targets */
+
+	bcbearer->bearer.publ.blocked = 1;
+	bcl->stats.bearer_congs++;
+	return ~TIPC_OK;
+}
+
+/**
+ * bcbearer_sort - create sets of bearer pairs used by broadcast bearer
+ */
+
+void bcbearer_sort(void)
+{
+	struct bcbearer_pair *bp_temp = bcbearer->bpairs_temp;
+	struct bcbearer_pair *bp_curr;
+	int b_index;
+	int pri;
+
+	spin_lock_bh(&bc_lock);
+
+	/* Group bearers by priority (can assume max of two per priority) */
+
+	memset(bp_temp, 0, sizeof(bcbearer->bpairs_temp));
+
+	for (b_index = 0; b_index < MAX_BEARERS; b_index++) {
+		struct bearer *b = &bearers[b_index];
+
+		if (!b->active || !b->nodes.count)
+			continue;
+
+		if (!bp_temp[b->priority].primary)
+			bp_temp[b->priority].primary = b;
+		else
+			bp_temp[b->priority].secondary = b;
+	}
+
+	/* Create array of bearer pairs for broadcasting */
+
+	bp_curr = bcbearer->bpairs;
+	memset(bcbearer->bpairs, 0, sizeof(bcbearer->bpairs));
+
+	for (pri = (TIPC_NUM_LINK_PRI - 1); pri >= 0; pri--) {
+
+		if (!bp_temp[pri].primary)
+			continue;
+
+		bp_curr->primary = bp_temp[pri].primary;
+
+		if (bp_temp[pri].secondary) {
+			if (nmap_equal(&bp_temp[pri].primary->nodes,
+				       &bp_temp[pri].secondary->nodes)) {
+				bp_curr->secondary = bp_temp[pri].secondary;
+			} else {
+				bp_curr++;
+				bp_curr->primary = bp_temp[pri].secondary;
+			}
+		}
+
+		bp_curr++;
+	}
+
+	spin_unlock_bh(&bc_lock);
+}
+
+/**
+ * bcbearer_push - resolve bearer congestion
+ * 
+ * Forces bclink to push out any unsent packets, until all packets are gone
+ * or congestion reoccurs.
+ * No locks set when function called
+ */
+
+void bcbearer_push(void)
+{
+	struct bearer *b_ptr;
+
+	spin_lock_bh(&bc_lock);
+	b_ptr = &bcbearer->bearer;
+	if (b_ptr->publ.blocked) {
+		b_ptr->publ.blocked = 0;
+		bearer_lock_push(b_ptr);
+	}
+	spin_unlock_bh(&bc_lock);
+}
+
+
+int bclink_stats(char *buf, const u32 buf_size)
+{
+	struct print_buf pb;
+
+	if (!bcl)
+		return 0;
+
+	printbuf_init(&pb, buf, buf_size);
+
+	spin_lock_bh(&bc_lock);
+
+	tipc_printf(&pb, "Link <%s>\n"
+		         "  Window:%u packets\n", 
+		    bcl->name, bcl->queue_limit[0]);
+	tipc_printf(&pb, "  RX packets:%u fragments:%u/%u bundles:%u/%u\n", 
+		    bcl->stats.recv_info,
+		    bcl->stats.recv_fragments,
+		    bcl->stats.recv_fragmented,
+		    bcl->stats.recv_bundles,
+		    bcl->stats.recv_bundled);
+	tipc_printf(&pb, "  TX packets:%u fragments:%u/%u bundles:%u/%u\n", 
+		    bcl->stats.sent_info,
+		    bcl->stats.sent_fragments,
+		    bcl->stats.sent_fragmented, 
+		    bcl->stats.sent_bundles,
+		    bcl->stats.sent_bundled);
+	tipc_printf(&pb, "  RX naks:%u defs:%u dups:%u\n", 
+		    bcl->stats.recv_nacks,
+		    bcl->stats.deferred_recv, 
+		    bcl->stats.duplicates);
+	tipc_printf(&pb, "  TX naks:%u acks:%u dups:%u\n", 
+		    bcl->stats.sent_nacks, 
+		    bcl->stats.sent_acks, 
+		    bcl->stats.retransmitted);
+	tipc_printf(&pb, "  Congestion bearer:%u link:%u  Send queue max:%u avg:%u\n",
+		    bcl->stats.bearer_congs,
+		    bcl->stats.link_congs,
+		    bcl->stats.max_queue_sz,
+		    bcl->stats.queue_sz_counts
+		    ? (bcl->stats.accu_queue_sz / bcl->stats.queue_sz_counts)
+		    : 0);
+
+	spin_unlock_bh(&bc_lock);
+	return printbuf_validate(&pb);
+}
+
+int bclink_reset_stats(void)
+{
+	if (!bcl)
+		return -ENOPROTOOPT;
+
+	spin_lock_bh(&bc_lock);
+	memset(&bcl->stats, 0, sizeof(bcl->stats));
+	spin_unlock_bh(&bc_lock);
+	return TIPC_OK;
+}
+
+int bclink_set_queue_limits(u32 limit)
+{
+	if (!bcl)
+		return -ENOPROTOOPT;
+	if ((limit < TIPC_MIN_LINK_WIN) || (limit > TIPC_MAX_LINK_WIN))
+		return -EINVAL;
+
+	spin_lock_bh(&bc_lock);
+	link_set_queue_limits(bcl, limit);
+	spin_unlock_bh(&bc_lock);
+	return TIPC_OK;
+}
+
+int bclink_init(void)
+{
+	bcbearer = kmalloc(sizeof(*bcbearer), GFP_ATOMIC);
+	bclink = kmalloc(sizeof(*bclink), GFP_ATOMIC);
+	if (!bcbearer || !bclink) {
+ nomem:
+	 	warn("Memory squeeze; Failed to create multicast link\n");
+		kfree(bcbearer);
+		bcbearer = NULL;
+		kfree(bclink);
+		bclink = NULL;
+		return -ENOMEM;
+	}
+
+	memset(bcbearer, 0, sizeof(struct bcbearer));
+	INIT_LIST_HEAD(&bcbearer->bearer.cong_links);
+	bcbearer->bearer.media = &bcbearer->media;
+	bcbearer->media.send_msg = bcbearer_send;
+	sprintf(bcbearer->media.name, "tipc-multicast");
+
+	bcl = &bclink->link;
+	memset(bclink, 0, sizeof(struct bclink));
+	INIT_LIST_HEAD(&bcl->waiting_ports);
+	bcl->next_out_no = 1;
+	bclink->node.lock =  SPIN_LOCK_UNLOCKED;        
+	bcl->owner = &bclink->node;
+        bcl->max_pkt = MAX_PKT_DEFAULT_MCAST;
+	link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
+	bcl->b_ptr = &bcbearer->bearer;
+	bcl->state = WORKING_WORKING;
+	sprintf(bcl->name, bc_link_name);
+
+	if (BCLINK_LOG_BUF_SIZE) {
+		char *pb = kmalloc(BCLINK_LOG_BUF_SIZE, GFP_ATOMIC);
+
+		if (!pb)
+			goto nomem;
+		printbuf_init(&bcl->print_buf, pb, BCLINK_LOG_BUF_SIZE);
+	}
+
+	return TIPC_OK;
+}
+
+void bclink_stop(void)
+{
+	spin_lock_bh(&bc_lock);
+	if (bcbearer) {
+		link_stop(bcl);
+		if (BCLINK_LOG_BUF_SIZE)
+			kfree(bcl->print_buf.buf);
+		bcl = NULL;
+		kfree(bclink);
+		bclink = NULL;
+		kfree(bcbearer);
+		bcbearer = NULL;
+	}
+	spin_unlock_bh(&bc_lock);
+}
+
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
new file mode 100644
index 000000000000..cc2ede15c4fd
--- /dev/null
+++ b/net/tipc/bcast.h
@@ -0,0 +1,220 @@
+/*
+ * net/tipc/bcast.h: Include file for TIPC broadcast code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_BCAST_H
+#define _TIPC_BCAST_H
+
+#define MAX_NODES 4096
+#define WSIZE 32
+
+/**
+ * struct node_map - set of node identifiers
+ * @count: # of nodes in set
+ * @map: bitmap of node identifiers that are in the set
+ */
+
+struct node_map {
+	u32 count;
+	u32 map[MAX_NODES / WSIZE];
+};
+
+
+#define PLSIZE 32
+
+/**
+ * struct port_list - set of node local destination ports
+ * @count: # of ports in set (only valid for first entry in list)
+ * @next: pointer to next entry in list
+ * @ports: array of port references
+ */
+
+struct port_list {
+	int count;
+	struct port_list *next;
+	u32 ports[PLSIZE];
+};
+
+
+struct node;
+
+extern char bc_link_name[];
+
+
+/**
+ * nmap_get - determine if node exists in a node map
+ */
+
+static inline int nmap_get(struct node_map *nm_ptr, u32 node)
+{
+	int n = tipc_node(node);
+	int w = n / WSIZE;
+	int b = n % WSIZE;
+
+	return nm_ptr->map[w] & (1 << b);
+}
+
+/**
+ * nmap_add - add a node to a node map
+ */
+
+static inline void nmap_add(struct node_map *nm_ptr, u32 node)
+{
+	int n = tipc_node(node);
+	int w = n / WSIZE;
+	u32 mask = (1 << (n % WSIZE));
+
+	if ((nm_ptr->map[w] & mask) == 0) {
+		nm_ptr->count++;
+		nm_ptr->map[w] |= mask;
+	}
+}
+
+/** 
+ * nmap_remove - remove a node from a node map
+ */
+
+static inline void nmap_remove(struct node_map *nm_ptr, u32 node)
+{
+	int n = tipc_node(node);
+	int w = n / WSIZE;
+	u32 mask = (1 << (n % WSIZE));
+
+	if ((nm_ptr->map[w] & mask) != 0) {
+		nm_ptr->map[w] &= ~mask;
+		nm_ptr->count--;
+	}
+}
+
+/**
+ * nmap_equal - test for equality of node maps
+ */
+
+static inline int nmap_equal(struct node_map *nm_a, struct node_map *nm_b)
+{
+	return !memcmp(nm_a, nm_b, sizeof(*nm_a));
+}
+
+/**
+ * nmap_diff - find differences between node maps
+ * @nm_a: input node map A
+ * @nm_b: input node map B
+ * @nm_diff: output node map A-B (i.e. nodes of A that are not in B)
+ */
+
+static inline void nmap_diff(struct node_map *nm_a, struct node_map *nm_b,
+			     struct node_map *nm_diff)
+{
+	int stop = sizeof(nm_a->map) / sizeof(u32);
+	int w;
+	int b;
+	u32 map;
+
+	memset(nm_diff, 0, sizeof(*nm_diff));
+	for (w = 0; w < stop; w++) {
+		map = nm_a->map[w] ^ (nm_a->map[w] & nm_b->map[w]);
+		nm_diff->map[w] = map;
+		if (map != 0) {
+			for (b = 0 ; b < WSIZE; b++) {
+				if (map & (1 << b))
+					nm_diff->count++;
+			}
+		}
+	}
+}
+
+/**
+ * port_list_add - add a port to a port list, ensuring no duplicates
+ */
+
+static inline void port_list_add(struct port_list *pl_ptr, u32 port)
+{
+	struct port_list *item = pl_ptr;
+	int i;
+	int item_sz = PLSIZE;
+	int cnt = pl_ptr->count;
+
+	for (; ; cnt -= item_sz, item = item->next) {
+		if (cnt < PLSIZE)
+			item_sz = cnt;
+		for (i = 0; i < item_sz; i++)
+			if (item->ports[i] == port)
+				return;
+		if (i < PLSIZE) {
+			item->ports[i] = port;
+			pl_ptr->count++;
+			return;
+		}
+		if (!item->next) {
+			item->next = kmalloc(sizeof(*item), GFP_ATOMIC);
+			if (!item->next) {
+				warn("Memory squeeze: multicast destination port list is incomplete\n");
+				return;
+			}
+			item->next->next = NULL;
+		}
+	}
+}
+
+/**
+ * port_list_free - free dynamically created entries in port_list chain
+ * 
+ * Note: First item is on stack, so it doesn't need to be released
+ */
+
+static inline void port_list_free(struct port_list *pl_ptr)
+{
+	struct port_list *item;
+	struct port_list *next;
+
+	for (item = pl_ptr->next; item; item = next) {
+		next = item->next;
+		kfree(item);
+	}
+}
+
+
+int  bclink_init(void);
+void bclink_stop(void);
+void bclink_acknowledge(struct node *n_ptr, u32 acked);
+int  bclink_send_msg(struct sk_buff *buf);
+void bclink_recv_pkt(struct sk_buff *buf);
+u32  bclink_get_last_sent(void);
+u32  bclink_acks_missing(struct node *n_ptr);
+void bclink_check_gap(struct node *n_ptr, u32 seqno);
+int  bclink_stats(char *stats_buf, const u32 buf_size);
+int  bclink_reset_stats(void);
+int  bclink_set_queue_limits(u32 limit);
+void bcbearer_sort(void);
+void bcbearer_push(void);
+
+#endif
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
new file mode 100644
index 000000000000..c19465c5f035
--- /dev/null
+++ b/net/tipc/bearer.c
@@ -0,0 +1,689 @@
+/*
+ * net/tipc/bearer.c: TIPC bearer code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include "dbg.h"
+#include "bearer.h"
+#include "link.h"
+#include "port.h"
+#include "discover.h"
+#include "bcast.h"
+
+#define MAX_ADDR_STR 32
+
+static struct media *media_list = 0;
+static u32 media_count = 0;
+
+struct bearer *bearers = 0;
+
+/**
+ * media_name_valid - validate media name
+ * 
+ * Returns 1 if media name is valid, otherwise 0.
+ */
+
+static int media_name_valid(const char *name)
+{
+	u32 len;
+
+	len = strlen(name);
+	if ((len + 1) > TIPC_MAX_MEDIA_NAME)
+		return 0;
+	return (strspn(name, tipc_alphabet) == len);
+}
+
+/**
+ * media_find - locates specified media object by name
+ */
+
+static struct media *media_find(const char *name)
+{
+	struct media *m_ptr;
+	u32 i;
+
+	for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) {
+		if (!strcmp(m_ptr->name, name))
+			return m_ptr;
+	}
+	return 0;
+}
+
+/**
+ * tipc_register_media - register a media type
+ * 
+ * Bearers for this media type must be activated separately at a later stage.
+ */
+
+int  tipc_register_media(u32 media_type,
+			 char *name, 
+			 int (*enable)(struct tipc_bearer *), 
+			 void (*disable)(struct tipc_bearer *), 
+			 int (*send_msg)(struct sk_buff *, 
+					 struct tipc_bearer *,
+					 struct tipc_media_addr *), 
+			 char *(*addr2str)(struct tipc_media_addr *a,
+					   char *str_buf, int str_size),
+			 struct tipc_media_addr *bcast_addr,
+			 const u32 bearer_priority,
+			 const u32 link_tolerance,  /* [ms] */
+			 const u32 send_window_limit)
+{
+	struct media *m_ptr;
+	u32 media_id;
+	u32 i;
+	int res = -EINVAL;
+
+	write_lock_bh(&net_lock);
+	if (!media_list)
+		goto exit;
+
+	if (!media_name_valid(name)) {
+		warn("Media registration error: illegal name <%s>\n", name);
+		goto exit;
+	}
+	if (!bcast_addr) {
+		warn("Media registration error: no broadcast address supplied\n");
+		goto exit;
+	}
+	if (bearer_priority >= TIPC_NUM_LINK_PRI) {
+		warn("Media registration error: priority %u\n", bearer_priority);
+		goto exit;
+	}
+	if ((link_tolerance < TIPC_MIN_LINK_TOL) || 
+	    (link_tolerance > TIPC_MAX_LINK_TOL)) {
+		warn("Media registration error: tolerance %u\n", link_tolerance);
+		goto exit;
+	}
+
+	media_id = media_count++;
+	if (media_id >= MAX_MEDIA) {
+		warn("Attempt to register more than %u media\n", MAX_MEDIA);
+		media_count--;
+		goto exit;
+	}
+	for (i = 0; i < media_id; i++) {
+		if (media_list[i].type_id == media_type) {
+			warn("Attempt to register second media with type %u\n", 
+			     media_type);
+			media_count--;
+			goto exit;
+		}
+		if (!strcmp(name, media_list[i].name)) {
+			warn("Attempt to re-register media name <%s>\n", name);
+			media_count--;
+			goto exit;
+		}
+	}
+
+	m_ptr = &media_list[media_id];
+	m_ptr->type_id = media_type;
+	m_ptr->send_msg = send_msg;
+	m_ptr->enable_bearer = enable;
+	m_ptr->disable_bearer = disable;
+	m_ptr->addr2str = addr2str;
+	memcpy(&m_ptr->bcast_addr, bcast_addr, sizeof(*bcast_addr));
+	m_ptr->bcast = 1;
+	strcpy(m_ptr->name, name);
+	m_ptr->priority = bearer_priority;
+	m_ptr->tolerance = link_tolerance;
+	m_ptr->window = send_window_limit;
+	dbg("Media <%s> registered\n", name);
+	res = 0;
+exit:
+	write_unlock_bh(&net_lock);
+	return res;
+}
+
+/**
+ * media_addr_printf - record media address in print buffer
+ */
+
+void media_addr_printf(struct print_buf *pb, struct tipc_media_addr *a)
+{
+	struct media *m_ptr;
+	u32 media_type;
+	u32 i;
+
+	media_type = ntohl(a->type);
+	for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) {
+		if (m_ptr->type_id == media_type)
+			break;
+	}
+
+	if ((i < media_count) && (m_ptr->addr2str != NULL)) {
+		char addr_str[MAX_ADDR_STR];
+
+		tipc_printf(pb, "%s(%s) ", m_ptr->name, 
+			    m_ptr->addr2str(a, addr_str, sizeof(addr_str)));
+	} else {
+		unchar *addr = (unchar *)&a->dev_addr;
+
+		tipc_printf(pb, "UNKNOWN(%u):", media_type);
+		for (i = 0; i < (sizeof(*a) - sizeof(a->type)); i++) {
+			tipc_printf(pb, "%02x ", addr[i]);
+		}
+	}
+}
+
+/**
+ * media_get_names - record names of registered media in buffer
+ */
+
+struct sk_buff *media_get_names(void)
+{
+	struct sk_buff *buf;
+	struct media *m_ptr;
+	int i;
+
+	buf = cfg_reply_alloc(MAX_MEDIA * TLV_SPACE(TIPC_MAX_MEDIA_NAME));
+	if (!buf)
+		return NULL;
+
+	read_lock_bh(&net_lock);
+	for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) {
+		cfg_append_tlv(buf, TIPC_TLV_MEDIA_NAME, m_ptr->name, 
+			       strlen(m_ptr->name) + 1);
+	}
+	read_unlock_bh(&net_lock);
+	return buf;
+}
+
+/**
+ * bearer_name_validate - validate & (optionally) deconstruct bearer name
+ * @name - ptr to bearer name string
+ * @name_parts - ptr to area for bearer name components (or NULL if not needed)
+ * 
+ * Returns 1 if bearer name is valid, otherwise 0.
+ */
+
+static int bearer_name_validate(const char *name, 
+				struct bearer_name *name_parts)
+{
+	char name_copy[TIPC_MAX_BEARER_NAME];
+	char *media_name;
+	char *if_name;
+	u32 media_len;
+	u32 if_len;
+
+	/* copy bearer name & ensure length is OK */
+
+	name_copy[TIPC_MAX_BEARER_NAME - 1] = 0;
+	/* need above in case non-Posix strncpy() doesn't pad with nulls */
+	strncpy(name_copy, name, TIPC_MAX_BEARER_NAME);
+	if (name_copy[TIPC_MAX_BEARER_NAME - 1] != 0)
+		return 0;
+
+	/* ensure all component parts of bearer name are present */
+
+	media_name = name_copy;
+	if ((if_name = strchr(media_name, ':')) == NULL)
+		return 0;
+	*(if_name++) = 0;
+	media_len = if_name - media_name;
+	if_len = strlen(if_name) + 1;
+
+	/* validate component parts of bearer name */
+
+	if ((media_len <= 1) || (media_len > TIPC_MAX_MEDIA_NAME) || 
+	    (if_len <= 1) || (if_len > TIPC_MAX_IF_NAME) || 
+	    (strspn(media_name, tipc_alphabet) != (media_len - 1)) ||
+	    (strspn(if_name, tipc_alphabet) != (if_len - 1)))
+		return 0;
+
+	/* return bearer name components, if necessary */
+
+	if (name_parts) {
+		strcpy(name_parts->media_name, media_name);
+		strcpy(name_parts->if_name, if_name);
+	}
+	return 1;
+}
+
+/**
+ * bearer_find - locates bearer object with matching bearer name
+ */
+
+static struct bearer *bearer_find(const char *name)
+{
+	struct bearer *b_ptr;
+	u32 i;
+
+	for (i = 0, b_ptr = bearers; i < MAX_BEARERS; i++, b_ptr++) {
+		if (b_ptr->active && (!strcmp(b_ptr->publ.name, name)))
+			return b_ptr;
+	}
+	return 0;
+}
+
+/**
+ * bearer_find - locates bearer object with matching interface name
+ */
+
+struct bearer *bearer_find_interface(const char *if_name)
+{
+	struct bearer *b_ptr;
+	char *b_if_name;
+	u32 i;
+
+	for (i = 0, b_ptr = bearers; i < MAX_BEARERS; i++, b_ptr++) {
+		if (!b_ptr->active)
+			continue;
+		b_if_name = strchr(b_ptr->publ.name, ':') + 1;
+		if (!strcmp(b_if_name, if_name))
+			return b_ptr;
+	}
+	return 0;
+}
+
+/**
+ * bearer_get_names - record names of bearers in buffer
+ */
+
+struct sk_buff *bearer_get_names(void)
+{
+	struct sk_buff *buf;
+	struct media *m_ptr;
+	struct bearer *b_ptr;
+	int i, j;
+
+	buf = cfg_reply_alloc(MAX_BEARERS * TLV_SPACE(TIPC_MAX_BEARER_NAME));
+	if (!buf)
+		return NULL;
+
+	read_lock_bh(&net_lock);
+	for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) {
+		for (j = 0; j < MAX_BEARERS; j++) {
+			b_ptr = &bearers[j];
+			if (b_ptr->active && (b_ptr->media == m_ptr)) {
+				cfg_append_tlv(buf, TIPC_TLV_BEARER_NAME, 
+					       b_ptr->publ.name, 
+					       strlen(b_ptr->publ.name) + 1);
+			}
+		}
+	}
+	read_unlock_bh(&net_lock);
+	return buf;
+}
+
+void bearer_add_dest(struct bearer *b_ptr, u32 dest)
+{
+	nmap_add(&b_ptr->nodes, dest);
+	disc_update_link_req(b_ptr->link_req);
+	bcbearer_sort();
+}
+
+void bearer_remove_dest(struct bearer *b_ptr, u32 dest)
+{
+	nmap_remove(&b_ptr->nodes, dest);
+	disc_update_link_req(b_ptr->link_req);
+	bcbearer_sort();
+}
+
+/*
+ * bearer_push(): Resolve bearer congestion. Force the waiting
+ * links to push out their unsent packets, one packet per link
+ * per iteration, until all packets are gone or congestion reoccurs.
+ * 'net_lock' is read_locked when this function is called
+ * bearer.lock must be taken before calling
+ * Returns binary true(1) ore false(0)
+ */
+static int bearer_push(struct bearer *b_ptr)
+{
+	u32 res = TIPC_OK;
+	struct link *ln, *tln;
+
+	if (b_ptr->publ.blocked)
+		return 0;
+
+	while (!list_empty(&b_ptr->cong_links) && (res != PUSH_FAILED)) {
+		list_for_each_entry_safe(ln, tln, &b_ptr->cong_links, link_list) {
+			res = link_push_packet(ln);
+			if (res == PUSH_FAILED)
+				break;
+			if (res == PUSH_FINISHED)
+				list_move_tail(&ln->link_list, &b_ptr->links);
+		}
+	}
+	return list_empty(&b_ptr->cong_links);
+}
+
+void bearer_lock_push(struct bearer *b_ptr)
+{
+	int res;
+
+	spin_lock_bh(&b_ptr->publ.lock);
+	res = bearer_push(b_ptr);
+	spin_unlock_bh(&b_ptr->publ.lock);
+	if (res)
+		bcbearer_push();
+}
+
+
+/*
+ * Interrupt enabling new requests after bearer congestion or blocking:    
+ * See bearer_send().   
+ */
+void tipc_continue(struct tipc_bearer *tb_ptr)
+{
+	struct bearer *b_ptr = (struct bearer *)tb_ptr;
+
+	spin_lock_bh(&b_ptr->publ.lock);
+	b_ptr->continue_count++;
+	if (!list_empty(&b_ptr->cong_links))
+		k_signal((Handler)bearer_lock_push, (unsigned long)b_ptr);
+	b_ptr->publ.blocked = 0;
+	spin_unlock_bh(&b_ptr->publ.lock);
+}
+
+/*
+ * Schedule link for sending of messages after the bearer 
+ * has been deblocked by 'continue()'. This method is called 
+ * when somebody tries to send a message via this link while 
+ * the bearer is congested. 'net_lock' is in read_lock here
+ * bearer.lock is busy
+ */
+
+static void bearer_schedule_unlocked(struct bearer *b_ptr, struct link *l_ptr)
+{
+	list_move_tail(&l_ptr->link_list, &b_ptr->cong_links);
+}
+
+/*
+ * Schedule link for sending of messages after the bearer 
+ * has been deblocked by 'continue()'. This method is called 
+ * when somebody tries to send a message via this link while 
+ * the bearer is congested. 'net_lock' is in read_lock here,
+ * bearer.lock is free
+ */
+
+void bearer_schedule(struct bearer *b_ptr, struct link *l_ptr)
+{
+	spin_lock_bh(&b_ptr->publ.lock);
+	bearer_schedule_unlocked(b_ptr, l_ptr);
+	spin_unlock_bh(&b_ptr->publ.lock);
+}
+
+
+/*
+ * bearer_resolve_congestion(): Check if there is bearer congestion,
+ * and if there is, try to resolve it before returning.
+ * 'net_lock' is read_locked when this function is called
+ */
+int bearer_resolve_congestion(struct bearer *b_ptr, struct link *l_ptr)
+{
+	int res = 1;
+
+	if (list_empty(&b_ptr->cong_links))
+		return 1;
+	spin_lock_bh(&b_ptr->publ.lock);
+	if (!bearer_push(b_ptr)) {
+		bearer_schedule_unlocked(b_ptr, l_ptr);
+		res = 0;
+	}
+	spin_unlock_bh(&b_ptr->publ.lock);
+	return res;
+}
+
+
+/**
+ * tipc_enable_bearer - enable bearer with the given name
+ */              
+
+int tipc_enable_bearer(const char *name, u32 bcast_scope, u32 priority)
+{
+	struct bearer *b_ptr;
+	struct media *m_ptr;
+	struct bearer_name b_name;
+	char addr_string[16];
+	u32 bearer_id;
+	u32 with_this_prio;
+	u32 i;
+	int res = -EINVAL;
+
+	if (tipc_mode != TIPC_NET_MODE)
+		return -ENOPROTOOPT;
+	if (!bearer_name_validate(name, &b_name) ||
+	    !addr_domain_valid(bcast_scope) ||
+	    !in_scope(bcast_scope, tipc_own_addr) ||
+	    (priority > TIPC_NUM_LINK_PRI))
+		return -EINVAL;
+
+	write_lock_bh(&net_lock);
+	if (!bearers)
+		goto failed;
+
+	m_ptr = media_find(b_name.media_name);
+	if (!m_ptr) {
+		warn("No media <%s>\n", b_name.media_name);
+		goto failed;
+	}
+	if (priority == TIPC_NUM_LINK_PRI)
+		priority = m_ptr->priority;
+
+restart:
+	bearer_id = MAX_BEARERS;
+	with_this_prio = 1;
+	for (i = MAX_BEARERS; i-- != 0; ) {
+		if (!bearers[i].active) {
+			bearer_id = i;
+			continue;
+		}
+		if (!strcmp(name, bearers[i].publ.name)) {
+			warn("Bearer <%s> already enabled\n", name);
+			goto failed;
+		}
+		if ((bearers[i].priority == priority) &&
+		    (++with_this_prio > 2)) {
+			if (priority-- == 0) {
+				warn("Third bearer <%s> with priority %u, unable to lower to %u\n",
+				     name, priority + 1, priority);
+				goto failed;
+			}
+			warn("Third bearer <%s> with priority %u, lowering to %u\n",
+			     name, priority + 1, priority);
+			goto restart;
+		}
+	}
+	if (bearer_id >= MAX_BEARERS) {
+		warn("Attempt to enable more than %d bearers\n", MAX_BEARERS);
+		goto failed;
+	}
+
+	b_ptr = &bearers[bearer_id];
+	memset(b_ptr, 0, sizeof(struct bearer));
+
+	strcpy(b_ptr->publ.name, name);
+	res = m_ptr->enable_bearer(&b_ptr->publ);
+	if (res) {
+		warn("Failed to enable bearer <%s>\n", name);
+		goto failed;
+	}
+
+	b_ptr->identity = bearer_id;
+	b_ptr->media = m_ptr;
+	b_ptr->net_plane = bearer_id + 'A';
+	b_ptr->active = 1;
+	b_ptr->detect_scope = bcast_scope;
+	b_ptr->priority = priority;
+	INIT_LIST_HEAD(&b_ptr->cong_links);
+	INIT_LIST_HEAD(&b_ptr->links);
+	if (m_ptr->bcast) {
+		b_ptr->link_req = disc_init_link_req(b_ptr, &m_ptr->bcast_addr,
+						     bcast_scope, 2);
+	}
+	b_ptr->publ.lock = SPIN_LOCK_UNLOCKED;
+	write_unlock_bh(&net_lock);
+	info("Enabled bearer <%s>, discovery domain %s\n",
+	     name, addr_string_fill(addr_string, bcast_scope));
+	return 0;
+failed:
+	write_unlock_bh(&net_lock);
+	return res;
+}
+
+/**
+ * tipc_block_bearer(): Block the bearer with the given name,
+ *                      and reset all its links
+ */
+
+int tipc_block_bearer(const char *name)
+{
+	struct bearer *b_ptr = 0;
+	struct link *l_ptr;
+	struct link *temp_l_ptr;
+
+	if (tipc_mode != TIPC_NET_MODE)
+		return -ENOPROTOOPT;
+
+	read_lock_bh(&net_lock);
+	b_ptr = bearer_find(name);
+	if (!b_ptr) {
+		warn("Attempt to block unknown bearer <%s>\n", name);
+		read_unlock_bh(&net_lock);
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&b_ptr->publ.lock);
+	b_ptr->publ.blocked = 1;
+	list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
+		struct node *n_ptr = l_ptr->owner;
+
+		spin_lock_bh(&n_ptr->lock);
+		link_reset(l_ptr);
+		spin_unlock_bh(&n_ptr->lock);
+	}
+	spin_unlock_bh(&b_ptr->publ.lock);
+	read_unlock_bh(&net_lock);
+	info("Blocked bearer <%s>\n", name);
+	return TIPC_OK;
+}
+
+/**
+ * bearer_disable -
+ * 
+ * Note: This routine assumes caller holds net_lock.
+ */
+
+static int bearer_disable(const char *name)
+{
+	struct bearer *b_ptr;
+	struct link *l_ptr;
+	struct link *temp_l_ptr;
+
+	if (tipc_mode != TIPC_NET_MODE)
+		return -ENOPROTOOPT;
+
+	b_ptr = bearer_find(name);
+	if (!b_ptr) {
+		warn("Attempt to disable unknown bearer <%s>\n", name);
+		return -EINVAL;
+	}
+
+	disc_stop_link_req(b_ptr->link_req);
+	spin_lock_bh(&b_ptr->publ.lock);
+	b_ptr->link_req = NULL;
+	b_ptr->publ.blocked = 1;
+	if (b_ptr->media->disable_bearer) {
+		spin_unlock_bh(&b_ptr->publ.lock);
+		write_unlock_bh(&net_lock);
+		b_ptr->media->disable_bearer(&b_ptr->publ);
+		write_lock_bh(&net_lock);
+		spin_lock_bh(&b_ptr->publ.lock);
+	}
+	list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
+		link_delete(l_ptr);
+	}
+	spin_unlock_bh(&b_ptr->publ.lock);
+	info("Disabled bearer <%s>\n", name);
+	memset(b_ptr, 0, sizeof(struct bearer));
+	return TIPC_OK;
+}
+
+int tipc_disable_bearer(const char *name)
+{
+	int res;
+
+	write_lock_bh(&net_lock);
+	res = bearer_disable(name);
+	write_unlock_bh(&net_lock);
+	return res;
+}
+
+
+
+int bearer_init(void)
+{
+	int res;
+
+	write_lock_bh(&net_lock);
+	bearers = kmalloc(MAX_BEARERS * sizeof(struct bearer), GFP_ATOMIC);
+	media_list = kmalloc(MAX_MEDIA * sizeof(struct media), GFP_ATOMIC);
+	if (bearers && media_list) {
+		memset(bearers, 0, MAX_BEARERS * sizeof(struct bearer));
+		memset(media_list, 0, MAX_MEDIA * sizeof(struct media));
+		res = TIPC_OK;
+	} else {
+		kfree(bearers);
+		kfree(media_list);
+		bearers = 0;
+		media_list = 0;
+		res = -ENOMEM;
+	}
+	write_unlock_bh(&net_lock);
+	return res;
+}
+
+void bearer_stop(void)
+{
+	u32 i;
+
+	if (!bearers)
+		return;
+
+	for (i = 0; i < MAX_BEARERS; i++) {
+		if (bearers[i].active)
+			bearers[i].publ.blocked = 1;
+	}
+	for (i = 0; i < MAX_BEARERS; i++) {
+		if (bearers[i].active)
+			bearer_disable(bearers[i].publ.name);
+	}
+	kfree(bearers);
+	kfree(media_list);
+	bearers = 0;
+	media_list = 0;
+	media_count = 0;
+}
+
+
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
new file mode 100644
index 000000000000..02a16f86defa
--- /dev/null
+++ b/net/tipc/bearer.h
@@ -0,0 +1,169 @@
+/*
+ * net/tipc/bearer.h: Include file for TIPC bearer code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_BEARER_H
+#define _TIPC_BEARER_H
+
+#include <net/tipc/tipc_bearer.h>
+#include "bcast.h"
+
+#define MAX_BEARERS 8
+#define MAX_MEDIA 4
+
+
+/**
+ * struct media - TIPC media information available to internal users
+ * @send_msg: routine which handles buffer transmission
+ * @enable_bearer: routine which enables a bearer
+ * @disable_bearer: routine which disables a bearer
+ * @addr2str: routine which converts bearer's address to string form
+ * @bcast_addr: media address used in broadcasting
+ * @bcast: non-zero if media supports broadcasting [currently mandatory]
+ * @priority: default link (and bearer) priority
+ * @tolerance: default time (in ms) before declaring link failure
+ * @window: default window (in packets) before declaring link congestion
+ * @type_id: TIPC media identifier [defined in tipc_bearer.h]
+ * @name: media name
+ */
+ 
+struct media {
+	int (*send_msg)(struct sk_buff *buf, 
+			struct tipc_bearer *b_ptr,
+			struct tipc_media_addr *dest);
+	int (*enable_bearer)(struct tipc_bearer *b_ptr);
+	void (*disable_bearer)(struct tipc_bearer *b_ptr);
+	char *(*addr2str)(struct tipc_media_addr *a, 
+			  char *str_buf, int str_size);
+	struct tipc_media_addr bcast_addr;
+	int bcast;
+	u32 priority;
+	u32 tolerance;
+	u32 window;
+	u32 type_id;
+	char name[TIPC_MAX_MEDIA_NAME];
+};
+
+/**
+ * struct bearer - TIPC bearer information available to internal users
+ * @publ: bearer information available to privileged users
+ * @media: ptr to media structure associated with bearer
+ * @priority: default link priority for bearer
+ * @detect_scope: network address mask used during automatic link creation
+ * @identity: array index of this bearer within TIPC bearer array
+ * @link_req: ptr to (optional) structure making periodic link setup requests
+ * @links: list of non-congested links associated with bearer
+ * @cong_links: list of congested links associated with bearer
+ * @continue_count: # of times bearer has resumed after congestion or blocking
+ * @active: non-zero if bearer structure is represents a bearer
+ * @net_plane: network plane ('A' through 'H') currently associated with bearer
+ * @nodes: indicates which nodes in cluster can be reached through bearer
+ */
+ 
+struct bearer {
+	struct tipc_bearer publ;
+	struct media *media;
+	u32 priority;
+	u32 detect_scope;
+	u32 identity;
+	struct link_req *link_req;
+	struct list_head links;
+	struct list_head cong_links;
+	u32 continue_count;
+	int active;
+	char net_plane;
+	struct node_map nodes;
+};
+
+struct bearer_name {
+	char media_name[TIPC_MAX_MEDIA_NAME];
+	char if_name[TIPC_MAX_IF_NAME];
+};
+
+struct link;
+
+extern struct bearer *bearers;
+
+void media_addr_printf(struct print_buf *pb, struct tipc_media_addr *a);
+struct sk_buff *media_get_names(void);
+
+struct sk_buff *bearer_get_names(void);
+void bearer_add_dest(struct bearer *b_ptr, u32 dest);
+void bearer_remove_dest(struct bearer *b_ptr, u32 dest);
+void bearer_schedule(struct bearer *b_ptr, struct link *l_ptr);
+struct bearer *bearer_find_interface(const char *if_name);
+int bearer_resolve_congestion(struct bearer *b_ptr, struct link *l_ptr);
+int bearer_init(void);
+void bearer_stop(void);
+int bearer_broadcast(struct sk_buff *buf, struct tipc_bearer *b_ptr,
+		     struct tipc_media_addr *dest);
+void bearer_lock_push(struct bearer *b_ptr);
+
+
+/**
+ * bearer_send- sends buffer to destination over bearer 
+ * 
+ * Returns true (1) if successful, or false (0) if unable to send
+ * 
+ * IMPORTANT:
+ * The media send routine must not alter the buffer being passed in
+ * as it may be needed for later retransmission!
+ * 
+ * If the media send routine returns a non-zero value (indicating that 
+ * it was unable to send the buffer), it must:
+ *   1) mark the bearer as blocked,
+ *   2) call tipc_continue() once the bearer is able to send again.
+ * Media types that are unable to meet these two critera must ensure their
+ * send routine always returns success -- even if the buffer was not sent --
+ * and let TIPC's link code deal with the undelivered message. 
+ */
+
+static inline int bearer_send(struct bearer *b_ptr, struct sk_buff *buf,
+			      struct tipc_media_addr *dest)
+{
+	return !b_ptr->media->send_msg(buf, &b_ptr->publ, dest);
+}
+
+/**
+ * bearer_congested - determines if bearer is currently congested
+ */
+
+static inline int bearer_congested(struct bearer *b_ptr, struct link *l_ptr)
+{
+	if (unlikely(b_ptr->publ.blocked))
+		return 1;
+	if (likely(list_empty(&b_ptr->cong_links)))
+		return 0;
+	return !bearer_resolve_congestion(b_ptr, l_ptr);
+}
+
+#endif
diff --git a/net/tipc/cluster.c b/net/tipc/cluster.c
new file mode 100644
index 000000000000..d7d0f5f17e00
--- /dev/null
+++ b/net/tipc/cluster.c
@@ -0,0 +1,573 @@
+/*
+ * net/tipc/cluster.c: TIPC cluster management routines
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "cluster.h"
+#include "addr.h"
+#include "node_subscr.h"
+#include "link.h"
+#include "node.h"
+#include "net.h"
+#include "msg.h"
+#include "bearer.h"
+
+void cluster_multicast(struct cluster *c_ptr, struct sk_buff *buf, 
+		       u32 lower, u32 upper);
+struct sk_buff *cluster_prepare_routing_msg(u32 data_size, u32 dest);
+
+struct node **local_nodes = 0;
+struct node_map cluster_bcast_nodes = {0,{0,}};
+u32 highest_allowed_slave = 0;
+
+struct cluster *cluster_create(u32 addr)
+{
+	struct _zone *z_ptr;
+	struct cluster *c_ptr;
+	int max_nodes; 
+	int alloc;
+
+	c_ptr = (struct cluster *)kmalloc(sizeof(*c_ptr), GFP_ATOMIC);
+	if (c_ptr == NULL)
+		return 0;
+	memset(c_ptr, 0, sizeof(*c_ptr));
+
+	c_ptr->addr = tipc_addr(tipc_zone(addr), tipc_cluster(addr), 0);
+	if (in_own_cluster(addr))
+		max_nodes = LOWEST_SLAVE + tipc_max_slaves;
+	else
+		max_nodes = tipc_max_nodes + 1;
+	alloc = sizeof(void *) * (max_nodes + 1);
+	c_ptr->nodes = (struct node **)kmalloc(alloc, GFP_ATOMIC);
+	if (c_ptr->nodes == NULL) {
+		kfree(c_ptr);
+		return 0;
+	}
+	memset(c_ptr->nodes, 0, alloc);  
+	if (in_own_cluster(addr))
+		local_nodes = c_ptr->nodes;
+	c_ptr->highest_slave = LOWEST_SLAVE - 1;
+	c_ptr->highest_node = 0;
+	
+	z_ptr = zone_find(tipc_zone(addr));
+	if (z_ptr == NULL) {
+		z_ptr = zone_create(addr);
+	}
+	if (z_ptr != NULL) {
+		zone_attach_cluster(z_ptr, c_ptr);
+		c_ptr->owner = z_ptr;
+	}
+	else {
+		kfree(c_ptr);
+		c_ptr = 0;
+	}
+
+	return c_ptr;
+}
+
+void cluster_delete(struct cluster *c_ptr)
+{
+	u32 n_num;
+
+	if (!c_ptr)
+		return;
+	for (n_num = 1; n_num <= c_ptr->highest_node; n_num++) {
+		node_delete(c_ptr->nodes[n_num]);
+	}
+	for (n_num = LOWEST_SLAVE; n_num <= c_ptr->highest_slave; n_num++) {
+		node_delete(c_ptr->nodes[n_num]);
+	}
+	kfree(c_ptr->nodes);
+	kfree(c_ptr);
+}
+
+u32 cluster_next_node(struct cluster *c_ptr, u32 addr)
+{
+	struct node *n_ptr;
+	u32 n_num = tipc_node(addr) + 1;
+
+	if (!c_ptr)
+		return addr;
+	for (; n_num <= c_ptr->highest_node; n_num++) {
+		n_ptr = c_ptr->nodes[n_num];
+		if (n_ptr && node_has_active_links(n_ptr))
+			return n_ptr->addr;
+	}
+	for (n_num = 1; n_num < tipc_node(addr); n_num++) {
+		n_ptr = c_ptr->nodes[n_num];
+		if (n_ptr && node_has_active_links(n_ptr))
+			return n_ptr->addr;
+	}
+	return 0;
+}
+
+void cluster_attach_node(struct cluster *c_ptr, struct node *n_ptr)
+{
+	u32 n_num = tipc_node(n_ptr->addr);
+	u32 max_n_num = tipc_max_nodes;
+
+	if (in_own_cluster(n_ptr->addr))
+		max_n_num = highest_allowed_slave;
+	assert(n_num > 0);
+	assert(n_num <= max_n_num);
+	assert(c_ptr->nodes[n_num] == 0);
+	c_ptr->nodes[n_num] = n_ptr;
+	if (n_num > c_ptr->highest_node)
+		c_ptr->highest_node = n_num;
+}
+
+/**
+ * cluster_select_router - select router to a cluster
+ * 
+ * Uses deterministic and fair algorithm.
+ */
+
+u32 cluster_select_router(struct cluster *c_ptr, u32 ref)
+{
+	u32 n_num;
+	u32 ulim = c_ptr->highest_node;
+	u32 mask;
+	u32 tstart;
+
+	assert(!in_own_cluster(c_ptr->addr));
+	if (!ulim)
+		return 0;
+
+	/* Start entry must be random */
+	mask = tipc_max_nodes;
+	while (mask > ulim)
+		mask >>= 1;
+	tstart = ref & mask;
+	n_num = tstart;
+
+	/* Lookup upwards with wrap-around */
+	do {
+		if (node_is_up(c_ptr->nodes[n_num]))
+			break;
+	} while (++n_num <= ulim);
+	if (n_num > ulim) {
+		n_num = 1;
+		do {
+			if (node_is_up(c_ptr->nodes[n_num]))
+				break;
+		} while (++n_num < tstart);
+		if (n_num == tstart)
+			return 0;
+	}
+	assert(n_num <= ulim);
+	return node_select_router(c_ptr->nodes[n_num], ref);
+}
+
+/**
+ * cluster_select_node - select destination node within a remote cluster
+ * 
+ * Uses deterministic and fair algorithm.
+ */
+
+struct node *cluster_select_node(struct cluster *c_ptr, u32 selector)
+{
+	u32 n_num;
+	u32 mask = tipc_max_nodes;
+	u32 start_entry;
+
+	assert(!in_own_cluster(c_ptr->addr));
+	if (!c_ptr->highest_node)
+		return 0;
+
+	/* Start entry must be random */
+	while (mask > c_ptr->highest_node) {
+		mask >>= 1;
+	}
+	start_entry = (selector & mask) ? selector & mask : 1u;
+	assert(start_entry <= c_ptr->highest_node);
+
+	/* Lookup upwards with wrap-around */
+	for (n_num = start_entry; n_num <= c_ptr->highest_node; n_num++) {
+		if (node_has_active_links(c_ptr->nodes[n_num]))
+			return c_ptr->nodes[n_num];
+	}
+	for (n_num = 1; n_num < start_entry; n_num++) {
+		if (node_has_active_links(c_ptr->nodes[n_num]))
+			return c_ptr->nodes[n_num];
+	}
+	return 0;
+}
+
+/*
+ *    Routing table management: See description in node.c
+ */
+
+struct sk_buff *cluster_prepare_routing_msg(u32 data_size, u32 dest)
+{
+	u32 size = INT_H_SIZE + data_size;
+	struct sk_buff *buf = buf_acquire(size);
+	struct tipc_msg *msg;
+
+	if (buf) {
+		msg = buf_msg(buf);
+		memset((char *)msg, 0, size);
+		msg_init(msg, ROUTE_DISTRIBUTOR, 0, TIPC_OK, INT_H_SIZE, dest);
+	}
+	return buf;
+}
+
+void cluster_bcast_new_route(struct cluster *c_ptr, u32 dest,
+			     u32 lower, u32 upper)
+{
+	struct sk_buff *buf = cluster_prepare_routing_msg(0, c_ptr->addr);
+	struct tipc_msg *msg;
+
+	if (buf) {
+		msg = buf_msg(buf);
+		msg_set_remote_node(msg, dest);
+		msg_set_type(msg, ROUTE_ADDITION);
+		cluster_multicast(c_ptr, buf, lower, upper);
+	} else {
+		warn("Memory squeeze: broadcast of new route failed\n");
+	}
+}
+
+void cluster_bcast_lost_route(struct cluster *c_ptr, u32 dest,
+			      u32 lower, u32 upper)
+{
+	struct sk_buff *buf = cluster_prepare_routing_msg(0, c_ptr->addr);
+	struct tipc_msg *msg;
+
+	if (buf) {
+		msg = buf_msg(buf);
+		msg_set_remote_node(msg, dest);
+		msg_set_type(msg, ROUTE_REMOVAL);
+		cluster_multicast(c_ptr, buf, lower, upper);
+	} else {
+		warn("Memory squeeze: broadcast of lost route failed\n");
+	}
+}
+
+void cluster_send_slave_routes(struct cluster *c_ptr, u32 dest)
+{
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+	u32 highest = c_ptr->highest_slave;
+	u32 n_num;
+	int send = 0;
+
+	assert(!is_slave(dest));
+	assert(in_own_cluster(dest));
+	assert(in_own_cluster(c_ptr->addr));
+	if (highest <= LOWEST_SLAVE)
+		return;
+	buf = cluster_prepare_routing_msg(highest - LOWEST_SLAVE + 1,
+					  c_ptr->addr);
+	if (buf) {
+		msg = buf_msg(buf);
+		msg_set_remote_node(msg, c_ptr->addr);
+		msg_set_type(msg, SLAVE_ROUTING_TABLE);
+		for (n_num = LOWEST_SLAVE; n_num <= highest; n_num++) {
+			if (c_ptr->nodes[n_num] && 
+			    node_has_active_links(c_ptr->nodes[n_num])) {
+				send = 1;
+				msg_set_dataoctet(msg, n_num);
+			}
+		}
+		if (send)
+			link_send(buf, dest, dest);
+		else
+			buf_discard(buf);
+	} else {
+		warn("Memory squeeze: broadcast of lost route failed\n");
+	}
+}
+
+void cluster_send_ext_routes(struct cluster *c_ptr, u32 dest)
+{
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+	u32 highest = c_ptr->highest_node;
+	u32 n_num;
+	int send = 0;
+
+	if (in_own_cluster(c_ptr->addr))
+		return;
+	assert(!is_slave(dest));
+	assert(in_own_cluster(dest));
+	highest = c_ptr->highest_node;
+	buf = cluster_prepare_routing_msg(highest + 1, c_ptr->addr);
+	if (buf) {
+		msg = buf_msg(buf);
+		msg_set_remote_node(msg, c_ptr->addr);
+		msg_set_type(msg, EXT_ROUTING_TABLE);
+		for (n_num = 1; n_num <= highest; n_num++) {
+			if (c_ptr->nodes[n_num] && 
+			    node_has_active_links(c_ptr->nodes[n_num])) {
+				send = 1;
+				msg_set_dataoctet(msg, n_num);
+			}
+		}
+		if (send)
+			link_send(buf, dest, dest);
+		else
+			buf_discard(buf);
+	} else {
+		warn("Memory squeeze: broadcast of external route failed\n");
+	}
+}
+
+void cluster_send_local_routes(struct cluster *c_ptr, u32 dest)
+{
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+	u32 highest = c_ptr->highest_node;
+	u32 n_num;
+	int send = 0;
+
+	assert(is_slave(dest));
+	assert(in_own_cluster(c_ptr->addr));
+	buf = cluster_prepare_routing_msg(highest, c_ptr->addr);
+	if (buf) {
+		msg = buf_msg(buf);
+		msg_set_remote_node(msg, c_ptr->addr);
+		msg_set_type(msg, LOCAL_ROUTING_TABLE);
+		for (n_num = 1; n_num <= highest; n_num++) {
+			if (c_ptr->nodes[n_num] && 
+			    node_has_active_links(c_ptr->nodes[n_num])) {
+				send = 1;
+				msg_set_dataoctet(msg, n_num);
+			}
+		}
+		if (send)
+			link_send(buf, dest, dest);
+		else
+			buf_discard(buf);
+	} else {
+		warn("Memory squeeze: broadcast of local route failed\n");
+	}
+}
+
+void cluster_recv_routing_table(struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	struct cluster *c_ptr;
+	struct node *n_ptr;
+	unchar *node_table;
+	u32 table_size;
+	u32 router;
+	u32 rem_node = msg_remote_node(msg);
+	u32 z_num;
+	u32 c_num;
+	u32 n_num;
+
+	c_ptr = cluster_find(rem_node);
+	if (!c_ptr) {
+		c_ptr = cluster_create(rem_node);
+		if (!c_ptr) {
+			buf_discard(buf);
+			return;
+		}
+	}
+
+	node_table = buf->data + msg_hdr_sz(msg);
+	table_size = msg_size(msg) - msg_hdr_sz(msg);
+	router = msg_prevnode(msg);
+	z_num = tipc_zone(rem_node);
+	c_num = tipc_cluster(rem_node);
+
+	switch (msg_type(msg)) {
+	case LOCAL_ROUTING_TABLE:
+		assert(is_slave(tipc_own_addr));
+	case EXT_ROUTING_TABLE:
+		for (n_num = 1; n_num < table_size; n_num++) {
+			if (node_table[n_num]) {
+				u32 addr = tipc_addr(z_num, c_num, n_num);
+				n_ptr = c_ptr->nodes[n_num];
+				if (!n_ptr) {
+					n_ptr = node_create(addr);
+				}
+				if (n_ptr)
+					node_add_router(n_ptr, router);
+			}
+		}
+		break;
+	case SLAVE_ROUTING_TABLE:
+		assert(!is_slave(tipc_own_addr));
+		assert(in_own_cluster(c_ptr->addr));
+		for (n_num = 1; n_num < table_size; n_num++) {
+			if (node_table[n_num]) {
+				u32 slave_num = n_num + LOWEST_SLAVE;
+				u32 addr = tipc_addr(z_num, c_num, slave_num);
+				n_ptr = c_ptr->nodes[slave_num];
+				if (!n_ptr) {
+					n_ptr = node_create(addr);
+				}
+				if (n_ptr)
+					node_add_router(n_ptr, router);
+			}
+		}
+		break;
+	case ROUTE_ADDITION:
+		if (!is_slave(tipc_own_addr)) {
+			assert(!in_own_cluster(c_ptr->addr)
+			       || is_slave(rem_node));
+		} else {
+			assert(in_own_cluster(c_ptr->addr)
+			       && !is_slave(rem_node));
+		}
+		n_ptr = c_ptr->nodes[tipc_node(rem_node)];
+		if (!n_ptr)
+			n_ptr = node_create(rem_node);
+		if (n_ptr)
+			node_add_router(n_ptr, router);
+		break;
+	case ROUTE_REMOVAL:
+		if (!is_slave(tipc_own_addr)) {
+			assert(!in_own_cluster(c_ptr->addr)
+			       || is_slave(rem_node));
+		} else {
+			assert(in_own_cluster(c_ptr->addr)
+			       && !is_slave(rem_node));
+		}
+		n_ptr = c_ptr->nodes[tipc_node(rem_node)];
+		if (n_ptr)
+			node_remove_router(n_ptr, router);
+		break;
+	default:
+		assert(!"Illegal routing manager message received\n");
+	}
+	buf_discard(buf);
+}
+
+void cluster_remove_as_router(struct cluster *c_ptr, u32 router)
+{
+	u32 start_entry;
+	u32 tstop;
+	u32 n_num;
+
+	if (is_slave(router))
+		return;	/* Slave nodes can not be routers */
+
+	if (in_own_cluster(c_ptr->addr)) {
+		start_entry = LOWEST_SLAVE;
+		tstop = c_ptr->highest_slave;
+	} else {
+		start_entry = 1;
+		tstop = c_ptr->highest_node;
+	}
+
+	for (n_num = start_entry; n_num <= tstop; n_num++) {
+		if (c_ptr->nodes[n_num]) {
+			node_remove_router(c_ptr->nodes[n_num], router);
+		}
+	}
+}
+
+/**
+ * cluster_multicast - multicast message to local nodes 
+ */
+
+void cluster_multicast(struct cluster *c_ptr, struct sk_buff *buf, 
+		       u32 lower, u32 upper)
+{
+	struct sk_buff *buf_copy;
+	struct node *n_ptr;
+	u32 n_num;
+	u32 tstop;
+
+	assert(lower <= upper);
+	assert(((lower >= 1) && (lower <= tipc_max_nodes)) ||
+	       ((lower >= LOWEST_SLAVE) && (lower <= highest_allowed_slave)));
+	assert(((upper >= 1) && (upper <= tipc_max_nodes)) ||
+	       ((upper >= LOWEST_SLAVE) && (upper <= highest_allowed_slave)));
+	assert(in_own_cluster(c_ptr->addr));
+
+	tstop = is_slave(upper) ? c_ptr->highest_slave : c_ptr->highest_node;
+	if (tstop > upper)
+		tstop = upper;
+	for (n_num = lower; n_num <= tstop; n_num++) {
+		n_ptr = c_ptr->nodes[n_num];
+		if (n_ptr && node_has_active_links(n_ptr)) {
+			buf_copy = skb_copy(buf, GFP_ATOMIC);
+			if (buf_copy == NULL)
+				break;
+			msg_set_destnode(buf_msg(buf_copy), n_ptr->addr);
+			link_send(buf_copy, n_ptr->addr, n_ptr->addr);
+		}
+	}
+	buf_discard(buf);
+}
+
+/**
+ * cluster_broadcast - broadcast message to all nodes within cluster
+ */
+
+void cluster_broadcast(struct sk_buff *buf)
+{
+	struct sk_buff *buf_copy;
+	struct cluster *c_ptr;
+	struct node *n_ptr;
+	u32 n_num;
+	u32 tstart;
+	u32 tstop;
+	u32 node_type;
+
+	if (tipc_mode == TIPC_NET_MODE) {
+		c_ptr = cluster_find(tipc_own_addr);
+		assert(in_own_cluster(c_ptr->addr));	/* For now */
+
+		/* Send to standard nodes, then repeat loop sending to slaves */
+		tstart = 1;
+		tstop = c_ptr->highest_node;
+		for (node_type = 1; node_type <= 2; node_type++) {
+			for (n_num = tstart; n_num <= tstop; n_num++) {
+				n_ptr = c_ptr->nodes[n_num];
+				if (n_ptr && node_has_active_links(n_ptr)) {
+					buf_copy = skb_copy(buf, GFP_ATOMIC);
+					if (buf_copy == NULL)
+						goto exit;
+					msg_set_destnode(buf_msg(buf_copy), 
+							 n_ptr->addr);
+					link_send(buf_copy, n_ptr->addr, 
+						  n_ptr->addr);
+				}
+			}
+			tstart = LOWEST_SLAVE;
+			tstop = c_ptr->highest_slave;
+		}
+	}
+exit:
+	buf_discard(buf);
+}
+
+int cluster_init(void)
+{
+	highest_allowed_slave = LOWEST_SLAVE + tipc_max_slaves;
+	return cluster_create(tipc_own_addr) ? TIPC_OK : -ENOMEM;
+}
+
diff --git a/net/tipc/cluster.h b/net/tipc/cluster.h
new file mode 100644
index 000000000000..c12875ab46be
--- /dev/null
+++ b/net/tipc/cluster.h
@@ -0,0 +1,89 @@
+/*
+ * net/tipc/cluster.h: Include file for TIPC cluster management routines
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_CLUSTER_H
+#define _TIPC_CLUSTER_H
+
+#include "addr.h"
+#include "zone.h"
+
+#define LOWEST_SLAVE  2048u
+
+/**
+ * struct cluster - TIPC cluster structure
+ * @addr: network address of cluster
+ * @owner: pointer to zone that cluster belongs to
+ * @nodes: array of pointers to all nodes within cluster
+ * @highest_node: id of highest numbered node within cluster
+ * @highest_slave: (used for secondary node support)
+ */
+ 
+struct cluster {
+	u32 addr;
+	struct _zone *owner;
+	struct node **nodes;
+	u32 highest_node;
+	u32 highest_slave;
+};
+
+
+extern struct node **local_nodes;
+extern u32 highest_allowed_slave;
+extern struct node_map cluster_bcast_nodes;
+
+void cluster_remove_as_router(struct cluster *c_ptr, u32 router);
+void cluster_send_ext_routes(struct cluster *c_ptr, u32 dest);
+struct node *cluster_select_node(struct cluster *c_ptr, u32 selector);
+u32 cluster_select_router(struct cluster *c_ptr, u32 ref);
+void cluster_recv_routing_table(struct sk_buff *buf);
+struct cluster *cluster_create(u32 addr);
+void cluster_delete(struct cluster *c_ptr);
+void cluster_attach_node(struct cluster *c_ptr, struct node *n_ptr);
+void cluster_send_slave_routes(struct cluster *c_ptr, u32 dest);
+void cluster_broadcast(struct sk_buff *buf);
+int cluster_init(void);
+u32 cluster_next_node(struct cluster *c_ptr, u32 addr);
+void cluster_bcast_new_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi);
+void cluster_send_local_routes(struct cluster *c_ptr, u32 dest);
+void cluster_bcast_lost_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi);
+
+static inline struct cluster *cluster_find(u32 addr)
+{
+	struct _zone *z_ptr = zone_find(addr);
+
+	if (z_ptr)
+		return z_ptr->clusters[1];
+	return 0;
+}
+
+#endif
diff --git a/net/tipc/config.c b/net/tipc/config.c
new file mode 100644
index 000000000000..0f31c342d11c
--- /dev/null
+++ b/net/tipc/config.c
@@ -0,0 +1,715 @@
+/*
+ * net/tipc/config.c: TIPC configuration management code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "dbg.h"
+#include "bearer.h"
+#include "port.h"
+#include "link.h"
+#include "zone.h"
+#include "addr.h"
+#include "name_table.h"
+#include "node.h"
+#include "config.h"
+#include "discover.h"
+
+struct subscr_data {
+	char usr_handle[8];
+	u32 domain;
+	u32 port_ref;
+	struct list_head subd_list;
+};
+
+struct manager {
+	u32 user_ref;
+	u32 port_ref;
+	u32 subscr_ref;
+	u32 link_subscriptions;
+	struct list_head link_subscribers;
+};
+
+static struct manager mng = { 0};
+
+static spinlock_t config_lock = SPIN_LOCK_UNLOCKED;
+
+static const void *req_tlv_area;	/* request message TLV area */
+static int req_tlv_space;		/* request message TLV area size */
+static int rep_headroom;		/* reply message headroom to use */
+
+
+void cfg_link_event(u32 addr, char *name, int up)
+{
+	/* TIPC DOESN'T HANDLE LINK EVENT SUBSCRIPTIONS AT THE MOMENT */
+}
+
+
+struct sk_buff *cfg_reply_alloc(int payload_size)
+{
+	struct sk_buff *buf;
+
+	buf = alloc_skb(rep_headroom + payload_size, GFP_ATOMIC);
+	if (buf)
+		skb_reserve(buf, rep_headroom);
+	return buf;
+}
+
+int cfg_append_tlv(struct sk_buff *buf, int tlv_type, 
+		   void *tlv_data, int tlv_data_size)
+{
+	struct tlv_desc *tlv = (struct tlv_desc *)buf->tail;
+	int new_tlv_space = TLV_SPACE(tlv_data_size);
+
+	if (skb_tailroom(buf) < new_tlv_space) {
+		dbg("cfg_append_tlv unable to append TLV\n");
+		return 0;
+	}
+	skb_put(buf, new_tlv_space);
+	tlv->tlv_type = htons(tlv_type);
+	tlv->tlv_len  = htons(TLV_LENGTH(tlv_data_size));
+	if (tlv_data_size && tlv_data)
+		memcpy(TLV_DATA(tlv), tlv_data, tlv_data_size);
+	return 1;
+}
+
+struct sk_buff *cfg_reply_unsigned_type(u16 tlv_type, u32 value)
+{
+	struct sk_buff *buf;
+	u32 value_net;
+
+	buf = cfg_reply_alloc(TLV_SPACE(sizeof(value)));
+	if (buf) {
+		value_net = htonl(value);
+		cfg_append_tlv(buf, tlv_type, &value_net, 
+			       sizeof(value_net));
+	}
+	return buf;
+}
+
+struct sk_buff *cfg_reply_string_type(u16 tlv_type, char *string)
+{
+	struct sk_buff *buf;
+	int string_len = strlen(string) + 1;
+
+	buf = cfg_reply_alloc(TLV_SPACE(string_len));
+	if (buf)
+		cfg_append_tlv(buf, tlv_type, string, string_len);
+	return buf;
+}
+
+
+
+
+#if 0
+
+/* Now obsolete code for handling commands not yet implemented the new way */
+
+int tipc_cfg_cmd(const struct tipc_cmd_msg * msg,
+		 char *data,
+		 u32 sz,
+		 u32 *ret_size,
+		 struct tipc_portid *orig)
+{
+	int rv = -EINVAL;
+	u32 cmd = msg->cmd;
+
+	*ret_size = 0;
+	switch (cmd) {
+	case TIPC_REMOVE_LINK:
+	case TIPC_CMD_BLOCK_LINK:
+	case TIPC_CMD_UNBLOCK_LINK:
+		if (!cfg_check_connection(orig))
+			rv = link_control(msg->argv.link_name, msg->cmd, 0);
+		break;
+	case TIPC_ESTABLISH:
+		{
+			int connected;
+
+			tipc_isconnected(mng.conn_port_ref, &connected);
+			if (connected || !orig) {
+				rv = TIPC_FAILURE;
+				break;
+			}
+			rv = tipc_connect2port(mng.conn_port_ref, orig);
+			if (rv == TIPC_OK)
+				orig = 0;
+			break;
+		}
+	case TIPC_GET_PEER_ADDRESS:
+		*ret_size = link_peer_addr(msg->argv.link_name, data, sz);
+		break;
+	case TIPC_GET_ROUTES:
+		rv = TIPC_OK;
+		break;
+	default: {}
+	}
+	if (*ret_size)
+		rv = TIPC_OK;
+	return rv;
+}
+
+static void cfg_cmd_event(struct tipc_cmd_msg *msg,
+			  char *data,
+			  u32 sz,        
+			  struct tipc_portid const *orig)
+{
+	int rv = -EINVAL;
+	struct tipc_cmd_result_msg rmsg;
+	struct iovec msg_sect[2];
+	int *arg;
+
+	msg->cmd = ntohl(msg->cmd);
+
+	cfg_prepare_res_msg(msg->cmd, msg->usr_handle, rv, &rmsg, msg_sect, 
+			    data, 0);
+	if (ntohl(msg->magic) != TIPC_MAGIC)
+		goto exit;
+
+	switch (msg->cmd) {
+	case TIPC_CREATE_LINK:
+		if (!cfg_check_connection(orig))
+			rv = disc_create_link(&msg->argv.create_link);
+		break;
+	case TIPC_LINK_SUBSCRIBE:
+		{
+			struct subscr_data *sub;
+
+			if (mng.link_subscriptions > 64)
+				break;
+			sub = (struct subscr_data *)kmalloc(sizeof(*sub),
+							    GFP_ATOMIC);
+			if (sub == NULL) {
+				warn("Memory squeeze; dropped remote link subscription\n");
+				break;
+			}
+			INIT_LIST_HEAD(&sub->subd_list);
+			tipc_createport(mng.user_ref,
+					(void *)sub,
+					TIPC_HIGH_IMPORTANCE,
+					0,
+					0,
+					(tipc_conn_shutdown_event)cfg_linksubscr_cancel,
+					0,
+					0,
+					(tipc_conn_msg_event)cfg_linksubscr_cancel,
+					0,
+					&sub->port_ref);
+			if (!sub->port_ref) {
+				kfree(sub);
+				break;
+			}
+			memcpy(sub->usr_handle,msg->usr_handle,
+			       sizeof(sub->usr_handle));
+			sub->domain = msg->argv.domain;
+			list_add_tail(&sub->subd_list, &mng.link_subscribers);
+			tipc_connect2port(sub->port_ref, orig);
+			rmsg.retval = TIPC_OK;
+			tipc_send(sub->port_ref, 2u, msg_sect);
+			mng.link_subscriptions++;
+			return;
+		}
+	default:
+		rv = tipc_cfg_cmd(msg, data, sz, (u32 *)&msg_sect[1].iov_len, orig);
+	}
+	exit:
+	rmsg.result_len = htonl(msg_sect[1].iov_len);
+	rmsg.retval = htonl(rv);
+	cfg_respond(msg_sect, 2u, orig);
+}
+#endif
+
+static struct sk_buff *cfg_enable_bearer(void)
+{
+	struct tipc_bearer_config *args;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_BEARER_CONFIG))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	args = (struct tipc_bearer_config *)TLV_DATA(req_tlv_area);
+	if (tipc_enable_bearer(args->name,
+			       ntohl(args->detect_scope),
+			       ntohl(args->priority)))
+		return cfg_reply_error_string("unable to enable bearer");
+
+	return cfg_reply_none();
+}
+
+static struct sk_buff *cfg_disable_bearer(void)
+{
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_BEARER_NAME))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	if (tipc_disable_bearer((char *)TLV_DATA(req_tlv_area)))
+		return cfg_reply_error_string("unable to disable bearer");
+
+	return cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_own_addr(void)
+{
+	u32 addr;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	addr = *(u32 *)TLV_DATA(req_tlv_area);
+	addr = ntohl(addr);
+	if (addr == tipc_own_addr)
+		return cfg_reply_none();
+	if (!addr_node_valid(addr))
+		return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+					      " (node address)");
+	if (tipc_own_addr)
+		return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+					      " (cannot change node address once assigned)");
+
+	spin_unlock_bh(&config_lock);
+	stop_net();
+	tipc_own_addr = addr;
+	start_net();
+	spin_lock_bh(&config_lock);
+	return cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_remote_mng(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	value = *(u32 *)TLV_DATA(req_tlv_area);
+	value = ntohl(value);
+	tipc_remote_management = (value != 0);
+	return cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_max_publications(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	value = *(u32 *)TLV_DATA(req_tlv_area);
+	value = ntohl(value);
+	if (value != delimit(value, 1, 65535))
+		return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+					      " (max publications must be 1-65535)");
+	tipc_max_publications = value;
+	return cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_max_subscriptions(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	value = *(u32 *)TLV_DATA(req_tlv_area);
+	value = ntohl(value);
+	if (value != delimit(value, 1, 65535))
+		return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+					      " (max subscriptions must be 1-65535");
+	tipc_max_subscriptions = value;
+	return cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_max_ports(void)
+{
+	int orig_mode;
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+	value = *(u32 *)TLV_DATA(req_tlv_area);
+	value = ntohl(value);
+	if (value != delimit(value, 127, 65535))
+		return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+					      " (max ports must be 127-65535)");
+
+	if (value == tipc_max_ports)
+		return cfg_reply_none();
+
+	if (atomic_read(&tipc_user_count) > 2)
+		return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+					      " (cannot change max ports while TIPC users exist)");
+
+	spin_unlock_bh(&config_lock);
+	orig_mode = tipc_get_mode();
+	if (orig_mode == TIPC_NET_MODE)
+		stop_net();
+	stop_core();
+	tipc_max_ports = value;
+	start_core();
+	if (orig_mode == TIPC_NET_MODE)
+		start_net();
+	spin_lock_bh(&config_lock);
+	return cfg_reply_none();
+}
+
+static struct sk_buff *set_net_max(int value, int *parameter)
+{
+	int orig_mode;
+
+	if (value != *parameter) {
+		orig_mode = tipc_get_mode();
+		if (orig_mode == TIPC_NET_MODE)
+			stop_net();
+		*parameter = value;
+		if (orig_mode == TIPC_NET_MODE)
+			start_net();
+	}
+
+	return cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_max_zones(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+	value = *(u32 *)TLV_DATA(req_tlv_area);
+	value = ntohl(value);
+	if (value != delimit(value, 1, 255))
+		return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+					      " (max zones must be 1-255)");
+	return set_net_max(value, &tipc_max_zones);
+}
+
+static struct sk_buff *cfg_set_max_clusters(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+	value = *(u32 *)TLV_DATA(req_tlv_area);
+	value = ntohl(value);
+	if (value != 1)
+		return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+					      " (max clusters fixed at 1)");
+	return cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_max_nodes(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+	value = *(u32 *)TLV_DATA(req_tlv_area);
+	value = ntohl(value);
+	if (value != delimit(value, 8, 2047))
+		return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+					      " (max nodes must be 8-2047)");
+	return set_net_max(value, &tipc_max_nodes);
+}
+
+static struct sk_buff *cfg_set_max_slaves(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+	value = *(u32 *)TLV_DATA(req_tlv_area);
+	value = ntohl(value);
+	if (value != 0)
+		return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+					      " (max secondary nodes fixed at 0)");
+	return cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_netid(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+	value = *(u32 *)TLV_DATA(req_tlv_area);
+	value = ntohl(value);
+	if (value != delimit(value, 1, 9999))
+		return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+					      " (network id must be 1-9999)");
+
+	if (tipc_own_addr)
+		return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+					      " (cannot change network id once part of network)");
+	
+	return set_net_max(value, &tipc_net_id);
+}
+
+struct sk_buff *cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area,
+			   int request_space, int reply_headroom)
+{
+	struct sk_buff *rep_tlv_buf;
+
+	spin_lock_bh(&config_lock);
+
+	/* Save request and reply details in a well-known location */
+
+	req_tlv_area = request_area;
+	req_tlv_space = request_space;
+	rep_headroom = reply_headroom;
+
+	/* Check command authorization */
+
+	if (likely(orig_node == tipc_own_addr)) {
+		/* command is permitted */
+	} else if (cmd >= 0x8000) {
+		rep_tlv_buf = cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+						     " (cannot be done remotely)");
+		goto exit;
+	} else if (!tipc_remote_management) {
+		rep_tlv_buf = cfg_reply_error_string(TIPC_CFG_NO_REMOTE);
+		goto exit;
+	}
+	else if (cmd >= 0x4000) {
+		u32 domain = 0;
+
+		if ((nametbl_translate(TIPC_ZM_SRV, 0, &domain) == 0) ||
+		    (domain != orig_node)) {
+			rep_tlv_buf = cfg_reply_error_string(TIPC_CFG_NOT_ZONE_MSTR);
+			goto exit;
+		}
+	}
+
+	/* Call appropriate processing routine */
+
+	switch (cmd) {
+	case TIPC_CMD_NOOP:
+		rep_tlv_buf = cfg_reply_none();
+		break;
+	case TIPC_CMD_GET_NODES:
+		rep_tlv_buf = node_get_nodes(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_GET_LINKS:
+		rep_tlv_buf = node_get_links(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_SHOW_LINK_STATS:
+		rep_tlv_buf = link_cmd_show_stats(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_RESET_LINK_STATS:
+		rep_tlv_buf = link_cmd_reset_stats(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_SHOW_NAME_TABLE:
+		rep_tlv_buf = nametbl_get(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_GET_BEARER_NAMES:
+		rep_tlv_buf = bearer_get_names();
+		break;
+	case TIPC_CMD_GET_MEDIA_NAMES:
+		rep_tlv_buf = media_get_names();
+		break;
+	case TIPC_CMD_SHOW_PORTS:
+		rep_tlv_buf = port_get_ports();
+		break;
+#if 0
+	case TIPC_CMD_SHOW_PORT_STATS:
+		rep_tlv_buf = port_show_stats(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_RESET_PORT_STATS:
+		rep_tlv_buf = cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED);
+		break;
+#endif
+	case TIPC_CMD_SET_LOG_SIZE:
+		rep_tlv_buf = log_resize(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_DUMP_LOG:
+		rep_tlv_buf = log_dump();
+		break;
+	case TIPC_CMD_SET_LINK_TOL:
+	case TIPC_CMD_SET_LINK_PRI:
+	case TIPC_CMD_SET_LINK_WINDOW:
+		rep_tlv_buf = link_cmd_config(req_tlv_area, req_tlv_space, cmd);
+		break;
+	case TIPC_CMD_ENABLE_BEARER:
+		rep_tlv_buf = cfg_enable_bearer();
+		break;
+	case TIPC_CMD_DISABLE_BEARER:
+		rep_tlv_buf = cfg_disable_bearer();
+		break;
+	case TIPC_CMD_SET_NODE_ADDR:
+		rep_tlv_buf = cfg_set_own_addr();
+		break;
+	case TIPC_CMD_SET_REMOTE_MNG:
+		rep_tlv_buf = cfg_set_remote_mng();
+		break;
+	case TIPC_CMD_SET_MAX_PORTS:
+		rep_tlv_buf = cfg_set_max_ports();
+		break;
+	case TIPC_CMD_SET_MAX_PUBL:
+		rep_tlv_buf = cfg_set_max_publications();
+		break;
+	case TIPC_CMD_SET_MAX_SUBSCR:
+		rep_tlv_buf = cfg_set_max_subscriptions();
+		break;
+	case TIPC_CMD_SET_MAX_ZONES:
+		rep_tlv_buf = cfg_set_max_zones();
+		break;
+	case TIPC_CMD_SET_MAX_CLUSTERS:
+		rep_tlv_buf = cfg_set_max_clusters();
+		break;
+	case TIPC_CMD_SET_MAX_NODES:
+		rep_tlv_buf = cfg_set_max_nodes();
+		break;
+	case TIPC_CMD_SET_MAX_SLAVES:
+		rep_tlv_buf = cfg_set_max_slaves();
+		break;
+	case TIPC_CMD_SET_NETID:
+		rep_tlv_buf = cfg_set_netid();
+		break;
+	case TIPC_CMD_GET_REMOTE_MNG:
+		rep_tlv_buf = cfg_reply_unsigned(tipc_remote_management);
+		break;
+	case TIPC_CMD_GET_MAX_PORTS:
+		rep_tlv_buf = cfg_reply_unsigned(tipc_max_ports);
+		break;
+	case TIPC_CMD_GET_MAX_PUBL:
+		rep_tlv_buf = cfg_reply_unsigned(tipc_max_publications);
+		break;
+	case TIPC_CMD_GET_MAX_SUBSCR:
+		rep_tlv_buf = cfg_reply_unsigned(tipc_max_subscriptions);
+		break;
+	case TIPC_CMD_GET_MAX_ZONES:
+		rep_tlv_buf = cfg_reply_unsigned(tipc_max_zones);
+		break;
+	case TIPC_CMD_GET_MAX_CLUSTERS:
+		rep_tlv_buf = cfg_reply_unsigned(tipc_max_clusters);
+		break;
+	case TIPC_CMD_GET_MAX_NODES:
+		rep_tlv_buf = cfg_reply_unsigned(tipc_max_nodes);
+		break;
+	case TIPC_CMD_GET_MAX_SLAVES:
+		rep_tlv_buf = cfg_reply_unsigned(tipc_max_slaves);
+		break;
+	case TIPC_CMD_GET_NETID:
+		rep_tlv_buf = cfg_reply_unsigned(tipc_net_id);
+		break;
+	default:
+		rep_tlv_buf = NULL;
+		break;
+	}
+
+	/* Return reply buffer */
+exit:
+	spin_unlock_bh(&config_lock);
+	return rep_tlv_buf;
+}
+
+static void cfg_named_msg_event(void *userdata,
+				u32 port_ref,
+				struct sk_buff **buf,
+				const unchar *msg,
+				u32 size,
+				u32 importance, 
+				struct tipc_portid const *orig,
+				struct tipc_name_seq const *dest)
+{
+	struct tipc_cfg_msg_hdr *req_hdr;
+	struct tipc_cfg_msg_hdr *rep_hdr;
+	struct sk_buff *rep_buf;
+
+	/* Validate configuration message header (ignore invalid message) */
+
+	req_hdr = (struct tipc_cfg_msg_hdr *)msg;
+	if ((size < sizeof(*req_hdr)) ||
+	    (size != TCM_ALIGN(ntohl(req_hdr->tcm_len))) ||
+	    (ntohs(req_hdr->tcm_flags) != TCM_F_REQUEST)) {
+		warn("discarded invalid configuration message\n");
+		return;
+	}
+
+	/* Generate reply for request (if can't, return request) */
+
+	rep_buf = cfg_do_cmd(orig->node,
+			     ntohs(req_hdr->tcm_type), 
+			     msg + sizeof(*req_hdr),
+			     size - sizeof(*req_hdr),
+			     BUF_HEADROOM + MAX_H_SIZE + sizeof(*rep_hdr));
+	if (rep_buf) {
+		skb_push(rep_buf, sizeof(*rep_hdr));
+		rep_hdr = (struct tipc_cfg_msg_hdr *)rep_buf->data;
+		memcpy(rep_hdr, req_hdr, sizeof(*rep_hdr));
+		rep_hdr->tcm_len = htonl(rep_buf->len);
+		rep_hdr->tcm_flags &= htons(~TCM_F_REQUEST);
+	} else {
+		rep_buf = *buf;
+		*buf = NULL;
+	}
+
+	/* NEED TO ADD CODE TO HANDLE FAILED SEND (SUCH AS CONGESTION) */
+	tipc_send_buf2port(port_ref, orig, rep_buf, rep_buf->len);
+}
+
+int cfg_init(void)
+{
+	struct tipc_name_seq seq;
+	int res;
+
+	memset(&mng, 0, sizeof(mng));
+	INIT_LIST_HEAD(&mng.link_subscribers);
+
+	res = tipc_attach(&mng.user_ref, 0, 0);
+	if (res)
+		goto failed;
+
+	res = tipc_createport(mng.user_ref, 0, TIPC_CRITICAL_IMPORTANCE,
+			      NULL, NULL, NULL,
+			      NULL, cfg_named_msg_event, NULL,
+			      NULL, &mng.port_ref);
+	if (res)
+		goto failed;
+
+	seq.type = TIPC_CFG_SRV;
+	seq.lower = seq.upper = tipc_own_addr;
+	res = nametbl_publish_rsv(mng.port_ref, TIPC_ZONE_SCOPE, &seq);
+	if (res)
+		goto failed;
+
+	return 0;
+
+failed:
+	err("Unable to create configuration service\n");
+	tipc_detach(mng.user_ref);
+	mng.user_ref = 0;
+	return res;
+}
+
+void cfg_stop(void)
+{
+	if (mng.user_ref) {
+		tipc_detach(mng.user_ref);
+		mng.user_ref = 0;
+	}
+}
diff --git a/net/tipc/config.h b/net/tipc/config.h
new file mode 100644
index 000000000000..7ac0af57bf57
--- /dev/null
+++ b/net/tipc/config.h
@@ -0,0 +1,76 @@
+/*
+ * net/tipc/config.h: Include file for TIPC configuration service code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_CONFIG_H
+#define _TIPC_CONFIG_H
+
+/* ---------------------------------------------------------------------- */
+
+#include <linux/tipc.h>
+#include "link.h"
+
+struct sk_buff *cfg_reply_alloc(int payload_size);
+int cfg_append_tlv(struct sk_buff *buf, int tlv_type, 
+		   void *tlv_data, int tlv_data_size);
+struct sk_buff *cfg_reply_unsigned_type(u16 tlv_type, u32 value);
+struct sk_buff *cfg_reply_string_type(u16 tlv_type, char *string);
+
+static inline struct sk_buff *cfg_reply_none(void)
+{
+	return cfg_reply_alloc(0);
+}
+
+static inline struct sk_buff *cfg_reply_unsigned(u32 value)
+{
+	return cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value);
+}
+
+static inline struct sk_buff *cfg_reply_error_string(char *string)
+{
+	return cfg_reply_string_type(TIPC_TLV_ERROR_STRING, string);
+}
+
+static inline struct sk_buff *cfg_reply_ultra_string(char *string)
+{
+	return cfg_reply_string_type(TIPC_TLV_ULTRA_STRING, string);
+}
+
+struct sk_buff *cfg_do_cmd(u32 orig_node, u16 cmd, 
+			   const void *req_tlv_area, int req_tlv_space, 
+			   int headroom);
+
+void cfg_link_event(u32 addr, char *name, int up);
+int  cfg_init(void);
+void cfg_stop(void);
+
+#endif
diff --git a/net/tipc/core.c b/net/tipc/core.c
new file mode 100644
index 000000000000..17c723f49185
--- /dev/null
+++ b/net/tipc/core.c
@@ -0,0 +1,282 @@
+/*
+ * net/tipc/core.c: TIPC module code
+ *
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/random.h>
+
+#include "core.h"
+#include "dbg.h"
+#include "ref.h"
+#include "net.h"
+#include "user_reg.h"
+#include "name_table.h"
+#include "subscr.h"
+#include "config.h"
+
+int  eth_media_start(void);
+void eth_media_stop(void);
+int  handler_start(void);
+void handler_stop(void);
+int  socket_init(void);
+void socket_stop(void);
+int  netlink_start(void);
+void netlink_stop(void);
+
+#define MOD_NAME "tipc_start: "
+
+#ifndef CONFIG_TIPC_ZONES
+#define CONFIG_TIPC_ZONES 3
+#endif
+
+#ifndef CONFIG_TIPC_CLUSTERS
+#define CONFIG_TIPC_CLUSTERS 1
+#endif
+
+#ifndef CONFIG_TIPC_NODES
+#define CONFIG_TIPC_NODES 255
+#endif
+
+#ifndef CONFIG_TIPC_SLAVE_NODES
+#define CONFIG_TIPC_SLAVE_NODES 0
+#endif
+
+#ifndef CONFIG_TIPC_PORTS
+#define CONFIG_TIPC_PORTS 8191
+#endif
+
+#ifndef CONFIG_TIPC_LOG
+#define CONFIG_TIPC_LOG 0
+#endif
+
+/* global variables used by multiple sub-systems within TIPC */
+
+int tipc_mode = TIPC_NOT_RUNNING;
+int tipc_random;
+atomic_t tipc_user_count = ATOMIC_INIT(0);
+
+const char tipc_alphabet[] = 
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
+
+/* configurable TIPC parameters */
+
+u32 tipc_own_addr;
+int tipc_max_zones;
+int tipc_max_clusters;
+int tipc_max_nodes;
+int tipc_max_slaves;
+int tipc_max_ports;
+int tipc_max_subscriptions;
+int tipc_max_publications;
+int tipc_net_id;
+int tipc_remote_management;
+
+
+int tipc_get_mode(void)
+{
+	return tipc_mode;
+}
+
+/**
+ * stop_net - shut down TIPC networking sub-systems
+ */
+
+void stop_net(void)
+{
+	eth_media_stop();
+	tipc_stop_net();
+}
+
+/**
+ * start_net - start TIPC networking sub-systems
+ */
+
+int start_net(void)
+{
+	int res;
+
+	if ((res = tipc_start_net()) ||
+	    (res = eth_media_start())) {
+		stop_net();
+	}
+	return res;
+}
+
+/**
+ * stop_core - switch TIPC from SINGLE NODE to NOT RUNNING mode
+ */
+
+void stop_core(void)
+{
+	if (tipc_mode != TIPC_NODE_MODE)
+		return;
+
+	tipc_mode = TIPC_NOT_RUNNING;
+
+	netlink_stop();
+	handler_stop();
+	cfg_stop();
+	subscr_stop();
+	reg_stop();
+	nametbl_stop();
+	ref_table_stop();
+	socket_stop();
+}
+
+/**
+ * start_core - switch TIPC from NOT RUNNING to SINGLE NODE mode
+ */
+
+int start_core(void)
+{
+	int res;
+
+	if (tipc_mode != TIPC_NOT_RUNNING)
+		return -ENOPROTOOPT;
+
+	get_random_bytes(&tipc_random, sizeof(tipc_random));
+	tipc_mode = TIPC_NODE_MODE;
+
+	if ((res = handler_start()) || 
+	    (res = ref_table_init(tipc_max_ports + tipc_max_subscriptions,
+				  tipc_random)) ||
+	    (res = reg_start()) ||
+	    (res = nametbl_init()) ||
+            (res = k_signal((Handler)subscr_start, 0)) ||
+	    (res = k_signal((Handler)cfg_init, 0)) || 
+	    (res = netlink_start()) ||
+	    (res = socket_init())) {
+		stop_core();
+	}
+	return res;
+}
+
+
+static int __init tipc_init(void)
+{
+	int res;
+
+	log_reinit(CONFIG_TIPC_LOG);
+	info("Activated (compiled " __DATE__ " " __TIME__ ")\n");
+
+	tipc_own_addr = 0;
+	tipc_remote_management = 1;
+	tipc_max_publications = 10000;
+	tipc_max_subscriptions = 2000;
+	tipc_max_ports = delimit(CONFIG_TIPC_PORTS, 127, 65536);
+	tipc_max_zones = delimit(CONFIG_TIPC_ZONES, 1, 511);
+	tipc_max_clusters = delimit(CONFIG_TIPC_CLUSTERS, 1, 1);
+	tipc_max_nodes = delimit(CONFIG_TIPC_NODES, 8, 2047);
+	tipc_max_slaves = delimit(CONFIG_TIPC_SLAVE_NODES, 0, 2047);
+	tipc_net_id = 4711;
+
+	if ((res = start_core()))
+		err("Unable to start in single node mode\n");
+	else	
+		info("Started in single node mode\n");
+        return res;
+}
+
+static void __exit tipc_exit(void)
+{
+	stop_net();
+	stop_core();
+	info("Deactivated\n");
+	log_stop();
+}
+
+module_init(tipc_init);
+module_exit(tipc_exit);
+
+MODULE_DESCRIPTION("TIPC: Transparent Inter Process Communication");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/* Native TIPC API for kernel-space applications (see tipc.h) */
+
+EXPORT_SYMBOL(tipc_attach);
+EXPORT_SYMBOL(tipc_detach);
+EXPORT_SYMBOL(tipc_get_addr);
+EXPORT_SYMBOL(tipc_get_mode);
+EXPORT_SYMBOL(tipc_createport);
+EXPORT_SYMBOL(tipc_deleteport);
+EXPORT_SYMBOL(tipc_ownidentity);
+EXPORT_SYMBOL(tipc_portimportance);
+EXPORT_SYMBOL(tipc_set_portimportance);
+EXPORT_SYMBOL(tipc_portunreliable);
+EXPORT_SYMBOL(tipc_set_portunreliable);
+EXPORT_SYMBOL(tipc_portunreturnable);
+EXPORT_SYMBOL(tipc_set_portunreturnable);
+EXPORT_SYMBOL(tipc_publish);
+EXPORT_SYMBOL(tipc_withdraw);
+EXPORT_SYMBOL(tipc_connect2port);
+EXPORT_SYMBOL(tipc_disconnect);
+EXPORT_SYMBOL(tipc_shutdown);
+EXPORT_SYMBOL(tipc_isconnected);
+EXPORT_SYMBOL(tipc_peer);
+EXPORT_SYMBOL(tipc_ref_valid);
+EXPORT_SYMBOL(tipc_send);
+EXPORT_SYMBOL(tipc_send_buf);
+EXPORT_SYMBOL(tipc_send2name);
+EXPORT_SYMBOL(tipc_forward2name);
+EXPORT_SYMBOL(tipc_send_buf2name);
+EXPORT_SYMBOL(tipc_forward_buf2name);
+EXPORT_SYMBOL(tipc_send2port);
+EXPORT_SYMBOL(tipc_forward2port);
+EXPORT_SYMBOL(tipc_send_buf2port);
+EXPORT_SYMBOL(tipc_forward_buf2port);
+EXPORT_SYMBOL(tipc_multicast);
+/* EXPORT_SYMBOL(tipc_multicast_buf); not available yet */
+EXPORT_SYMBOL(tipc_ispublished);
+EXPORT_SYMBOL(tipc_available_nodes);
+
+/* TIPC API for external bearers (see tipc_bearer.h) */
+
+EXPORT_SYMBOL(tipc_block_bearer);
+EXPORT_SYMBOL(tipc_continue); 
+EXPORT_SYMBOL(tipc_disable_bearer);
+EXPORT_SYMBOL(tipc_enable_bearer);
+EXPORT_SYMBOL(tipc_recv_msg);
+EXPORT_SYMBOL(tipc_register_media); 
+
+/* TIPC API for external APIs (see tipc_port.h) */
+
+EXPORT_SYMBOL(tipc_createport_raw);
+EXPORT_SYMBOL(tipc_set_msg_option);
+EXPORT_SYMBOL(tipc_reject_msg);
+EXPORT_SYMBOL(tipc_send_buf_fast);
+EXPORT_SYMBOL(tipc_acknowledge);
+EXPORT_SYMBOL(tipc_get_port);
+EXPORT_SYMBOL(tipc_get_handle);
+
diff --git a/net/tipc/core.h b/net/tipc/core.h
new file mode 100644
index 000000000000..d92898d6c093
--- /dev/null
+++ b/net/tipc/core.h
@@ -0,0 +1,313 @@
+/*
+ * net/tipc/core.h: Include file for TIPC global declarations
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_CORE_H
+#define _TIPC_CORE_H
+
+#include <net/tipc/tipc.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <asm/uaccess.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <asm/hardirq.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>	
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+
+/*
+ * TIPC debugging code
+ */
+
+#define assert(i)  BUG_ON(!(i))
+
+struct tipc_msg;
+extern struct print_buf *CONS, *LOG;
+extern struct print_buf *TEE(struct print_buf *, struct print_buf *);
+void msg_print(struct print_buf*,struct tipc_msg *,const char*);
+void tipc_printf(struct print_buf *, const char *fmt, ...);
+void tipc_dump(struct print_buf*,const char *fmt, ...);
+
+#ifdef CONFIG_TIPC_DEBUG
+
+/*
+ * TIPC debug support included:
+ * - system messages are printed to TIPC_OUTPUT print buffer
+ * - debug messages are printed to DBG_OUTPUT print buffer
+ */
+
+#define err(fmt, arg...)  tipc_printf(TIPC_OUTPUT, KERN_ERR "TIPC: " fmt, ## arg)
+#define warn(fmt, arg...) tipc_printf(TIPC_OUTPUT, KERN_WARNING "TIPC: " fmt, ## arg)
+#define info(fmt, arg...) tipc_printf(TIPC_OUTPUT, KERN_NOTICE "TIPC: " fmt, ## arg)
+
+#define dbg(fmt, arg...)  do {if (DBG_OUTPUT) tipc_printf(DBG_OUTPUT, fmt, ## arg);} while(0)
+#define msg_dbg(msg, txt) do {if (DBG_OUTPUT) msg_print(DBG_OUTPUT, msg, txt);} while(0)
+#define dump(fmt, arg...) do {if (DBG_OUTPUT) tipc_dump(DBG_OUTPUT, fmt, ##arg);} while(0)
+
+
+/*	
+ * By default, TIPC_OUTPUT is defined to be system console and TIPC log buffer,
+ * while DBG_OUTPUT is the null print buffer.  These defaults can be changed
+ * here, or on a per .c file basis, by redefining these symbols.  The following
+ * print buffer options are available:
+ *
+ * NULL			: Output to null print buffer (i.e. print nowhere)
+ * CONS			: Output to system console
+ * LOG			: Output to TIPC log buffer 
+ * &buf 		: Output to user-defined buffer (struct print_buf *)
+ * TEE(&buf_a,&buf_b)	: Output to two print buffers (eg. TEE(CONS,LOG) )
+ */
+
+#ifndef TIPC_OUTPUT
+#define TIPC_OUTPUT TEE(CONS,LOG)
+#endif
+
+#ifndef DBG_OUTPUT
+#define DBG_OUTPUT NULL
+#endif
+
+#else
+
+#ifndef DBG_OUTPUT
+#define DBG_OUTPUT NULL
+#endif
+
+/*
+ * TIPC debug support not included:
+ * - system messages are printed to system console
+ * - debug messages are not printed
+ */
+
+#define err(fmt, arg...)  printk(KERN_ERR "%s: " fmt "\n" , __FILE__ , ## arg)
+#define info(fmt, arg...) printk(KERN_INFO "%s: " fmt "\n" , __FILE__ , ## arg)
+#define warn(fmt, arg...) printk(KERN_WARNING "%s: " fmt "\n" , __FILE__ , ## arg)
+
+#define dbg(fmt, arg...) do {} while (0)
+#define msg_dbg(msg,txt) do {} while (0)
+#define dump(fmt,arg...) do {} while (0)
+
+#endif			  
+
+
+/* 
+ * TIPC-specific error codes
+ */
+
+#define ELINKCONG EAGAIN	/* link congestion <=> resource unavailable */
+
+/*
+ * Global configuration variables
+ */
+
+extern u32 tipc_own_addr;
+extern int tipc_max_zones;
+extern int tipc_max_clusters;
+extern int tipc_max_nodes;
+extern int tipc_max_slaves;
+extern int tipc_max_ports;
+extern int tipc_max_subscriptions;
+extern int tipc_max_publications;
+extern int tipc_net_id;
+extern int tipc_remote_management;
+
+/*
+ * Other global variables
+ */
+
+extern int tipc_mode;
+extern int tipc_random;
+extern const char tipc_alphabet[];
+extern atomic_t tipc_user_count;
+
+
+/*
+ * Routines available to privileged subsystems
+ */
+
+extern int  start_core(void);
+extern void stop_core(void);
+extern int  start_net(void);
+extern void stop_net(void);
+
+static inline int delimit(int val, int min, int max)
+{
+	if (val > max)
+		return max;
+	if (val < min)
+		return min;
+	return val;
+}
+
+
+/*
+ * TIPC timer and signal code
+ */
+
+typedef void (*Handler) (unsigned long);
+
+u32 k_signal(Handler routine, unsigned long argument);
+
+/**
+ * k_init_timer - initialize a timer
+ * @timer: pointer to timer structure
+ * @routine: pointer to routine to invoke when timer expires
+ * @argument: value to pass to routine when timer expires
+ * 
+ * Timer must be initialized before use (and terminated when no longer needed).
+ */
+
+static inline void k_init_timer(struct timer_list *timer, Handler routine, 
+				unsigned long argument)
+{
+	dbg("initializing timer %p\n", timer);
+	init_timer(timer);
+	timer->function = routine;
+	timer->data = argument;
+}
+
+/**
+ * k_start_timer - start a timer
+ * @timer: pointer to timer structure
+ * @msec: time to delay (in ms)
+ * 
+ * Schedules a previously initialized timer for later execution.
+ * If timer is already running, the new timeout overrides the previous request.
+ * 
+ * To ensure the timer doesn't expire before the specified delay elapses,
+ * the amount of delay is rounded up when converting to the jiffies
+ * then an additional jiffy is added to account for the fact that 
+ * the starting time may be in the middle of the current jiffy.
+ */
+
+static inline void k_start_timer(struct timer_list *timer, unsigned long msec)
+{
+	dbg("starting timer %p for %u\n", timer, msec);
+	mod_timer(timer, jiffies + msecs_to_jiffies(msec) + 1);
+}
+
+/**
+ * k_cancel_timer - cancel a timer
+ * @timer: pointer to timer structure
+ * 
+ * Cancels a previously initialized timer.  
+ * Can be called safely even if the timer is already inactive.
+ * 
+ * WARNING: Must not be called when holding locks required by the timer's
+ *          timeout routine, otherwise deadlock can occur on SMP systems!
+ */
+
+static inline void k_cancel_timer(struct timer_list *timer)
+{
+	dbg("cancelling timer %p\n", timer);
+	del_timer_sync(timer);
+}
+
+/**
+ * k_term_timer - terminate a timer
+ * @timer: pointer to timer structure
+ * 
+ * Prevents further use of a previously initialized timer.
+ * 
+ * WARNING: Caller must ensure timer isn't currently running.
+ * 
+ * (Do not "enhance" this routine to automatically cancel an active timer,
+ * otherwise deadlock can arise when a timeout routine calls k_term_timer.)
+ */
+
+static inline void k_term_timer(struct timer_list *timer)
+{
+	dbg("terminating timer %p\n", timer);
+}
+
+
+/*
+ * TIPC message buffer code
+ *
+ * TIPC message buffer headroom leaves room for 14 byte Ethernet header, 
+ * while ensuring TIPC header is word aligned for quicker access
+ */
+
+#define BUF_HEADROOM 16u 
+
+struct tipc_skb_cb {
+	void *handle;
+};
+
+#define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0]))
+
+
+static inline struct tipc_msg *buf_msg(struct sk_buff *skb)
+{
+	return (struct tipc_msg *)skb->data;
+}
+
+/**
+ * buf_acquire - creates a TIPC message buffer
+ * @size: message size (including TIPC header)
+ *
+ * Returns a new buffer.  Space is reserved for a data link header.
+ */
+
+static inline struct sk_buff *buf_acquire(u32 size)
+{
+	struct sk_buff *skb;
+	unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
+
+	skb = alloc_skb(buf_size, GFP_ATOMIC);
+	if (skb) {
+		skb_reserve(skb, BUF_HEADROOM);
+		skb_put(skb, size);
+		skb->next = NULL;
+	}
+	return skb;
+}
+
+/**
+ * buf_discard - frees a TIPC message buffer
+ * @skb: message buffer
+ *
+ * Frees a new buffer.  If passed NULL, just returns.
+ */
+
+static inline void buf_discard(struct sk_buff *skb)
+{
+	if (likely(skb != NULL))
+		kfree_skb(skb);
+}
+
+#endif			
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
new file mode 100644
index 000000000000..efd6d652d052
--- /dev/null
+++ b/net/tipc/dbg.c
@@ -0,0 +1,392 @@
+/*
+ * net/tipc/dbg.c: TIPC print buffer routines for debuggign
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include "dbg.h"
+
+#define MAX_STRING 512
+
+static char print_string[MAX_STRING];
+static spinlock_t print_lock = SPIN_LOCK_UNLOCKED;
+
+static struct print_buf cons_buf = { NULL, 0, NULL, NULL };
+struct print_buf *CONS = &cons_buf;
+
+static struct print_buf log_buf = { NULL, 0, NULL, NULL };
+struct print_buf *LOG = &log_buf;
+
+
+#define FORMAT(PTR,LEN,FMT) \
+{\
+       va_list args;\
+       va_start(args, FMT);\
+       LEN = vsprintf(PTR, FMT, args);\
+       va_end(args);\
+       *(PTR + LEN) = '\0';\
+}
+
+/*
+ * Locking policy when using print buffers.
+ *
+ * 1) Routines of the form printbuf_XXX() rely on the caller to prevent
+ *    simultaneous use of the print buffer(s) being manipulated.
+ * 2) tipc_printf() uses 'print_lock' to prevent simultaneous use of
+ *    'print_string' and to protect its print buffer(s).
+ * 3) TEE() uses 'print_lock' to protect its print buffer(s).
+ * 4) Routines of the form log_XXX() uses 'print_lock' to protect LOG.
+ */
+
+/**
+ * printbuf_init - initialize print buffer to empty
+ */
+
+void printbuf_init(struct print_buf *pb, char *raw, u32 sz)
+{
+	if (!pb || !raw || (sz < (MAX_STRING + 1)))
+		return;
+
+	pb->crs = pb->buf = raw;
+	pb->size = sz;
+	pb->next = 0;
+	pb->buf[0] = 0;
+	pb->buf[sz-1] = ~0;
+}
+
+/**
+ * printbuf_reset - reinitialize print buffer to empty state
+ */
+
+void printbuf_reset(struct print_buf *pb)
+{
+	if (pb && pb->buf)
+		printbuf_init(pb, pb->buf, pb->size);
+}
+
+/**
+ * printbuf_empty - test if print buffer is in empty state
+ */
+
+int printbuf_empty(struct print_buf *pb)
+{
+	return (!pb || !pb->buf || (pb->crs == pb->buf));
+}
+
+/**
+ * printbuf_validate - check for print buffer overflow
+ * 
+ * Verifies that a print buffer has captured all data written to it. 
+ * If data has been lost, linearize buffer and prepend an error message
+ * 
+ * Returns length of print buffer data string (including trailing NULL)
+ */
+
+int printbuf_validate(struct print_buf *pb)
+{
+        char *err = "             *** PRINT BUFFER WRAPPED AROUND ***\n";
+        char *cp_buf;
+        struct print_buf cb;
+
+	if (!pb || !pb->buf)
+		return 0;
+
+	if (pb->buf[pb->size - 1] == '\0') {
+                cp_buf = kmalloc(pb->size, GFP_ATOMIC);
+                if (cp_buf != NULL){
+                        printbuf_init(&cb, cp_buf, pb->size);
+                        printbuf_move(&cb, pb);
+                        printbuf_move(pb, &cb);
+                        kfree(cp_buf);
+                        memcpy(pb->buf, err, strlen(err));
+                } else {
+                        printbuf_reset(pb);
+                        tipc_printf(pb, err);
+                }
+	}
+	return (pb->crs - pb->buf + 1);
+}
+
+/**
+ * printbuf_move - move print buffer contents to another print buffer
+ * 
+ * Current contents of destination print buffer (if any) are discarded.
+ * Source print buffer becomes empty if a successful move occurs.
+ */
+
+void printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from)
+{
+	int len;
+
+	/* Handle the cases where contents can't be moved */
+
+	if (!pb_to || !pb_to->buf)
+		return;
+
+	if (!pb_from || !pb_from->buf) {
+		printbuf_reset(pb_to);
+		return;
+	}
+
+	if (pb_to->size < pb_from->size) {
+		printbuf_reset(pb_to);
+		tipc_printf(pb_to, "*** PRINT BUFFER OVERFLOW ***");
+		return;
+	}
+
+	/* Copy data from char after cursor to end (if used) */
+	len = pb_from->buf + pb_from->size - pb_from->crs - 2;
+	if ((pb_from->buf[pb_from->size-1] == 0) && (len > 0)) {
+		strcpy(pb_to->buf, pb_from->crs + 1);
+		pb_to->crs = pb_to->buf + len;
+	} else
+		pb_to->crs = pb_to->buf;
+
+	/* Copy data from start to cursor (always) */
+	len = pb_from->crs - pb_from->buf;
+	strcpy(pb_to->crs, pb_from->buf);
+	pb_to->crs += len;
+
+	printbuf_reset(pb_from);
+}
+
+/**
+ * tipc_printf - append formatted output to print buffer chain
+ */
+
+void tipc_printf(struct print_buf *pb, const char *fmt, ...)
+{
+	int chars_to_add;
+	int chars_left;
+	char save_char;
+	struct print_buf *pb_next;
+
+	spin_lock_bh(&print_lock);
+	FORMAT(print_string, chars_to_add, fmt);
+	if (chars_to_add >= MAX_STRING)
+		strcpy(print_string, "*** STRING TOO LONG ***");
+
+	while (pb) {
+		if (pb == CONS)
+			printk(print_string);
+		else if (pb->buf) {
+			chars_left = pb->buf + pb->size - pb->crs - 1;
+			if (chars_to_add <= chars_left) {
+				strcpy(pb->crs, print_string);
+				pb->crs += chars_to_add;
+			} else {
+				strcpy(pb->buf, print_string + chars_left);
+                                save_char = print_string[chars_left];
+                                print_string[chars_left] = 0;
+                                strcpy(pb->crs, print_string);
+                                print_string[chars_left] = save_char;
+                                pb->crs = pb->buf + chars_to_add - chars_left;
+                        }
+                }
+		pb_next = pb->next;
+		pb->next = 0;
+		pb = pb_next;
+	}
+	spin_unlock_bh(&print_lock);
+}
+
+/**
+ * TEE - perform next output operation on both print buffers  
+ */
+
+struct print_buf *TEE(struct print_buf *b0, struct print_buf *b1)
+{
+	struct print_buf *pb = b0;
+
+	if (!b0 || (b0 == b1))
+		return b1;
+	if (!b1)
+		return b0;
+
+	spin_lock_bh(&print_lock);
+	while (pb->next) {
+		if ((pb->next == b1) || (pb->next == b0))
+			pb->next = pb->next->next;
+		else
+			pb = pb->next;
+	}
+	pb->next = b1;
+	spin_unlock_bh(&print_lock);
+	return b0;
+}
+
+/**
+ * print_to_console - write string of bytes to console in multiple chunks
+ */
+
+static void print_to_console(char *crs, int len)
+{
+	int rest = len;
+
+	while (rest > 0) {
+		int sz = rest < MAX_STRING ? rest : MAX_STRING;
+		char c = crs[sz];
+
+		crs[sz] = 0;
+		printk((const char *)crs);
+		crs[sz] = c;
+		rest -= sz;
+		crs += sz;
+	}
+}
+
+/**
+ * printbuf_dump - write print buffer contents to console
+ */
+
+static void printbuf_dump(struct print_buf *pb)
+{
+	int len;
+
+	/* Dump print buffer from char after cursor to end (if used) */
+	len = pb->buf + pb->size - pb->crs - 2;
+	if ((pb->buf[pb->size - 1] == 0) && (len > 0))
+		print_to_console(pb->crs + 1, len);
+
+	/* Dump print buffer from start to cursor (always) */
+	len = pb->crs - pb->buf;
+	print_to_console(pb->buf, len);
+}
+
+/**
+ * tipc_dump - dump non-console print buffer(s) to console
+ */
+
+void tipc_dump(struct print_buf *pb, const char *fmt, ...)
+{
+	int len;
+
+	spin_lock_bh(&print_lock);
+	FORMAT(CONS->buf, len, fmt);
+	printk(CONS->buf);
+
+	for (; pb; pb = pb->next) {
+		if (pb == CONS)
+			continue;
+		printk("\n---- Start of dump,%s log ----\n\n", 
+		       (pb == LOG) ? "global" : "local");
+		printbuf_dump(pb);
+		printbuf_reset(pb);
+		printk("\n-------- End of dump --------\n");
+	}
+	spin_unlock_bh(&print_lock);
+}
+
+/**
+ * log_stop - free up TIPC log print buffer 
+ */
+
+void log_stop(void)
+{
+	spin_lock_bh(&print_lock);
+	if (LOG->buf) {
+		kfree(LOG->buf);
+		LOG->buf = NULL;
+	}
+	spin_unlock_bh(&print_lock);
+}
+
+/**
+ * log_reinit - set TIPC log print buffer to specified size
+ */
+
+void log_reinit(int log_size)
+{
+	log_stop();
+
+	if (log_size) {
+		if (log_size <= MAX_STRING)
+			log_size = MAX_STRING + 1;
+		spin_lock_bh(&print_lock);
+		printbuf_init(LOG, kmalloc(log_size, GFP_ATOMIC), log_size);
+		spin_unlock_bh(&print_lock);
+	}
+}
+
+/**
+ * log_resize - reconfigure size of TIPC log buffer
+ */
+
+struct sk_buff *log_resize(const void *req_tlv_area, int req_tlv_space)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	value = *(u32 *)TLV_DATA(req_tlv_area);
+	value = ntohl(value);
+	if (value != delimit(value, 0, 32768))
+		return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+					      " (log size must be 0-32768)");
+	log_reinit(value);
+	return cfg_reply_none();
+}
+
+/**
+ * log_dump - capture TIPC log buffer contents in configuration message
+ */
+
+struct sk_buff *log_dump(void)
+{
+	struct sk_buff *reply;
+
+	spin_lock_bh(&print_lock);
+	if (!LOG->buf)
+		reply = cfg_reply_ultra_string("log not activated\n");
+	else if (printbuf_empty(LOG))
+		reply = cfg_reply_ultra_string("log is empty\n");
+	else {
+		struct tlv_desc *rep_tlv;
+		struct print_buf pb;
+		int str_len;
+
+		str_len = min(LOG->size, 32768u);
+		reply = cfg_reply_alloc(TLV_SPACE(str_len));
+		if (reply) {
+			rep_tlv = (struct tlv_desc *)reply->data;
+			printbuf_init(&pb, TLV_DATA(rep_tlv), str_len);
+			printbuf_move(&pb, LOG);
+			str_len = strlen(TLV_DATA(rep_tlv)) + 1;
+			skb_put(reply, TLV_SPACE(str_len));
+			TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
+		}
+	}
+	spin_unlock_bh(&print_lock);
+	return reply;
+}
+
diff --git a/net/tipc/dbg.h b/net/tipc/dbg.h
new file mode 100644
index 000000000000..af4217a993cf
--- /dev/null
+++ b/net/tipc/dbg.h
@@ -0,0 +1,56 @@
+/*
+ * net/tipc/dbg.h: Include file for TIPC print buffer routines
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_DBG_H
+#define _TIPC_DBG_H
+
+struct print_buf {
+	char *buf;
+	u32 size;
+	char *crs;
+	struct print_buf *next;
+};
+
+void printbuf_init(struct print_buf *pb, char *buf, u32 sz);
+void printbuf_reset(struct print_buf *pb);
+int  printbuf_empty(struct print_buf *pb);
+int  printbuf_validate(struct print_buf *pb);
+void printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from);
+
+void log_reinit(int log_size);
+void log_stop(void);
+
+struct sk_buff *log_resize(const void *req_tlv_area, int req_tlv_space);
+struct sk_buff *log_dump(void);
+
+#endif
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
new file mode 100644
index 000000000000..c83c1be17f85
--- /dev/null
+++ b/net/tipc/discover.c
@@ -0,0 +1,339 @@
+/*
+ * net/tipc/discover.c
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "dbg.h"
+#include "link.h"
+#include "zone.h"
+#include "discover.h"
+#include "port.h"
+#include "name_table.h"
+
+#define TIPC_LINK_REQ_INIT	125	/* min delay during bearer start up */
+#define TIPC_LINK_REQ_FAST	2000	/* normal delay if bearer has no links */
+#define TIPC_LINK_REQ_SLOW	600000	/* normal delay if bearer has links */
+
+#if 0
+#define  GET_NODE_INFO         300
+#define  GET_NODE_INFO_RESULT  301
+#define  FORWARD_LINK_PROBE    302
+#define  LINK_REQUEST_REJECTED 303
+#define  LINK_REQUEST_ACCEPTED 304
+#define  DROP_LINK_REQUEST     305
+#define  CHECK_LINK_COUNT      306
+#endif
+
+/* 
+ * TODO: Most of the inter-cluster setup stuff should be
+ * rewritten, and be made conformant with specification.
+ */ 
+
+
+/**
+ * struct link_req - information about an ongoing link setup request
+ * @bearer: bearer issuing requests
+ * @dest: destination address for request messages
+ * @buf: request message to be (repeatedly) sent
+ * @timer: timer governing period between requests
+ * @timer_intv: current interval between requests (in ms)
+ */
+struct link_req {
+	struct bearer *bearer;
+	struct tipc_media_addr dest;
+	struct sk_buff *buf;
+	struct timer_list timer;
+	unsigned int timer_intv;
+};
+
+
+#if 0
+int disc_create_link(const struct tipc_link_create *argv) 
+{
+	/* 
+	 * Code for inter cluster link setup here 
+	 */
+	return TIPC_OK;
+}
+#endif
+
+/*
+ * disc_lost_link(): A link has lost contact
+ */
+
+void disc_link_event(u32 addr, char *name, int up) 
+{
+	if (in_own_cluster(addr))
+		return;
+	/* 
+	 * Code for inter cluster link setup here 
+	 */
+}
+
+/** 
+ * disc_init_msg - initialize a link setup message
+ * @type: message type (request or response)
+ * @req_links: number of links associated with message
+ * @dest_domain: network domain of node(s) which should respond to message
+ * @b_ptr: ptr to bearer issuing message
+ */
+
+struct sk_buff *disc_init_msg(u32 type,
+			      u32 req_links,
+			      u32 dest_domain,
+			      struct bearer *b_ptr)
+{
+	struct sk_buff *buf = buf_acquire(DSC_H_SIZE);
+	struct tipc_msg *msg;
+
+	if (buf) {
+		msg = buf_msg(buf);
+		msg_init(msg, LINK_CONFIG, type, TIPC_OK, DSC_H_SIZE,
+			 dest_domain);
+		msg_set_non_seq(msg);
+		msg_set_req_links(msg, req_links);
+		msg_set_dest_domain(msg, dest_domain);
+		msg_set_bc_netid(msg, tipc_net_id);
+		msg_set_media_addr(msg, &b_ptr->publ.addr);
+	}
+	return buf;
+}
+
+/**
+ * disc_recv_msg - handle incoming link setup message (request or response)
+ * @buf: buffer containing message
+ */
+
+void disc_recv_msg(struct sk_buff *buf)
+{
+	struct bearer *b_ptr = (struct bearer *)TIPC_SKB_CB(buf)->handle;
+	struct link *link;
+	struct tipc_media_addr media_addr;
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 dest = msg_dest_domain(msg);
+	u32 orig = msg_prevnode(msg);
+	u32 net_id = msg_bc_netid(msg);
+	u32 type = msg_type(msg);
+
+	msg_get_media_addr(msg,&media_addr);
+	msg_dbg(msg, "RECV:");
+	buf_discard(buf);
+
+	if (net_id != tipc_net_id)
+		return;
+	if (!addr_domain_valid(dest))
+		return;
+	if (!addr_node_valid(orig))
+		return;
+	if (orig == tipc_own_addr)
+		return;
+	if (!in_scope(dest, tipc_own_addr))
+		return;
+	if (is_slave(tipc_own_addr) && is_slave(orig))
+		return;
+	if (is_slave(orig) && !in_own_cluster(orig))
+		return;
+	if (in_own_cluster(orig)) {
+		/* Always accept link here */
+		struct sk_buff *rbuf;
+		struct tipc_media_addr *addr;
+		struct node *n_ptr = node_find(orig);
+		int link_up;
+		dbg(" in own cluster\n");
+		if (n_ptr == NULL) {
+			n_ptr = node_create(orig);
+		}
+		if (n_ptr == NULL) {
+			warn("Memory squeeze; Failed to create node\n");
+			return;
+		}
+		spin_lock_bh(&n_ptr->lock);
+		link = n_ptr->links[b_ptr->identity];
+		if (!link) {
+			dbg("creating link\n");
+			link = link_create(b_ptr, orig, &media_addr);
+			if (!link) {
+				spin_unlock_bh(&n_ptr->lock);                
+				return;
+			}
+		}
+		addr = &link->media_addr;
+		if (memcmp(addr, &media_addr, sizeof(*addr))) {
+			char addr_string[16];
+
+			warn("New bearer address for %s\n", 
+			     addr_string_fill(addr_string, orig));
+			memcpy(addr, &media_addr, sizeof(*addr));
+			link_reset(link);     
+		}
+		link_up = link_is_up(link);
+		spin_unlock_bh(&n_ptr->lock);                
+		if ((type == DSC_RESP_MSG) || link_up)
+			return;
+		rbuf = disc_init_msg(DSC_RESP_MSG, 1, orig, b_ptr);
+		if (rbuf != NULL) {
+			msg_dbg(buf_msg(rbuf),"SEND:");
+			b_ptr->media->send_msg(rbuf, &b_ptr->publ, &media_addr);
+			buf_discard(rbuf);
+		}
+	}
+}
+
+/**
+ * disc_stop_link_req - stop sending periodic link setup requests
+ * @req: ptr to link request structure
+ */
+
+void disc_stop_link_req(struct link_req *req) 
+{
+	if (!req)
+		return;
+		
+	k_cancel_timer(&req->timer);
+	k_term_timer(&req->timer);
+	buf_discard(req->buf);
+	kfree(req);
+} 
+
+/**
+ * disc_update_link_req - update frequency of periodic link setup requests
+ * @req: ptr to link request structure
+ */
+
+void disc_update_link_req(struct link_req *req) 
+{
+	if (!req)
+		return;
+
+	if (req->timer_intv == TIPC_LINK_REQ_SLOW) {
+		if (!req->bearer->nodes.count) {
+			req->timer_intv = TIPC_LINK_REQ_FAST;
+			k_start_timer(&req->timer, req->timer_intv);
+		}
+	} else if (req->timer_intv == TIPC_LINK_REQ_FAST) {
+		if (req->bearer->nodes.count) {
+			req->timer_intv = TIPC_LINK_REQ_SLOW;
+			k_start_timer(&req->timer, req->timer_intv);
+		}
+	} else {
+		/* leave timer "as is" if haven't yet reached a "normal" rate */
+	}
+} 
+
+/**
+ * disc_timeout - send a periodic link setup request
+ * @req: ptr to link request structure
+ * 
+ * Called whenever a link setup request timer associated with a bearer expires.
+ */
+
+static void disc_timeout(struct link_req *req) 
+{
+	struct tipc_msg *msg = buf_msg(req->buf);
+
+	spin_lock_bh(&req->bearer->publ.lock);
+
+#if 0
+	/* CURRENTLY DON'T SUPPORT INTER-ZONE LINKS */
+	u32 dest_domain = msg_dest_domain(msg);
+	int stop = 0;
+	if (!in_scope(dest_domain, tipc_own_addr)) {
+		struct _zone *z_ptr = zone_find(dest_domain);
+
+		if (z_ptr && (z_ptr->links >= msg_req_links(msg)))
+			stop = 1;
+		if (req->timer_intv >= 32000)
+			stop = 1;
+	}
+	if (stop) {
+		k_cancel_timer(&req->timer);
+		buf_discard(req->buf);
+		kfree(req);
+		spin_unlock_bh(&req->bearer->publ.lock);
+		return;
+	}
+#endif
+
+	msg_dbg(msg,"SEND:");
+	req->bearer->media->send_msg(req->buf, &req->bearer->publ, &req->dest);
+
+	if ((req->timer_intv == TIPC_LINK_REQ_SLOW) ||
+	    (req->timer_intv == TIPC_LINK_REQ_FAST)) {
+		/* leave timer interval "as is" if already at a "normal" rate */
+	} else {
+		req->timer_intv *= 2;
+		if (req->timer_intv > TIPC_LINK_REQ_FAST)
+			req->timer_intv = TIPC_LINK_REQ_FAST;
+		if ((req->timer_intv == TIPC_LINK_REQ_FAST) && 
+		    (req->bearer->nodes.count))
+			req->timer_intv = TIPC_LINK_REQ_SLOW;
+	}
+	k_start_timer(&req->timer, req->timer_intv);
+
+	spin_unlock_bh(&req->bearer->publ.lock);
+}
+
+/**
+ * disc_init_link_req - start sending periodic link setup requests
+ * @b_ptr: ptr to bearer issuing requests
+ * @dest: destination address for request messages
+ * @dest_domain: network domain of node(s) which should respond to message
+ * @req_links: max number of desired links
+ * 
+ * Returns pointer to link request structure, or NULL if unable to create.
+ */
+
+struct link_req *disc_init_link_req(struct bearer *b_ptr, 
+				    const struct tipc_media_addr *dest,
+				    u32 dest_domain,
+				    u32 req_links) 
+{
+	struct link_req *req;
+
+	req = (struct link_req *)kmalloc(sizeof(*req), GFP_ATOMIC);
+	if (!req)
+		return NULL;
+
+	req->buf = disc_init_msg(DSC_REQ_MSG, req_links, dest_domain, b_ptr);
+	if (!req->buf) {
+		kfree(req);
+		return NULL;
+	}
+
+	memcpy(&req->dest, dest, sizeof(*dest));
+	req->bearer = b_ptr;
+	req->timer_intv = TIPC_LINK_REQ_INIT;
+	k_init_timer(&req->timer, (Handler)disc_timeout, (unsigned long)req);
+	k_start_timer(&req->timer, req->timer_intv);
+	return req;
+} 
+
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
new file mode 100644
index 000000000000..90c1de905e78
--- /dev/null
+++ b/net/tipc/discover.h
@@ -0,0 +1,55 @@
+/*
+ * net/tipc/discover.h
+ *
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_DISCOVER_H
+#define _TIPC_DISCOVER_H
+
+#include <linux/tipc.h>
+
+struct link_req;
+
+struct link_req *disc_init_link_req(struct bearer *b_ptr, 
+			            const struct tipc_media_addr *dest,
+			            u32 dest_domain,
+			            u32 req_links);
+void disc_update_link_req(struct link_req *req);
+void disc_stop_link_req(struct link_req *req);
+
+void disc_recv_msg(struct sk_buff *buf);
+
+void disc_link_event(u32 addr, char *name, int up);
+#if 0
+int  disc_create_link(const struct tipc_link_create *argv);
+#endif
+
+#endif
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
new file mode 100644
index 000000000000..b634d7a5640e
--- /dev/null
+++ b/net/tipc/eth_media.c
@@ -0,0 +1,296 @@
+/*
+ * net/tipc/eth_media.c: Ethernet bearer support for TIPC
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <net/tipc/tipc.h>
+#include <net/tipc/tipc_bearer.h>
+#include <net/tipc/tipc_msg.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+
+#define MAX_ETH_BEARERS		2
+#define TIPC_PROTOCOL		0x88ca
+#define ETH_LINK_PRIORITY	10
+#define ETH_LINK_TOLERANCE	TIPC_DEF_LINK_TOL
+
+
+/**
+ * struct eth_bearer - Ethernet bearer data structure
+ * @bearer: ptr to associated "generic" bearer structure
+ * @dev: ptr to associated Ethernet network device
+ * @tipc_packet_type: used in binding TIPC to Ethernet driver
+ */
+ 
+struct eth_bearer {
+	struct tipc_bearer *bearer;
+	struct net_device *dev;
+	struct packet_type tipc_packet_type;
+};
+
+static struct eth_bearer eth_bearers[MAX_ETH_BEARERS];
+static int eth_started = 0;
+static struct notifier_block notifier;
+
+/**
+ * send_msg - send a TIPC message out over an Ethernet interface 
+ */
+
+static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, 
+		    struct tipc_media_addr *dest)
+{
+	struct sk_buff *clone;
+	struct net_device *dev;
+
+	clone = skb_clone(buf, GFP_ATOMIC);
+	if (clone) {
+		clone->nh.raw = clone->data;
+		dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev;
+		clone->dev = dev;
+		dev->hard_header(clone, dev, TIPC_PROTOCOL, 
+				 &dest->dev_addr.eth_addr,
+				 dev->dev_addr, clone->len);
+		dev_queue_xmit(clone);
+	}
+	return TIPC_OK;
+}
+
+/**
+ * recv_msg - handle incoming TIPC message from an Ethernet interface
+ * 
+ * Routine truncates any Ethernet padding/CRC appended to the message,
+ * and ensures message size matches actual length
+ */
+
+static int recv_msg(struct sk_buff *buf, struct net_device *dev, 
+		    struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct eth_bearer *eb_ptr = (struct eth_bearer *)pt->af_packet_priv;
+	u32 size;
+
+	if (likely(eb_ptr->bearer)) {
+		size = msg_size((struct tipc_msg *)buf->data);
+		skb_trim(buf, size);
+		if (likely(buf->len == size)) {
+			buf->next = NULL;
+			tipc_recv_msg(buf, eb_ptr->bearer);
+		} else {
+			kfree_skb(buf);
+		}
+	} else {
+		kfree_skb(buf);
+	}
+	return TIPC_OK;
+}
+
+/**
+ * enable_bearer - attach TIPC bearer to an Ethernet interface 
+ */
+
+static int enable_bearer(struct tipc_bearer *tb_ptr)
+{
+	struct net_device *dev = dev_base;
+	struct eth_bearer *eb_ptr = &eth_bearers[0];
+	struct eth_bearer *stop = &eth_bearers[MAX_ETH_BEARERS];
+	char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1;
+
+	/* Find device with specified name */
+
+	while (dev && dev->name &&
+	       (memcmp(dev->name, driver_name, strlen(dev->name)))) {
+		dev = dev->next;
+	}
+	if (!dev)
+		return -ENODEV;
+
+	/* Find Ethernet bearer for device (or create one) */
+
+	for (;(eb_ptr != stop) && eb_ptr->dev && (eb_ptr->dev != dev); eb_ptr++);
+	if (eb_ptr == stop)
+		return -EDQUOT;
+	if (!eb_ptr->dev) {
+		eb_ptr->dev = dev;
+		eb_ptr->tipc_packet_type.type = __constant_htons(TIPC_PROTOCOL);
+		eb_ptr->tipc_packet_type.dev = dev;
+		eb_ptr->tipc_packet_type.func = recv_msg;
+		eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr;
+		INIT_LIST_HEAD(&(eb_ptr->tipc_packet_type.list));
+		dev_hold(dev);
+		dev_add_pack(&eb_ptr->tipc_packet_type);
+	}
+
+	/* Associate TIPC bearer with Ethernet bearer */
+
+	eb_ptr->bearer = tb_ptr;
+	tb_ptr->usr_handle = (void *)eb_ptr;
+	tb_ptr->mtu = dev->mtu;
+	tb_ptr->blocked = 0; 
+	tb_ptr->addr.type = htonl(TIPC_MEDIA_TYPE_ETH);
+	memcpy(&tb_ptr->addr.dev_addr, &dev->dev_addr, ETH_ALEN);
+	return 0;
+}
+
+/**
+ * disable_bearer - detach TIPC bearer from an Ethernet interface 
+ *
+ * We really should do dev_remove_pack() here, but this function can not be
+ * called at tasklet level. => Use eth_bearer->bearer as a flag to throw away
+ * incoming buffers, & postpone dev_remove_pack() to eth_media_stop() on exit.
+ */
+
+static void disable_bearer(struct tipc_bearer *tb_ptr)
+{
+	((struct eth_bearer *)tb_ptr->usr_handle)->bearer = 0;
+}
+
+/**
+ * recv_notification - handle device updates from OS
+ *
+ * Change the state of the Ethernet bearer (if any) associated with the 
+ * specified device.
+ */
+
+static int recv_notification(struct notifier_block *nb, unsigned long evt, 
+			     void *dv)
+{
+	struct net_device *dev = (struct net_device *)dv;
+	struct eth_bearer *eb_ptr = &eth_bearers[0];
+	struct eth_bearer *stop = &eth_bearers[MAX_ETH_BEARERS];
+
+	while ((eb_ptr->dev != dev)) {
+		if (++eb_ptr == stop)
+			return NOTIFY_DONE;	/* couldn't find device */
+	}
+	if (!eb_ptr->bearer)
+		return NOTIFY_DONE;		/* bearer had been disabled */
+
+        eb_ptr->bearer->mtu = dev->mtu;
+
+	switch (evt) {
+	case NETDEV_CHANGE:
+		if (netif_carrier_ok(dev))
+			tipc_continue(eb_ptr->bearer);
+		else
+			tipc_block_bearer(eb_ptr->bearer->name);
+		break;
+	case NETDEV_UP:
+		tipc_continue(eb_ptr->bearer);
+		break;
+	case NETDEV_DOWN:
+		tipc_block_bearer(eb_ptr->bearer->name);
+		break;
+	case NETDEV_CHANGEMTU:
+        case NETDEV_CHANGEADDR:
+		tipc_block_bearer(eb_ptr->bearer->name);
+                tipc_continue(eb_ptr->bearer);
+		break;
+	case NETDEV_UNREGISTER:
+        case NETDEV_CHANGENAME:
+		tipc_disable_bearer(eb_ptr->bearer->name);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+/**
+ * eth_addr2str - convert Ethernet address to string
+ */
+
+static char *eth_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size)
+{                       
+	unchar *addr = (unchar *)&a->dev_addr;
+
+	if (str_size < 18)
+		*str_buf = '\0';
+	else
+		sprintf(str_buf, "%02x:%02x:%02x:%02x:%02x:%02x",
+			addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]);
+	return str_buf;
+}
+
+/**
+ * eth_media_start - activate Ethernet bearer support
+ *
+ * Register Ethernet media type with TIPC bearer code.  Also register
+ * with OS for notifications about device state changes.
+ */
+
+int eth_media_start(void)
+{                       
+	struct tipc_media_addr bcast_addr;
+	int res;
+
+	if (eth_started)
+		return -EINVAL;
+
+	memset(&bcast_addr, 0xff, sizeof(bcast_addr));
+	memset(eth_bearers, 0, sizeof(eth_bearers));
+
+	res = tipc_register_media(TIPC_MEDIA_TYPE_ETH, "eth",
+				  enable_bearer, disable_bearer, send_msg, 
+				  eth_addr2str, &bcast_addr, ETH_LINK_PRIORITY, 
+				  ETH_LINK_TOLERANCE, TIPC_DEF_LINK_WIN);
+	if (res)
+		return res;
+
+	notifier.notifier_call = &recv_notification;
+	notifier.priority = 0;
+	res = register_netdevice_notifier(&notifier);
+	if (!res)
+		eth_started = 1;
+	return res;
+}
+
+/**
+ * eth_media_stop - deactivate Ethernet bearer support
+ */
+
+void eth_media_stop(void)
+{
+	int i;
+
+	if (!eth_started)
+		return;
+
+	unregister_netdevice_notifier(&notifier);
+	for (i = 0; i < MAX_ETH_BEARERS ; i++) {
+		if (eth_bearers[i].bearer) {
+			eth_bearers[i].bearer->blocked = 1;
+			eth_bearers[i].bearer = 0;
+		}
+		if (eth_bearers[i].dev) {
+			dev_remove_pack(&eth_bearers[i].tipc_packet_type);
+			dev_put(eth_bearers[i].dev);
+		}
+	}
+	memset(&eth_bearers, 0, sizeof(eth_bearers));
+	eth_started = 0;
+}
diff --git a/net/tipc/handler.c b/net/tipc/handler.c
new file mode 100644
index 000000000000..c8fbb2071dfc
--- /dev/null
+++ b/net/tipc/handler.c
@@ -0,0 +1,129 @@
+/*
+ * net/tipc/handler.c: TIPC signal handling
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+
+struct queue_item {
+	struct list_head next_signal;
+	void (*handler) (unsigned long);
+	unsigned long data;
+};
+
+static kmem_cache_t *tipc_queue_item_cache;
+static struct list_head signal_queue_head;
+static spinlock_t qitem_lock = SPIN_LOCK_UNLOCKED;
+static int handler_enabled = 0;
+
+static void process_signal_queue(unsigned long dummy);
+
+static DECLARE_TASKLET_DISABLED(tipc_tasklet, process_signal_queue, 0);
+
+
+unsigned int k_signal(Handler routine, unsigned long argument)
+{
+	struct queue_item *item;
+
+	if (!handler_enabled) {
+		err("Signal request ignored by handler\n");
+		return -ENOPROTOOPT;
+	}
+
+	spin_lock_bh(&qitem_lock);
+	item = kmem_cache_alloc(tipc_queue_item_cache, GFP_ATOMIC);
+	if (!item) {
+		err("Signal queue out of memory\n");
+		spin_unlock_bh(&qitem_lock);
+		return -ENOMEM;
+	}
+	item->handler = routine;
+	item->data = argument;
+	list_add_tail(&item->next_signal, &signal_queue_head);
+	spin_unlock_bh(&qitem_lock);
+	tasklet_schedule(&tipc_tasklet);
+	return 0;
+}
+
+static void process_signal_queue(unsigned long dummy)
+{
+	struct queue_item *__volatile__ item;
+	struct list_head *l, *n;
+
+	spin_lock_bh(&qitem_lock);
+	list_for_each_safe(l, n, &signal_queue_head) {
+		item = list_entry(l, struct queue_item, next_signal);
+		list_del(&item->next_signal);
+		spin_unlock_bh(&qitem_lock);
+		item->handler(item->data);
+		spin_lock_bh(&qitem_lock);
+		kmem_cache_free(tipc_queue_item_cache, item);
+	}
+	spin_unlock_bh(&qitem_lock);
+}
+
+int handler_start(void)
+{
+	tipc_queue_item_cache = 
+		kmem_cache_create("tipc_queue_items", sizeof(struct queue_item),
+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!tipc_queue_item_cache)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&signal_queue_head);
+	tasklet_enable(&tipc_tasklet);
+	handler_enabled = 1;
+	return 0;
+}
+
+void handler_stop(void)
+{
+	struct list_head *l, *n;
+	struct queue_item *item; 
+
+	if (!handler_enabled)
+		return;
+
+	handler_enabled = 0;
+	tasklet_disable(&tipc_tasklet);
+	tasklet_kill(&tipc_tasklet);
+
+	spin_lock_bh(&qitem_lock);
+	list_for_each_safe(l, n, &signal_queue_head) {
+		item = list_entry(l, struct queue_item, next_signal);
+		list_del(&item->next_signal);
+		kmem_cache_free(tipc_queue_item_cache, item);
+	}
+	spin_unlock_bh(&qitem_lock);
+
+	kmem_cache_destroy(tipc_queue_item_cache);
+}
+
diff --git a/net/tipc/link.c b/net/tipc/link.c
new file mode 100644
index 000000000000..92acb80bb24d
--- /dev/null
+++ b/net/tipc/link.c
@@ -0,0 +1,3164 @@
+/*
+ * net/tipc/link.c: TIPC link code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "dbg.h"
+#include "link.h"
+#include "net.h"
+#include "node.h"
+#include "port.h"
+#include "addr.h"
+#include "node_subscr.h"
+#include "name_distr.h"
+#include "bearer.h"
+#include "name_table.h"
+#include "discover.h"
+#include "config.h"
+#include "bcast.h"
+
+
+/* 
+ * Limit for deferred reception queue: 
+ */
+
+#define DEF_QUEUE_LIMIT 256u
+
+/* 
+ * Link state events: 
+ */
+
+#define  STARTING_EVT    856384768	/* link processing trigger */
+#define  TRAFFIC_MSG_EVT 560815u	/* rx'd ??? */
+#define  TIMEOUT_EVT     560817u	/* link timer expired */
+
+/*   
+ * The following two 'message types' is really just implementation 
+ * data conveniently stored in the message header. 
+ * They must not be considered part of the protocol
+ */
+#define OPEN_MSG   0
+#define CLOSED_MSG 1
+
+/* 
+ * State value stored in 'exp_msg_count'
+ */
+
+#define START_CHANGEOVER 100000u
+
+/**
+ * struct link_name - deconstructed link name
+ * @addr_local: network address of node at this end
+ * @if_local: name of interface at this end
+ * @addr_peer: network address of node at far end
+ * @if_peer: name of interface at far end
+ */
+
+struct link_name {
+	u32 addr_local;
+	char if_local[TIPC_MAX_IF_NAME];
+	u32 addr_peer;
+	char if_peer[TIPC_MAX_IF_NAME];
+};
+
+#if 0
+
+/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */
+
+/** 
+ * struct link_event - link up/down event notification
+ */
+
+struct link_event {
+	u32 addr;
+	int up;
+	void (*fcn)(u32, char *, int);
+	char name[TIPC_MAX_LINK_NAME];
+};
+
+#endif
+
+static void link_handle_out_of_seq_msg(struct link *l_ptr,
+				       struct sk_buff *buf);
+static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf);
+static int  link_recv_changeover_msg(struct link **l_ptr, struct sk_buff **buf);
+static void link_set_supervision_props(struct link *l_ptr, u32 tolerance);
+static int  link_send_sections_long(struct port *sender,
+				    struct iovec const *msg_sect,
+				    u32 num_sect, u32 destnode);
+static void link_check_defragm_bufs(struct link *l_ptr);
+static void link_state_event(struct link *l_ptr, u32 event);
+static void link_reset_statistics(struct link *l_ptr);
+static void link_print(struct link *l_ptr, struct print_buf *buf, 
+		       const char *str);
+
+/*
+ * Debugging code used by link routines only
+ *
+ * When debugging link problems on a system that has multiple links,
+ * the standard TIPC debugging routines may not be useful since they
+ * allow the output from multiple links to be intermixed.  For this reason
+ * routines of the form "dbg_link_XXX()" have been created that will capture
+ * debug info into a link's personal print buffer, which can then be dumped
+ * into the TIPC system log (LOG) upon request.
+ *
+ * To enable per-link debugging, use LINK_LOG_BUF_SIZE to specify the size
+ * of the print buffer used by each link.  If LINK_LOG_BUF_SIZE is set to 0,
+ * the dbg_link_XXX() routines simply send their output to the standard 
+ * debug print buffer (DBG_OUTPUT), if it has been defined; this can be useful
+ * when there is only a single link in the system being debugged.
+ *
+ * Notes:
+ * - When enabled, LINK_LOG_BUF_SIZE should be set to at least 1000 (bytes)
+ * - "l_ptr" must be valid when using dbg_link_XXX() macros  
+ */
+
+#define LINK_LOG_BUF_SIZE 0
+
+#define dbg_link(fmt, arg...)  do {if (LINK_LOG_BUF_SIZE) tipc_printf(&l_ptr->print_buf, fmt, ## arg); } while(0)
+#define dbg_link_msg(msg, txt) do {if (LINK_LOG_BUF_SIZE) msg_print(&l_ptr->print_buf, msg, txt); } while(0)
+#define dbg_link_state(txt) do {if (LINK_LOG_BUF_SIZE) link_print(l_ptr, &l_ptr->print_buf, txt); } while(0)
+#define dbg_link_dump() do { \
+	if (LINK_LOG_BUF_SIZE) { \
+		tipc_printf(LOG, "\n\nDumping link <%s>:\n", l_ptr->name); \
+		printbuf_move(LOG, &l_ptr->print_buf); \
+	} \
+} while (0)
+
+static inline void dbg_print_link(struct link *l_ptr, const char *str)
+{
+	if (DBG_OUTPUT)
+		link_print(l_ptr, DBG_OUTPUT, str);
+}
+
+static inline void dbg_print_buf_chain(struct sk_buff *root_buf)
+{
+	if (DBG_OUTPUT) {
+		struct sk_buff *buf = root_buf;
+
+		while (buf) {
+			msg_dbg(buf_msg(buf), "In chain: ");
+			buf = buf->next;
+		}
+	}
+}
+
+/*
+ *  Simple inlined link routines
+ */
+
+static inline unsigned int align(unsigned int i)
+{
+	return (i + 3) & ~3u;
+}
+
+static inline int link_working_working(struct link *l_ptr)
+{
+	return (l_ptr->state == WORKING_WORKING);
+}
+
+static inline int link_working_unknown(struct link *l_ptr)
+{
+	return (l_ptr->state == WORKING_UNKNOWN);
+}
+
+static inline int link_reset_unknown(struct link *l_ptr)
+{
+	return (l_ptr->state == RESET_UNKNOWN);
+}
+
+static inline int link_reset_reset(struct link *l_ptr)
+{
+	return (l_ptr->state == RESET_RESET);
+}
+
+static inline int link_blocked(struct link *l_ptr)
+{
+	return (l_ptr->exp_msg_count || l_ptr->blocked);
+}
+
+static inline int link_congested(struct link *l_ptr)
+{
+	return (l_ptr->out_queue_size >= l_ptr->queue_limit[0]);
+}
+
+static inline u32 link_max_pkt(struct link *l_ptr)
+{
+	return l_ptr->max_pkt;
+}
+
+static inline void link_init_max_pkt(struct link *l_ptr)
+{
+	u32 max_pkt;
+	
+	max_pkt = (l_ptr->b_ptr->publ.mtu & ~3);
+	if (max_pkt > MAX_MSG_SIZE)
+		max_pkt = MAX_MSG_SIZE;
+
+        l_ptr->max_pkt_target = max_pkt;
+	if (l_ptr->max_pkt_target < MAX_PKT_DEFAULT)
+		l_ptr->max_pkt = l_ptr->max_pkt_target;
+	else 
+		l_ptr->max_pkt = MAX_PKT_DEFAULT;
+
+        l_ptr->max_pkt_probes = 0;
+}
+
+static inline u32 link_next_sent(struct link *l_ptr)
+{
+	if (l_ptr->next_out)
+		return msg_seqno(buf_msg(l_ptr->next_out));
+	return mod(l_ptr->next_out_no);
+}
+
+static inline u32 link_last_sent(struct link *l_ptr)
+{
+	return mod(link_next_sent(l_ptr) - 1);
+}
+
+/*
+ *  Simple non-inlined link routines (i.e. referenced outside this file)
+ */
+
+int link_is_up(struct link *l_ptr)
+{
+	if (!l_ptr)
+		return 0;
+	return (link_working_working(l_ptr) || link_working_unknown(l_ptr));
+}
+
+int link_is_active(struct link *l_ptr)
+{
+	return ((l_ptr->owner->active_links[0] == l_ptr) ||
+		(l_ptr->owner->active_links[1] == l_ptr));
+}
+
+/**
+ * link_name_validate - validate & (optionally) deconstruct link name
+ * @name - ptr to link name string
+ * @name_parts - ptr to area for link name components (or NULL if not needed)
+ * 
+ * Returns 1 if link name is valid, otherwise 0.
+ */
+
+static int link_name_validate(const char *name, struct link_name *name_parts)
+{
+	char name_copy[TIPC_MAX_LINK_NAME];
+	char *addr_local;
+	char *if_local;
+	char *addr_peer;
+	char *if_peer;
+	char dummy;
+	u32 z_local, c_local, n_local;
+	u32 z_peer, c_peer, n_peer;
+	u32 if_local_len;
+	u32 if_peer_len;
+
+	/* copy link name & ensure length is OK */
+
+	name_copy[TIPC_MAX_LINK_NAME - 1] = 0;
+	/* need above in case non-Posix strncpy() doesn't pad with nulls */
+	strncpy(name_copy, name, TIPC_MAX_LINK_NAME);
+	if (name_copy[TIPC_MAX_LINK_NAME - 1] != 0)
+		return 0;
+
+	/* ensure all component parts of link name are present */
+
+	addr_local = name_copy;
+	if ((if_local = strchr(addr_local, ':')) == NULL)
+		return 0;
+	*(if_local++) = 0;
+	if ((addr_peer = strchr(if_local, '-')) == NULL)
+		return 0;
+	*(addr_peer++) = 0;
+	if_local_len = addr_peer - if_local;
+	if ((if_peer = strchr(addr_peer, ':')) == NULL)
+		return 0;
+	*(if_peer++) = 0;
+	if_peer_len = strlen(if_peer) + 1;
+
+	/* validate component parts of link name */
+
+	if ((sscanf(addr_local, "%u.%u.%u%c",
+		    &z_local, &c_local, &n_local, &dummy) != 3) ||
+	    (sscanf(addr_peer, "%u.%u.%u%c",
+		    &z_peer, &c_peer, &n_peer, &dummy) != 3) ||
+	    (z_local > 255) || (c_local > 4095) || (n_local > 4095) ||
+	    (z_peer  > 255) || (c_peer  > 4095) || (n_peer  > 4095) ||
+	    (if_local_len <= 1) || (if_local_len > TIPC_MAX_IF_NAME) || 
+	    (if_peer_len  <= 1) || (if_peer_len  > TIPC_MAX_IF_NAME) || 
+	    (strspn(if_local, tipc_alphabet) != (if_local_len - 1)) ||
+	    (strspn(if_peer, tipc_alphabet) != (if_peer_len - 1)))
+		return 0;
+
+	/* return link name components, if necessary */
+
+	if (name_parts) {
+		name_parts->addr_local = tipc_addr(z_local, c_local, n_local);
+		strcpy(name_parts->if_local, if_local);
+		name_parts->addr_peer = tipc_addr(z_peer, c_peer, n_peer);
+		strcpy(name_parts->if_peer, if_peer);
+	}
+	return 1;
+}
+
+/**
+ * link_timeout - handle expiration of link timer
+ * @l_ptr: pointer to link
+ * 
+ * This routine must not grab "net_lock" to avoid a potential deadlock conflict
+ * with link_delete().  (There is no risk that the node will be deleted by
+ * another thread because link_delete() always cancels the link timer before
+ * node_delete() is called.)
+ */
+
+static void link_timeout(struct link *l_ptr)
+{
+	node_lock(l_ptr->owner);
+
+	/* update counters used in statistical profiling of send traffic */
+
+	l_ptr->stats.accu_queue_sz += l_ptr->out_queue_size;
+	l_ptr->stats.queue_sz_counts++;
+
+	if (l_ptr->out_queue_size > l_ptr->stats.max_queue_sz)
+		l_ptr->stats.max_queue_sz = l_ptr->out_queue_size;
+
+	if (l_ptr->first_out) {
+		struct tipc_msg *msg = buf_msg(l_ptr->first_out);
+		u32 length = msg_size(msg);
+
+		if ((msg_user(msg) == MSG_FRAGMENTER)
+		    && (msg_type(msg) == FIRST_FRAGMENT)) {
+			length = msg_size(msg_get_wrapped(msg));
+		}
+		if (length) {
+			l_ptr->stats.msg_lengths_total += length;
+			l_ptr->stats.msg_length_counts++;
+			if (length <= 64)
+				l_ptr->stats.msg_length_profile[0]++;
+			else if (length <= 256)
+				l_ptr->stats.msg_length_profile[1]++;
+			else if (length <= 1024)
+				l_ptr->stats.msg_length_profile[2]++;
+			else if (length <= 4096)
+				l_ptr->stats.msg_length_profile[3]++;
+			else if (length <= 16384)
+				l_ptr->stats.msg_length_profile[4]++;
+			else if (length <= 32768)
+				l_ptr->stats.msg_length_profile[5]++;
+			else
+				l_ptr->stats.msg_length_profile[6]++;
+		}
+	}
+
+	/* do all other link processing performed on a periodic basis */
+
+	link_check_defragm_bufs(l_ptr);
+
+	link_state_event(l_ptr, TIMEOUT_EVT);
+
+	if (l_ptr->next_out)
+		link_push_queue(l_ptr);
+
+	node_unlock(l_ptr->owner);
+}
+
+static inline void link_set_timer(struct link *l_ptr, u32 time)
+{
+	k_start_timer(&l_ptr->timer, time);
+}
+
+/**
+ * link_create - create a new link
+ * @b_ptr: pointer to associated bearer
+ * @peer: network address of node at other end of link
+ * @media_addr: media address to use when sending messages over link
+ * 
+ * Returns pointer to link.
+ */
+
+struct link *link_create(struct bearer *b_ptr, const u32 peer,
+			 const struct tipc_media_addr *media_addr)
+{
+	struct link *l_ptr;
+	struct tipc_msg *msg;
+	char *if_name;
+
+	l_ptr = (struct link *)kmalloc(sizeof(*l_ptr), GFP_ATOMIC);
+	if (!l_ptr) {
+		warn("Memory squeeze; Failed to create link\n");
+		return NULL;
+	}
+	memset(l_ptr, 0, sizeof(*l_ptr));
+
+	l_ptr->addr = peer;
+	if_name = strchr(b_ptr->publ.name, ':') + 1;
+	sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:",
+		tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr),
+		tipc_node(tipc_own_addr), 
+		if_name,
+		tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
+		/* note: peer i/f is appended to link name by reset/activate */
+	memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr));
+	k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr);
+	list_add_tail(&l_ptr->link_list, &b_ptr->links);
+	l_ptr->checkpoint = 1;
+	l_ptr->b_ptr = b_ptr;
+	link_set_supervision_props(l_ptr, b_ptr->media->tolerance);
+	l_ptr->state = RESET_UNKNOWN;
+
+	l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg;
+	msg = l_ptr->pmsg;
+	msg_init(msg, LINK_PROTOCOL, RESET_MSG, TIPC_OK, INT_H_SIZE, l_ptr->addr);
+	msg_set_size(msg, sizeof(l_ptr->proto_msg));
+	msg_set_session(msg, tipc_random);
+	msg_set_bearer_id(msg, b_ptr->identity);
+	strcpy((char *)msg_data(msg), if_name);
+
+	l_ptr->priority = b_ptr->priority;
+	link_set_queue_limits(l_ptr, b_ptr->media->window);
+
+	link_init_max_pkt(l_ptr);
+
+	l_ptr->next_out_no = 1;
+	INIT_LIST_HEAD(&l_ptr->waiting_ports);
+
+	link_reset_statistics(l_ptr);
+
+	l_ptr->owner = node_attach_link(l_ptr);
+	if (!l_ptr->owner) {
+		kfree(l_ptr);
+		return NULL;
+	}
+
+	if (LINK_LOG_BUF_SIZE) {
+		char *pb = kmalloc(LINK_LOG_BUF_SIZE, GFP_ATOMIC);
+
+		if (!pb) {
+			kfree(l_ptr);
+			warn("Memory squeeze; Failed to create link\n");
+			return NULL;
+		}
+		printbuf_init(&l_ptr->print_buf, pb, LINK_LOG_BUF_SIZE);
+	}
+
+	k_signal((Handler)link_start, (unsigned long)l_ptr);
+
+	dbg("link_create(): tolerance = %u,cont intv = %u, abort_limit = %u\n",
+	    l_ptr->tolerance, l_ptr->continuity_interval, l_ptr->abort_limit);
+	
+	return l_ptr;
+}
+
+/** 
+ * link_delete - delete a link
+ * @l_ptr: pointer to link
+ * 
+ * Note: 'net_lock' is write_locked, bearer is locked.
+ * This routine must not grab the node lock until after link timer cancellation
+ * to avoid a potential deadlock situation.  
+ */
+
+void link_delete(struct link *l_ptr)
+{
+	if (!l_ptr) {
+		err("Attempt to delete non-existent link\n");
+		return;
+	}
+
+	dbg("link_delete()\n");
+
+	k_cancel_timer(&l_ptr->timer);
+	
+	node_lock(l_ptr->owner);
+	link_reset(l_ptr);
+	node_detach_link(l_ptr->owner, l_ptr);
+	link_stop(l_ptr);
+	list_del_init(&l_ptr->link_list);
+	if (LINK_LOG_BUF_SIZE)
+		kfree(l_ptr->print_buf.buf);
+	node_unlock(l_ptr->owner);
+	k_term_timer(&l_ptr->timer);
+	kfree(l_ptr);
+}
+
+void link_start(struct link *l_ptr)
+{
+	dbg("link_start %x\n", l_ptr);
+	link_state_event(l_ptr, STARTING_EVT);
+}
+
+/**
+ * link_schedule_port - schedule port for deferred sending 
+ * @l_ptr: pointer to link
+ * @origport: reference to sending port
+ * @sz: amount of data to be sent
+ * 
+ * Schedules port for renewed sending of messages after link congestion 
+ * has abated.
+ */
+
+static int link_schedule_port(struct link *l_ptr, u32 origport, u32 sz)
+{
+	struct port *p_ptr;
+
+	spin_lock_bh(&port_list_lock);
+	p_ptr = port_lock(origport);
+	if (p_ptr) {
+		if (!p_ptr->wakeup)
+			goto exit;
+		if (!list_empty(&p_ptr->wait_list))
+			goto exit;
+		p_ptr->congested_link = l_ptr;
+		p_ptr->publ.congested = 1;
+		p_ptr->waiting_pkts = 1 + ((sz - 1) / link_max_pkt(l_ptr));
+		list_add_tail(&p_ptr->wait_list, &l_ptr->waiting_ports);
+		l_ptr->stats.link_congs++;
+exit:
+		port_unlock(p_ptr);
+	}
+	spin_unlock_bh(&port_list_lock);
+	return -ELINKCONG;
+}
+
+void link_wakeup_ports(struct link *l_ptr, int all)
+{
+	struct port *p_ptr;
+	struct port *temp_p_ptr;
+	int win = l_ptr->queue_limit[0] - l_ptr->out_queue_size;
+
+	if (all)
+		win = 100000;
+	if (win <= 0)
+		return;
+	if (!spin_trylock_bh(&port_list_lock))
+		return;
+	if (link_congested(l_ptr))
+		goto exit;
+	list_for_each_entry_safe(p_ptr, temp_p_ptr, &l_ptr->waiting_ports, 
+				 wait_list) {
+		if (win <= 0)
+			break;
+		list_del_init(&p_ptr->wait_list);
+		p_ptr->congested_link = 0;
+		assert(p_ptr->wakeup);
+		spin_lock_bh(p_ptr->publ.lock);
+		p_ptr->publ.congested = 0;
+		p_ptr->wakeup(&p_ptr->publ);
+		win -= p_ptr->waiting_pkts;
+		spin_unlock_bh(p_ptr->publ.lock);
+	}
+
+exit:
+	spin_unlock_bh(&port_list_lock);
+}
+
+/** 
+ * link_release_outqueue - purge link's outbound message queue
+ * @l_ptr: pointer to link
+ */
+
+static void link_release_outqueue(struct link *l_ptr)
+{
+	struct sk_buff *buf = l_ptr->first_out;
+	struct sk_buff *next;
+
+	while (buf) {
+		next = buf->next;
+		buf_discard(buf);
+		buf = next;
+	}
+	l_ptr->first_out = NULL;
+	l_ptr->out_queue_size = 0;
+}
+
+/**
+ * link_reset_fragments - purge link's inbound message fragments queue
+ * @l_ptr: pointer to link
+ */
+
+void link_reset_fragments(struct link *l_ptr)
+{
+	struct sk_buff *buf = l_ptr->defragm_buf;
+	struct sk_buff *next;
+
+	while (buf) {
+		next = buf->next;
+		buf_discard(buf);
+		buf = next;
+	}
+	l_ptr->defragm_buf = NULL;
+}
+
+/** 
+ * link_stop - purge all inbound and outbound messages associated with link
+ * @l_ptr: pointer to link
+ */
+
+void link_stop(struct link *l_ptr)
+{
+	struct sk_buff *buf;
+	struct sk_buff *next;
+
+	buf = l_ptr->oldest_deferred_in;
+	while (buf) {
+		next = buf->next;
+		buf_discard(buf);
+		buf = next;
+	}
+
+	buf = l_ptr->first_out;
+	while (buf) {
+		next = buf->next;
+		buf_discard(buf);
+		buf = next;
+	}
+
+	link_reset_fragments(l_ptr);
+
+	buf_discard(l_ptr->proto_msg_queue);
+	l_ptr->proto_msg_queue = NULL;
+}
+
+#if 0
+
+/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */
+
+static void link_recv_event(struct link_event *ev)
+{
+	ev->fcn(ev->addr, ev->name, ev->up);
+	kfree(ev);
+}
+
+static void link_send_event(void (*fcn)(u32 a, char *n, int up),
+			    struct link *l_ptr, int up)
+{
+	struct link_event *ev;
+	
+	ev = kmalloc(sizeof(*ev), GFP_ATOMIC);
+	if (!ev) {
+		warn("Link event allocation failure\n");
+		return;
+	}
+	ev->addr = l_ptr->addr;
+	ev->up = up;
+	ev->fcn = fcn;
+	memcpy(ev->name, l_ptr->name, TIPC_MAX_LINK_NAME);
+	k_signal((Handler)link_recv_event, (unsigned long)ev);
+}
+
+#else
+
+#define link_send_event(fcn, l_ptr, up) do { } while (0)
+
+#endif
+
+void link_reset(struct link *l_ptr)
+{
+	struct sk_buff *buf;
+	u32 prev_state = l_ptr->state;
+	u32 checkpoint = l_ptr->next_in_no;
+	
+	msg_set_session(l_ptr->pmsg, msg_session(l_ptr->pmsg) + 1);
+
+        /* Link is down, accept any session: */
+	l_ptr->peer_session = 0;
+
+        /* Prepare for max packet size negotiation */
+	link_init_max_pkt(l_ptr);
+	
+	l_ptr->state = RESET_UNKNOWN;
+	dbg_link_state("Resetting Link\n");
+
+	if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET))
+		return;
+
+	node_link_down(l_ptr->owner, l_ptr);
+	bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr);
+#if 0
+	tipc_printf(CONS, "\nReset link <%s>\n", l_ptr->name);
+	dbg_link_dump();
+#endif
+	if (node_has_active_links(l_ptr->owner) &&
+	    l_ptr->owner->permit_changeover) {
+		l_ptr->reset_checkpoint = checkpoint;
+		l_ptr->exp_msg_count = START_CHANGEOVER;
+	}
+
+	/* Clean up all queues: */
+
+	link_release_outqueue(l_ptr);
+	buf_discard(l_ptr->proto_msg_queue);
+	l_ptr->proto_msg_queue = NULL;
+	buf = l_ptr->oldest_deferred_in;
+	while (buf) {
+		struct sk_buff *next = buf->next;
+		buf_discard(buf);
+		buf = next;
+	}
+	if (!list_empty(&l_ptr->waiting_ports))
+		link_wakeup_ports(l_ptr, 1);
+
+	l_ptr->retransm_queue_head = 0;
+	l_ptr->retransm_queue_size = 0;
+	l_ptr->last_out = NULL;
+	l_ptr->first_out = NULL;
+	l_ptr->next_out = NULL;
+	l_ptr->unacked_window = 0;
+	l_ptr->checkpoint = 1;
+	l_ptr->next_out_no = 1;
+	l_ptr->deferred_inqueue_sz = 0;
+	l_ptr->oldest_deferred_in = NULL;
+	l_ptr->newest_deferred_in = NULL;
+	l_ptr->fsm_msg_cnt = 0;
+	l_ptr->stale_count = 0;
+	link_reset_statistics(l_ptr);
+
+	link_send_event(cfg_link_event, l_ptr, 0);
+	if (!in_own_cluster(l_ptr->addr))
+		link_send_event(disc_link_event, l_ptr, 0);
+}
+
+
+static void link_activate(struct link *l_ptr)
+{
+	l_ptr->next_in_no = 1;
+	node_link_up(l_ptr->owner, l_ptr);
+	bearer_add_dest(l_ptr->b_ptr, l_ptr->addr);
+	link_send_event(cfg_link_event, l_ptr, 1);
+	if (!in_own_cluster(l_ptr->addr))
+		link_send_event(disc_link_event, l_ptr, 1);
+}
+
+/**
+ * link_state_event - link finite state machine
+ * @l_ptr: pointer to link
+ * @event: state machine event to process
+ */
+
+static void link_state_event(struct link *l_ptr, unsigned event)
+{
+	struct link *other; 
+	u32 cont_intv = l_ptr->continuity_interval;
+
+	if (!l_ptr->started && (event != STARTING_EVT))
+		return;		/* Not yet. */
+
+	if (link_blocked(l_ptr)) {
+		if (event == TIMEOUT_EVT) {
+			link_set_timer(l_ptr, cont_intv);
+		}
+		return;	  /* Changeover going on */
+	}
+	dbg_link("STATE_EV: <%s> ", l_ptr->name);
+
+	switch (l_ptr->state) {
+	case WORKING_WORKING:
+		dbg_link("WW/");
+		switch (event) {
+		case TRAFFIC_MSG_EVT:
+			dbg_link("TRF-");
+			/* fall through */
+		case ACTIVATE_MSG:
+			dbg_link("ACT\n");
+			break;
+		case TIMEOUT_EVT:
+			dbg_link("TIM ");
+			if (l_ptr->next_in_no != l_ptr->checkpoint) {
+				l_ptr->checkpoint = l_ptr->next_in_no;
+				if (bclink_acks_missing(l_ptr->owner)) {
+					link_send_proto_msg(l_ptr, STATE_MSG, 
+							    0, 0, 0, 0, 0);
+					l_ptr->fsm_msg_cnt++;
+				} else if (l_ptr->max_pkt < l_ptr->max_pkt_target) {
+					link_send_proto_msg(l_ptr, STATE_MSG, 
+							    1, 0, 0, 0, 0);
+					l_ptr->fsm_msg_cnt++;
+				}
+				link_set_timer(l_ptr, cont_intv);
+				break;
+			}
+			dbg_link(" -> WU\n");
+			l_ptr->state = WORKING_UNKNOWN;
+			l_ptr->fsm_msg_cnt = 0;
+			link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv / 4);
+			break;
+		case RESET_MSG:
+			dbg_link("RES -> RR\n");
+			link_reset(l_ptr);
+			l_ptr->state = RESET_RESET;
+			l_ptr->fsm_msg_cnt = 0;
+			link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		default:
+			err("Unknown link event %u in WW state\n", event);
+		}
+		break;
+	case WORKING_UNKNOWN:
+		dbg_link("WU/");
+		switch (event) {
+		case TRAFFIC_MSG_EVT:
+			dbg_link("TRF-");
+		case ACTIVATE_MSG:
+			dbg_link("ACT -> WW\n");
+			l_ptr->state = WORKING_WORKING;
+			l_ptr->fsm_msg_cnt = 0;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		case RESET_MSG:
+			dbg_link("RES -> RR\n");
+			link_reset(l_ptr);
+			l_ptr->state = RESET_RESET;
+			l_ptr->fsm_msg_cnt = 0;
+			link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		case TIMEOUT_EVT:
+			dbg_link("TIM ");
+			if (l_ptr->next_in_no != l_ptr->checkpoint) {
+				dbg_link("-> WW \n");
+				l_ptr->state = WORKING_WORKING;
+				l_ptr->fsm_msg_cnt = 0;
+				l_ptr->checkpoint = l_ptr->next_in_no;
+				if (bclink_acks_missing(l_ptr->owner)) {
+					link_send_proto_msg(l_ptr, STATE_MSG,
+							    0, 0, 0, 0, 0);
+					l_ptr->fsm_msg_cnt++;
+				}
+				link_set_timer(l_ptr, cont_intv);
+			} else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) {
+				dbg_link("Probing %u/%u,timer = %u ms)\n",
+					 l_ptr->fsm_msg_cnt, l_ptr->abort_limit,
+					 cont_intv / 4);
+				link_send_proto_msg(l_ptr, STATE_MSG, 
+						    1, 0, 0, 0, 0);
+				l_ptr->fsm_msg_cnt++;
+				link_set_timer(l_ptr, cont_intv / 4);
+			} else {	/* Link has failed */
+				dbg_link("-> RU (%u probes unanswered)\n",
+					 l_ptr->fsm_msg_cnt);
+				link_reset(l_ptr);
+				l_ptr->state = RESET_UNKNOWN;
+				l_ptr->fsm_msg_cnt = 0;
+				link_send_proto_msg(l_ptr, RESET_MSG,
+						    0, 0, 0, 0, 0);
+				l_ptr->fsm_msg_cnt++;
+				link_set_timer(l_ptr, cont_intv);
+			}
+			break;
+		default:
+			err("Unknown link event %u in WU state\n", event);
+		}
+		break;
+	case RESET_UNKNOWN:
+		dbg_link("RU/");
+		switch (event) {
+		case TRAFFIC_MSG_EVT:
+			dbg_link("TRF-\n");
+			break;
+		case ACTIVATE_MSG:
+			other = l_ptr->owner->active_links[0];
+			if (other && link_working_unknown(other)) {
+				dbg_link("ACT\n");
+				break;
+			}
+			dbg_link("ACT -> WW\n");
+			l_ptr->state = WORKING_WORKING;
+			l_ptr->fsm_msg_cnt = 0;
+			link_activate(l_ptr);
+			link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		case RESET_MSG:
+			dbg_link("RES \n");
+			dbg_link(" -> RR\n");
+			l_ptr->state = RESET_RESET;
+			l_ptr->fsm_msg_cnt = 0;
+			link_send_proto_msg(l_ptr, ACTIVATE_MSG, 1, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		case STARTING_EVT:
+			dbg_link("START-");
+			l_ptr->started = 1;
+			/* fall through */
+		case TIMEOUT_EVT:
+			dbg_link("TIM \n");
+			link_send_proto_msg(l_ptr, RESET_MSG, 0, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		default:
+			err("Unknown link event %u in RU state\n", event);
+		}
+		break;
+	case RESET_RESET:
+		dbg_link("RR/ ");
+		switch (event) {
+		case TRAFFIC_MSG_EVT:
+			dbg_link("TRF-");
+			/* fall through */
+		case ACTIVATE_MSG:
+			other = l_ptr->owner->active_links[0];
+			if (other && link_working_unknown(other)) {
+				dbg_link("ACT\n");
+				break;
+			}
+			dbg_link("ACT -> WW\n");
+			l_ptr->state = WORKING_WORKING;
+			l_ptr->fsm_msg_cnt = 0;
+			link_activate(l_ptr);
+			link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		case RESET_MSG:
+			dbg_link("RES\n");
+			break;
+		case TIMEOUT_EVT:
+			dbg_link("TIM\n");
+			link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			dbg_link("fsm_msg_cnt %u\n", l_ptr->fsm_msg_cnt);
+			break;
+		default:
+			err("Unknown link event %u in RR state\n", event);
+		}
+		break;
+	default:
+		err("Unknown link state %u/%u\n", l_ptr->state, event);
+	}
+}
+
+/*
+ * link_bundle_buf(): Append contents of a buffer to
+ * the tail of an existing one. 
+ */
+
+static int link_bundle_buf(struct link *l_ptr,
+			   struct sk_buff *bundler, 
+			   struct sk_buff *buf)
+{
+	struct tipc_msg *bundler_msg = buf_msg(bundler);
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 size = msg_size(msg);
+	u32 to_pos = align(msg_size(bundler_msg));
+	u32 rest = link_max_pkt(l_ptr) - to_pos;
+
+	if (msg_user(bundler_msg) != MSG_BUNDLER)
+		return 0;
+	if (msg_type(bundler_msg) != OPEN_MSG)
+		return 0;
+	if (rest < align(size))
+		return 0;
+
+	skb_put(bundler, (to_pos - msg_size(bundler_msg)) + size);
+	memcpy(bundler->data + to_pos, buf->data, size);
+	msg_set_size(bundler_msg, to_pos + size);
+	msg_set_msgcnt(bundler_msg, msg_msgcnt(bundler_msg) + 1);
+	dbg("Packed msg # %u(%u octets) into pos %u in buf(#%u)\n",
+	    msg_msgcnt(bundler_msg), size, to_pos, msg_seqno(bundler_msg));
+	msg_dbg(msg, "PACKD:");
+	buf_discard(buf);
+	l_ptr->stats.sent_bundled++;
+	return 1;
+}
+
+static inline void link_add_to_outqueue(struct link *l_ptr, 
+					struct sk_buff *buf, 
+					struct tipc_msg *msg)
+{
+	u32 ack = mod(l_ptr->next_in_no - 1);
+	u32 seqno = mod(l_ptr->next_out_no++);
+
+	msg_set_word(msg, 2, ((ack << 16) | seqno));
+	msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
+	buf->next = NULL;
+	if (l_ptr->first_out) {
+		l_ptr->last_out->next = buf;
+		l_ptr->last_out = buf;
+	} else
+		l_ptr->first_out = l_ptr->last_out = buf;
+	l_ptr->out_queue_size++;
+}
+
+/* 
+ * link_send_buf() is the 'full path' for messages, called from 
+ * inside TIPC when the 'fast path' in tipc_send_buf
+ * has failed, and from link_send()
+ */
+
+int link_send_buf(struct link *l_ptr, struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 size = msg_size(msg);
+	u32 dsz = msg_data_sz(msg);
+	u32 queue_size = l_ptr->out_queue_size;
+	u32 imp = msg_tot_importance(msg);
+	u32 queue_limit = l_ptr->queue_limit[imp];
+	u32 max_packet = link_max_pkt(l_ptr);
+
+	msg_set_prevnode(msg, tipc_own_addr);	/* If routed message */
+
+	/* Match msg importance against queue limits: */
+
+	if (unlikely(queue_size >= queue_limit)) {
+		if (imp <= TIPC_CRITICAL_IMPORTANCE) {
+			return link_schedule_port(l_ptr, msg_origport(msg),
+						  size);
+		}
+		msg_dbg(msg, "TIPC: Congestion, throwing away\n");
+		buf_discard(buf);
+		if (imp > CONN_MANAGER) {
+			warn("Resetting <%s>, send queue full", l_ptr->name);
+			link_reset(l_ptr);
+		}
+		return dsz;
+	}
+
+	/* Fragmentation needed ? */
+
+	if (size > max_packet)
+		return link_send_long_buf(l_ptr, buf);
+
+	/* Packet can be queued or sent: */
+
+	if (queue_size > l_ptr->stats.max_queue_sz)
+		l_ptr->stats.max_queue_sz = queue_size;
+
+	if (likely(!bearer_congested(l_ptr->b_ptr, l_ptr) && 
+		   !link_congested(l_ptr))) {
+		link_add_to_outqueue(l_ptr, buf, msg);
+
+		if (likely(bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr))) {
+			l_ptr->unacked_window = 0;
+		} else {
+			bearer_schedule(l_ptr->b_ptr, l_ptr);
+			l_ptr->stats.bearer_congs++;
+			l_ptr->next_out = buf;
+		}
+		return dsz;
+	}
+	/* Congestion: can message be bundled ?: */
+
+	if ((msg_user(msg) != CHANGEOVER_PROTOCOL) &&
+	    (msg_user(msg) != MSG_FRAGMENTER)) {
+
+		/* Try adding message to an existing bundle */
+
+		if (l_ptr->next_out && 
+		    link_bundle_buf(l_ptr, l_ptr->last_out, buf)) {
+			bearer_resolve_congestion(l_ptr->b_ptr, l_ptr);
+			return dsz;
+		}
+
+		/* Try creating a new bundle */
+
+		if (size <= max_packet * 2 / 3) {
+			struct sk_buff *bundler = buf_acquire(max_packet);
+			struct tipc_msg bundler_hdr;
+
+			if (bundler) {
+				msg_init(&bundler_hdr, MSG_BUNDLER, OPEN_MSG,
+					 TIPC_OK, INT_H_SIZE, l_ptr->addr);
+				memcpy(bundler->data, (unchar *)&bundler_hdr, 
+				       INT_H_SIZE);
+				skb_trim(bundler, INT_H_SIZE);
+				link_bundle_buf(l_ptr, bundler, buf);
+				buf = bundler;
+				msg = buf_msg(buf);
+				l_ptr->stats.sent_bundles++;
+			}
+		}
+	}
+	if (!l_ptr->next_out)
+		l_ptr->next_out = buf;
+	link_add_to_outqueue(l_ptr, buf, msg);
+	bearer_resolve_congestion(l_ptr->b_ptr, l_ptr);
+	return dsz;
+}
+
+/* 
+ * link_send(): same as link_send_buf(), but the link to use has 
+ * not been selected yet, and the the owner node is not locked
+ * Called by TIPC internal users, e.g. the name distributor
+ */
+
+int link_send(struct sk_buff *buf, u32 dest, u32 selector)
+{
+	struct link *l_ptr;
+	struct node *n_ptr;
+	int res = -ELINKCONG;
+
+	read_lock_bh(&net_lock);
+	n_ptr = node_select(dest, selector);
+	if (n_ptr) {
+		node_lock(n_ptr);
+		l_ptr = n_ptr->active_links[selector & 1];
+		dbg("link_send: found link %x for dest %x\n", l_ptr, dest);
+		if (l_ptr) {
+			res = link_send_buf(l_ptr, buf);
+		}
+		node_unlock(n_ptr);
+	} else {
+		dbg("Attempt to send msg to unknown node:\n");
+		msg_dbg(buf_msg(buf),">>>");
+		buf_discard(buf);
+	}
+	read_unlock_bh(&net_lock);
+	return res;
+}
+
+/* 
+ * link_send_buf_fast: Entry for data messages where the 
+ * destination link is known and the header is complete,
+ * inclusive total message length. Very time critical.
+ * Link is locked. Returns user data length.
+ */
+
+static inline int link_send_buf_fast(struct link *l_ptr, struct sk_buff *buf,
+				     u32 *used_max_pkt)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	int res = msg_data_sz(msg);
+
+	if (likely(!link_congested(l_ptr))) {
+		if (likely(msg_size(msg) <= link_max_pkt(l_ptr))) {
+			if (likely(list_empty(&l_ptr->b_ptr->cong_links))) {
+				link_add_to_outqueue(l_ptr, buf, msg);
+				if (likely(bearer_send(l_ptr->b_ptr, buf,
+						       &l_ptr->media_addr))) {
+					l_ptr->unacked_window = 0;
+					msg_dbg(msg,"SENT_FAST:");
+					return res;
+				}
+				dbg("failed sent fast...\n");
+				bearer_schedule(l_ptr->b_ptr, l_ptr);
+				l_ptr->stats.bearer_congs++;
+				l_ptr->next_out = buf;
+				return res;
+			}
+		}
+		else
+			*used_max_pkt = link_max_pkt(l_ptr);
+	}
+	return link_send_buf(l_ptr, buf);  /* All other cases */
+}
+
+/* 
+ * tipc_send_buf_fast: Entry for data messages where the 
+ * destination node is known and the header is complete,
+ * inclusive total message length.
+ * Returns user data length.
+ */
+int tipc_send_buf_fast(struct sk_buff *buf, u32 destnode)
+{
+	struct link *l_ptr;
+	struct node *n_ptr;
+	int res;
+	u32 selector = msg_origport(buf_msg(buf)) & 1;
+	u32 dummy;
+
+	if (destnode == tipc_own_addr)
+		return port_recv_msg(buf);
+
+	read_lock_bh(&net_lock);
+	n_ptr = node_select(destnode, selector);
+	if (likely(n_ptr)) {
+		node_lock(n_ptr);
+		l_ptr = n_ptr->active_links[selector];
+		dbg("send_fast: buf %x selected %x, destnode = %x\n",
+		    buf, l_ptr, destnode);
+		if (likely(l_ptr)) {
+			res = link_send_buf_fast(l_ptr, buf, &dummy);
+			node_unlock(n_ptr);
+			read_unlock_bh(&net_lock);
+			return res;
+		}
+		node_unlock(n_ptr);
+	}
+	read_unlock_bh(&net_lock);
+	res = msg_data_sz(buf_msg(buf));
+	tipc_reject_msg(buf, TIPC_ERR_NO_NODE);
+	return res;
+}
+
+
+/* 
+ * link_send_sections_fast: Entry for messages where the 
+ * destination processor is known and the header is complete,
+ * except for total message length. 
+ * Returns user data length or errno.
+ */
+int link_send_sections_fast(struct port *sender, 
+			    struct iovec const *msg_sect,
+			    const u32 num_sect, 
+			    u32 destaddr)
+{
+	struct tipc_msg *hdr = &sender->publ.phdr;
+	struct link *l_ptr;
+	struct sk_buff *buf;
+	struct node *node;
+	int res;
+	u32 selector = msg_origport(hdr) & 1;
+
+	assert(destaddr != tipc_own_addr);
+
+again:
+	/*
+	 * Try building message using port's max_pkt hint.
+	 * (Must not hold any locks while building message.)
+	 */
+
+	res = msg_build(hdr, msg_sect, num_sect, sender->max_pkt,
+			!sender->user_port, &buf);
+
+	read_lock_bh(&net_lock);
+	node = node_select(destaddr, selector);
+	if (likely(node)) {
+		node_lock(node);
+		l_ptr = node->active_links[selector];
+		if (likely(l_ptr)) {
+			if (likely(buf)) {
+				res = link_send_buf_fast(l_ptr, buf,
+							 &sender->max_pkt);
+				if (unlikely(res < 0))
+					buf_discard(buf);
+exit:
+				node_unlock(node);
+				read_unlock_bh(&net_lock);
+				return res;
+			}
+
+			/* Exit if build request was invalid */
+
+			if (unlikely(res < 0))
+				goto exit;
+
+			/* Exit if link (or bearer) is congested */
+
+			if (link_congested(l_ptr) || 
+			    !list_empty(&l_ptr->b_ptr->cong_links)) {
+				res = link_schedule_port(l_ptr,
+							 sender->publ.ref, res);
+				goto exit;
+			}
+
+			/* 
+			 * Message size exceeds max_pkt hint; update hint,
+			 * then re-try fast path or fragment the message
+			 */
+
+			sender->max_pkt = link_max_pkt(l_ptr);
+			node_unlock(node);
+			read_unlock_bh(&net_lock);
+
+
+			if ((msg_hdr_sz(hdr) + res) <= sender->max_pkt)
+				goto again;
+
+			return link_send_sections_long(sender, msg_sect,
+						       num_sect, destaddr);
+		}
+		node_unlock(node);
+	}
+	read_unlock_bh(&net_lock);
+
+	/* Couldn't find a link to the destination node */
+
+	if (buf)
+		return tipc_reject_msg(buf, TIPC_ERR_NO_NODE);
+	if (res >= 0)
+		return port_reject_sections(sender, hdr, msg_sect, num_sect,
+					    TIPC_ERR_NO_NODE);
+	return res;
+}
+
+/* 
+ * link_send_sections_long(): Entry for long messages where the 
+ * destination node is known and the header is complete,
+ * inclusive total message length. 
+ * Link and bearer congestion status have been checked to be ok,
+ * and are ignored if they change.
+ *
+ * Note that fragments do not use the full link MTU so that they won't have
+ * to undergo refragmentation if link changeover causes them to be sent
+ * over another link with an additional tunnel header added as prefix.
+ * (Refragmentation will still occur if the other link has a smaller MTU.)
+ *
+ * Returns user data length or errno.
+ */
+static int link_send_sections_long(struct port *sender,
+				   struct iovec const *msg_sect,
+				   u32 num_sect,
+				   u32 destaddr)
+{
+	struct link *l_ptr;
+	struct node *node;
+	struct tipc_msg *hdr = &sender->publ.phdr;
+	u32 dsz = msg_data_sz(hdr);
+	u32 max_pkt,fragm_sz,rest;
+	struct tipc_msg fragm_hdr;
+	struct sk_buff *buf,*buf_chain,*prev;
+	u32 fragm_crs,fragm_rest,hsz,sect_rest;
+	const unchar *sect_crs;
+	int curr_sect;
+	u32 fragm_no;
+
+again:
+	fragm_no = 1;
+	max_pkt = sender->max_pkt - INT_H_SIZE;  
+		/* leave room for tunnel header in case of link changeover */
+	fragm_sz = max_pkt - INT_H_SIZE; 
+		/* leave room for fragmentation header in each fragment */
+	rest = dsz;
+	fragm_crs = 0;
+	fragm_rest = 0;
+	sect_rest = 0;
+	sect_crs = 0;
+	curr_sect = -1;
+
+	/* Prepare reusable fragment header: */
+
+	msg_dbg(hdr, ">FRAGMENTING>");
+	msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT,
+		 TIPC_OK, INT_H_SIZE, msg_destnode(hdr));
+	msg_set_link_selector(&fragm_hdr, sender->publ.ref);
+	msg_set_size(&fragm_hdr, max_pkt);
+	msg_set_fragm_no(&fragm_hdr, 1);
+
+	/* Prepare header of first fragment: */
+
+	buf_chain = buf = buf_acquire(max_pkt);
+	if (!buf)
+		return -ENOMEM;
+	buf->next = NULL;
+	memcpy(buf->data, (unchar *)&fragm_hdr, INT_H_SIZE);
+	hsz = msg_hdr_sz(hdr);
+	memcpy(buf->data + INT_H_SIZE, (unchar *)hdr, hsz);
+	msg_dbg(buf_msg(buf), ">BUILD>");
+
+	/* Chop up message: */
+
+	fragm_crs = INT_H_SIZE + hsz;
+	fragm_rest = fragm_sz - hsz;
+
+	do {		/* For all sections */
+		u32 sz;
+
+		if (!sect_rest) {
+			sect_rest = msg_sect[++curr_sect].iov_len;
+			sect_crs = (const unchar *)msg_sect[curr_sect].iov_base;
+		}
+
+		if (sect_rest < fragm_rest)
+			sz = sect_rest;
+		else
+			sz = fragm_rest;
+
+		if (likely(!sender->user_port)) {
+			if (copy_from_user(buf->data + fragm_crs, sect_crs, sz)) {
+error:
+				for (; buf_chain; buf_chain = buf) {
+					buf = buf_chain->next;
+					buf_discard(buf_chain);
+				}
+				return -EFAULT;
+			}
+		} else
+			memcpy(buf->data + fragm_crs, sect_crs, sz);
+
+		sect_crs += sz;
+		sect_rest -= sz;
+		fragm_crs += sz;
+		fragm_rest -= sz;
+		rest -= sz;
+
+		if (!fragm_rest && rest) {
+
+			/* Initiate new fragment: */
+			if (rest <= fragm_sz) {
+				fragm_sz = rest;
+				msg_set_type(&fragm_hdr,LAST_FRAGMENT);
+			} else {
+				msg_set_type(&fragm_hdr, FRAGMENT);
+			}
+			msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE);
+			msg_set_fragm_no(&fragm_hdr, ++fragm_no);
+			prev = buf;
+			buf = buf_acquire(fragm_sz + INT_H_SIZE);
+			if (!buf)
+				goto error;
+
+			buf->next = NULL;                                
+			prev->next = buf;
+			memcpy(buf->data, (unchar *)&fragm_hdr, INT_H_SIZE);
+			fragm_crs = INT_H_SIZE;
+			fragm_rest = fragm_sz;
+			msg_dbg(buf_msg(buf),"  >BUILD>");
+		}
+	}
+	while (rest > 0);
+
+	/* 
+	 * Now we have a buffer chain. Select a link and check
+	 * that packet size is still OK
+	 */
+	node = node_select(destaddr, sender->publ.ref & 1);
+	if (likely(node)) {
+		node_lock(node);
+		l_ptr = node->active_links[sender->publ.ref & 1];
+		if (!l_ptr) {
+			node_unlock(node);
+			goto reject;
+		}
+		if (link_max_pkt(l_ptr) < max_pkt) {
+			sender->max_pkt = link_max_pkt(l_ptr);
+			node_unlock(node);
+			for (; buf_chain; buf_chain = buf) {
+				buf = buf_chain->next;
+				buf_discard(buf_chain);
+			}
+			goto again;
+		}
+	} else {
+reject:
+		for (; buf_chain; buf_chain = buf) {
+			buf = buf_chain->next;
+			buf_discard(buf_chain);
+		}
+		return port_reject_sections(sender, hdr, msg_sect, num_sect,
+					    TIPC_ERR_NO_NODE);
+	}
+
+	/* Append whole chain to send queue: */
+
+	buf = buf_chain;
+	l_ptr->long_msg_seq_no = mod(l_ptr->long_msg_seq_no + 1);
+	if (!l_ptr->next_out)
+		l_ptr->next_out = buf_chain;
+	l_ptr->stats.sent_fragmented++;
+	while (buf) {
+		struct sk_buff *next = buf->next;
+		struct tipc_msg *msg = buf_msg(buf);
+
+		l_ptr->stats.sent_fragments++;
+		msg_set_long_msgno(msg, l_ptr->long_msg_seq_no);
+		link_add_to_outqueue(l_ptr, buf, msg);
+		msg_dbg(msg, ">ADD>");
+		buf = next;
+	}
+
+	/* Send it, if possible: */
+
+	link_push_queue(l_ptr);
+	node_unlock(node);
+	return dsz;
+}
+
+/* 
+ * link_push_packet: Push one unsent packet to the media
+ */
+u32 link_push_packet(struct link *l_ptr)
+{
+	struct sk_buff *buf = l_ptr->first_out;
+	u32 r_q_size = l_ptr->retransm_queue_size;
+	u32 r_q_head = l_ptr->retransm_queue_head;
+
+	/* Step to position where retransmission failed, if any,    */
+	/* consider that buffers may have been released in meantime */
+
+	if (r_q_size && buf) {
+		u32 last = lesser(mod(r_q_head + r_q_size), 
+				  link_last_sent(l_ptr));
+		u32 first = msg_seqno(buf_msg(buf));
+
+		while (buf && less(first, r_q_head)) {
+			first = mod(first + 1);
+			buf = buf->next;
+		}
+		l_ptr->retransm_queue_head = r_q_head = first;
+		l_ptr->retransm_queue_size = r_q_size = mod(last - first);
+	}
+
+	/* Continue retransmission now, if there is anything: */
+
+	if (r_q_size && buf && !skb_cloned(buf)) {
+		msg_set_ack(buf_msg(buf), mod(l_ptr->next_in_no - 1));
+		msg_set_bcast_ack(buf_msg(buf), l_ptr->owner->bclink.last_in); 
+		if (bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) {
+			msg_dbg(buf_msg(buf), ">DEF-RETR>");
+			l_ptr->retransm_queue_head = mod(++r_q_head);
+			l_ptr->retransm_queue_size = --r_q_size;
+			l_ptr->stats.retransmitted++;
+			return TIPC_OK;
+		} else {
+			l_ptr->stats.bearer_congs++;
+			msg_dbg(buf_msg(buf), "|>DEF-RETR>");
+			return PUSH_FAILED;
+		}
+	}
+
+	/* Send deferred protocol message, if any: */
+
+	buf = l_ptr->proto_msg_queue;
+	if (buf) {
+		msg_set_ack(buf_msg(buf), mod(l_ptr->next_in_no - 1));
+		msg_set_bcast_ack(buf_msg(buf),l_ptr->owner->bclink.last_in); 
+		if (bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) {
+			msg_dbg(buf_msg(buf), ">DEF-PROT>");
+			l_ptr->unacked_window = 0;
+			buf_discard(buf);
+			l_ptr->proto_msg_queue = 0;
+			return TIPC_OK;
+		} else {
+			msg_dbg(buf_msg(buf), "|>DEF-PROT>");
+			l_ptr->stats.bearer_congs++;
+			return PUSH_FAILED;
+		}
+	}
+
+	/* Send one deferred data message, if send window not full: */
+
+	buf = l_ptr->next_out;
+	if (buf) {
+		struct tipc_msg *msg = buf_msg(buf);
+		u32 next = msg_seqno(msg);
+		u32 first = msg_seqno(buf_msg(l_ptr->first_out));
+
+		if (mod(next - first) < l_ptr->queue_limit[0]) {
+			msg_set_ack(msg, mod(l_ptr->next_in_no - 1));
+			msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); 
+			if (bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) {
+				if (msg_user(msg) == MSG_BUNDLER)
+					msg_set_type(msg, CLOSED_MSG);
+				msg_dbg(msg, ">PUSH-DATA>");
+				l_ptr->next_out = buf->next;
+				return TIPC_OK;
+			} else {
+				msg_dbg(msg, "|PUSH-DATA|");
+				l_ptr->stats.bearer_congs++;
+				return PUSH_FAILED;
+			}
+		}
+	}
+	return PUSH_FINISHED;
+}
+
+/*
+ * push_queue(): push out the unsent messages of a link where
+ *               congestion has abated. Node is locked
+ */
+void link_push_queue(struct link *l_ptr)
+{
+	u32 res;
+
+	if (bearer_congested(l_ptr->b_ptr, l_ptr))
+		return;
+
+	do {
+		res = link_push_packet(l_ptr);
+	}
+	while (res == TIPC_OK);
+	if (res == PUSH_FAILED)
+		bearer_schedule(l_ptr->b_ptr, l_ptr);
+}
+
+void link_retransmit(struct link *l_ptr, struct sk_buff *buf, 
+		     u32 retransmits)
+{
+	struct tipc_msg *msg;
+
+	dbg("Retransmitting %u in link %x\n", retransmits, l_ptr);
+
+	if (bearer_congested(l_ptr->b_ptr, l_ptr) && buf && !skb_cloned(buf)) {
+		msg_dbg(buf_msg(buf), ">NO_RETR->BCONG>");
+		dbg_print_link(l_ptr, "   ");
+		l_ptr->retransm_queue_head = msg_seqno(buf_msg(buf));
+		l_ptr->retransm_queue_size = retransmits;
+		return;
+	}
+	while (retransmits && (buf != l_ptr->next_out) && buf && !skb_cloned(buf)) {
+		msg = buf_msg(buf);
+		msg_set_ack(msg, mod(l_ptr->next_in_no - 1));
+		msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); 
+		if (bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) {
+                        /* Catch if retransmissions fail repeatedly: */
+                        if (l_ptr->last_retransmitted == msg_seqno(msg)) {
+                                if (++l_ptr->stale_count > 100) {
+                                        msg_print(CONS, buf_msg(buf), ">RETR>");
+                                        info("...Retransmitted %u times\n",
+					     l_ptr->stale_count);
+                                        link_print(l_ptr, CONS, "Resetting Link\n");;
+                                        link_reset(l_ptr);
+                                        break;
+                                }
+                        } else {
+                                l_ptr->stale_count = 0;
+                        }
+                        l_ptr->last_retransmitted = msg_seqno(msg);
+
+			msg_dbg(buf_msg(buf), ">RETR>");
+			buf = buf->next;
+			retransmits--;
+			l_ptr->stats.retransmitted++;
+		} else {
+			bearer_schedule(l_ptr->b_ptr, l_ptr);
+			l_ptr->stats.bearer_congs++;
+			l_ptr->retransm_queue_head = msg_seqno(buf_msg(buf));
+			l_ptr->retransm_queue_size = retransmits;
+			return;
+		}
+	}
+	l_ptr->retransm_queue_head = l_ptr->retransm_queue_size = 0;
+}
+
+/* 
+ * link_recv_non_seq: Receive packets which are outside
+ *                    the link sequence flow
+ */
+
+static void link_recv_non_seq(struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+
+	if (msg_user(msg) ==  LINK_CONFIG)
+		disc_recv_msg(buf);
+	else
+		bclink_recv_pkt(buf);
+}
+
+/** 
+ * link_insert_deferred_queue - insert deferred messages back into receive chain
+ */
+
+static struct sk_buff *link_insert_deferred_queue(struct link *l_ptr, 
+						  struct sk_buff *buf)
+{
+	u32 seq_no;
+
+	if (l_ptr->oldest_deferred_in == NULL)
+		return buf;
+
+	seq_no = msg_seqno(buf_msg(l_ptr->oldest_deferred_in));
+	if (seq_no == mod(l_ptr->next_in_no)) {
+		l_ptr->newest_deferred_in->next = buf;
+		buf = l_ptr->oldest_deferred_in;
+		l_ptr->oldest_deferred_in = NULL;
+		l_ptr->deferred_inqueue_sz = 0;
+	}
+	return buf;
+}
+
+void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
+{
+	read_lock_bh(&net_lock);
+	while (head) {
+		struct bearer *b_ptr;
+		struct node *n_ptr;
+		struct link *l_ptr;
+		struct sk_buff *crs;
+		struct sk_buff *buf = head;
+		struct tipc_msg *msg = buf_msg(buf);
+		u32 seq_no = msg_seqno(msg);
+		u32 ackd = msg_ack(msg);
+		u32 released = 0;
+		int type;
+
+		b_ptr = (struct bearer *)tb_ptr;
+		TIPC_SKB_CB(buf)->handle = b_ptr;
+
+		head = head->next;
+		if (unlikely(msg_version(msg) != TIPC_VERSION))
+			goto cont;
+#if 0
+		if (msg_user(msg) != LINK_PROTOCOL)
+#endif
+			msg_dbg(msg,"<REC<");
+
+		if (unlikely(msg_non_seq(msg))) {
+			link_recv_non_seq(buf);
+			continue;
+		}
+		n_ptr = node_find(msg_prevnode(msg));
+		if (unlikely(!n_ptr))
+			goto cont;
+
+		node_lock(n_ptr);
+		l_ptr = n_ptr->links[b_ptr->identity];
+		if (unlikely(!l_ptr)) {
+			node_unlock(n_ptr);
+			goto cont;
+		}
+		/* 
+		 * Release acked messages 
+		 */
+		if (less(n_ptr->bclink.acked, msg_bcast_ack(msg))) {
+			if (node_is_up(n_ptr) && n_ptr->bclink.supported)
+				bclink_acknowledge(n_ptr, msg_bcast_ack(msg));
+		}
+
+		crs = l_ptr->first_out;
+		while ((crs != l_ptr->next_out) && 
+		       less_eq(msg_seqno(buf_msg(crs)), ackd)) {
+			struct sk_buff *next = crs->next;
+
+			buf_discard(crs);
+			crs = next;
+			released++;
+		}
+		if (released) {
+			l_ptr->first_out = crs;
+			l_ptr->out_queue_size -= released;
+		}
+		if (unlikely(l_ptr->next_out))
+			link_push_queue(l_ptr);
+		if (unlikely(!list_empty(&l_ptr->waiting_ports)))
+			link_wakeup_ports(l_ptr, 0);
+		if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) {
+			l_ptr->stats.sent_acks++;
+			link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0);
+		}
+
+protocol_check:
+		if (likely(link_working_working(l_ptr))) {
+			if (likely(seq_no == mod(l_ptr->next_in_no))) {
+				l_ptr->next_in_no++;
+				if (unlikely(l_ptr->oldest_deferred_in))
+					head = link_insert_deferred_queue(l_ptr,
+									  head);
+				if (likely(msg_is_dest(msg, tipc_own_addr))) {
+deliver:
+					if (likely(msg_isdata(msg))) {
+						node_unlock(n_ptr);
+						port_recv_msg(buf);
+						continue;
+					}
+					switch (msg_user(msg)) {
+					case MSG_BUNDLER:
+						l_ptr->stats.recv_bundles++;
+						l_ptr->stats.recv_bundled += 
+							msg_msgcnt(msg);
+						node_unlock(n_ptr);
+						link_recv_bundle(buf);
+						continue;
+					case ROUTE_DISTRIBUTOR:
+						node_unlock(n_ptr);
+						cluster_recv_routing_table(buf);
+						continue;
+					case NAME_DISTRIBUTOR:
+						node_unlock(n_ptr);
+						named_recv(buf);
+						continue;
+					case CONN_MANAGER:
+						node_unlock(n_ptr);
+						port_recv_proto_msg(buf);
+						continue;
+					case MSG_FRAGMENTER:
+						l_ptr->stats.recv_fragments++;
+						if (link_recv_fragment(
+							&l_ptr->defragm_buf, 
+							&buf, &msg)) {
+							l_ptr->stats.recv_fragmented++;
+							goto deliver;
+						}
+						break;
+					case CHANGEOVER_PROTOCOL:
+						type = msg_type(msg);
+						if (link_recv_changeover_msg(
+							&l_ptr, &buf)) {
+							msg = buf_msg(buf);
+							seq_no = msg_seqno(msg);
+							TIPC_SKB_CB(buf)->handle 
+								= b_ptr;
+							if (type == ORIGINAL_MSG)
+								goto deliver;
+							goto protocol_check;
+						}
+						break;
+					}
+				}
+				node_unlock(n_ptr);
+				net_route_msg(buf);
+				continue;
+			}
+			link_handle_out_of_seq_msg(l_ptr, buf);
+			head = link_insert_deferred_queue(l_ptr, head);
+			node_unlock(n_ptr);
+			continue;
+		}
+
+		if (msg_user(msg) == LINK_PROTOCOL) {
+			link_recv_proto_msg(l_ptr, buf);
+			head = link_insert_deferred_queue(l_ptr, head);
+			node_unlock(n_ptr);
+			continue;
+		}
+		msg_dbg(msg,"NSEQ<REC<");
+		link_state_event(l_ptr, TRAFFIC_MSG_EVT);
+
+		if (link_working_working(l_ptr)) {
+			/* Re-insert in front of queue */
+			msg_dbg(msg,"RECV-REINS:");
+			buf->next = head;
+			head = buf;
+			node_unlock(n_ptr);
+			continue;
+		}
+		node_unlock(n_ptr);
+cont:
+		buf_discard(buf);
+	}
+	read_unlock_bh(&net_lock);
+}
+
+/* 
+ * link_defer_buf(): Sort a received out-of-sequence packet 
+ *                   into the deferred reception queue.
+ * Returns the increase of the queue length,i.e. 0 or 1
+ */
+
+u32 link_defer_pkt(struct sk_buff **head,
+		   struct sk_buff **tail,
+		   struct sk_buff *buf)
+{
+	struct sk_buff *prev = 0;
+	struct sk_buff *crs = *head;
+	u32 seq_no = msg_seqno(buf_msg(buf));
+
+	buf->next = NULL;
+
+	/* Empty queue ? */
+	if (*head == NULL) {
+		*head = *tail = buf;
+		return 1;
+	}
+
+	/* Last ? */
+	if (less(msg_seqno(buf_msg(*tail)), seq_no)) {
+		(*tail)->next = buf;
+		*tail = buf;
+		return 1;
+	}
+
+	/* Scan through queue and sort it in */
+	do {
+		struct tipc_msg *msg = buf_msg(crs);
+
+		if (less(seq_no, msg_seqno(msg))) {
+			buf->next = crs;
+			if (prev)
+				prev->next = buf;
+			else
+				*head = buf;   
+			return 1;
+		}
+		if (seq_no == msg_seqno(msg)) {
+			break;
+		}
+		prev = crs;
+		crs = crs->next;
+	}
+	while (crs);
+
+	/* Message is a duplicate of an existing message */
+
+	buf_discard(buf);
+	return 0;
+}
+
+/** 
+ * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet
+ */
+
+static void link_handle_out_of_seq_msg(struct link *l_ptr, 
+				       struct sk_buff *buf)
+{
+	u32 seq_no = msg_seqno(buf_msg(buf));
+
+	if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) {
+		link_recv_proto_msg(l_ptr, buf);
+		return;
+	}
+
+	dbg("rx OOS msg: seq_no %u, expecting %u (%u)\n", 
+	    seq_no, mod(l_ptr->next_in_no), l_ptr->next_in_no);
+
+	/* Record OOS packet arrival (force mismatch on next timeout) */
+
+	l_ptr->checkpoint--;
+
+	/* 
+	 * Discard packet if a duplicate; otherwise add it to deferred queue
+	 * and notify peer of gap as per protocol specification
+	 */
+
+	if (less(seq_no, mod(l_ptr->next_in_no))) {
+		l_ptr->stats.duplicates++;
+		buf_discard(buf);
+		return;
+	}
+
+	if (link_defer_pkt(&l_ptr->oldest_deferred_in,
+			   &l_ptr->newest_deferred_in, buf)) {
+		l_ptr->deferred_inqueue_sz++;
+		l_ptr->stats.deferred_recv++;
+		if ((l_ptr->deferred_inqueue_sz % 16) == 1)
+			link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0);
+	} else
+		l_ptr->stats.duplicates++;
+}
+
+/*
+ * Send protocol message to the other endpoint.
+ */
+void link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
+			 u32 gap, u32 tolerance, u32 priority, u32 ack_mtu)
+{
+	struct sk_buff *buf = 0;
+	struct tipc_msg *msg = l_ptr->pmsg;
+        u32 msg_size = sizeof(l_ptr->proto_msg);
+
+	if (link_blocked(l_ptr))
+		return;
+	msg_set_type(msg, msg_typ);
+	msg_set_net_plane(msg, l_ptr->b_ptr->net_plane);
+	msg_set_bcast_ack(msg, mod(l_ptr->owner->bclink.last_in)); 
+	msg_set_last_bcast(msg, bclink_get_last_sent());
+
+	if (msg_typ == STATE_MSG) {
+		u32 next_sent = mod(l_ptr->next_out_no);
+
+		if (!link_is_up(l_ptr))
+			return;
+		if (l_ptr->next_out)
+			next_sent = msg_seqno(buf_msg(l_ptr->next_out));
+		msg_set_next_sent(msg, next_sent);
+		if (l_ptr->oldest_deferred_in) {
+			u32 rec = msg_seqno(buf_msg(l_ptr->oldest_deferred_in));
+			gap = mod(rec - mod(l_ptr->next_in_no));
+		}
+		msg_set_seq_gap(msg, gap);
+		if (gap)
+			l_ptr->stats.sent_nacks++;
+		msg_set_link_tolerance(msg, tolerance);
+		msg_set_linkprio(msg, priority);
+		msg_set_max_pkt(msg, ack_mtu);
+		msg_set_ack(msg, mod(l_ptr->next_in_no - 1));
+		msg_set_probe(msg, probe_msg != 0);
+		if (probe_msg) { 
+			u32 mtu = l_ptr->max_pkt;
+
+                        if ((mtu < l_ptr->max_pkt_target) &&
+			    link_working_working(l_ptr) &&
+			    l_ptr->fsm_msg_cnt) {
+				msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3;
+                                if (l_ptr->max_pkt_probes == 10) {
+                                        l_ptr->max_pkt_target = (msg_size - 4);
+                                        l_ptr->max_pkt_probes = 0;
+					msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3;
+                                }
+				l_ptr->max_pkt_probes++;
+                        }
+
+			l_ptr->stats.sent_probes++;
+                }
+		l_ptr->stats.sent_states++;
+	} else {		/* RESET_MSG or ACTIVATE_MSG */
+		msg_set_ack(msg, mod(l_ptr->reset_checkpoint - 1));
+		msg_set_seq_gap(msg, 0);
+		msg_set_next_sent(msg, 1);
+		msg_set_link_tolerance(msg, l_ptr->tolerance);
+		msg_set_linkprio(msg, l_ptr->priority);
+		msg_set_max_pkt(msg, l_ptr->max_pkt_target);
+	}
+
+	if (node_has_redundant_links(l_ptr->owner)) {
+		msg_set_redundant_link(msg);
+	} else {
+		msg_clear_redundant_link(msg);
+	}
+	msg_set_linkprio(msg, l_ptr->priority);
+
+	/* Ensure sequence number will not fit : */
+
+	msg_set_seqno(msg, mod(l_ptr->next_out_no + (0xffff/2)));
+
+	/* Congestion? */
+
+	if (bearer_congested(l_ptr->b_ptr, l_ptr)) {
+		if (!l_ptr->proto_msg_queue) {
+			l_ptr->proto_msg_queue =
+				buf_acquire(sizeof(l_ptr->proto_msg));
+		}
+		buf = l_ptr->proto_msg_queue;
+		if (!buf)
+			return;
+		memcpy(buf->data, (unchar *)msg, sizeof(l_ptr->proto_msg));
+		return;
+	}
+	msg_set_timestamp(msg, jiffies_to_msecs(jiffies));
+
+	/* Message can be sent */
+
+	msg_dbg(msg, ">>");
+
+	buf = buf_acquire(msg_size);
+	if (!buf)
+		return;
+
+	memcpy(buf->data, (unchar *)msg, sizeof(l_ptr->proto_msg));
+        msg_set_size(buf_msg(buf), msg_size);
+
+	if (bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) {
+		l_ptr->unacked_window = 0;
+		buf_discard(buf);
+		return;
+	}
+
+	/* New congestion */
+	bearer_schedule(l_ptr->b_ptr, l_ptr);
+	l_ptr->proto_msg_queue = buf;
+	l_ptr->stats.bearer_congs++;
+}
+
+/*
+ * Receive protocol message :
+ * Note that network plane id propagates through the network, and may 
+ * change at any time. The node with lowest address rules    
+ */
+
+static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf)
+{
+	u32 rec_gap = 0;
+	u32 max_pkt_info;
+        u32 max_pkt_ack;
+	u32 msg_tol;
+	struct tipc_msg *msg = buf_msg(buf);
+
+	dbg("AT(%u):", jiffies_to_msecs(jiffies));
+	msg_dbg(msg, "<<");
+	if (link_blocked(l_ptr))
+		goto exit;
+
+	/* record unnumbered packet arrival (force mismatch on next timeout) */
+
+	l_ptr->checkpoint--;
+
+	if (l_ptr->b_ptr->net_plane != msg_net_plane(msg))
+		if (tipc_own_addr > msg_prevnode(msg))
+			l_ptr->b_ptr->net_plane = msg_net_plane(msg);
+
+	l_ptr->owner->permit_changeover = msg_redundant_link(msg);
+
+	switch (msg_type(msg)) {
+	
+	case RESET_MSG:
+		if (!link_working_unknown(l_ptr) && l_ptr->peer_session) {
+			if (msg_session(msg) == l_ptr->peer_session) {
+				dbg("Duplicate RESET: %u<->%u\n",
+				    msg_session(msg), l_ptr->peer_session);                                     
+				break; /* duplicate: ignore */
+			}
+		}
+		/* fall thru' */
+	case ACTIVATE_MSG:
+		/* Update link settings according other endpoint's values */
+
+		strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg));
+
+		if ((msg_tol = msg_link_tolerance(msg)) &&
+		    (msg_tol > l_ptr->tolerance))
+			link_set_supervision_props(l_ptr, msg_tol);
+
+		if (msg_linkprio(msg) > l_ptr->priority)
+			l_ptr->priority = msg_linkprio(msg);
+
+		max_pkt_info = msg_max_pkt(msg);
+                if (max_pkt_info) {
+			if (max_pkt_info < l_ptr->max_pkt_target)
+				l_ptr->max_pkt_target = max_pkt_info;
+			if (l_ptr->max_pkt > l_ptr->max_pkt_target)
+				l_ptr->max_pkt = l_ptr->max_pkt_target;
+		} else {
+                        l_ptr->max_pkt = l_ptr->max_pkt_target;
+		}
+		l_ptr->owner->bclink.supported = (max_pkt_info != 0);
+
+		link_state_event(l_ptr, msg_type(msg));
+
+		l_ptr->peer_session = msg_session(msg);
+		l_ptr->peer_bearer_id = msg_bearer_id(msg);
+
+		/* Synchronize broadcast sequence numbers */
+		if (!node_has_redundant_links(l_ptr->owner)) {
+			l_ptr->owner->bclink.last_in = mod(msg_last_bcast(msg));
+		}
+		break;
+	case STATE_MSG:
+
+		if ((msg_tol = msg_link_tolerance(msg)))
+			link_set_supervision_props(l_ptr, msg_tol);
+		
+		if (msg_linkprio(msg) && 
+		    (msg_linkprio(msg) != l_ptr->priority)) {
+			warn("Changing prio <%s>: %u->%u\n",
+			     l_ptr->name, l_ptr->priority, msg_linkprio(msg));
+			l_ptr->priority = msg_linkprio(msg);
+			link_reset(l_ptr); /* Enforce change to take effect */
+			break;
+		}
+		link_state_event(l_ptr, TRAFFIC_MSG_EVT);
+		l_ptr->stats.recv_states++;
+		if (link_reset_unknown(l_ptr))
+			break;
+
+		if (less_eq(mod(l_ptr->next_in_no), msg_next_sent(msg))) {
+			rec_gap = mod(msg_next_sent(msg) - 
+				      mod(l_ptr->next_in_no));
+		}
+
+		max_pkt_ack = msg_max_pkt(msg);
+                if (max_pkt_ack > l_ptr->max_pkt) {
+                        dbg("Link <%s> updated MTU %u -> %u\n",
+                            l_ptr->name, l_ptr->max_pkt, max_pkt_ack);
+                        l_ptr->max_pkt = max_pkt_ack;
+                        l_ptr->max_pkt_probes = 0;
+                }
+
+		max_pkt_ack = 0;
+                if (msg_probe(msg)) {
+			l_ptr->stats.recv_probes++;
+                        if (msg_size(msg) > sizeof(l_ptr->proto_msg)) {
+                                max_pkt_ack = msg_size(msg);
+                        }
+                }
+
+		/* Protocol message before retransmits, reduce loss risk */
+
+		bclink_check_gap(l_ptr->owner, msg_last_bcast(msg));
+
+		if (rec_gap || (msg_probe(msg))) {
+			link_send_proto_msg(l_ptr, STATE_MSG,
+					    0, rec_gap, 0, 0, max_pkt_ack);
+		}
+		if (msg_seq_gap(msg)) {
+			msg_dbg(msg, "With Gap:");
+			l_ptr->stats.recv_nacks++;
+			link_retransmit(l_ptr, l_ptr->first_out,
+					msg_seq_gap(msg));
+		}
+		break;
+	default:
+		msg_dbg(buf_msg(buf), "<DISCARDING UNKNOWN<");
+	}
+exit:
+	buf_discard(buf);
+}
+
+
+/*
+ * link_tunnel(): Send one message via a link belonging to 
+ * another bearer. Owner node is locked.
+ */
+void link_tunnel(struct link *l_ptr, 
+	    struct tipc_msg *tunnel_hdr, 
+	    struct tipc_msg  *msg,
+	    u32 selector)
+{
+	struct link *tunnel;
+	struct sk_buff *buf;
+	u32 length = msg_size(msg);
+
+	tunnel = l_ptr->owner->active_links[selector & 1];
+	if (!link_is_up(tunnel))
+		return;
+	msg_set_size(tunnel_hdr, length + INT_H_SIZE);
+	buf = buf_acquire(length + INT_H_SIZE);
+	if (!buf)
+		return;
+	memcpy(buf->data, (unchar *)tunnel_hdr, INT_H_SIZE);
+	memcpy(buf->data + INT_H_SIZE, (unchar *)msg, length);
+	dbg("%c->%c:", l_ptr->b_ptr->net_plane, tunnel->b_ptr->net_plane);
+	msg_dbg(buf_msg(buf), ">SEND>");
+	assert(tunnel);
+	link_send_buf(tunnel, buf);
+}
+
+
+
+/*
+ * changeover(): Send whole message queue via the remaining link
+ *               Owner node is locked.
+ */
+
+void link_changeover(struct link *l_ptr)
+{
+	u32 msgcount = l_ptr->out_queue_size;
+	struct sk_buff *crs = l_ptr->first_out;
+	struct link *tunnel = l_ptr->owner->active_links[0];
+	int split_bundles = node_has_redundant_links(l_ptr->owner);
+	struct tipc_msg tunnel_hdr;
+
+	if (!tunnel)
+		return;
+
+	if (!l_ptr->owner->permit_changeover)
+		return;
+
+	msg_init(&tunnel_hdr, CHANGEOVER_PROTOCOL,
+		 ORIGINAL_MSG, TIPC_OK, INT_H_SIZE, l_ptr->addr);
+	msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id);
+	msg_set_msgcnt(&tunnel_hdr, msgcount);
+	if (!l_ptr->first_out) {
+		struct sk_buff *buf;
+
+		assert(!msgcount);
+		buf = buf_acquire(INT_H_SIZE);
+		if (buf) {
+			memcpy(buf->data, (unchar *)&tunnel_hdr, INT_H_SIZE);
+			msg_set_size(&tunnel_hdr, INT_H_SIZE);
+			dbg("%c->%c:", l_ptr->b_ptr->net_plane,
+			    tunnel->b_ptr->net_plane);
+			msg_dbg(&tunnel_hdr, "EMPTY>SEND>");
+			link_send_buf(tunnel, buf);
+		} else {
+			warn("Memory squeeze; link changeover failed\n");
+		}
+		return;
+	}
+	while (crs) {
+		struct tipc_msg *msg = buf_msg(crs);
+
+		if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) {
+			u32 msgcount = msg_msgcnt(msg);
+			struct tipc_msg *m = msg_get_wrapped(msg);
+			unchar* pos = (unchar*)m;
+
+			while (msgcount--) {
+				msg_set_seqno(m,msg_seqno(msg));
+				link_tunnel(l_ptr, &tunnel_hdr, m,
+					    msg_link_selector(m));
+				pos += align(msg_size(m));
+				m = (struct tipc_msg *)pos;
+			}
+		} else {
+			link_tunnel(l_ptr, &tunnel_hdr, msg,
+				    msg_link_selector(msg));
+		}
+		crs = crs->next;
+	}
+}
+
+void link_send_duplicate(struct link *l_ptr, struct link *tunnel)
+{
+	struct sk_buff *iter;
+	struct tipc_msg tunnel_hdr;
+
+	msg_init(&tunnel_hdr, CHANGEOVER_PROTOCOL,
+		 DUPLICATE_MSG, TIPC_OK, INT_H_SIZE, l_ptr->addr);
+	msg_set_msgcnt(&tunnel_hdr, l_ptr->out_queue_size);
+	msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id);
+	iter = l_ptr->first_out;
+	while (iter) {
+		struct sk_buff *outbuf;
+		struct tipc_msg *msg = buf_msg(iter);
+		u32 length = msg_size(msg);
+
+		if (msg_user(msg) == MSG_BUNDLER)
+			msg_set_type(msg, CLOSED_MSG);
+		msg_set_ack(msg, mod(l_ptr->next_in_no - 1));	/* Update */
+		msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); 
+		msg_set_size(&tunnel_hdr, length + INT_H_SIZE);
+		outbuf = buf_acquire(length + INT_H_SIZE);
+		if (outbuf == NULL) {
+			warn("Memory squeeze; buffer duplication failed\n");
+			return;
+		}
+		memcpy(outbuf->data, (unchar *)&tunnel_hdr, INT_H_SIZE);
+		memcpy(outbuf->data + INT_H_SIZE, iter->data, length);
+		dbg("%c->%c:", l_ptr->b_ptr->net_plane,
+		    tunnel->b_ptr->net_plane);
+		msg_dbg(buf_msg(outbuf), ">SEND>");
+		link_send_buf(tunnel, outbuf);
+		if (!link_is_up(l_ptr))
+			return;
+		iter = iter->next;
+	}
+}
+
+
+
+/**
+ * buf_extract - extracts embedded TIPC message from another message
+ * @skb: encapsulating message buffer
+ * @from_pos: offset to extract from
+ *
+ * Returns a new message buffer containing an embedded message.  The 
+ * encapsulating message itself is left unchanged.
+ */
+
+static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos)
+{
+	struct tipc_msg *msg = (struct tipc_msg *)(skb->data + from_pos);
+	u32 size = msg_size(msg);
+	struct sk_buff *eb;
+
+	eb = buf_acquire(size);
+	if (eb)
+		memcpy(eb->data, (unchar *)msg, size);
+	return eb;
+}
+
+/* 
+ *  link_recv_changeover_msg(): Receive tunneled packet sent
+ *  via other link. Node is locked. Return extracted buffer.
+ */
+
+static int link_recv_changeover_msg(struct link **l_ptr,
+				    struct sk_buff **buf)
+{
+	struct sk_buff *tunnel_buf = *buf;
+	struct link *dest_link;
+	struct tipc_msg *msg;
+	struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf);
+	u32 msg_typ = msg_type(tunnel_msg);
+	u32 msg_count = msg_msgcnt(tunnel_msg);
+
+	dest_link = (*l_ptr)->owner->links[msg_bearer_id(tunnel_msg)];
+	assert(dest_link != *l_ptr);
+	if (!dest_link) {
+		msg_dbg(tunnel_msg, "NOLINK/<REC<");
+		goto exit;
+	}
+	dbg("%c<-%c:", dest_link->b_ptr->net_plane,
+	    (*l_ptr)->b_ptr->net_plane);
+	*l_ptr = dest_link;
+	msg = msg_get_wrapped(tunnel_msg);
+
+	if (msg_typ == DUPLICATE_MSG) {
+		if (less(msg_seqno(msg), mod(dest_link->next_in_no))) {
+			msg_dbg(tunnel_msg, "DROP/<REC<");
+			goto exit;
+		}
+		*buf = buf_extract(tunnel_buf,INT_H_SIZE);
+		if (*buf == NULL) {
+			warn("Memory squeeze; failed to extract msg\n");
+			goto exit;
+		}
+		msg_dbg(tunnel_msg, "TNL<REC<");
+		buf_discard(tunnel_buf);
+		return 1;
+	}
+
+	/* First original message ?: */
+
+	if (link_is_up(dest_link)) {
+		msg_dbg(tunnel_msg, "UP/FIRST/<REC<");
+		link_reset(dest_link);
+		dest_link->exp_msg_count = msg_count;
+		if (!msg_count)
+			goto exit;
+	} else if (dest_link->exp_msg_count == START_CHANGEOVER) {
+		msg_dbg(tunnel_msg, "BLK/FIRST/<REC<");
+		dest_link->exp_msg_count = msg_count;
+		if (!msg_count)
+			goto exit;
+	}
+
+	/* Receive original message */
+
+	if (dest_link->exp_msg_count == 0) {
+		msg_dbg(tunnel_msg, "OVERDUE/DROP/<REC<");
+		dbg_print_link(dest_link, "LINK:");
+		goto exit;
+	}
+	dest_link->exp_msg_count--;
+	if (less(msg_seqno(msg), dest_link->reset_checkpoint)) {
+		msg_dbg(tunnel_msg, "DROP/DUPL/<REC<");
+		goto exit;
+	} else {
+		*buf = buf_extract(tunnel_buf, INT_H_SIZE);
+		if (*buf != NULL) {
+			msg_dbg(tunnel_msg, "TNL<REC<");
+			buf_discard(tunnel_buf);
+			return 1;
+		} else {
+			warn("Memory squeeze; dropped incoming msg\n");
+		}
+	}
+exit:
+	*buf = 0;
+	buf_discard(tunnel_buf);
+	return 0;
+}
+
+/*
+ *  Bundler functionality:
+ */
+void link_recv_bundle(struct sk_buff *buf)
+{
+	u32 msgcount = msg_msgcnt(buf_msg(buf));
+	u32 pos = INT_H_SIZE;
+	struct sk_buff *obuf;
+
+	msg_dbg(buf_msg(buf), "<BNDL<: ");
+	while (msgcount--) {
+		obuf = buf_extract(buf, pos);
+		if (obuf == NULL) {
+			char addr_string[16];
+
+			warn("Buffer allocation failure;\n");
+			warn("  incoming message(s) from %s lost\n",
+			     addr_string_fill(addr_string, 
+					      msg_orignode(buf_msg(buf))));
+			return;
+		};
+		pos += align(msg_size(buf_msg(obuf)));
+		msg_dbg(buf_msg(obuf), "     /");
+		net_route_msg(obuf);
+	}
+	buf_discard(buf);
+}
+
+/*
+ *  Fragmentation/defragmentation:
+ */
+
+
+/* 
+ * link_send_long_buf: Entry for buffers needing fragmentation.
+ * The buffer is complete, inclusive total message length. 
+ * Returns user data length.
+ */
+int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
+{
+	struct tipc_msg *inmsg = buf_msg(buf);
+	struct tipc_msg fragm_hdr;
+	u32 insize = msg_size(inmsg);
+	u32 dsz = msg_data_sz(inmsg);
+	unchar *crs = buf->data;
+	u32 rest = insize;
+	u32 pack_sz = link_max_pkt(l_ptr);
+	u32 fragm_sz = pack_sz - INT_H_SIZE;
+	u32 fragm_no = 1;
+	u32 destaddr = msg_destnode(inmsg);
+
+	if (msg_short(inmsg))
+		destaddr = l_ptr->addr;
+
+	if (msg_routed(inmsg))
+		msg_set_prevnode(inmsg, tipc_own_addr);
+
+	/* Prepare reusable fragment header: */
+
+	msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT,
+		 TIPC_OK, INT_H_SIZE, destaddr);
+	msg_set_link_selector(&fragm_hdr, msg_link_selector(inmsg));
+	msg_set_long_msgno(&fragm_hdr, mod(l_ptr->long_msg_seq_no++));
+	msg_set_fragm_no(&fragm_hdr, fragm_no);
+	l_ptr->stats.sent_fragmented++;
+
+	/* Chop up message: */
+
+	while (rest > 0) {
+		struct sk_buff *fragm;
+
+		if (rest <= fragm_sz) {
+			fragm_sz = rest;
+			msg_set_type(&fragm_hdr, LAST_FRAGMENT);
+		}
+		fragm = buf_acquire(fragm_sz + INT_H_SIZE);
+		if (fragm == NULL) {
+			warn("Memory squeeze; failed to fragment msg\n");
+			dsz = -ENOMEM;
+			goto exit;
+		}
+		msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE);
+		memcpy(fragm->data, (unchar *)&fragm_hdr, INT_H_SIZE);
+		memcpy(fragm->data + INT_H_SIZE, crs, fragm_sz);
+
+		/*  Send queued messages first, if any: */
+
+		l_ptr->stats.sent_fragments++;
+		link_send_buf(l_ptr, fragm);
+		if (!link_is_up(l_ptr))
+			return dsz;
+		msg_set_fragm_no(&fragm_hdr, ++fragm_no);
+		rest -= fragm_sz;
+		crs += fragm_sz;
+		msg_set_type(&fragm_hdr, FRAGMENT);
+	}
+exit:
+	buf_discard(buf);
+	return dsz;
+}
+
+/* 
+ * A pending message being re-assembled must store certain values 
+ * to handle subsequent fragments correctly. The following functions 
+ * help storing these values in unused, available fields in the
+ * pending message. This makes dynamic memory allocation unecessary.
+ */
+
+static inline u32 get_long_msg_seqno(struct sk_buff *buf)
+{
+	return msg_seqno(buf_msg(buf));
+}
+
+static inline void set_long_msg_seqno(struct sk_buff *buf, u32 seqno)
+{
+	msg_set_seqno(buf_msg(buf), seqno);
+}
+
+static inline u32 get_fragm_size(struct sk_buff *buf)
+{
+	return msg_ack(buf_msg(buf));
+}
+
+static inline void set_fragm_size(struct sk_buff *buf, u32 sz)
+{
+	msg_set_ack(buf_msg(buf), sz);
+}
+
+static inline u32 get_expected_frags(struct sk_buff *buf)
+{
+	return msg_bcast_ack(buf_msg(buf));
+}
+
+static inline void set_expected_frags(struct sk_buff *buf, u32 exp)
+{
+	msg_set_bcast_ack(buf_msg(buf), exp);
+}
+
+static inline u32 get_timer_cnt(struct sk_buff *buf)
+{
+	return msg_reroute_cnt(buf_msg(buf));
+}
+
+static inline void incr_timer_cnt(struct sk_buff *buf)
+{
+	msg_incr_reroute_cnt(buf_msg(buf));
+}
+
+/* 
+ * link_recv_fragment(): Called with node lock on. Returns 
+ * the reassembled buffer if message is complete.
+ */
+int link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, 
+		       struct tipc_msg **m)
+{
+	struct sk_buff *prev = 0;
+	struct sk_buff *fbuf = *fb;
+	struct tipc_msg *fragm = buf_msg(fbuf);
+	struct sk_buff *pbuf = *pending;
+	u32 long_msg_seq_no = msg_long_msgno(fragm);
+
+	*fb = 0;
+	msg_dbg(fragm,"FRG<REC<");
+
+	/* Is there an incomplete message waiting for this fragment? */
+
+	while (pbuf && ((msg_seqno(buf_msg(pbuf)) != long_msg_seq_no)
+			|| (msg_orignode(fragm) != msg_orignode(buf_msg(pbuf))))) {
+		prev = pbuf;
+		pbuf = pbuf->next;
+	}
+
+	if (!pbuf && (msg_type(fragm) == FIRST_FRAGMENT)) {
+		struct tipc_msg *imsg = (struct tipc_msg *)msg_data(fragm);
+		u32 msg_sz = msg_size(imsg);
+		u32 fragm_sz = msg_data_sz(fragm);
+		u32 exp_fragm_cnt = msg_sz/fragm_sz + !!(msg_sz % fragm_sz);
+		u32 max =  TIPC_MAX_USER_MSG_SIZE + LONG_H_SIZE;
+		if (msg_type(imsg) == TIPC_MCAST_MSG)
+			max = TIPC_MAX_USER_MSG_SIZE + MCAST_H_SIZE;
+		if (msg_size(imsg) > max) {
+			msg_dbg(fragm,"<REC<Oversized: ");
+			buf_discard(fbuf);
+			return 0;
+		}
+		pbuf = buf_acquire(msg_size(imsg));
+		if (pbuf != NULL) {
+			pbuf->next = *pending;
+			*pending = pbuf;
+			memcpy(pbuf->data, (unchar *)imsg, msg_data_sz(fragm));
+
+			/*  Prepare buffer for subsequent fragments. */
+
+			set_long_msg_seqno(pbuf, long_msg_seq_no); 
+			set_fragm_size(pbuf,fragm_sz); 
+			set_expected_frags(pbuf,exp_fragm_cnt - 1); 
+		} else {
+			warn("Memory squeeze; got no defragmenting buffer\n");
+		}
+		buf_discard(fbuf);
+		return 0;
+	} else if (pbuf && (msg_type(fragm) != FIRST_FRAGMENT)) {
+		u32 dsz = msg_data_sz(fragm);
+		u32 fsz = get_fragm_size(pbuf);
+		u32 crs = ((msg_fragm_no(fragm) - 1) * fsz);
+		u32 exp_frags = get_expected_frags(pbuf) - 1;
+		memcpy(pbuf->data + crs, msg_data(fragm), dsz);
+		buf_discard(fbuf);
+
+		/* Is message complete? */
+
+		if (exp_frags == 0) {
+			if (prev)
+				prev->next = pbuf->next;
+			else
+				*pending = pbuf->next;
+			msg_reset_reroute_cnt(buf_msg(pbuf));
+			*fb = pbuf;
+			*m = buf_msg(pbuf);
+			return 1;
+		}
+		set_expected_frags(pbuf,exp_frags);     
+		return 0;
+	}
+	dbg(" Discarding orphan fragment %x\n",fbuf);
+	msg_dbg(fragm,"ORPHAN:");
+	dbg("Pending long buffers:\n");
+	dbg_print_buf_chain(*pending);
+	buf_discard(fbuf);
+	return 0;
+}
+
+/**
+ * link_check_defragm_bufs - flush stale incoming message fragments
+ * @l_ptr: pointer to link
+ */
+
+static void link_check_defragm_bufs(struct link *l_ptr)
+{
+	struct sk_buff *prev = 0;
+	struct sk_buff *next = 0;
+	struct sk_buff *buf = l_ptr->defragm_buf;
+
+	if (!buf)
+		return;
+	if (!link_working_working(l_ptr))
+		return;
+	while (buf) {
+		u32 cnt = get_timer_cnt(buf);
+
+		next = buf->next;
+		if (cnt < 4) {
+			incr_timer_cnt(buf);
+			prev = buf;
+		} else {
+			dbg(" Discarding incomplete long buffer\n");
+			msg_dbg(buf_msg(buf), "LONG:");
+			dbg_print_link(l_ptr, "curr:");
+			dbg("Pending long buffers:\n");
+			dbg_print_buf_chain(l_ptr->defragm_buf);
+			if (prev)
+				prev->next = buf->next;
+			else
+				l_ptr->defragm_buf = buf->next;
+			buf_discard(buf);
+		}
+		buf = next;
+	}
+}
+
+
+
+static void link_set_supervision_props(struct link *l_ptr, u32 tolerance)
+{
+	l_ptr->tolerance = tolerance;
+	l_ptr->continuity_interval =
+		((tolerance / 4) > 500) ? 500 : tolerance / 4;
+	l_ptr->abort_limit = tolerance / (l_ptr->continuity_interval / 4);
+}
+
+
+void link_set_queue_limits(struct link *l_ptr, u32 window)
+{
+	/* Data messages from this node, inclusive FIRST_FRAGM */
+	l_ptr->queue_limit[DATA_LOW] = window;
+	l_ptr->queue_limit[DATA_MEDIUM] = (window / 3) * 4;
+	l_ptr->queue_limit[DATA_HIGH] = (window / 3) * 5;
+	l_ptr->queue_limit[DATA_CRITICAL] = (window / 3) * 6;
+	/* Transiting data messages,inclusive FIRST_FRAGM */
+	l_ptr->queue_limit[DATA_LOW + 4] = 300;
+	l_ptr->queue_limit[DATA_MEDIUM + 4] = 600;
+	l_ptr->queue_limit[DATA_HIGH + 4] = 900;
+	l_ptr->queue_limit[DATA_CRITICAL + 4] = 1200;
+	l_ptr->queue_limit[CONN_MANAGER] = 1200;
+	l_ptr->queue_limit[ROUTE_DISTRIBUTOR] = 1200;
+	l_ptr->queue_limit[CHANGEOVER_PROTOCOL] = 2500;
+	l_ptr->queue_limit[NAME_DISTRIBUTOR] = 3000;
+	/* FRAGMENT and LAST_FRAGMENT packets */
+	l_ptr->queue_limit[MSG_FRAGMENTER] = 4000;
+}
+
+/**
+ * link_find_link - locate link by name
+ * @name - ptr to link name string
+ * @node - ptr to area to be filled with ptr to associated node
+ * 
+ * Caller must hold 'net_lock' to ensure node and bearer are not deleted;
+ * this also prevents link deletion.
+ * 
+ * Returns pointer to link (or 0 if invalid link name).
+ */
+
+static struct link *link_find_link(const char *name, struct node **node)
+{
+	struct link_name link_name_parts;
+	struct bearer *b_ptr;
+	struct link *l_ptr; 
+
+	if (!link_name_validate(name, &link_name_parts))
+		return 0;
+
+	b_ptr = bearer_find_interface(link_name_parts.if_local);
+	if (!b_ptr)
+		return 0;
+
+	*node = node_find(link_name_parts.addr_peer); 
+	if (!*node)
+		return 0;
+
+	l_ptr = (*node)->links[b_ptr->identity];
+	if (!l_ptr || strcmp(l_ptr->name, name))
+		return 0;
+
+	return l_ptr;
+}
+
+struct sk_buff *link_cmd_config(const void *req_tlv_area, int req_tlv_space, 
+			        u16 cmd)
+{
+	struct tipc_link_config *args;
+        u32 new_value;
+	struct link *l_ptr;
+	struct node *node;
+        int res;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_CONFIG))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	args = (struct tipc_link_config *)TLV_DATA(req_tlv_area);
+	new_value = ntohl(args->value);
+
+	if (!strcmp(args->name, bc_link_name)) {
+		if ((cmd == TIPC_CMD_SET_LINK_WINDOW) &&
+		    (bclink_set_queue_limits(new_value) == 0))
+			return cfg_reply_none();
+	       	return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+					      " (cannot change setting on broadcast link)");
+	}
+
+	read_lock_bh(&net_lock);
+	l_ptr = link_find_link(args->name, &node); 
+	if (!l_ptr) {
+		read_unlock_bh(&net_lock);
+	       	return cfg_reply_error_string("link not found");
+	}
+
+	node_lock(node);
+	res = -EINVAL;
+	switch (cmd) {
+	case TIPC_CMD_SET_LINK_TOL: 
+		if ((new_value >= TIPC_MIN_LINK_TOL) && 
+		    (new_value <= TIPC_MAX_LINK_TOL)) {
+			link_set_supervision_props(l_ptr, new_value);
+			link_send_proto_msg(l_ptr, STATE_MSG, 
+					    0, 0, new_value, 0, 0);
+			res = TIPC_OK;
+		}
+		break;
+	case TIPC_CMD_SET_LINK_PRI: 
+		if (new_value < TIPC_NUM_LINK_PRI) {
+			l_ptr->priority = new_value;
+			link_send_proto_msg(l_ptr, STATE_MSG, 
+					    0, 0, 0, new_value, 0);
+			res = TIPC_OK;
+		}
+		break;
+	case TIPC_CMD_SET_LINK_WINDOW: 
+		if ((new_value >= TIPC_MIN_LINK_WIN) && 
+		    (new_value <= TIPC_MAX_LINK_WIN)) {
+			link_set_queue_limits(l_ptr, new_value);
+			res = TIPC_OK;
+		}
+		break;
+	}
+	node_unlock(node);
+
+	read_unlock_bh(&net_lock);
+	if (res)
+	       	return cfg_reply_error_string("cannot change link setting");
+
+	return cfg_reply_none();
+}
+
+/**
+ * link_reset_statistics - reset link statistics
+ * @l_ptr: pointer to link
+ */
+
+static void link_reset_statistics(struct link *l_ptr)
+{
+	memset(&l_ptr->stats, 0, sizeof(l_ptr->stats));
+	l_ptr->stats.sent_info = l_ptr->next_out_no;
+	l_ptr->stats.recv_info = l_ptr->next_in_no;
+}
+
+struct sk_buff *link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_space)
+{
+	char *link_name;
+	struct link *l_ptr; 
+	struct node *node;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_NAME))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	link_name = (char *)TLV_DATA(req_tlv_area);
+	if (!strcmp(link_name, bc_link_name)) {
+		if (bclink_reset_stats())
+			return cfg_reply_error_string("link not found");
+		return cfg_reply_none();
+	}
+
+	read_lock_bh(&net_lock);
+	l_ptr = link_find_link(link_name, &node); 
+	if (!l_ptr) {
+		read_unlock_bh(&net_lock);
+		return cfg_reply_error_string("link not found");
+	}
+
+	node_lock(node);
+	link_reset_statistics(l_ptr);
+	node_unlock(node);
+	read_unlock_bh(&net_lock);
+	return cfg_reply_none();
+}
+
+/**
+ * percent - convert count to a percentage of total (rounding up or down)
+ */
+
+static u32 percent(u32 count, u32 total)
+{
+	return (count * 100 + (total / 2)) / total;
+}
+
+/**
+ * link_stats - print link statistics
+ * @name: link name
+ * @buf: print buffer area
+ * @buf_size: size of print buffer area
+ * 
+ * Returns length of print buffer data string (or 0 if error)
+ */
+
+static int link_stats(const char *name, char *buf, const u32 buf_size)
+{
+	struct print_buf pb;
+	struct link *l_ptr; 
+	struct node *node;
+	char *status;
+	u32 profile_total = 0;
+
+	if (!strcmp(name, bc_link_name))
+		return bclink_stats(buf, buf_size);
+
+	printbuf_init(&pb, buf, buf_size);
+
+	read_lock_bh(&net_lock);
+	l_ptr = link_find_link(name, &node); 
+	if (!l_ptr) {
+		read_unlock_bh(&net_lock);
+		return 0;
+	}
+	node_lock(node);
+
+	if (link_is_active(l_ptr))
+		status = "ACTIVE";
+	else if (link_is_up(l_ptr))
+		status = "STANDBY";
+	else
+		status = "DEFUNCT";
+	tipc_printf(&pb, "Link <%s>\n"
+		         "  %s  MTU:%u  Priority:%u  Tolerance:%u ms"
+		         "  Window:%u packets\n", 
+		    l_ptr->name, status, link_max_pkt(l_ptr), 
+		    l_ptr->priority, l_ptr->tolerance, l_ptr->queue_limit[0]);
+	tipc_printf(&pb, "  RX packets:%u fragments:%u/%u bundles:%u/%u\n", 
+		    l_ptr->next_in_no - l_ptr->stats.recv_info,
+		    l_ptr->stats.recv_fragments,
+		    l_ptr->stats.recv_fragmented,
+		    l_ptr->stats.recv_bundles,
+		    l_ptr->stats.recv_bundled);
+	tipc_printf(&pb, "  TX packets:%u fragments:%u/%u bundles:%u/%u\n", 
+		    l_ptr->next_out_no - l_ptr->stats.sent_info,
+		    l_ptr->stats.sent_fragments,
+		    l_ptr->stats.sent_fragmented, 
+		    l_ptr->stats.sent_bundles,
+		    l_ptr->stats.sent_bundled);
+	profile_total = l_ptr->stats.msg_length_counts;
+	if (!profile_total)
+		profile_total = 1;
+	tipc_printf(&pb, "  TX profile sample:%u packets  average:%u octets\n"
+		         "  0-64:%u%% -256:%u%% -1024:%u%% -4096:%u%% "
+		         "-16354:%u%% -32768:%u%% -66000:%u%%\n",
+		    l_ptr->stats.msg_length_counts,
+		    l_ptr->stats.msg_lengths_total / profile_total,
+		    percent(l_ptr->stats.msg_length_profile[0], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[1], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[2], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[3], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[4], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[5], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[6], profile_total));
+	tipc_printf(&pb, "  RX states:%u probes:%u naks:%u defs:%u dups:%u\n", 
+		    l_ptr->stats.recv_states,
+		    l_ptr->stats.recv_probes,
+		    l_ptr->stats.recv_nacks,
+		    l_ptr->stats.deferred_recv, 
+		    l_ptr->stats.duplicates);
+	tipc_printf(&pb, "  TX states:%u probes:%u naks:%u acks:%u dups:%u\n", 
+		    l_ptr->stats.sent_states, 
+		    l_ptr->stats.sent_probes, 
+		    l_ptr->stats.sent_nacks, 
+		    l_ptr->stats.sent_acks, 
+		    l_ptr->stats.retransmitted);
+	tipc_printf(&pb, "  Congestion bearer:%u link:%u  Send queue max:%u avg:%u\n",
+		    l_ptr->stats.bearer_congs,
+		    l_ptr->stats.link_congs, 
+		    l_ptr->stats.max_queue_sz,
+		    l_ptr->stats.queue_sz_counts
+		    ? (l_ptr->stats.accu_queue_sz / l_ptr->stats.queue_sz_counts)
+		    : 0);
+
+	node_unlock(node);
+	read_unlock_bh(&net_lock);
+	return printbuf_validate(&pb);
+}
+
+#define MAX_LINK_STATS_INFO 2000
+
+struct sk_buff *link_cmd_show_stats(const void *req_tlv_area, int req_tlv_space)
+{
+	struct sk_buff *buf;
+	struct tlv_desc *rep_tlv;
+	int str_len;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_NAME))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	buf = cfg_reply_alloc(TLV_SPACE(MAX_LINK_STATS_INFO));
+	if (!buf)
+		return NULL;
+
+	rep_tlv = (struct tlv_desc *)buf->data;
+
+	str_len = link_stats((char *)TLV_DATA(req_tlv_area),
+			     (char *)TLV_DATA(rep_tlv), MAX_LINK_STATS_INFO);
+	if (!str_len) {
+		buf_discard(buf);
+	       	return cfg_reply_error_string("link not found");
+	}
+
+	skb_put(buf, TLV_SPACE(str_len));
+	TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
+
+	return buf;
+}
+
+#if 0
+int link_control(const char *name, u32 op, u32 val)
+{
+	int res = -EINVAL;
+	struct link *l_ptr;
+	u32 bearer_id;
+	struct node * node;
+	u32 a;
+
+	a = link_name2addr(name, &bearer_id);
+	read_lock_bh(&net_lock);
+	node = node_find(a);
+	if (node) {
+		node_lock(node);
+		l_ptr = node->links[bearer_id];
+		if (l_ptr) {
+			if (op == TIPC_REMOVE_LINK) {
+				struct bearer *b_ptr = l_ptr->b_ptr;
+				spin_lock_bh(&b_ptr->publ.lock);
+				link_delete(l_ptr);
+				spin_unlock_bh(&b_ptr->publ.lock);
+			}
+			if (op == TIPC_CMD_BLOCK_LINK) {
+				link_reset(l_ptr);
+				l_ptr->blocked = 1;
+			}
+			if (op == TIPC_CMD_UNBLOCK_LINK) {
+				l_ptr->blocked = 0;
+			}
+			res = TIPC_OK;
+		}
+		node_unlock(node);
+	}
+	read_unlock_bh(&net_lock);
+	return res;
+}
+#endif
+
+/**
+ * link_get_max_pkt - get maximum packet size to use when sending to destination
+ * @dest: network address of destination node
+ * @selector: used to select from set of active links
+ * 
+ * If no active link can be found, uses default maximum packet size.
+ */
+
+u32 link_get_max_pkt(u32 dest, u32 selector)
+{
+	struct node *n_ptr;
+	struct link *l_ptr;
+	u32 res = MAX_PKT_DEFAULT;
+	
+	if (dest == tipc_own_addr)
+		return MAX_MSG_SIZE;
+
+	read_lock_bh(&net_lock);        
+	n_ptr = node_select(dest, selector);
+	if (n_ptr) {
+		node_lock(n_ptr);
+		l_ptr = n_ptr->active_links[selector & 1];
+		if (l_ptr)
+			res = link_max_pkt(l_ptr);
+		node_unlock(n_ptr);
+	}
+	read_unlock_bh(&net_lock);       
+	return res;
+}
+
+#if 0
+static void link_dump_rec_queue(struct link *l_ptr)
+{
+	struct sk_buff *crs;
+
+	if (!l_ptr->oldest_deferred_in) {
+		info("Reception queue empty\n");
+		return;
+	}
+	info("Contents of Reception queue:\n");
+	crs = l_ptr->oldest_deferred_in;
+	while (crs) {
+		if (crs->data == (void *)0x0000a3a3) {
+			info("buffer %x invalid\n", crs);
+			return;
+		}
+		msg_dbg(buf_msg(crs), "In rec queue: \n");
+		crs = crs->next;
+	}
+}
+#endif
+
+static void link_dump_send_queue(struct link *l_ptr)
+{
+	if (l_ptr->next_out) {
+		info("\nContents of unsent queue:\n");
+		dbg_print_buf_chain(l_ptr->next_out);
+	}
+	info("\nContents of send queue:\n");
+	if (l_ptr->first_out) {
+		dbg_print_buf_chain(l_ptr->first_out);
+	}
+	info("Empty send queue\n");
+}
+
+static void link_print(struct link *l_ptr, struct print_buf *buf,
+		       const char *str)
+{
+	tipc_printf(buf, str);
+	if (link_reset_reset(l_ptr) || link_reset_unknown(l_ptr))
+		return;
+	tipc_printf(buf, "Link %x<%s>:",
+		    l_ptr->addr, l_ptr->b_ptr->publ.name);
+	tipc_printf(buf, ": NXO(%u):", mod(l_ptr->next_out_no));
+	tipc_printf(buf, "NXI(%u):", mod(l_ptr->next_in_no));
+	tipc_printf(buf, "SQUE");
+	if (l_ptr->first_out) {
+		tipc_printf(buf, "[%u..", msg_seqno(buf_msg(l_ptr->first_out)));
+		if (l_ptr->next_out)
+			tipc_printf(buf, "%u..",
+				    msg_seqno(buf_msg(l_ptr->next_out)));
+		tipc_printf(buf, "%u]",
+			    msg_seqno(buf_msg
+				      (l_ptr->last_out)), l_ptr->out_queue_size);
+		if ((mod(msg_seqno(buf_msg(l_ptr->last_out)) - 
+			 msg_seqno(buf_msg(l_ptr->first_out))) 
+		     != (l_ptr->out_queue_size - 1))
+		    || (l_ptr->last_out->next != 0)) {
+			tipc_printf(buf, "\nSend queue inconsistency\n");
+			tipc_printf(buf, "first_out= %x ", l_ptr->first_out);
+			tipc_printf(buf, "next_out= %x ", l_ptr->next_out);
+			tipc_printf(buf, "last_out= %x ", l_ptr->last_out);
+			link_dump_send_queue(l_ptr);
+		}
+	} else
+		tipc_printf(buf, "[]");
+	tipc_printf(buf, "SQSIZ(%u)", l_ptr->out_queue_size);
+	if (l_ptr->oldest_deferred_in) {
+		u32 o = msg_seqno(buf_msg(l_ptr->oldest_deferred_in));
+		u32 n = msg_seqno(buf_msg(l_ptr->newest_deferred_in));
+		tipc_printf(buf, ":RQUE[%u..%u]", o, n);
+		if (l_ptr->deferred_inqueue_sz != mod((n + 1) - o)) {
+			tipc_printf(buf, ":RQSIZ(%u)",
+				    l_ptr->deferred_inqueue_sz);
+		}
+	}
+	if (link_working_unknown(l_ptr))
+		tipc_printf(buf, ":WU");
+	if (link_reset_reset(l_ptr))
+		tipc_printf(buf, ":RR");
+	if (link_reset_unknown(l_ptr))
+		tipc_printf(buf, ":RU");
+	if (link_working_working(l_ptr))
+		tipc_printf(buf, ":WW");
+	tipc_printf(buf, "\n");
+}
+
diff --git a/net/tipc/link.h b/net/tipc/link.h
new file mode 100644
index 000000000000..e7db3476d84e
--- /dev/null
+++ b/net/tipc/link.h
@@ -0,0 +1,293 @@
+/*
+ * net/tipc/link.h: Include file for TIPC link code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_LINK_H
+#define _TIPC_LINK_H
+
+#include "dbg.h"
+#include "msg.h"
+#include "bearer.h"
+#include "node.h"
+
+#define PUSH_FAILED   1
+#define PUSH_FINISHED 2
+
+/* 
+ * Link states 
+ */
+
+#define WORKING_WORKING 560810u
+#define WORKING_UNKNOWN 560811u
+#define RESET_UNKNOWN   560812u
+#define RESET_RESET     560813u
+
+/* 
+ * Starting value for maximum packet size negotiation on unicast links
+ * (unless bearer MTU is less)
+ */
+
+#define MAX_PKT_DEFAULT 1500
+
+/**
+ * struct link - TIPC link data structure
+ * @addr: network address of link's peer node
+ * @name: link name character string
+ * @media_addr: media address to use when sending messages over link
+ * @timer: link timer
+ * @owner: pointer to peer node
+ * @link_list: adjacent links in bearer's list of links
+ * @started: indicates if link has been started
+ * @checkpoint: reference point for triggering link continuity checking
+ * @peer_session: link session # being used by peer end of link
+ * @peer_bearer_id: bearer id used by link's peer endpoint
+ * @b_ptr: pointer to bearer used by link
+ * @tolerance: minimum link continuity loss needed to reset link [in ms] 
+ * @continuity_interval: link continuity testing interval [in ms]
+ * @abort_limit: # of unacknowledged continuity probes needed to reset link
+ * @state: current state of link FSM
+ * @blocked: indicates if link has been administratively blocked
+ * @fsm_msg_cnt: # of protocol messages link FSM has sent in current state
+ * @proto_msg: template for control messages generated by link
+ * @pmsg: convenience pointer to "proto_msg" field
+ * @priority: current link priority
+ * @queue_limit: outbound message queue congestion thresholds (indexed by user)
+ * @exp_msg_count: # of tunnelled messages expected during link changeover
+ * @reset_checkpoint: seq # of last acknowledged message at time of link reset
+ * @max_pkt: current maximum packet size for this link
+ * @max_pkt_target: desired maximum packet size for this link
+ * @max_pkt_probes: # of probes based on current (max_pkt, max_pkt_target)
+ * @out_queue_size: # of messages in outbound message queue
+ * @first_out: ptr to first outbound message in queue
+ * @last_out: ptr to last outbound message in queue
+ * @next_out_no: next sequence number to use for outbound messages
+ * @last_retransmitted: sequence number of most recently retransmitted message
+ * @stale_count: # of identical retransmit requests made by peer
+ * @next_in_no: next sequence number to expect for inbound messages
+ * @deferred_inqueue_sz: # of messages in inbound message queue
+ * @oldest_deferred_in: ptr to first inbound message in queue
+ * @newest_deferred_in: ptr to last inbound message in queue
+ * @unacked_window: # of inbound messages rx'd without ack'ing back to peer
+ * @proto_msg_queue: ptr to (single) outbound control message
+ * @retransm_queue_size: number of messages to retransmit
+ * @retransm_queue_head: sequence number of first message to retransmit
+ * @next_out: ptr to first unsent outbound message in queue
+ * @waiting_ports: linked list of ports waiting for link congestion to abate
+ * @long_msg_seq_no: next identifier to use for outbound fragmented messages
+ * @defragm_buf: list of partially reassembled inbound message fragments
+ * @stats: collects statistics regarding link activity
+ * @print_buf: print buffer used to log link activity
+ */
+ 
+struct link {
+	u32 addr;
+	char name[TIPC_MAX_LINK_NAME];
+	struct tipc_media_addr media_addr;
+	struct timer_list timer;
+	struct node *owner;
+	struct list_head link_list;
+
+	/* Management and link supervision data */
+	int started;
+	u32 checkpoint;
+	u32 peer_session;
+	u32 peer_bearer_id;
+	struct bearer *b_ptr;
+	u32 tolerance;
+	u32 continuity_interval;
+	u32 abort_limit;
+	int state;
+	int blocked;
+	u32 fsm_msg_cnt;
+	struct {
+		unchar hdr[INT_H_SIZE];
+		unchar body[TIPC_MAX_IF_NAME];
+	} proto_msg;
+	struct tipc_msg *pmsg;
+	u32 priority;
+	u32 queue_limit[15];	/* queue_limit[0]==window limit */
+
+	/* Changeover */
+	u32 exp_msg_count;
+	u32 reset_checkpoint;
+
+        /* Max packet negotiation */
+        u32 max_pkt;
+        u32 max_pkt_target;
+        u32 max_pkt_probes;
+
+	/* Sending */
+	u32 out_queue_size;
+	struct sk_buff *first_out;
+	struct sk_buff *last_out;
+	u32 next_out_no;
+        u32 last_retransmitted;
+        u32 stale_count;
+
+	/* Reception */
+	u32 next_in_no;
+	u32 deferred_inqueue_sz;
+	struct sk_buff *oldest_deferred_in;
+	struct sk_buff *newest_deferred_in;
+	u32 unacked_window;
+
+	/* Congestion handling */
+	struct sk_buff *proto_msg_queue;
+	u32 retransm_queue_size;
+	u32 retransm_queue_head;
+	struct sk_buff *next_out;
+	struct list_head waiting_ports;
+
+	/* Fragmentation/defragmentation */
+	u32 long_msg_seq_no;
+	struct sk_buff *defragm_buf;
+
+        /* Statistics */
+	struct {
+		u32 sent_info;		/* used in counting # sent packets */
+		u32 recv_info;		/* used in counting # recv'd packets */
+		u32 sent_states;
+		u32 recv_states;
+		u32 sent_probes;
+		u32 recv_probes;
+		u32 sent_nacks;
+		u32 recv_nacks;
+		u32 sent_acks;
+		u32 sent_bundled;
+		u32 sent_bundles;
+		u32 recv_bundled;
+		u32 recv_bundles;
+		u32 retransmitted;
+		u32 sent_fragmented;
+		u32 sent_fragments;
+		u32 recv_fragmented;
+		u32 recv_fragments;
+		u32 link_congs;		/* # port sends blocked by congestion */
+		u32 bearer_congs;
+		u32 deferred_recv;
+		u32 duplicates;
+
+		/* for statistical profiling of send queue size */
+
+		u32 max_queue_sz;
+		u32 accu_queue_sz;
+		u32 queue_sz_counts;
+
+		/* for statistical profiling of message lengths */
+
+		u32 msg_length_counts;
+		u32 msg_lengths_total;
+		u32 msg_length_profile[7];
+#if 0
+		u32 sent_tunneled;
+		u32 recv_tunneled;
+#endif
+	} stats;
+
+	struct print_buf print_buf;
+};
+
+struct port;
+
+struct link *link_create(struct bearer *b_ptr, const u32 peer,
+			 const struct tipc_media_addr *media_addr);
+void link_delete(struct link *l_ptr);
+void link_changeover(struct link *l_ptr);
+void link_send_duplicate(struct link *l_ptr, struct link *dest);
+void link_reset_fragments(struct link *l_ptr);
+int link_is_up(struct link *l_ptr);
+int link_is_active(struct link *l_ptr);
+void link_start(struct link *l_ptr);
+u32 link_push_packet(struct link *l_ptr);
+void link_stop(struct link *l_ptr);
+struct sk_buff *link_cmd_config(const void *req_tlv_area, int req_tlv_space, u16 cmd);
+struct sk_buff *link_cmd_show_stats(const void *req_tlv_area, int req_tlv_space);
+struct sk_buff *link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_space);
+void link_reset(struct link *l_ptr);
+int link_send(struct sk_buff *buf, u32 dest, u32 selector);
+int link_send_buf(struct link *l_ptr, struct sk_buff *buf);
+u32 link_get_max_pkt(u32 dest,u32 selector);
+int link_send_sections_fast(struct port* sender, 
+			    struct iovec const *msg_sect,
+			    const u32 num_sect, 
+			    u32 destnode);
+
+int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf);
+void link_tunnel(struct link *l_ptr, struct tipc_msg *tnl_hdr,
+		 struct tipc_msg *msg, u32 selector);
+void link_recv_bundle(struct sk_buff *buf);
+int  link_recv_fragment(struct sk_buff **pending,
+			struct sk_buff **fb,
+			struct tipc_msg **msg);
+void link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int prob, u32 gap, 
+			 u32 tolerance, u32 priority, u32 acked_mtu);
+void link_push_queue(struct link *l_ptr);
+u32 link_defer_pkt(struct sk_buff **head, struct sk_buff **tail,
+		   struct sk_buff *buf);
+void link_wakeup_ports(struct link *l_ptr, int all);
+void link_set_queue_limits(struct link *l_ptr, u32 window);
+void link_retransmit(struct link *l_ptr, struct sk_buff *start, u32 retransmits);
+
+/*
+ * Link sequence number manipulation routines (uses modulo 2**16 arithmetic)
+ */
+
+static inline u32 mod(u32 x)
+{
+	return x & 0xffffu;
+}
+
+static inline int between(u32 lower, u32 upper, u32 n)
+{
+	if ((lower < n) && (n < upper))
+		return 1;
+	if ((upper < lower) && ((n > lower) || (n < upper)))
+		return 1;
+	return 0;
+}
+
+static inline int less_eq(u32 left, u32 right)
+{
+	return (mod(right - left) < 32768u);
+}
+
+static inline int less(u32 left, u32 right)
+{
+	return (less_eq(left, right) && (mod(right) != mod(left)));
+}
+
+static inline u32 lesser(u32 left, u32 right)
+{
+	return less_eq(left, right) ? left : right;
+}
+
+#endif
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
new file mode 100644
index 000000000000..c6e90efd59f2
--- /dev/null
+++ b/net/tipc/msg.c
@@ -0,0 +1,331 @@
+/*
+ * net/tipc/msg.c: TIPC message header routines
+ *     
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "addr.h"
+#include "dbg.h"
+#include "msg.h"
+#include "bearer.h"
+
+
+void msg_set_media_addr(struct tipc_msg *m, struct tipc_media_addr *a)
+{
+	memcpy(&((int *)m)[5], a, sizeof(*a));
+}
+
+void msg_get_media_addr(struct tipc_msg *m, struct tipc_media_addr *a)
+{
+	memcpy(a, &((int*)m)[5], sizeof(*a));
+}
+
+
+void msg_print(struct print_buf *buf, struct tipc_msg *msg, const char *str)
+{
+	u32 usr = msg_user(msg);
+	tipc_printf(buf, str);
+
+	switch (usr) {
+	case MSG_BUNDLER:
+		tipc_printf(buf, "BNDL::");
+		tipc_printf(buf, "MSGS(%u):", msg_msgcnt(msg));
+		break;
+	case BCAST_PROTOCOL:
+		tipc_printf(buf, "BCASTP::");
+		break;
+	case MSG_FRAGMENTER:
+		tipc_printf(buf, "FRAGM::");
+		switch (msg_type(msg)) {
+		case FIRST_FRAGMENT:
+			tipc_printf(buf, "FIRST:");
+			break;
+		case FRAGMENT:
+			tipc_printf(buf, "BODY:");
+			break;
+		case LAST_FRAGMENT:
+			tipc_printf(buf, "LAST:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN:%x",msg_type(msg));
+
+		}
+		tipc_printf(buf, "NO(%u/%u):",msg_long_msgno(msg),
+			    msg_fragm_no(msg));
+		break;
+	case DATA_LOW:
+	case DATA_MEDIUM:
+	case DATA_HIGH:
+	case DATA_CRITICAL:
+		tipc_printf(buf, "DAT%u:", msg_user(msg));
+		if (msg_short(msg)) {
+			tipc_printf(buf, "CON:");
+			break;
+		}
+		switch (msg_type(msg)) {
+		case TIPC_CONN_MSG:
+			tipc_printf(buf, "CON:");
+			break;
+		case TIPC_MCAST_MSG:
+			tipc_printf(buf, "MCST:");
+			break;
+		case TIPC_NAMED_MSG:
+			tipc_printf(buf, "NAM:");
+			break;
+		case TIPC_DIRECT_MSG:
+			tipc_printf(buf, "DIR:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE %u",msg_type(msg));
+		}
+		if (msg_routed(msg) && !msg_non_seq(msg))
+			tipc_printf(buf, "ROUT:");
+		if (msg_reroute_cnt(msg))
+			tipc_printf(buf, "REROUTED(%u):",
+				    msg_reroute_cnt(msg));
+		break;
+	case NAME_DISTRIBUTOR:
+		tipc_printf(buf, "NMD::");
+		switch (msg_type(msg)) {
+		case PUBLICATION:
+			tipc_printf(buf, "PUBL(%u):", (msg_size(msg) - msg_hdr_sz(msg)) / 20);	/* Items */
+			break;
+		case WITHDRAWAL:
+			tipc_printf(buf, "WDRW:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN:%x",msg_type(msg));
+		}
+		if (msg_routed(msg))
+			tipc_printf(buf, "ROUT:");
+		if (msg_reroute_cnt(msg))
+			tipc_printf(buf, "REROUTED(%u):",
+				    msg_reroute_cnt(msg));
+		break;
+	case CONN_MANAGER:
+		tipc_printf(buf, "CONN_MNG:");
+		switch (msg_type(msg)) {
+		case CONN_PROBE:
+			tipc_printf(buf, "PROBE:");
+			break;
+		case CONN_PROBE_REPLY:
+			tipc_printf(buf, "PROBE_REPLY:");
+			break;
+		case CONN_ACK:
+			tipc_printf(buf, "CONN_ACK:");
+			tipc_printf(buf, "ACK(%u):",msg_msgcnt(msg));
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE:%x",msg_type(msg));
+		}
+		if (msg_routed(msg))
+			tipc_printf(buf, "ROUT:");
+		if (msg_reroute_cnt(msg))
+			tipc_printf(buf, "REROUTED(%u):",msg_reroute_cnt(msg));
+		break;
+	case LINK_PROTOCOL:
+		tipc_printf(buf, "PROT:TIM(%u):",msg_timestamp(msg));
+		switch (msg_type(msg)) {
+		case STATE_MSG:
+			tipc_printf(buf, "STATE:");
+			tipc_printf(buf, "%s:",msg_probe(msg) ? "PRB" :"");
+			tipc_printf(buf, "NXS(%u):",msg_next_sent(msg));
+			tipc_printf(buf, "GAP(%u):",msg_seq_gap(msg));
+			tipc_printf(buf, "LSTBC(%u):",msg_last_bcast(msg));
+			break;
+		case RESET_MSG:
+			tipc_printf(buf, "RESET:");
+			if (msg_size(msg) != msg_hdr_sz(msg))
+				tipc_printf(buf, "BEAR:%s:",msg_data(msg));
+			break;
+		case ACTIVATE_MSG:
+			tipc_printf(buf, "ACTIVATE:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE:%x",msg_type(msg));
+		}
+		tipc_printf(buf, "PLANE(%c):",msg_net_plane(msg));
+		tipc_printf(buf, "SESS(%u):",msg_session(msg));
+		break;
+	case CHANGEOVER_PROTOCOL:
+		tipc_printf(buf, "TUNL:");
+		switch (msg_type(msg)) {
+		case DUPLICATE_MSG:
+			tipc_printf(buf, "DUPL:");
+			break;
+		case ORIGINAL_MSG:
+			tipc_printf(buf, "ORIG:");
+			tipc_printf(buf, "EXP(%u)",msg_msgcnt(msg));
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE:%x",msg_type(msg));
+		}
+		break;
+	case ROUTE_DISTRIBUTOR:
+		tipc_printf(buf, "ROUTING_MNG:");
+		switch (msg_type(msg)) {
+		case EXT_ROUTING_TABLE:
+			tipc_printf(buf, "EXT_TBL:");
+			tipc_printf(buf, "TO:%x:",msg_remote_node(msg));
+			break;
+		case LOCAL_ROUTING_TABLE:
+			tipc_printf(buf, "LOCAL_TBL:");
+			tipc_printf(buf, "TO:%x:",msg_remote_node(msg));
+			break;
+		case SLAVE_ROUTING_TABLE:
+			tipc_printf(buf, "DP_TBL:");
+			tipc_printf(buf, "TO:%x:",msg_remote_node(msg));
+			break;
+		case ROUTE_ADDITION:
+			tipc_printf(buf, "ADD:");
+			tipc_printf(buf, "TO:%x:",msg_remote_node(msg));
+			break;
+		case ROUTE_REMOVAL:
+			tipc_printf(buf, "REMOVE:");
+			tipc_printf(buf, "TO:%x:",msg_remote_node(msg));
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE:%x",msg_type(msg));
+		}
+		break;
+	case LINK_CONFIG:
+		tipc_printf(buf, "CFG:");
+		switch (msg_type(msg)) {
+		case DSC_REQ_MSG:
+			tipc_printf(buf, "DSC_REQ:");
+			break;
+		case DSC_RESP_MSG:
+			tipc_printf(buf, "DSC_RESP:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE:%x:",msg_type(msg));
+			break;
+		}
+		break;
+	default:
+		tipc_printf(buf, "UNKNOWN USER:");
+	}
+
+	switch (usr) {
+	case CONN_MANAGER:
+	case NAME_DISTRIBUTOR:
+	case DATA_LOW:
+	case DATA_MEDIUM:
+	case DATA_HIGH:
+	case DATA_CRITICAL:
+		if (msg_short(msg))
+			break;	/* No error */
+		switch (msg_errcode(msg)) {
+		case TIPC_OK:
+			break;
+		case TIPC_ERR_NO_NAME:
+			tipc_printf(buf, "NO_NAME:");
+			break;
+		case TIPC_ERR_NO_PORT:
+			tipc_printf(buf, "NO_PORT:");
+			break;
+		case TIPC_ERR_NO_NODE:
+			tipc_printf(buf, "NO_PROC:");
+			break;
+		case TIPC_ERR_OVERLOAD:
+			tipc_printf(buf, "OVERLOAD:");
+			break;
+		case TIPC_CONN_SHUTDOWN:
+			tipc_printf(buf, "SHUTDOWN:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN ERROR(%x):",
+				    msg_errcode(msg));
+		}
+	default:{}
+	}
+
+	tipc_printf(buf, "HZ(%u):", msg_hdr_sz(msg));
+	tipc_printf(buf, "SZ(%u):", msg_size(msg));
+	tipc_printf(buf, "SQNO(%u):", msg_seqno(msg));
+
+	if (msg_non_seq(msg))
+		tipc_printf(buf, "NOSEQ:");
+	else {
+		tipc_printf(buf, "ACK(%u):", msg_ack(msg));
+	}
+	tipc_printf(buf, "BACK(%u):", msg_bcast_ack(msg));
+	tipc_printf(buf, "PRND(%x)", msg_prevnode(msg));
+
+	if (msg_isdata(msg)) {
+		if (msg_named(msg)) {
+			tipc_printf(buf, "NTYP(%u):", msg_nametype(msg));
+			tipc_printf(buf, "NINST(%u)", msg_nameinst(msg));
+		}
+	}
+
+	if ((usr != LINK_PROTOCOL) && (usr != LINK_CONFIG) &&
+	    (usr != MSG_BUNDLER)) {
+		if (!msg_short(msg)) {
+			tipc_printf(buf, ":ORIG(%x:%u):",
+				    msg_orignode(msg), msg_origport(msg));
+			tipc_printf(buf, ":DEST(%x:%u):",
+				    msg_destnode(msg), msg_destport(msg));
+		} else {
+			tipc_printf(buf, ":OPRT(%u):", msg_origport(msg));
+			tipc_printf(buf, ":DPRT(%u):", msg_destport(msg));
+		}
+		if (msg_routed(msg) && !msg_non_seq(msg))
+			tipc_printf(buf, ":TSEQN(%u)", msg_transp_seqno(msg));
+	}
+	if (msg_user(msg) == NAME_DISTRIBUTOR) {
+		tipc_printf(buf, ":ONOD(%x):", msg_orignode(msg));
+		tipc_printf(buf, ":DNOD(%x):", msg_destnode(msg));
+		if (msg_routed(msg)) {
+			tipc_printf(buf, ":CSEQN(%u)", msg_transp_seqno(msg));
+		}
+	}
+
+	if (msg_user(msg) ==  LINK_CONFIG) {
+		u32* raw = (u32*)msg;
+		struct tipc_media_addr* orig = (struct tipc_media_addr*)&raw[5];
+		tipc_printf(buf, ":REQL(%u):", msg_req_links(msg));
+		tipc_printf(buf, ":DDOM(%x):", msg_dest_domain(msg));
+		tipc_printf(buf, ":NETID(%u):", msg_bc_netid(msg));
+		media_addr_printf(buf, orig);
+	}
+	if (msg_user(msg) == BCAST_PROTOCOL) {
+		tipc_printf(buf, "BCNACK:AFTER(%u):", msg_bcgap_after(msg));
+		tipc_printf(buf, "TO(%u):", msg_bcgap_to(msg));
+	}
+	tipc_printf(buf, "\n");
+	if ((usr == CHANGEOVER_PROTOCOL) && (msg_msgcnt(msg))) {
+		msg_print(buf,msg_get_wrapped(msg),"      /");
+	}
+	if ((usr == MSG_FRAGMENTER) && (msg_type(msg) == FIRST_FRAGMENT)) {
+		msg_print(buf,msg_get_wrapped(msg),"      /");
+	}
+}
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
new file mode 100644
index 000000000000..3dcd23f51883
--- /dev/null
+++ b/net/tipc/msg.h
@@ -0,0 +1,815 @@
+/*
+ * net/tipc/msg.h: Include file for TIPC message header routines
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_MSG_H
+#define _TIPC_MSG_H
+
+#include <net/tipc/tipc_msg.h>
+
+#define TIPC_VERSION              2
+#define DATA_LOW                  TIPC_LOW_IMPORTANCE
+#define DATA_MEDIUM               TIPC_MEDIUM_IMPORTANCE
+#define DATA_HIGH                 TIPC_HIGH_IMPORTANCE
+#define DATA_CRITICAL             TIPC_CRITICAL_IMPORTANCE
+#define SHORT_H_SIZE              24	/* Connected,in cluster */
+#define DIR_MSG_H_SIZE            32	/* Directly addressed messages */
+#define CONN_MSG_H_SIZE           36	/* Routed connected msgs*/
+#define LONG_H_SIZE               40	/* Named Messages */
+#define MCAST_H_SIZE              44	/* Multicast messages */
+#define MAX_H_SIZE                60	/* Inclusive full options */
+#define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE)
+#define LINK_CONFIG               13
+
+
+/*
+		TIPC user data message header format, version 2
+		
+	- Fundamental definitions available to privileged TIPC users
+	  are located in tipc_msg.h.
+	- Remaining definitions available to TIPC internal users appear below. 
+*/
+
+
+static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val)
+{
+	m->hdr[w] = htonl(val);
+}
+
+static inline void msg_set_bits(struct tipc_msg *m, u32 w,
+				u32 pos, u32 mask, u32 val)
+{
+	u32 word = msg_word(m,w) & ~(mask << pos);
+	msg_set_word(m, w, (word |= (val << pos)));
+}
+
+/* 
+ * Word 0
+ */
+
+static inline u32 msg_version(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 29, 7);
+}
+
+static inline void msg_set_version(struct tipc_msg *m) 
+{
+	msg_set_bits(m, 0, 29, 0xf, TIPC_VERSION);
+}
+
+static inline u32 msg_user(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 25, 0xf);
+}
+
+static inline u32 msg_isdata(struct tipc_msg *m)
+{
+	return (msg_user(m) <= DATA_CRITICAL);
+}
+
+static inline void msg_set_user(struct tipc_msg *m, u32 n) 
+{
+	msg_set_bits(m, 0, 25, 0xf, n);
+}
+
+static inline void msg_set_importance(struct tipc_msg *m, u32 i) 
+{
+	msg_set_user(m, i);
+}
+
+static inline void msg_set_hdr_sz(struct tipc_msg *m,u32 n) 
+{
+	msg_set_bits(m, 0, 21, 0xf, n>>2);
+}
+
+static inline int msg_non_seq(struct tipc_msg *m) 
+{
+	return msg_bits(m, 0, 20, 1);
+}
+
+static inline void msg_set_non_seq(struct tipc_msg *m) 
+{
+	msg_set_bits(m, 0, 20, 1, 1);
+}
+
+static inline int msg_dest_droppable(struct tipc_msg *m) 
+{
+	return msg_bits(m, 0, 19, 1);
+}
+
+static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) 
+{
+	msg_set_bits(m, 0, 19, 1, d);
+}
+
+static inline int msg_src_droppable(struct tipc_msg *m) 
+{
+	return msg_bits(m, 0, 18, 1);
+}
+
+static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) 
+{
+	msg_set_bits(m, 0, 18, 1, d);
+}
+
+static inline void msg_set_size(struct tipc_msg *m, u32 sz)
+{
+	m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz);
+}
+
+
+/* 
+ * Word 1
+ */
+
+static inline void msg_set_type(struct tipc_msg *m, u32 n) 
+{
+	msg_set_bits(m, 1, 29, 0x7, n);
+}
+
+static inline void msg_set_errcode(struct tipc_msg *m, u32 err) 
+{
+	msg_set_bits(m, 1, 25, 0xf, err);
+}
+
+static inline u32 msg_reroute_cnt(struct tipc_msg *m) 
+{
+	return msg_bits(m, 1, 21, 0xf);
+}
+
+static inline void msg_incr_reroute_cnt(struct tipc_msg *m) 
+{
+	msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1);
+}
+
+static inline void msg_reset_reroute_cnt(struct tipc_msg *m) 
+{
+	msg_set_bits(m, 1, 21, 0xf, 0);
+}
+
+static inline u32 msg_lookup_scope(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 19, 0x3);
+}
+
+static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) 
+{
+	msg_set_bits(m, 1, 19, 0x3, n);
+}
+
+static inline void msg_set_options(struct tipc_msg *m, const char *opt, u32 sz) 
+{
+	u32 hsz = msg_hdr_sz(m);
+	char *to = (char *)&m->hdr[hsz/4];
+
+	if ((hsz < DIR_MSG_H_SIZE) || ((hsz + sz) > MAX_H_SIZE))
+		return;
+	msg_set_bits(m, 1, 16, 0x7, (hsz - 28)/4);
+	msg_set_hdr_sz(m, hsz + sz);
+	memcpy(to, opt, sz);
+}
+
+static inline u32 msg_bcast_ack(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 0, 0xffff);
+}
+
+static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n) 
+{
+	msg_set_bits(m, 1, 0, 0xffff, n);
+}
+
+
+/* 
+ * Word 2
+ */
+
+static inline u32 msg_ack(struct tipc_msg *m)
+{
+	return msg_bits(m, 2, 16, 0xffff);
+}
+
+static inline void msg_set_ack(struct tipc_msg *m, u32 n) 
+{
+	msg_set_bits(m, 2, 16, 0xffff, n);
+}
+
+static inline u32 msg_seqno(struct tipc_msg *m)
+{
+	return msg_bits(m, 2, 0, 0xffff);
+}
+
+static inline void msg_set_seqno(struct tipc_msg *m, u32 n) 
+{
+	msg_set_bits(m, 2, 0, 0xffff, n);
+}
+
+
+/* 
+ * Words 3-10
+ */
+
+
+static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) 
+{
+	msg_set_word(m, 3, a);
+}
+
+static inline void msg_set_origport(struct tipc_msg *m, u32 p) 
+{
+	msg_set_word(m, 4, p);
+}
+
+static inline void msg_set_destport(struct tipc_msg *m, u32 p) 
+{
+	msg_set_word(m, 5, p);
+}
+
+static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p) 
+{
+	msg_set_word(m, 5, p);
+}
+
+static inline void msg_set_orignode(struct tipc_msg *m, u32 a) 
+{
+	msg_set_word(m, 6, a);
+}
+
+static inline void msg_set_destnode(struct tipc_msg *m, u32 a) 
+{
+	msg_set_word(m, 7, a);
+}
+
+static inline int msg_is_dest(struct tipc_msg *m, u32 d) 
+{
+	return(msg_short(m) || (msg_destnode(m) == d));
+}
+
+static inline u32 msg_routed(struct tipc_msg *m)
+{
+	if (likely(msg_short(m)))
+		return 0;
+	return(msg_destnode(m) ^ msg_orignode(m)) >> 11;
+}
+
+static inline void msg_set_nametype(struct tipc_msg *m, u32 n) 
+{
+	msg_set_word(m, 8, n);
+}
+
+static inline u32 msg_transp_seqno(struct tipc_msg *m)
+{
+	return msg_word(m, 8);
+}
+
+static inline void msg_set_timestamp(struct tipc_msg *m, u32 n)
+{
+	msg_set_word(m, 8, n);
+}
+
+static inline u32 msg_timestamp(struct tipc_msg *m)
+{
+	return msg_word(m, 8);
+}
+
+static inline void msg_set_transp_seqno(struct tipc_msg *m, u32 n)
+{
+	msg_set_word(m, 8, n);
+}
+
+static inline void msg_set_namelower(struct tipc_msg *m, u32 n) 
+{
+	msg_set_word(m, 9, n);
+}
+
+static inline void msg_set_nameinst(struct tipc_msg *m, u32 n) 
+{
+	msg_set_namelower(m, n);
+}
+
+static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) 
+{
+	msg_set_word(m, 10, n);
+}
+
+static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m)
+{
+	return (struct tipc_msg *)msg_data(m);
+}
+
+static inline void msg_expand(struct tipc_msg *m, u32 destnode) 
+{
+	if (!msg_short(m))
+		return;
+	msg_set_hdr_sz(m, LONG_H_SIZE);
+	msg_set_orignode(m, msg_prevnode(m));
+	msg_set_destnode(m, destnode);
+	memset(&m->hdr[8], 0, 12);
+}
+
+
+
+/*
+		TIPC internal message header format, version 2
+
+       1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0 
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w0:|vers |msg usr|hdr sz |n|resrv|            packet size          |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w1:|m typ|rsv=0|   sequence gap    |       broadcast ack no        |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w2:| link level ack no/bc_gap_from |     seq no / bcast_gap_to     |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w3:|                       previous node                           |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w4:|  next sent broadcast/fragm no | next sent pkt/ fragm msg no   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w5:|          session no           |rsv=0|r|berid|link prio|netpl|p|
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w6:|                      originating node                         |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w7:|                      destination node                         |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w8:|                   transport sequence number                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+   w9:|   msg count / bcast tag       |       link tolerance          |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      \                                                               \
+      /                     User Specific Data                        /
+      \                                                               \
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+      NB: CONN_MANAGER use data message format. LINK_CONFIG has own format.
+*/   
+
+/* 
+ * Internal users
+ */
+
+#define  BCAST_PROTOCOL       5
+#define  MSG_BUNDLER          6
+#define  LINK_PROTOCOL        7
+#define  CONN_MANAGER         8
+#define  ROUTE_DISTRIBUTOR    9
+#define  CHANGEOVER_PROTOCOL  10
+#define  NAME_DISTRIBUTOR     11
+#define  MSG_FRAGMENTER       12
+#define  LINK_CONFIG          13
+#define  INT_H_SIZE           40
+#define  DSC_H_SIZE           40
+
+/* 
+ *  Connection management protocol messages
+ */
+
+#define CONN_PROBE        0
+#define CONN_PROBE_REPLY  1
+#define CONN_ACK          2
+
+/* 
+ * Name distributor messages
+ */
+
+#define PUBLICATION       0
+#define WITHDRAWAL        1
+
+
+/* 
+ * Word 1
+ */
+
+static inline u32 msg_seq_gap(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 16, 0xff);
+}
+
+static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 1, 16, 0xff, n);
+}
+
+static inline u32 msg_req_links(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 16, 0xfff);
+}
+
+static inline void msg_set_req_links(struct tipc_msg *m, u32 n) 
+{
+	msg_set_bits(m, 1, 16, 0xfff, n);
+}
+
+
+/* 
+ * Word 2
+ */
+
+static inline u32 msg_dest_domain(struct tipc_msg *m)
+{
+	return msg_word(m, 2);
+}
+
+static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n) 
+{
+	msg_set_word(m, 2, n);
+}
+
+static inline u32 msg_bcgap_after(struct tipc_msg *m)
+{
+	return msg_bits(m, 2, 16, 0xffff);
+}
+
+static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 2, 16, 0xffff, n);
+}
+
+static inline u32 msg_bcgap_to(struct tipc_msg *m)
+{
+	return msg_bits(m, 2, 0, 0xffff);
+}
+
+static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) 
+{
+	msg_set_bits(m, 2, 0, 0xffff, n);
+}
+
+
+/* 
+ * Word 4
+ */
+
+static inline u32 msg_last_bcast(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 16, 0xffff);
+}
+
+static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 16, 0xffff, n);
+}
+
+
+static inline u32 msg_fragm_no(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 16, 0xffff);
+}
+
+static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 16, 0xffff, n);
+}
+
+
+static inline u32 msg_next_sent(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 0, 0xffff);
+}
+
+static inline void msg_set_next_sent(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 0, 0xffff, n);
+}
+
+
+static inline u32 msg_long_msgno(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 0, 0xffff);
+}
+
+static inline void msg_set_long_msgno(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 0, 0xffff, n);
+}
+
+static inline u32 msg_bc_netid(struct tipc_msg *m)
+{
+	return msg_word(m, 4);
+}
+
+static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id)
+{
+	msg_set_word(m, 4, id);
+}
+
+static inline u32 msg_link_selector(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 0, 1);
+}
+
+static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 0, 1, (n & 1));
+}
+
+/* 
+ * Word 5
+ */
+
+static inline u32 msg_session(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 16, 0xffff);
+}
+
+static inline void msg_set_session(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 5, 16, 0xffff, n);
+}
+
+static inline u32 msg_probe(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 0, 1);
+}
+
+static inline void msg_set_probe(struct tipc_msg *m, u32 val)
+{
+	msg_set_bits(m, 5, 0, 1, (val & 1));
+}
+
+static inline char msg_net_plane(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 1, 7) + 'A';
+}
+
+static inline void msg_set_net_plane(struct tipc_msg *m, char n)
+{
+	msg_set_bits(m, 5, 1, 7, (n - 'A'));
+}
+
+static inline u32 msg_linkprio(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 4, 0x1f);
+}
+
+static inline void msg_set_linkprio(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 5, 4, 0x1f, n);
+}
+
+static inline u32 msg_bearer_id(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 9, 0x7);
+}
+
+static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 5, 9, 0x7, n);
+}
+
+static inline u32 msg_redundant_link(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 12, 0x1);
+}
+
+static inline void msg_set_redundant_link(struct tipc_msg *m)
+{
+	msg_set_bits(m, 5, 12, 0x1, 1);
+}
+
+static inline void msg_clear_redundant_link(struct tipc_msg *m)
+{
+	msg_set_bits(m, 5, 12, 0x1, 0);
+}
+
+
+/* 
+ * Word 9
+ */
+
+static inline u32 msg_msgcnt(struct tipc_msg *m)
+{
+	return msg_bits(m, 9, 16, 0xffff);
+}
+
+static inline void msg_set_msgcnt(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 9, 16, 0xffff, n);
+}
+
+static inline u32 msg_bcast_tag(struct tipc_msg *m)
+{
+	return msg_bits(m, 9, 16, 0xffff);
+}
+
+static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 9, 16, 0xffff, n);
+}
+
+static inline u32 msg_max_pkt(struct tipc_msg *m) 
+{
+	return (msg_bits(m, 9, 16, 0xffff) * 4);
+}
+
+static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) 
+{
+	msg_set_bits(m, 9, 16, 0xffff, (n / 4));
+}
+
+static inline u32 msg_link_tolerance(struct tipc_msg *m)
+{
+	return msg_bits(m, 9, 0, 0xffff);
+}
+
+static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 9, 0, 0xffff, n);
+}
+
+/* 
+ * Routing table message data
+ */
+
+
+static inline u32 msg_remote_node(struct tipc_msg *m)
+{
+	return msg_word(m, msg_hdr_sz(m)/4);
+}
+
+static inline void msg_set_remote_node(struct tipc_msg *m, u32 a)
+{
+	msg_set_word(m, msg_hdr_sz(m)/4, a);
+}
+
+static inline int msg_dataoctet(struct tipc_msg *m, u32 pos)
+{
+	return(msg_data(m)[pos + 4] != 0);
+}
+
+static inline void msg_set_dataoctet(struct tipc_msg *m, u32 pos)
+{
+	msg_data(m)[pos + 4] = 1;
+}
+
+/* 
+ * Segmentation message types
+ */
+
+#define FIRST_FRAGMENT     0
+#define FRAGMENT           1
+#define LAST_FRAGMENT      2
+
+/* 
+ * Link management protocol message types
+ */
+
+#define STATE_MSG       0
+#define RESET_MSG       1
+#define ACTIVATE_MSG    2
+
+/* 
+ * Changeover tunnel message types
+ */
+#define DUPLICATE_MSG    0
+#define ORIGINAL_MSG     1
+
+/* 
+ * Routing table message types
+ */
+#define EXT_ROUTING_TABLE    0
+#define LOCAL_ROUTING_TABLE  1
+#define SLAVE_ROUTING_TABLE  2
+#define ROUTE_ADDITION       3
+#define ROUTE_REMOVAL        4
+
+/* 
+ * Config protocol message types
+ */
+
+#define DSC_REQ_MSG          0
+#define DSC_RESP_MSG         1
+
+static inline u32 msg_tot_importance(struct tipc_msg *m)
+{
+	if (likely(msg_isdata(m))) {
+		if (likely(msg_orignode(m) == tipc_own_addr))
+			return msg_importance(m);
+		return msg_importance(m) + 4;
+	}
+	if ((msg_user(m) == MSG_FRAGMENTER)  &&
+	    (msg_type(m) == FIRST_FRAGMENT))
+		return msg_importance(msg_get_wrapped(m));
+	return msg_importance(m);
+}
+
+
+static inline void msg_init(struct tipc_msg *m, u32 user, u32 type, 
+			    u32 err, u32 hsize, u32 destnode)
+{
+	memset(m, 0, hsize);
+	msg_set_version(m);
+	msg_set_user(m, user);
+	msg_set_hdr_sz(m, hsize);
+	msg_set_size(m, hsize);
+	msg_set_prevnode(m, tipc_own_addr);
+	msg_set_type(m, type);
+	msg_set_errcode(m, err);
+	if (!msg_short(m)) {
+		msg_set_orignode(m, tipc_own_addr);
+		msg_set_destnode(m, destnode);
+	}
+}
+
+/** 
+ * msg_calc_data_size - determine total data size for message
+ */
+
+static inline int msg_calc_data_size(struct iovec const *msg_sect, u32 num_sect)
+{
+	int dsz = 0;
+	int i;
+
+	for (i = 0; i < num_sect; i++)
+		dsz += msg_sect[i].iov_len;
+	return dsz;
+}
+
+/** 
+ * msg_build - create message using specified header and data
+ * 
+ * Note: Caller must not hold any locks in case copy_from_user() is interrupted!
+ * 
+ * Returns message data size or errno
+ */
+
+static inline int msg_build(struct tipc_msg *hdr, 
+			    struct iovec const *msg_sect, u32 num_sect,
+			    int max_size, int usrmem, struct sk_buff** buf)
+{
+	int dsz, sz, hsz, pos, res, cnt;
+
+	dsz = msg_calc_data_size(msg_sect, num_sect);
+	if (unlikely(dsz > TIPC_MAX_USER_MSG_SIZE)) {
+		*buf = NULL;
+		return -EINVAL;
+	}
+
+	pos = hsz = msg_hdr_sz(hdr);
+	sz = hsz + dsz;
+	msg_set_size(hdr, sz);
+	if (unlikely(sz > max_size)) {
+		*buf = NULL;
+		return dsz;
+	}
+
+	*buf = buf_acquire(sz);
+	if (!(*buf))
+		return -ENOMEM;
+	memcpy((*buf)->data, (unchar *)hdr, hsz);
+	for (res = 1, cnt = 0; res && (cnt < num_sect); cnt++) {
+		if (likely(usrmem))
+			res = !copy_from_user((*buf)->data + pos, 
+					      msg_sect[cnt].iov_base, 
+					      msg_sect[cnt].iov_len);
+		else
+			memcpy((*buf)->data + pos, msg_sect[cnt].iov_base, 
+			       msg_sect[cnt].iov_len);
+		pos += msg_sect[cnt].iov_len;
+	}
+	if (likely(res))
+		return dsz;
+
+	buf_discard(*buf);
+	*buf = NULL;
+	return -EFAULT;
+}
+
+
+struct tipc_media_addr;
+
+extern void msg_set_media_addr(struct tipc_msg *m,
+			       struct tipc_media_addr *a);
+
+extern void msg_get_media_addr(struct tipc_msg *m,
+			       struct tipc_media_addr *a);
+
+
+#endif
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
new file mode 100644
index 000000000000..5e0604cb3aeb
--- /dev/null
+++ b/net/tipc/name_distr.c
@@ -0,0 +1,306 @@
+/*
+ * net/tipc/name_distr.c: TIPC name distribution code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "cluster.h"
+#include "dbg.h"
+#include "link.h"
+#include "msg.h"
+#include "name_distr.h"
+
+#undef  DBG_OUTPUT
+#define DBG_OUTPUT NULL
+
+#define ITEM_SIZE sizeof(struct distr_item)
+
+/**
+ * struct distr_item - publication info distributed to other nodes
+ * @type: name sequence type
+ * @lower: name sequence lower bound
+ * @upper: name sequence upper bound
+ * @ref: publishing port reference
+ * @key: publication key
+ * 
+ * ===> All fields are stored in network byte order. <===
+ * 
+ * First 3 fields identify (name or) name sequence being published.
+ * Reference field uniquely identifies port that published name sequence.
+ * Key field uniquely identifies publication, in the event a port has
+ * multiple publications of the same name sequence.
+ * 
+ * Note: There is no field that identifies the publishing node because it is 
+ * the same for all items contained within a publication message.
+ */
+
+struct distr_item {
+	u32 type;
+	u32 lower;
+	u32 upper;
+	u32 ref;
+	u32 key;
+};
+
+/**
+ * List of externally visible publications by this node -- 
+ * that is, all publications having scope > TIPC_NODE_SCOPE.
+ */
+
+static LIST_HEAD(publ_root);
+static u32 publ_cnt = 0;		
+
+/**
+ * publ_to_item - add publication info to a publication message
+ */
+
+static void publ_to_item(struct distr_item *i, struct publication *p)
+{
+	i->type = htonl(p->type);
+	i->lower = htonl(p->lower);
+	i->upper = htonl(p->upper);
+	i->ref = htonl(p->ref);
+	i->key = htonl(p->key);
+	dbg("publ_to_item: %u, %u, %u\n", p->type, p->lower, p->upper);
+}
+
+/**
+ * named_prepare_buf - allocate & initialize a publication message
+ */
+
+static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest)
+{
+	struct sk_buff *buf = buf_acquire(LONG_H_SIZE + size);  
+	struct tipc_msg *msg;
+
+	if (buf != NULL) {
+		msg = buf_msg(buf);
+		msg_init(msg, NAME_DISTRIBUTOR, type, TIPC_OK, 
+			 LONG_H_SIZE, dest);
+		msg_set_size(msg, LONG_H_SIZE + size);
+	}
+	return buf;
+}
+
+/**
+ * named_publish - tell other nodes about a new publication by this node
+ */
+
+void named_publish(struct publication *publ)
+{
+	struct sk_buff *buf;
+	struct distr_item *item;
+
+	list_add(&publ->local_list, &publ_root);
+	publ_cnt++;
+
+	buf = named_prepare_buf(PUBLICATION, ITEM_SIZE, 0);
+	if (!buf) {
+		warn("Memory squeeze; failed to distribute publication\n");
+		return;
+	}
+
+	item = (struct distr_item *)msg_data(buf_msg(buf));
+	publ_to_item(item, publ);
+	dbg("named_withdraw: broadcasting publish msg\n");
+	cluster_broadcast(buf);
+}
+
+/**
+ * named_withdraw - tell other nodes about a withdrawn publication by this node
+ */
+
+void named_withdraw(struct publication *publ)
+{
+	struct sk_buff *buf;
+	struct distr_item *item;
+
+	list_del(&publ->local_list);
+	publ_cnt--;
+
+	buf = named_prepare_buf(WITHDRAWAL, ITEM_SIZE, 0);
+	if (!buf) {
+		warn("Memory squeeze; failed to distribute withdrawal\n");
+		return;
+	}
+
+	item = (struct distr_item *)msg_data(buf_msg(buf));
+	publ_to_item(item, publ);
+	dbg("named_withdraw: broadcasting withdraw msg\n");
+	cluster_broadcast(buf);
+}
+
+/**
+ * named_node_up - tell specified node about all publications by this node
+ */
+
+void named_node_up(unsigned long node)
+{
+	struct publication *publ;
+	struct distr_item *item = 0;
+	struct sk_buff *buf = 0;
+	u32 left = 0;
+	u32 rest;
+	u32 max_item_buf;
+
+	assert(in_own_cluster(node));
+	read_lock_bh(&nametbl_lock); 
+	max_item_buf = TIPC_MAX_USER_MSG_SIZE / ITEM_SIZE;
+	max_item_buf *= ITEM_SIZE;
+	rest = publ_cnt * ITEM_SIZE;
+
+	list_for_each_entry(publ, &publ_root, local_list) {
+		if (!buf) {
+			left = (rest <= max_item_buf) ? rest : max_item_buf;
+			rest -= left;
+			buf = named_prepare_buf(PUBLICATION, left, node);       
+			if (buf == NULL) {
+				warn("Memory Squeeze; could not send publication\n");
+				goto exit;
+			}
+			item = (struct distr_item *)msg_data(buf_msg(buf));
+		}
+		publ_to_item(item, publ);
+		item++;
+		left -= ITEM_SIZE;
+		if (!left) {
+			msg_set_link_selector(buf_msg(buf), node);
+			dbg("named_node_up: sending publish msg to "
+			    "<%u.%u.%u>\n", tipc_zone(node), 
+			    tipc_cluster(node), tipc_node(node));
+			link_send(buf, node, node);
+			buf = 0;
+		}
+	}
+exit:
+	read_unlock_bh(&nametbl_lock); 
+}
+
+/**
+ * node_is_down - remove publication associated with a failed node
+ * 
+ * Invoked for each publication issued by a newly failed node.  
+ * Removes publication structure from name table & deletes it.
+ * In rare cases the link may have come back up again when this
+ * function is called, and we have two items representing the same
+ * publication. Nudge this item's key to distinguish it from the other.
+ * (Note: Publication's node subscription is already unsubscribed.)
+ */
+
+static void node_is_down(struct publication *publ)
+{
+	struct publication *p;
+        write_lock_bh(&nametbl_lock);
+	dbg("node_is_down: withdrawing %u, %u, %u\n", 
+	    publ->type, publ->lower, publ->upper);
+        publ->key += 1222345;
+	p = nametbl_remove_publ(publ->type, publ->lower, 
+				publ->node, publ->ref, publ->key);
+        assert(p == publ);
+	write_unlock_bh(&nametbl_lock);
+	if (publ)
+		kfree(publ);
+}
+
+/**
+ * named_recv - process name table update message sent by another node
+ */
+
+void named_recv(struct sk_buff *buf)
+{
+	struct publication *publ;
+	struct tipc_msg *msg = buf_msg(buf);
+	struct distr_item *item = (struct distr_item *)msg_data(msg);
+	u32 count = msg_data_sz(msg) / ITEM_SIZE;
+
+	write_lock_bh(&nametbl_lock); 
+	while (count--) {
+		if (msg_type(msg) == PUBLICATION) {
+			dbg("named_recv: got publication for %u, %u, %u\n", 
+			    ntohl(item->type), ntohl(item->lower),
+			    ntohl(item->upper));
+			publ = nametbl_insert_publ(ntohl(item->type), 
+						   ntohl(item->lower),
+						   ntohl(item->upper),
+						   TIPC_CLUSTER_SCOPE,
+						   msg_orignode(msg), 
+						   ntohl(item->ref),
+						   ntohl(item->key));
+			if (publ) {
+				nodesub_subscribe(&publ->subscr, 
+						  msg_orignode(msg), 
+						  publ,
+						  (net_ev_handler)node_is_down);
+			}
+		} else if (msg_type(msg) == WITHDRAWAL) {
+			dbg("named_recv: got withdrawl for %u, %u, %u\n", 
+			    ntohl(item->type), ntohl(item->lower),
+			    ntohl(item->upper));
+			publ = nametbl_remove_publ(ntohl(item->type),
+						   ntohl(item->lower),
+						   msg_orignode(msg),
+						   ntohl(item->ref),
+						   ntohl(item->key));
+
+			if (publ) {
+				nodesub_unsubscribe(&publ->subscr);
+        			kfree(publ);
+			}
+		} else {
+			warn("named_recv: unknown msg\n");
+		}
+		item++;
+	}
+	write_unlock_bh(&nametbl_lock); 
+	buf_discard(buf);
+}
+
+/**
+ * named_reinit - re-initialize local publication list
+ * 
+ * This routine is called whenever TIPC networking is (re)enabled.
+ * All existing publications by this node that have "cluster" or "zone" scope
+ * are updated to reflect the node's current network address.
+ * (If the node's address is unchanged, the update loop terminates immediately.)
+ */
+
+void named_reinit(void)
+{
+	struct publication *publ;
+
+	write_lock_bh(&nametbl_lock); 
+	list_for_each_entry(publ, &publ_root, local_list) {
+		if (publ->node == tipc_own_addr)
+			break;
+		publ->node = tipc_own_addr;
+	}
+	write_unlock_bh(&nametbl_lock); 
+}
diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h
new file mode 100644
index 000000000000..7c6616aecc03
--- /dev/null
+++ b/net/tipc/name_distr.h
@@ -0,0 +1,45 @@
+/*
+ * net/tipc/name_distr.h: Include file for TIPC name distribution code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NAME_DISTR_H
+#define _TIPC_NAME_DISTR_H
+
+#include "name_table.h"
+
+void named_publish(struct publication *publ);
+void named_withdraw(struct publication *publ);
+void named_node_up(unsigned long node);
+void named_recv(struct sk_buff *buf);
+void named_reinit(void);
+
+#endif
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
new file mode 100644
index 000000000000..9626c5b30678
--- /dev/null
+++ b/net/tipc/name_table.c
@@ -0,0 +1,1076 @@
+/*
+ * net/tipc/name_table.c: TIPC name table code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include "dbg.h"
+#include "name_table.h"
+#include "name_distr.h"
+#include "addr.h"
+#include "node_subscr.h"
+#include "subscr.h"
+#include "port.h"
+#include "cluster.h"
+#include "bcast.h"
+
+int tipc_nametbl_size = 1024;		/* must be a power of 2 */
+
+/**
+ * struct sub_seq - container for all published instances of a name sequence
+ * @lower: name sequence lower bound
+ * @upper: name sequence upper bound
+ * @node_list: circular list of matching publications with >= node scope
+ * @cluster_list: circular list of matching publications with >= cluster scope
+ * @zone_list: circular list of matching publications with >= zone scope
+ */
+
+struct sub_seq {
+	u32 lower;
+	u32 upper;
+	struct publication *node_list;
+	struct publication *cluster_list;
+	struct publication *zone_list;
+};
+
+/** 
+ * struct name_seq - container for all published instances of a name type
+ * @type: 32 bit 'type' value for name sequence
+ * @sseq: pointer to dynamically-sized array of sub-sequences of this 'type';
+ *        sub-sequences are sorted in ascending order
+ * @alloc: number of sub-sequences currently in array
+ * @first_free: upper bound of highest sub-sequence + 1
+ * @ns_list: links to adjacent name sequences in hash chain
+ * @subscriptions: list of subscriptions for this 'type'
+ * @lock: spinlock controlling access to name sequence structure
+ */
+
+struct name_seq {
+	u32 type;
+	struct sub_seq *sseqs;
+	u32 alloc;
+	u32 first_free;
+	struct hlist_node ns_list;
+	struct list_head subscriptions;
+	spinlock_t lock;
+};
+
+/**
+ * struct name_table - table containing all existing port name publications
+ * @types: pointer to fixed-sized array of name sequence lists, 
+ *         accessed via hashing on 'type'; name sequence lists are *not* sorted
+ * @local_publ_count: number of publications issued by this node
+ */
+
+struct name_table {
+	struct hlist_head *types;
+	u32 local_publ_count;
+};
+
+struct name_table table = { NULL } ;
+static atomic_t rsv_publ_ok = ATOMIC_INIT(0);
+rwlock_t nametbl_lock = RW_LOCK_UNLOCKED;
+
+
+static inline int hash(int x)
+{
+	return(x & (tipc_nametbl_size - 1));
+}
+
+/**
+ * publ_create - create a publication structure
+ */
+
+static struct publication *publ_create(u32 type, u32 lower, u32 upper, 
+				       u32 scope, u32 node, u32 port_ref,   
+				       u32 key)
+{
+	struct publication *publ =
+		(struct publication *)kmalloc(sizeof(*publ), GFP_ATOMIC);
+	if (publ == NULL) {
+		warn("Memory squeeze; failed to create publication\n");
+		return 0;
+	}
+
+	memset(publ, 0, sizeof(*publ));
+	publ->type = type;
+	publ->lower = lower;
+	publ->upper = upper;
+	publ->scope = scope;
+	publ->node = node;
+	publ->ref = port_ref;
+	publ->key = key;
+	INIT_LIST_HEAD(&publ->local_list);
+	INIT_LIST_HEAD(&publ->pport_list);
+	INIT_LIST_HEAD(&publ->subscr.nodesub_list);
+	return publ;
+}
+
+/**
+ * subseq_alloc - allocate a specified number of sub-sequence structures
+ */
+
+struct sub_seq *subseq_alloc(u32 cnt)
+{
+	u32 sz = cnt * sizeof(struct sub_seq);
+	struct sub_seq *sseq = (struct sub_seq *)kmalloc(sz, GFP_ATOMIC);
+
+	if (sseq)
+		memset(sseq, 0, sz);
+	return sseq;
+}
+
+/**
+ * nameseq_create - create a name sequence structure for the specified 'type'
+ * 
+ * Allocates a single sub-sequence structure and sets it to all 0's.
+ */
+
+struct name_seq *nameseq_create(u32 type, struct hlist_head *seq_head)
+{
+	struct name_seq *nseq = 
+		(struct name_seq *)kmalloc(sizeof(*nseq), GFP_ATOMIC);
+	struct sub_seq *sseq = subseq_alloc(1);
+
+	if (!nseq || !sseq) {
+		warn("Memory squeeze; failed to create name sequence\n");
+		kfree(nseq);
+		kfree(sseq);
+		return 0;
+	}
+
+	memset(nseq, 0, sizeof(*nseq));
+	nseq->lock = SPIN_LOCK_UNLOCKED;
+	nseq->type = type;
+	nseq->sseqs = sseq;
+	dbg("nameseq_create() nseq = %x type %u, ssseqs %x, ff: %u\n",
+	    nseq, type, nseq->sseqs, nseq->first_free);
+	nseq->alloc = 1;
+	INIT_HLIST_NODE(&nseq->ns_list);
+	INIT_LIST_HEAD(&nseq->subscriptions);
+	hlist_add_head(&nseq->ns_list, seq_head);
+	return nseq;
+}
+
+/**
+ * nameseq_find_subseq - find sub-sequence (if any) matching a name instance
+ *  
+ * Very time-critical, so binary searches through sub-sequence array.
+ */
+
+static inline struct sub_seq *nameseq_find_subseq(struct name_seq *nseq, 
+						  u32 instance)
+{
+	struct sub_seq *sseqs = nseq->sseqs;
+	int low = 0;
+	int high = nseq->first_free - 1;
+	int mid;
+
+	while (low <= high) {
+		mid = (low + high) / 2;
+		if (instance < sseqs[mid].lower)
+			high = mid - 1;
+		else if (instance > sseqs[mid].upper)
+			low = mid + 1;
+		else
+			return &sseqs[mid];
+	}
+	return 0;
+}
+
+/**
+ * nameseq_locate_subseq - determine position of name instance in sub-sequence
+ * 
+ * Returns index in sub-sequence array of the entry that contains the specified
+ * instance value; if no entry contains that value, returns the position
+ * where a new entry for it would be inserted in the array.
+ *
+ * Note: Similar to binary search code for locating a sub-sequence.
+ */
+
+static u32 nameseq_locate_subseq(struct name_seq *nseq, u32 instance)
+{
+	struct sub_seq *sseqs = nseq->sseqs;
+	int low = 0;
+	int high = nseq->first_free - 1;
+	int mid;
+
+	while (low <= high) {
+		mid = (low + high) / 2;
+		if (instance < sseqs[mid].lower)
+			high = mid - 1;
+		else if (instance > sseqs[mid].upper)
+			low = mid + 1;
+		else
+			return mid;
+	}
+	return low;
+}
+
+/**
+ * nameseq_insert_publ - 
+ */
+
+struct publication *nameseq_insert_publ(struct name_seq *nseq,
+					u32 type, u32 lower, u32 upper,
+					u32 scope, u32 node, u32 port, u32 key)
+{
+	struct subscription *s;
+	struct subscription *st;
+	struct publication *publ;
+	struct sub_seq *sseq;
+	int created_subseq = 0;
+
+	assert(nseq->first_free <= nseq->alloc);
+	sseq = nameseq_find_subseq(nseq, lower);
+	dbg("nameseq_ins: for seq %x,<%u,%u>, found sseq %x\n",
+	    nseq, type, lower, sseq);
+	if (sseq) {
+
+		/* Lower end overlaps existing entry => need an exact match */
+
+		if ((sseq->lower != lower) || (sseq->upper != upper)) {
+			warn("Overlapping publ <%u,%u,%u>\n", type, lower, upper);
+			return 0;
+		}
+	} else {
+		u32 inspos;
+		struct sub_seq *freesseq;
+
+		/* Find where lower end should be inserted */
+
+		inspos = nameseq_locate_subseq(nseq, lower);
+
+		/* Fail if upper end overlaps into an existing entry */
+
+		if ((inspos < nseq->first_free) &&
+		    (upper >= nseq->sseqs[inspos].lower)) {
+			warn("Overlapping publ <%u,%u,%u>\n", type, lower, upper);
+			return 0;
+		}
+
+		/* Ensure there is space for new sub-sequence */
+
+		if (nseq->first_free == nseq->alloc) {
+			struct sub_seq *sseqs = nseq->sseqs;
+			nseq->sseqs = subseq_alloc(nseq->alloc * 2);
+			if (nseq->sseqs != NULL) {
+				memcpy(nseq->sseqs, sseqs,
+				       nseq->alloc * sizeof (struct sub_seq));
+				kfree(sseqs);
+				dbg("Allocated %u sseqs\n", nseq->alloc);
+				nseq->alloc *= 2;
+			} else {
+				warn("Memory squeeze; failed to create sub-sequence\n");
+				return 0;
+			}
+		}
+		dbg("Have %u sseqs for type %u\n", nseq->alloc, type);
+
+		/* Insert new sub-sequence */
+
+		dbg("ins in pos %u, ff = %u\n", inspos, nseq->first_free);
+		sseq = &nseq->sseqs[inspos];
+		freesseq = &nseq->sseqs[nseq->first_free];
+		memmove(sseq + 1, sseq, (freesseq - sseq) * sizeof (*sseq));
+		memset(sseq, 0, sizeof (*sseq));
+		nseq->first_free++;
+		sseq->lower = lower;
+		sseq->upper = upper;
+		created_subseq = 1;
+	}
+	dbg("inserting (%u %u %u) from %x:%u into sseq %x(%u,%u) of seq %x\n",
+	    type, lower, upper, node, port, sseq,
+	    sseq->lower, sseq->upper, nseq);
+
+	/* Insert a publication: */
+
+	publ = publ_create(type, lower, upper, scope, node, port, key);
+	if (!publ)
+		return 0;
+	dbg("inserting publ %x, node=%x publ->node=%x, subscr->node=%x\n",
+	    publ, node, publ->node, publ->subscr.node);
+
+	if (!sseq->zone_list)
+		sseq->zone_list = publ->zone_list_next = publ;
+	else {
+		publ->zone_list_next = sseq->zone_list->zone_list_next;
+		sseq->zone_list->zone_list_next = publ;
+	}
+
+	if (in_own_cluster(node)) {
+		if (!sseq->cluster_list)
+			sseq->cluster_list = publ->cluster_list_next = publ;
+		else {
+			publ->cluster_list_next =
+			sseq->cluster_list->cluster_list_next;
+			sseq->cluster_list->cluster_list_next = publ;
+		}
+	}
+
+	if (node == tipc_own_addr) {
+		if (!sseq->node_list)
+			sseq->node_list = publ->node_list_next = publ;
+		else {
+			publ->node_list_next = sseq->node_list->node_list_next;
+			sseq->node_list->node_list_next = publ;
+		}
+	}
+
+	/* 
+	 * Any subscriptions waiting for notification? 
+	 */
+	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
+		dbg("calling report_overlap()\n");
+		subscr_report_overlap(s,
+				      publ->lower,
+				      publ->upper,
+				      TIPC_PUBLISHED,
+				      publ->ref, 
+				      publ->node,
+				      created_subseq);
+	}
+	return publ;
+}
+
+/**
+ * nameseq_remove_publ -
+ */
+
+struct publication *nameseq_remove_publ(struct name_seq *nseq, u32 inst,
+					u32 node, u32 ref, u32 key)
+{
+	struct publication *publ;
+	struct publication *prev;
+	struct sub_seq *sseq = nameseq_find_subseq(nseq, inst);
+	struct sub_seq *free;
+	struct subscription *s, *st;
+	int removed_subseq = 0;
+
+	assert(nseq);
+
+	if (!sseq) {
+		int i;
+
+		warn("Withdraw unknown <%u,%u>?\n", nseq->type, inst);
+		assert(nseq->sseqs);
+		dbg("Dumping subseqs %x for %x, alloc = %u,ff=%u\n",
+		    nseq->sseqs, nseq, nseq->alloc, 
+		    nseq->first_free);
+		for (i = 0; i < nseq->first_free; i++) {
+			dbg("Subseq %u(%x): lower = %u,upper = %u\n",
+			    i, &nseq->sseqs[i], nseq->sseqs[i].lower,
+			    nseq->sseqs[i].upper);
+		}
+		return 0;
+	}
+	dbg("nameseq_remove: seq: %x, sseq %x, <%u,%u> key %u\n",
+	    nseq, sseq, nseq->type, inst, key);
+
+	prev = sseq->zone_list;
+	publ = sseq->zone_list->zone_list_next;
+	while ((publ->key != key) || (publ->ref != ref) || 
+	       (publ->node && (publ->node != node))) {
+		prev = publ;
+		publ = publ->zone_list_next;
+		assert(prev != sseq->zone_list);
+	}
+	if (publ != sseq->zone_list)
+		prev->zone_list_next = publ->zone_list_next;
+	else if (publ->zone_list_next != publ) {
+		prev->zone_list_next = publ->zone_list_next;
+		sseq->zone_list = publ->zone_list_next;
+	} else {
+		sseq->zone_list = 0;
+	}
+
+	if (in_own_cluster(node)) {
+		prev = sseq->cluster_list;
+		publ = sseq->cluster_list->cluster_list_next;
+		while ((publ->key != key) || (publ->ref != ref) || 
+		       (publ->node && (publ->node != node))) {
+			prev = publ;
+			publ = publ->cluster_list_next;
+			assert(prev != sseq->cluster_list);
+		}
+		if (publ != sseq->cluster_list)
+			prev->cluster_list_next = publ->cluster_list_next;
+		else if (publ->cluster_list_next != publ) {
+			prev->cluster_list_next = publ->cluster_list_next;
+			sseq->cluster_list = publ->cluster_list_next;
+		} else {
+			sseq->cluster_list = 0;
+		}
+	}
+
+	if (node == tipc_own_addr) {
+		prev = sseq->node_list;
+		publ = sseq->node_list->node_list_next;
+		while ((publ->key != key) || (publ->ref != ref) || 
+		       (publ->node && (publ->node != node))) {
+			prev = publ;
+			publ = publ->node_list_next;
+			assert(prev != sseq->node_list);
+		}
+		if (publ != sseq->node_list)
+			prev->node_list_next = publ->node_list_next;
+		else if (publ->node_list_next != publ) {
+			prev->node_list_next = publ->node_list_next;
+			sseq->node_list = publ->node_list_next;
+		} else {
+			sseq->node_list = 0;
+		}
+	}
+	assert(!publ->node || (publ->node == node));
+	assert(publ->ref == ref);
+	assert(publ->key == key);
+
+	/* 
+	 * Contract subseq list if no more publications:
+	 */
+	if (!sseq->node_list && !sseq->cluster_list && !sseq->zone_list) {
+		free = &nseq->sseqs[nseq->first_free--];
+		memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof (*sseq));
+		removed_subseq = 1;
+	}
+
+	/* 
+	 * Any subscriptions waiting ? 
+	 */
+	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
+		subscr_report_overlap(s,
+				      publ->lower,
+				      publ->upper,
+				      TIPC_WITHDRAWN, 
+				      publ->ref, 
+				      publ->node,
+				      removed_subseq);
+	}
+	return publ;
+}
+
+/**
+ * nameseq_subscribe: attach a subscription, and issue
+ * the prescribed number of events if there is any sub-
+ * sequence overlapping with the requested sequence
+ */
+
+void nameseq_subscribe(struct name_seq *nseq, struct subscription *s)
+{
+	struct sub_seq *sseq = nseq->sseqs;
+
+	list_add(&s->nameseq_list, &nseq->subscriptions);
+
+	if (!sseq)
+		return;
+
+	while (sseq != &nseq->sseqs[nseq->first_free]) {
+		struct publication *zl = sseq->zone_list;
+		if (zl && subscr_overlap(s,sseq->lower,sseq->upper)) {
+			struct publication *crs = zl;
+			int must_report = 1;
+
+			do {
+				subscr_report_overlap(s, 
+						       sseq->lower, 
+						       sseq->upper,
+						       TIPC_PUBLISHED,
+						       crs->ref,
+						       crs->node,
+						       must_report);
+				must_report = 0;
+				crs = crs->zone_list_next;
+			} while (crs != zl);
+		}
+		sseq++;
+	}
+}
+
+static struct name_seq *nametbl_find_seq(u32 type)
+{
+	struct hlist_head *seq_head;
+	struct hlist_node *seq_node;
+	struct name_seq *ns;
+
+	dbg("find_seq %u,(%u,0x%x) table = %p, hash[type] = %u\n",
+	    type, ntohl(type), type, table.types, hash(type));
+
+	seq_head = &table.types[hash(type)];
+	hlist_for_each_entry(ns, seq_node, seq_head, ns_list) {
+		if (ns->type == type) {
+			dbg("found %x\n", ns);
+			return ns;
+		}
+	}
+
+	return 0;
+};
+
+struct publication *nametbl_insert_publ(u32 type, u32 lower, u32 upper,
+		    u32 scope, u32 node, u32 port, u32 key)
+{
+	struct name_seq *seq = nametbl_find_seq(type);
+
+	dbg("ins_publ: <%u,%x,%x> found %x\n", type, lower, upper, seq);
+	if (lower > upper) {
+		warn("Failed to publish illegal <%u,%u,%u>\n",
+		     type, lower, upper);
+		return 0;
+	}
+
+	dbg("Publishing <%u,%u,%u> from %x\n", type, lower, upper, node);
+	if (!seq) {
+		seq = nameseq_create(type, &table.types[hash(type)]);
+		dbg("nametbl_insert_publ: created %x\n", seq);
+	}
+	if (!seq)
+		return 0;
+
+	assert(seq->type == type);
+	return nameseq_insert_publ(seq, type, lower, upper,
+				   scope, node, port, key);
+}
+
+struct publication *nametbl_remove_publ(u32 type, u32 lower, 
+					u32 node, u32 ref, u32 key)
+{
+	struct publication *publ;
+	struct name_seq *seq = nametbl_find_seq(type);
+
+	if (!seq)
+		return 0;
+
+	dbg("Withdrawing <%u,%u> from %x\n", type, lower, node);
+	publ = nameseq_remove_publ(seq, lower, node, ref, key);
+
+	if (!seq->first_free && list_empty(&seq->subscriptions)) {
+		hlist_del_init(&seq->ns_list);
+		kfree(seq->sseqs);
+		kfree(seq);
+	}
+	return publ;
+}
+
+/*
+ * nametbl_translate(): Translate tipc_name -> tipc_portid.
+ *                      Very time-critical.
+ *
+ * Note: on entry 'destnode' is the search domain used during translation;
+ *       on exit it passes back the node address of the matching port (if any)
+ */
+
+u32 nametbl_translate(u32 type, u32 instance, u32 *destnode)
+{
+	struct sub_seq *sseq;
+	struct publication *publ = 0;
+	struct name_seq *seq;
+	u32 ref;
+
+	if (!in_scope(*destnode, tipc_own_addr))
+		return 0;
+
+	read_lock_bh(&nametbl_lock);
+	seq = nametbl_find_seq(type);
+	if (unlikely(!seq))
+		goto not_found;
+	sseq = nameseq_find_subseq(seq, instance);
+	if (unlikely(!sseq))
+		goto not_found;
+	spin_lock_bh(&seq->lock);
+
+	/* Closest-First Algorithm: */
+	if (likely(!*destnode)) {
+		publ = sseq->node_list;
+		if (publ) {
+			sseq->node_list = publ->node_list_next;
+found:
+			ref = publ->ref;
+			*destnode = publ->node;
+			spin_unlock_bh(&seq->lock);
+			read_unlock_bh(&nametbl_lock);
+			return ref;
+		}
+		publ = sseq->cluster_list;
+		if (publ) {
+			sseq->cluster_list = publ->cluster_list_next;
+			goto found;
+		}
+		publ = sseq->zone_list;
+		if (publ) {
+			sseq->zone_list = publ->zone_list_next;
+			goto found;
+		}
+	}
+
+	/* Round-Robin Algorithm: */
+	else if (*destnode == tipc_own_addr) {
+		publ = sseq->node_list;
+		if (publ) {
+			sseq->node_list = publ->node_list_next;
+			goto found;
+		}
+	} else if (in_own_cluster(*destnode)) {
+		publ = sseq->cluster_list;
+		if (publ) {
+			sseq->cluster_list = publ->cluster_list_next;
+			goto found;
+		}
+	} else {
+		publ = sseq->zone_list;
+		if (publ) {
+			sseq->zone_list = publ->zone_list_next;
+			goto found;
+		}
+	}
+	spin_unlock_bh(&seq->lock);
+not_found:
+	*destnode = 0;
+	read_unlock_bh(&nametbl_lock);
+	return 0;
+}
+
+/**
+ * nametbl_mc_translate - find multicast destinations
+ * 
+ * Creates list of all local ports that overlap the given multicast address;
+ * also determines if any off-node ports overlap.
+ *
+ * Note: Publications with a scope narrower than 'limit' are ignored.
+ * (i.e. local node-scope publications mustn't receive messages arriving
+ * from another node, even if the multcast link brought it here)
+ * 
+ * Returns non-zero if any off-node ports overlap
+ */
+
+int nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit,
+			 struct port_list *dports)
+{
+	struct name_seq *seq;
+	struct sub_seq *sseq;
+	struct sub_seq *sseq_stop;
+	int res = 0;
+
+	read_lock_bh(&nametbl_lock);
+	seq = nametbl_find_seq(type);
+	if (!seq)
+		goto exit;
+
+	spin_lock_bh(&seq->lock);
+
+	sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
+	sseq_stop = seq->sseqs + seq->first_free;
+	for (; sseq != sseq_stop; sseq++) {
+		struct publication *publ;
+
+		if (sseq->lower > upper)
+			break;
+		publ = sseq->cluster_list;
+		if (publ && (publ->scope <= limit))
+			do {
+				if (publ->node == tipc_own_addr)
+					port_list_add(dports, publ->ref);
+				else
+					res = 1;
+				publ = publ->cluster_list_next;
+			} while (publ != sseq->cluster_list);
+	}
+
+	spin_unlock_bh(&seq->lock);
+exit:
+	read_unlock_bh(&nametbl_lock);
+	return res;
+}
+
+/**
+ * nametbl_publish_rsv - publish port name using a reserved name type
+ */
+
+int nametbl_publish_rsv(u32 ref, unsigned int scope, 
+			struct tipc_name_seq const *seq)
+{
+	int res;
+
+	atomic_inc(&rsv_publ_ok);
+	res = tipc_publish(ref, scope, seq);
+	atomic_dec(&rsv_publ_ok);
+	return res;
+}
+
+/**
+ * nametbl_publish - add name publication to network name tables
+ */
+
+struct publication *nametbl_publish(u32 type, u32 lower, u32 upper, 
+				    u32 scope, u32 port_ref, u32 key)
+{
+	struct publication *publ;
+
+	if (table.local_publ_count >= tipc_max_publications) {
+		warn("Failed publish: max %u local publication\n", 
+		     tipc_max_publications);
+		return 0;
+	}
+	if ((type < TIPC_RESERVED_TYPES) && !atomic_read(&rsv_publ_ok)) {
+		warn("Failed to publish reserved name <%u,%u,%u>\n",
+		     type, lower, upper);
+		return 0;
+	}
+
+	write_lock_bh(&nametbl_lock);
+	table.local_publ_count++;
+	publ = nametbl_insert_publ(type, lower, upper, scope,
+				   tipc_own_addr, port_ref, key);
+	if (publ && (scope != TIPC_NODE_SCOPE)) {
+		named_publish(publ);
+	}
+	write_unlock_bh(&nametbl_lock);
+	return publ;
+}
+
+/**
+ * nametbl_withdraw - withdraw name publication from network name tables
+ */
+
+int nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key)
+{
+	struct publication *publ;
+
+	dbg("nametbl_withdraw:<%d,%d,%d>\n", type, lower, key);
+	write_lock_bh(&nametbl_lock);
+	publ = nametbl_remove_publ(type, lower, tipc_own_addr, ref, key);
+	if (publ) {
+		table.local_publ_count--;
+		if (publ->scope != TIPC_NODE_SCOPE)
+			named_withdraw(publ);
+		write_unlock_bh(&nametbl_lock);
+		list_del_init(&publ->pport_list);
+		kfree(publ);
+		return 1;
+	}
+	write_unlock_bh(&nametbl_lock);
+	return 0;
+}
+
+/**
+ * nametbl_subscribe - add a subscription object to the name table
+ */
+
+void
+nametbl_subscribe(struct subscription *s)
+{
+	u32 type = s->seq.type;
+	struct name_seq *seq;
+
+        write_lock_bh(&nametbl_lock);
+	seq = nametbl_find_seq(type);
+	if (!seq) {
+		seq = nameseq_create(type, &table.types[hash(type)]);
+	}
+        if (seq){
+                spin_lock_bh(&seq->lock);
+                dbg("nametbl_subscribe:found %x for <%u,%u,%u>\n",
+                    seq, type, s->seq.lower, s->seq.upper);
+                assert(seq->type == type);
+                nameseq_subscribe(seq, s);
+                spin_unlock_bh(&seq->lock);
+        }
+        write_unlock_bh(&nametbl_lock);
+}
+
+/**
+ * nametbl_unsubscribe - remove a subscription object from name table
+ */
+
+void
+nametbl_unsubscribe(struct subscription *s)
+{
+	struct name_seq *seq;
+
+        write_lock_bh(&nametbl_lock);
+        seq = nametbl_find_seq(s->seq.type);
+	if (seq != NULL){
+                spin_lock_bh(&seq->lock);
+                list_del_init(&s->nameseq_list);
+                spin_unlock_bh(&seq->lock);
+                if ((seq->first_free == 0) && list_empty(&seq->subscriptions)) {
+                        hlist_del_init(&seq->ns_list);
+                        kfree(seq->sseqs);
+                        kfree(seq);
+                }
+        }
+        write_unlock_bh(&nametbl_lock);
+}
+
+
+/**
+ * subseq_list: print specified sub-sequence contents into the given buffer
+ */
+
+static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth,
+			u32 index)
+{
+	char portIdStr[27];
+	char *scopeStr;
+	struct publication *publ = sseq->zone_list;
+
+	tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper);
+
+	if (depth == 2 || !publ) {
+		tipc_printf(buf, "\n");
+		return;
+	}
+
+	do {
+		sprintf (portIdStr, "<%u.%u.%u:%u>",
+			 tipc_zone(publ->node), tipc_cluster(publ->node),
+			 tipc_node(publ->node), publ->ref);
+		tipc_printf(buf, "%-26s ", portIdStr);
+		if (depth > 3) {
+			if (publ->node != tipc_own_addr)
+				scopeStr = "";
+			else if (publ->scope == TIPC_NODE_SCOPE)
+				scopeStr = "node";
+			else if (publ->scope == TIPC_CLUSTER_SCOPE)
+				scopeStr = "cluster";
+			else
+				scopeStr = "zone";
+			tipc_printf(buf, "%-10u %s", publ->key, scopeStr);
+		}
+
+		publ = publ->zone_list_next;
+		if (publ == sseq->zone_list)
+			break;
+
+		tipc_printf(buf, "\n%33s", " ");
+	} while (1);
+
+	tipc_printf(buf, "\n");
+}
+
+/**
+ * nameseq_list: print specified name sequence contents into the given buffer
+ */
+
+static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth,
+			 u32 type, u32 lowbound, u32 upbound, u32 index)
+{
+	struct sub_seq *sseq;
+	char typearea[11];
+
+	sprintf(typearea, "%-10u", seq->type);
+
+	if (depth == 1) {
+		tipc_printf(buf, "%s\n", typearea);
+		return;
+	}
+
+	for (sseq = seq->sseqs; sseq != &seq->sseqs[seq->first_free]; sseq++) {
+		if ((lowbound <= sseq->upper) && (upbound >= sseq->lower)) {
+			tipc_printf(buf, "%s ", typearea);
+			subseq_list(sseq, buf, depth, index);
+			sprintf(typearea, "%10s", " ");
+		}
+	}
+}
+
+/**
+ * nametbl_header - print name table header into the given buffer
+ */
+
+static void nametbl_header(struct print_buf *buf, u32 depth)
+{
+	tipc_printf(buf, "Type       ");
+
+	if (depth > 1)
+		tipc_printf(buf, "Lower      Upper      ");
+	if (depth > 2)
+		tipc_printf(buf, "Port Identity              ");
+	if (depth > 3)
+		tipc_printf(buf, "Publication");
+
+	tipc_printf(buf, "\n-----------");
+
+	if (depth > 1)
+		tipc_printf(buf, "--------------------- ");
+	if (depth > 2)
+		tipc_printf(buf, "-------------------------- ");
+	if (depth > 3)
+		tipc_printf(buf, "------------------");
+
+	tipc_printf(buf, "\n");
+}
+
+/**
+ * nametbl_list - print specified name table contents into the given buffer
+ */
+
+static void nametbl_list(struct print_buf *buf, u32 depth_info, 
+			 u32 type, u32 lowbound, u32 upbound)
+{
+	struct hlist_head *seq_head;
+	struct hlist_node *seq_node;
+	struct name_seq *seq;
+	int all_types;
+	u32 depth;
+	u32 i;
+
+	all_types = (depth_info & TIPC_NTQ_ALLTYPES);
+	depth = (depth_info & ~TIPC_NTQ_ALLTYPES);
+
+	if (depth == 0)
+		return;
+
+	if (all_types) {
+		/* display all entries in name table to specified depth */
+		nametbl_header(buf, depth);
+		lowbound = 0;
+		upbound = ~0;
+		for (i = 0; i < tipc_nametbl_size; i++) {
+			seq_head = &table.types[i];
+			hlist_for_each_entry(seq, seq_node, seq_head, ns_list) {
+				nameseq_list(seq, buf, depth, seq->type, 
+					     lowbound, upbound, i);
+			}
+		}
+	} else {
+		/* display only the sequence that matches the specified type */
+		if (upbound < lowbound) {
+			tipc_printf(buf, "invalid name sequence specified\n");
+			return;
+		}
+		nametbl_header(buf, depth);
+		i = hash(type);
+		seq_head = &table.types[i];
+		hlist_for_each_entry(seq, seq_node, seq_head, ns_list) {
+			if (seq->type == type) {
+				nameseq_list(seq, buf, depth, type, 
+					     lowbound, upbound, i);
+				break;
+			}
+		}
+	}
+}
+
+void nametbl_print(struct print_buf *buf, const char *str)
+{
+	tipc_printf(buf, str);
+	read_lock_bh(&nametbl_lock);
+	nametbl_list(buf, 0, 0, 0, 0);
+	read_unlock_bh(&nametbl_lock);
+}
+
+#define MAX_NAME_TBL_QUERY 32768
+
+struct sk_buff *nametbl_get(const void *req_tlv_area, int req_tlv_space)
+{
+	struct sk_buff *buf;
+	struct tipc_name_table_query *argv;
+	struct tlv_desc *rep_tlv;
+	struct print_buf b;
+	int str_len;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NAME_TBL_QUERY))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	buf = cfg_reply_alloc(TLV_SPACE(MAX_NAME_TBL_QUERY));
+	if (!buf)
+		return NULL;
+
+	rep_tlv = (struct tlv_desc *)buf->data;
+	printbuf_init(&b, TLV_DATA(rep_tlv), MAX_NAME_TBL_QUERY);
+	argv = (struct tipc_name_table_query *)TLV_DATA(req_tlv_area);
+	read_lock_bh(&nametbl_lock);
+	nametbl_list(&b, ntohl(argv->depth), ntohl(argv->type), 
+		     ntohl(argv->lowbound), ntohl(argv->upbound));
+	read_unlock_bh(&nametbl_lock);
+	str_len = printbuf_validate(&b);
+
+	skb_put(buf, TLV_SPACE(str_len));
+	TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
+
+	return buf;
+}
+
+void nametbl_dump(void)
+{
+	nametbl_list(CONS, 0, 0, 0, 0);
+}
+
+int nametbl_init(void)
+{
+	int array_size = sizeof(struct hlist_head) * tipc_nametbl_size;
+
+	table.types = (struct hlist_head *)kmalloc(array_size, GFP_ATOMIC);
+	if (!table.types)
+		return -ENOMEM;
+
+	write_lock_bh(&nametbl_lock);
+	memset(table.types, 0, array_size);
+	table.local_publ_count = 0;
+	write_unlock_bh(&nametbl_lock);
+	return 0;
+}
+
+void nametbl_stop(void)
+{
+	struct hlist_head *seq_head;
+	struct hlist_node *seq_node;
+	struct hlist_node *tmp;
+	struct name_seq *seq;
+	u32 i;
+
+	if (!table.types)
+		return;
+
+	write_lock_bh(&nametbl_lock);
+	for (i = 0; i < tipc_nametbl_size; i++) {
+		seq_head = &table.types[i];
+		hlist_for_each_entry_safe(seq, seq_node, tmp, seq_head, ns_list) {
+			struct sub_seq *sseq = seq->sseqs;
+
+			for (; sseq != &seq->sseqs[seq->first_free]; sseq++) {
+				struct publication *publ = sseq->zone_list;
+				assert(publ);
+				do {
+					struct publication *next =
+						publ->zone_list_next;
+					kfree(publ);
+					publ = next;
+				}
+				while (publ != sseq->zone_list);
+			}
+		}
+	}
+	kfree(table.types);
+	table.types = NULL;
+	write_unlock_bh(&nametbl_lock);
+}
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
new file mode 100644
index 000000000000..5036b5bac360
--- /dev/null
+++ b/net/tipc/name_table.h
@@ -0,0 +1,105 @@
+/*
+ * net/tipc/name_table.h: Include file for TIPC name table code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NAME_TABLE_H
+#define _TIPC_NAME_TABLE_H
+
+#include "node_subscr.h"
+
+struct subscription;
+struct port_list;
+
+/*
+ * TIPC name types reserved for internal TIPC use (both current and planned)
+ */
+
+#define TIPC_ZM_SRV 3  		/* zone master service name type */
+
+
+/**
+ * struct publication - info about a published (name or) name sequence
+ * @type: name sequence type
+ * @lower: name sequence lower bound
+ * @upper: name sequence upper bound
+ * @scope: scope of publication
+ * @node: network address of publishing port's node
+ * @ref: publishing port
+ * @key: publication key
+ * @subscr: subscription to "node down" event (for off-node publications only)
+ * @local_list: adjacent entries in list of publications made by this node
+ * @pport_list: adjacent entries in list of publications made by this port
+ * @node_list: next matching name seq publication with >= node scope
+ * @cluster_list: next matching name seq publication with >= cluster scope
+ * @zone_list: next matching name seq publication with >= zone scope
+ * 
+ * Note that the node list, cluster list, and zone list are circular lists.
+ */
+
+struct publication {
+	u32 type;
+	u32 lower;
+	u32 upper;
+	u32 scope;
+	u32 node;
+	u32 ref;
+	u32 key;
+	struct node_subscr subscr;
+	struct list_head local_list;
+	struct list_head pport_list;
+	struct publication *node_list_next;
+	struct publication *cluster_list_next;
+	struct publication *zone_list_next;
+};
+
+
+extern rwlock_t nametbl_lock;
+
+struct sk_buff *nametbl_get(const void *req_tlv_area, int req_tlv_space);
+u32 nametbl_translate(u32 type, u32 instance, u32 *node);
+int nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit, 
+			 struct port_list *dports);
+int nametbl_publish_rsv(u32 ref, unsigned int scope, 
+			struct tipc_name_seq const *seq);
+struct publication *nametbl_publish(u32 type, u32 lower, u32 upper,
+				    u32 scope, u32 port_ref, u32 key);
+int nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key);
+struct publication *nametbl_insert_publ(u32 type, u32 lower, u32 upper,
+					u32 scope, u32 node, u32 ref, u32 key);
+struct publication *nametbl_remove_publ(u32 type, u32 lower, 
+					u32 node, u32 ref, u32 key);
+void nametbl_subscribe(struct subscription *s);
+void nametbl_unsubscribe(struct subscription *s);
+int nametbl_init(void);
+void nametbl_stop(void);
+
+#endif
diff --git a/net/tipc/net.c b/net/tipc/net.c
new file mode 100644
index 000000000000..eba88033b90e
--- /dev/null
+++ b/net/tipc/net.c
@@ -0,0 +1,308 @@
+/*
+ * net/tipc/net.c: TIPC network routing code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "bearer.h"
+#include "net.h"
+#include "zone.h"
+#include "addr.h"
+#include "name_table.h"
+#include "name_distr.h"
+#include "subscr.h"
+#include "link.h"
+#include "msg.h"
+#include "port.h"
+#include "bcast.h"
+#include "discover.h"
+#include "config.h"
+
+/* 
+ * The TIPC locking policy is designed to ensure a very fine locking
+ * granularity, permitting complete parallel access to individual
+ * port and node/link instances. The code consists of three major 
+ * locking domains, each protected with their own disjunct set of locks.
+ *
+ * 1: The routing hierarchy.
+ *    Comprises the structures 'zone', 'cluster', 'node', 'link' 
+ *    and 'bearer'. The whole hierarchy is protected by a big 
+ *    read/write lock, net_lock, to enssure that nothing is added 
+ *    or removed while code is accessing any of these structures. 
+ *    This layer must not be called from the two others while they 
+ *    hold any of their own locks.
+ *    Neither must it itself do any upcalls to the other two before
+ *    it has released net_lock and other protective locks.
+ *
+ *   Within the net_lock domain there are two sub-domains;'node' and 
+ *   'bearer', where local write operations are permitted,
+ *   provided that those are protected by individual spin_locks
+ *   per instance. Code holding net_lock(read) and a node spin_lock 
+ *   is permitted to poke around in both the node itself and its
+ *   subordinate links. I.e, it can update link counters and queues, 
+ *   change link state, send protocol messages, and alter the 
+ *   "active_links" array in the node; but it can _not_ remove a link 
+ *   or a node from the overall structure.
+ *   Correspondingly, individual bearers may change status within a 
+ *   net_lock(read), protected by an individual spin_lock ber bearer 
+ *   instance, but it needs net_lock(write) to remove/add any bearers.
+ *     
+ *
+ *  2: The transport level of the protocol. 
+ *     This consists of the structures port, (and its user level 
+ *     representations, such as user_port and tipc_sock), reference and 
+ *     tipc_user (port.c, reg.c, socket.c). 
+ *
+ *     This layer has four different locks:
+ *     - The tipc_port spin_lock. This is protecting each port instance
+ *       from parallel data access and removal. Since we can not place 
+ *       this lock in the port itself, it has been placed in the 
+ *       corresponding reference table entry, which has the same life
+ *       cycle as the module. This entry is difficult to access from 
+ *       outside the TIPC core, however, so a pointer to the lock has 
+ *       been added in the port instance, -to be used for unlocking 
+ *       only.
+ *     - A read/write lock to protect the reference table itself (teg.c). 
+ *       (Nobody is using read-only access to this, so it can just as 
+ *       well be changed to a spin_lock)
+ *     - A spin lock to protect the registry of kernel/driver users (reg.c)
+ *     - A global spin_lock (port_lock), which only task is to ensure 
+ *       consistency where more than one port is involved in an operation,
+ *       i.e., whe a port is part of a linked list of ports.
+ *       There are two such lists; 'port_list', which is used for management,
+ *       and 'wait_list', which is used to queue ports during congestion.
+ *     
+ *  3: The name table (name_table.c, name_distr.c, subscription.c)
+ *     - There is one big read/write-lock (nametbl_lock) protecting the 
+ *       overall name table structure. Nothing must be added/removed to 
+ *       this structure without holding write access to it.
+ *     - There is one local spin_lock per sub_sequence, which can be seen
+ *       as a sub-domain to the nametbl_lock domain. It is used only
+ *       for translation operations, and is needed because a translation
+ *       steps the root of the 'publication' linked list between each lookup.
+ *       This is always used within the scope of a nametbl_lock(read).
+ *     - A local spin_lock protecting the queue of subscriber events.
+*/
+
+rwlock_t net_lock = RW_LOCK_UNLOCKED;
+struct network net = { 0 };
+
+struct node *net_select_remote_node(u32 addr, u32 ref) 
+{
+	return zone_select_remote_node(net.zones[tipc_zone(addr)], addr, ref);
+}
+
+u32 net_select_router(u32 addr, u32 ref)
+{
+	return zone_select_router(net.zones[tipc_zone(addr)], addr, ref);
+}
+
+
+u32 net_next_node(u32 a)
+{
+	if (net.zones[tipc_zone(a)])
+		return zone_next_node(a);
+	return 0;
+}
+
+void net_remove_as_router(u32 router)
+{
+	u32 z_num;
+
+	for (z_num = 1; z_num <= tipc_max_zones; z_num++) {
+		if (!net.zones[z_num])
+			continue;
+		zone_remove_as_router(net.zones[z_num], router);
+	}
+}
+
+void net_send_external_routes(u32 dest)
+{
+	u32 z_num;
+
+	for (z_num = 1; z_num <= tipc_max_zones; z_num++) {
+		if (net.zones[z_num])
+			zone_send_external_routes(net.zones[z_num], dest);
+	}
+}
+
+int net_init(void)
+{
+	u32 sz = sizeof(struct _zone *) * (tipc_max_zones + 1);
+
+	memset(&net, 0, sizeof(net));
+	net.zones = (struct _zone **)kmalloc(sz, GFP_ATOMIC);
+	if (!net.zones) {
+		return -ENOMEM;
+	}
+	memset(net.zones, 0, sz);
+	return TIPC_OK;
+}
+
+void net_stop(void)
+{
+	u32 z_num;
+
+	if (!net.zones)
+		return;
+
+	for (z_num = 1; z_num <= tipc_max_zones; z_num++) {
+		zone_delete(net.zones[z_num]);
+	}
+	kfree(net.zones);
+	net.zones = 0;
+}
+
+static void net_route_named_msg(struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 dnode;
+	u32 dport;
+
+	if (!msg_named(msg)) {
+		msg_dbg(msg, "net->drop_nam:");
+		buf_discard(buf);
+		return;
+	}
+
+	dnode = addr_domain(msg_lookup_scope(msg));
+	dport = nametbl_translate(msg_nametype(msg), msg_nameinst(msg), &dnode);
+	dbg("net->lookup<%u,%u>-><%u,%x>\n",
+	    msg_nametype(msg), msg_nameinst(msg), dport, dnode);
+	if (dport) {
+		msg_set_destnode(msg, dnode);
+		msg_set_destport(msg, dport);
+		net_route_msg(buf);
+		return;
+	}
+	msg_dbg(msg, "net->rej:NO NAME: ");
+	tipc_reject_msg(buf, TIPC_ERR_NO_NAME);
+}
+
+void net_route_msg(struct sk_buff *buf)
+{
+	struct tipc_msg *msg;
+	u32 dnode;
+
+	if (!buf)
+		return;
+	msg = buf_msg(buf);
+
+	msg_incr_reroute_cnt(msg);
+	if (msg_reroute_cnt(msg) > 6) {
+		if (msg_errcode(msg)) {
+			msg_dbg(msg, "NET>DISC>:");
+			buf_discard(buf);
+		} else {
+			msg_dbg(msg, "NET>REJ>:");
+			tipc_reject_msg(buf, msg_destport(msg) ? 
+					TIPC_ERR_NO_PORT : TIPC_ERR_NO_NAME);
+		}
+		return;
+	}
+
+	msg_dbg(msg, "net->rout: ");
+
+	/* Handle message for this node */
+	dnode = msg_short(msg) ? tipc_own_addr : msg_destnode(msg);
+	if (in_scope(dnode, tipc_own_addr)) {
+		if (msg_isdata(msg)) {
+			if (msg_mcast(msg)) 
+				port_recv_mcast(buf, NULL);
+			else if (msg_destport(msg))
+				port_recv_msg(buf);
+			else
+				net_route_named_msg(buf);
+			return;
+		}
+		switch (msg_user(msg)) {
+		case ROUTE_DISTRIBUTOR:
+			cluster_recv_routing_table(buf);
+			break;
+		case NAME_DISTRIBUTOR:
+			named_recv(buf);
+			break;
+		case CONN_MANAGER:
+			port_recv_proto_msg(buf);
+			break;
+		default:
+			msg_dbg(msg,"DROP/NET/<REC<");
+			buf_discard(buf);
+		}
+		return;
+	}
+
+	/* Handle message for another node */
+	msg_dbg(msg, "NET>SEND>: ");
+	link_send(buf, dnode, msg_link_selector(msg));
+}
+
+int tipc_start_net(void)
+{
+	char addr_string[16];
+	int res;
+
+	if (tipc_mode != TIPC_NODE_MODE)
+		return -ENOPROTOOPT;
+
+	tipc_mode = TIPC_NET_MODE;
+	named_reinit();
+	port_reinit();
+
+	if ((res = bearer_init()) ||
+	    (res = net_init()) ||
+	    (res = cluster_init()) ||
+	    (res = bclink_init())) {
+		return res;
+	}
+        subscr_stop();
+	cfg_stop();
+	k_signal((Handler)subscr_start, 0);
+	k_signal((Handler)cfg_init, 0);
+	info("Started in network mode\n");
+	info("Own node address %s, network identity %u\n",
+	     addr_string_fill(addr_string, tipc_own_addr), tipc_net_id);
+	return TIPC_OK;
+}
+
+void tipc_stop_net(void)
+{
+	if (tipc_mode != TIPC_NET_MODE)
+		return;
+        write_lock_bh(&net_lock);
+	bearer_stop();
+	tipc_mode = TIPC_NODE_MODE;
+	bclink_stop();
+	net_stop();
+        write_unlock_bh(&net_lock);
+	info("Left network mode \n");
+}
+
diff --git a/net/tipc/net.h b/net/tipc/net.h
new file mode 100644
index 000000000000..50098455a898
--- /dev/null
+++ b/net/tipc/net.h
@@ -0,0 +1,63 @@
+/*
+ * net/tipc/net.h: Include file for TIPC network routing code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NET_H
+#define _TIPC_NET_H
+
+struct _zone;
+
+/**
+ * struct network - TIPC network structure
+ * @zones: array of pointers to all zones within network
+ */
+ 
+struct network {
+	struct _zone **zones;
+};
+
+
+extern struct network net;
+extern rwlock_t net_lock;
+
+int net_init(void);
+void net_stop(void);
+void net_remove_as_router(u32 router);
+void net_send_external_routes(u32 dest);
+void net_route_msg(struct sk_buff *buf);
+struct node *net_select_remote_node(u32 addr, u32 ref);
+u32 net_select_router(u32 addr, u32 ref);
+
+int tipc_start_net(void);
+void tipc_stop_net(void);
+
+#endif
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
new file mode 100644
index 000000000000..35fbc7933604
--- /dev/null
+++ b/net/tipc/netlink.c
@@ -0,0 +1,107 @@
+/*
+ * net/tipc/netlink.c: TIPC configuration handling
+ * 
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include <net/genetlink.h>
+
+static int handle_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *rep_buf;
+	struct nlmsghdr *rep_nlh;
+	struct nlmsghdr *req_nlh = info->nlhdr;
+	struct tipc_genlmsghdr *req_userhdr = info->userhdr;
+	int hdr_space = NLMSG_SPACE(0);
+
+	if ((req_userhdr->cmd & 0xC000) && (!capable(CAP_NET_ADMIN)))
+		rep_buf = cfg_reply_error_string(TIPC_CFG_NOT_NET_ADMIN);
+	else
+		rep_buf = cfg_do_cmd(req_userhdr->dest,
+				     req_userhdr->cmd,
+				     NLMSG_DATA(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN,
+				     NLMSG_PAYLOAD(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN),
+				     hdr_space);
+
+	if (rep_buf) {
+		skb_push(rep_buf, hdr_space);
+		rep_nlh = (struct nlmsghdr *)rep_buf->data;
+		memcpy(rep_nlh, req_nlh, hdr_space);
+		rep_nlh->nlmsg_len = rep_buf->len;
+		genlmsg_unicast(rep_buf, req_nlh->nlmsg_pid);
+	}
+
+        return 0;
+}
+
+static struct genl_family family = {
+        .id		= TIPC_GENL_FAMILY,
+        .name		= TIPC_GENL_NAME,
+        .version	= TIPC_GENL_VERSION,
+        .hdrsize	= TIPC_GENL_HDRLEN,
+        .maxattr	= 0,
+	.owner		= THIS_MODULE,
+};
+
+static struct genl_ops ops = {
+	.cmd		= TIPC_GENL_CMD,
+	.doit		= handle_cmd,
+};
+
+static int family_registered = 0;
+
+int netlink_start(void)
+{
+	if (genl_register_family(&family))
+		goto err;
+
+	family_registered = 1;
+
+	if (genl_register_ops(&family, &ops))
+		goto err_unregister;
+
+	return 0;
+
+ err_unregister:
+	genl_unregister_family(&family);
+	family_registered = 0;
+ err:
+	err("Failed to register netlink interface");
+	return -EFAULT;
+}
+
+void netlink_stop(void)
+{
+	if (family_registered) {
+		genl_unregister_family(&family);
+		family_registered = 0;
+	}
+}
diff --git a/net/tipc/node.c b/net/tipc/node.c
new file mode 100644
index 000000000000..e311638b9b3d
--- /dev/null
+++ b/net/tipc/node.c
@@ -0,0 +1,676 @@
+/*
+ * net/tipc/node.c: TIPC node management routines
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include "node.h"
+#include "cluster.h"
+#include "net.h"
+#include "addr.h"
+#include "node_subscr.h"
+#include "link.h"
+#include "port.h"
+#include "bearer.h"
+#include "name_distr.h"
+#include "net.h"
+
+void node_print(struct print_buf *buf, struct node *n_ptr, char *str);
+static void node_lost_contact(struct node *n_ptr);
+static void node_established_contact(struct node *n_ptr);
+
+struct node *nodes = NULL;	/* sorted list of nodes within cluster */
+
+u32 tipc_own_tag = 0;
+
+struct node *node_create(u32 addr)
+{
+	struct cluster *c_ptr;
+	struct node *n_ptr;
+        struct node **curr_node;
+
+	n_ptr = kmalloc(sizeof(*n_ptr),GFP_ATOMIC);
+        if (n_ptr != NULL) {
+                memset(n_ptr, 0, sizeof(*n_ptr));
+                n_ptr->addr = addr;
+                n_ptr->lock =  SPIN_LOCK_UNLOCKED;	
+                INIT_LIST_HEAD(&n_ptr->nsub);
+	
+		c_ptr = cluster_find(addr);
+                if (c_ptr == NULL)
+                        c_ptr = cluster_create(addr);
+                if (c_ptr != NULL) {
+                        n_ptr->owner = c_ptr;
+                        cluster_attach_node(c_ptr, n_ptr);
+                        n_ptr->last_router = -1;
+
+                        /* Insert node into ordered list */
+                        for (curr_node = &nodes; *curr_node; 
+			     curr_node = &(*curr_node)->next) {
+                                if (addr < (*curr_node)->addr) {
+                                        n_ptr->next = *curr_node;
+                                        break;
+                                }
+                        }
+                        (*curr_node) = n_ptr;
+                } else {
+                        kfree(n_ptr);
+                        n_ptr = NULL;
+                }
+        }
+	return n_ptr;
+}
+
+void node_delete(struct node *n_ptr)
+{
+	if (!n_ptr)
+		return;
+
+#if 0
+	/* Not needed because links are already deleted via bearer_stop() */
+
+	u32 l_num;
+
+	for (l_num = 0; l_num < MAX_BEARERS; l_num++) {
+		link_delete(n_ptr->links[l_num]);
+	}
+#endif
+
+	dbg("node %x deleted\n", n_ptr->addr);
+	kfree(n_ptr);
+}
+
+
+/**
+ * node_link_up - handle addition of link
+ * 
+ * Link becomes active (alone or shared) or standby, depending on its priority.
+ */
+
+void node_link_up(struct node *n_ptr, struct link *l_ptr)
+{
+	struct link **active = &n_ptr->active_links[0];
+
+	info("Established link <%s> on network plane %c\n",
+	     l_ptr->name, l_ptr->b_ptr->net_plane);
+	
+	if (!active[0]) {
+		dbg(" link %x into %x/%x\n", l_ptr, &active[0], &active[1]);
+		active[0] = active[1] = l_ptr;
+		node_established_contact(n_ptr);
+		return;
+	}
+	if (l_ptr->priority < active[0]->priority) { 
+		info("Link is standby\n");
+		return;
+	}
+	link_send_duplicate(active[0], l_ptr);
+	if (l_ptr->priority == active[0]->priority) { 
+		active[0] = l_ptr;
+		return;
+	}
+	info("Link <%s> on network plane %c becomes standby\n",
+	     active[0]->name, active[0]->b_ptr->net_plane);
+	active[0] = active[1] = l_ptr;
+}
+
+/**
+ * node_select_active_links - select active link
+ */
+
+static void node_select_active_links(struct node *n_ptr)
+{
+	struct link **active = &n_ptr->active_links[0];
+	u32 i;
+	u32 highest_prio = 0;
+
+        active[0] = active[1] = 0;
+
+	for (i = 0; i < MAX_BEARERS; i++) {
+                struct link *l_ptr = n_ptr->links[i];
+
+		if (!l_ptr || !link_is_up(l_ptr) ||
+		    (l_ptr->priority < highest_prio))
+			continue;
+
+		if (l_ptr->priority > highest_prio) {
+                        highest_prio = l_ptr->priority;
+			active[0] = active[1] = l_ptr;
+		} else {
+			active[1] = l_ptr;
+		}
+	}
+}
+
+/**
+ * node_link_down - handle loss of link
+ */
+
+void node_link_down(struct node *n_ptr, struct link *l_ptr)
+{
+	struct link **active;
+
+	if (!link_is_active(l_ptr)) {
+		info("Lost standby link <%s> on network plane %c\n",
+		     l_ptr->name, l_ptr->b_ptr->net_plane);
+		return;
+	}
+	info("Lost link <%s> on network plane %c\n",
+		l_ptr->name, l_ptr->b_ptr->net_plane);
+
+	active = &n_ptr->active_links[0];
+	if (active[0] == l_ptr)
+		active[0] = active[1];
+	if (active[1] == l_ptr)
+		active[1] = active[0];
+	if (active[0] == l_ptr)
+		node_select_active_links(n_ptr);
+	if (node_is_up(n_ptr)) 
+		link_changeover(l_ptr);
+	else 
+		node_lost_contact(n_ptr);
+}
+
+int node_has_active_links(struct node *n_ptr)
+{
+	return (n_ptr && 
+		((n_ptr->active_links[0]) || (n_ptr->active_links[1])));
+}
+
+int node_has_redundant_links(struct node *n_ptr)
+{
+	return (node_has_active_links(n_ptr) &&
+		(n_ptr->active_links[0] != n_ptr->active_links[1]));
+}
+
+int node_has_active_routes(struct node *n_ptr)
+{
+	return (n_ptr && (n_ptr->last_router >= 0));
+}
+
+int node_is_up(struct node *n_ptr)
+{
+	return (node_has_active_links(n_ptr) || node_has_active_routes(n_ptr));
+}
+
+struct node *node_attach_link(struct link *l_ptr)
+{
+	struct node *n_ptr = node_find(l_ptr->addr);
+
+	if (!n_ptr)
+		n_ptr = node_create(l_ptr->addr);
+        if (n_ptr) {
+		u32 bearer_id = l_ptr->b_ptr->identity;
+		char addr_string[16];
+
+                assert(bearer_id < MAX_BEARERS);
+                if (n_ptr->link_cnt >= 2) {
+			char addr_string[16];
+
+                        err("Attempt to create third link to %s\n",
+			    addr_string_fill(addr_string, n_ptr->addr));
+                        return 0;
+                }
+
+                if (!n_ptr->links[bearer_id]) {
+                        n_ptr->links[bearer_id] = l_ptr;
+                        net.zones[tipc_zone(l_ptr->addr)]->links++;
+                        n_ptr->link_cnt++;
+                        return n_ptr;
+                }
+                err("Attempt to establish second link on <%s> to <%s> \n",
+                    l_ptr->b_ptr->publ.name, 
+		    addr_string_fill(addr_string, l_ptr->addr));
+        }
+	return 0;
+}
+
+void node_detach_link(struct node *n_ptr, struct link *l_ptr)
+{
+	n_ptr->links[l_ptr->b_ptr->identity] = 0;
+	net.zones[tipc_zone(l_ptr->addr)]->links--;
+	n_ptr->link_cnt--;
+}
+
+/*
+ * Routing table management - five cases to handle:
+ *
+ * 1: A link towards a zone/cluster external node comes up.
+ *    => Send a multicast message updating routing tables of all 
+ *    system nodes within own cluster that the new destination 
+ *    can be reached via this node. 
+ *    (node.establishedContact()=>cluster.multicastNewRoute())
+ *
+ * 2: A link towards a slave node comes up.
+ *    => Send a multicast message updating routing tables of all 
+ *    system nodes within own cluster that the new destination 
+ *    can be reached via this node. 
+ *    (node.establishedContact()=>cluster.multicastNewRoute())
+ *    => Send a  message to the slave node about existence 
+ *    of all system nodes within cluster:
+ *    (node.establishedContact()=>cluster.sendLocalRoutes())
+ *
+ * 3: A new cluster local system node becomes available.
+ *    => Send message(s) to this particular node containing
+ *    information about all cluster external and slave
+ *     nodes which can be reached via this node.
+ *    (node.establishedContact()==>network.sendExternalRoutes())
+ *    (node.establishedContact()==>network.sendSlaveRoutes())
+ *    => Send messages to all directly connected slave nodes 
+ *    containing information about the existence of the new node
+ *    (node.establishedContact()=>cluster.multicastNewRoute())
+ *    
+ * 4: The link towards a zone/cluster external node or slave
+ *    node goes down.
+ *    => Send a multcast message updating routing tables of all 
+ *    nodes within cluster that the new destination can not any
+ *    longer be reached via this node.
+ *    (node.lostAllLinks()=>cluster.bcastLostRoute())
+ *
+ * 5: A cluster local system node becomes unavailable.
+ *    => Remove all references to this node from the local
+ *    routing tables. Note: This is a completely node
+ *    local operation.
+ *    (node.lostAllLinks()=>network.removeAsRouter())
+ *    => Send messages to all directly connected slave nodes 
+ *    containing information about loss of the node
+ *    (node.establishedContact()=>cluster.multicastLostRoute())
+ *
+ */
+
+static void node_established_contact(struct node *n_ptr)
+{
+	struct cluster *c_ptr;
+
+	dbg("node_established_contact:-> %x\n", n_ptr->addr);
+	if (!node_has_active_routes(n_ptr)) { 
+		k_signal((Handler)named_node_up, n_ptr->addr);
+	}
+
+        /* Syncronize broadcast acks */
+        n_ptr->bclink.acked = bclink_get_last_sent();
+
+	if (is_slave(tipc_own_addr))
+		return;
+	if (!in_own_cluster(n_ptr->addr)) {
+		/* Usage case 1 (see above) */
+		c_ptr = cluster_find(tipc_own_addr);
+		if (!c_ptr)
+			c_ptr = cluster_create(tipc_own_addr);
+                if (c_ptr)
+                        cluster_bcast_new_route(c_ptr, n_ptr->addr, 1, 
+						tipc_max_nodes);
+		return;
+	} 
+
+	c_ptr = n_ptr->owner;
+	if (is_slave(n_ptr->addr)) {
+		/* Usage case 2 (see above) */
+		cluster_bcast_new_route(c_ptr, n_ptr->addr, 1, tipc_max_nodes);
+		cluster_send_local_routes(c_ptr, n_ptr->addr);
+		return;
+	}
+
+	if (n_ptr->bclink.supported) {
+		nmap_add(&cluster_bcast_nodes, n_ptr->addr);
+		if (n_ptr->addr < tipc_own_addr)
+			tipc_own_tag++;
+	}
+
+	/* Case 3 (see above) */
+	net_send_external_routes(n_ptr->addr);
+	cluster_send_slave_routes(c_ptr, n_ptr->addr);
+	cluster_bcast_new_route(c_ptr, n_ptr->addr, LOWEST_SLAVE,
+				highest_allowed_slave);
+}
+
+static void node_lost_contact(struct node *n_ptr)
+{
+	struct cluster *c_ptr;
+	struct node_subscr *ns, *tns;
+	char addr_string[16];
+	u32 i;
+
+        /* Clean up broadcast reception remains */
+        n_ptr->bclink.gap_after = n_ptr->bclink.gap_to = 0;
+        while (n_ptr->bclink.deferred_head) {
+                struct sk_buff* buf = n_ptr->bclink.deferred_head;
+                n_ptr->bclink.deferred_head = buf->next;
+                buf_discard(buf);
+        }
+        if (n_ptr->bclink.defragm) {
+                buf_discard(n_ptr->bclink.defragm);  
+                n_ptr->bclink.defragm = NULL;
+        }            
+        if (in_own_cluster(n_ptr->addr) && n_ptr->bclink.supported) { 
+                bclink_acknowledge(n_ptr, mod(n_ptr->bclink.acked + 10000));
+        }
+
+        /* Update routing tables */
+	if (is_slave(tipc_own_addr)) {
+		net_remove_as_router(n_ptr->addr);
+	} else {
+		if (!in_own_cluster(n_ptr->addr)) { 
+			/* Case 4 (see above) */
+			c_ptr = cluster_find(tipc_own_addr);
+			cluster_bcast_lost_route(c_ptr, n_ptr->addr, 1,
+						 tipc_max_nodes);
+		} else {
+			/* Case 5 (see above) */
+			c_ptr = cluster_find(n_ptr->addr);
+			if (is_slave(n_ptr->addr)) {
+				cluster_bcast_lost_route(c_ptr, n_ptr->addr, 1,
+							 tipc_max_nodes);
+			} else {
+				if (n_ptr->bclink.supported) {
+					nmap_remove(&cluster_bcast_nodes, 
+						    n_ptr->addr);
+					if (n_ptr->addr < tipc_own_addr)
+						tipc_own_tag--;
+				}
+				net_remove_as_router(n_ptr->addr);
+				cluster_bcast_lost_route(c_ptr, n_ptr->addr,
+							 LOWEST_SLAVE,
+							 highest_allowed_slave);
+			}
+		}
+	}
+	if (node_has_active_routes(n_ptr))
+		return;
+
+	info("Lost contact with %s\n", 
+	     addr_string_fill(addr_string, n_ptr->addr));
+
+	/* Abort link changeover */
+	for (i = 0; i < MAX_BEARERS; i++) {
+		struct link *l_ptr = n_ptr->links[i];
+		if (!l_ptr) 
+			continue;
+		l_ptr->reset_checkpoint = l_ptr->next_in_no;
+		l_ptr->exp_msg_count = 0;
+		link_reset_fragments(l_ptr);
+	}
+
+	/* Notify subscribers */
+	list_for_each_entry_safe(ns, tns, &n_ptr->nsub, nodesub_list) {
+                ns->node = 0;
+		list_del_init(&ns->nodesub_list);
+		k_signal((Handler)ns->handle_node_down,
+			 (unsigned long)ns->usr_handle);
+	}
+}
+
+/**
+ * node_select_next_hop - find the next-hop node for a message
+ * 
+ * Called by when cluster local lookup has failed.
+ */
+
+struct node *node_select_next_hop(u32 addr, u32 selector)
+{
+	struct node *n_ptr;
+	u32 router_addr;
+
+        if (!addr_domain_valid(addr))
+                return 0;
+
+	/* Look for direct link to destination processsor */
+	n_ptr = node_find(addr);
+	if (n_ptr && node_has_active_links(n_ptr))
+                return n_ptr;
+
+	/* Cluster local system nodes *must* have direct links */
+	if (!is_slave(addr) && in_own_cluster(addr))
+		return 0;
+
+	/* Look for cluster local router with direct link to node */
+	router_addr = node_select_router(n_ptr, selector);
+	if (router_addr) 
+                return node_select(router_addr, selector);
+
+	/* Slave nodes can only be accessed within own cluster via a 
+	   known router with direct link -- if no router was found,give up */
+	if (is_slave(addr))
+		return 0;
+
+	/* Inter zone/cluster -- find any direct link to remote cluster */
+	addr = tipc_addr(tipc_zone(addr), tipc_cluster(addr), 0);
+	n_ptr = net_select_remote_node(addr, selector);
+	if (n_ptr && node_has_active_links(n_ptr))
+                return n_ptr;
+
+	/* Last resort -- look for any router to anywhere in remote zone */
+	router_addr =  net_select_router(addr, selector);
+	if (router_addr) 
+                return node_select(router_addr, selector);
+
+        return 0;
+}
+
+/**
+ * node_select_router - select router to reach specified node
+ * 
+ * Uses a deterministic and fair algorithm for selecting router node. 
+ */
+
+u32 node_select_router(struct node *n_ptr, u32 ref)
+{
+	u32 ulim;
+	u32 mask;
+	u32 start;
+	u32 r;
+
+        if (!n_ptr)
+                return 0;
+
+	if (n_ptr->last_router < 0)
+		return 0;
+	ulim = ((n_ptr->last_router + 1) * 32) - 1;
+
+	/* Start entry must be random */
+	mask = tipc_max_nodes;
+	while (mask > ulim)
+		mask >>= 1;
+	start = ref & mask;
+	r = start;
+
+	/* Lookup upwards with wrap-around */
+	do {
+		if (((n_ptr->routers[r / 32]) >> (r % 32)) & 1)
+			break;
+	} while (++r <= ulim);
+	if (r > ulim) {
+		r = 1;
+		do {
+			if (((n_ptr->routers[r / 32]) >> (r % 32)) & 1)
+				break;
+		} while (++r < start);
+		assert(r != start);
+	}
+	assert(r && (r <= ulim));
+	return tipc_addr(own_zone(), own_cluster(), r);
+}
+
+void node_add_router(struct node *n_ptr, u32 router)
+{
+	u32 r_num = tipc_node(router);
+
+	n_ptr->routers[r_num / 32] = 
+		((1 << (r_num % 32)) | n_ptr->routers[r_num / 32]);
+	n_ptr->last_router = tipc_max_nodes / 32;
+	while ((--n_ptr->last_router >= 0) && 
+	       !n_ptr->routers[n_ptr->last_router]);
+}
+
+void node_remove_router(struct node *n_ptr, u32 router)
+{
+	u32 r_num = tipc_node(router);
+
+	if (n_ptr->last_router < 0)
+		return;		/* No routes */
+
+	n_ptr->routers[r_num / 32] =
+		((~(1 << (r_num % 32))) & (n_ptr->routers[r_num / 32]));
+	n_ptr->last_router = tipc_max_nodes / 32;
+	while ((--n_ptr->last_router >= 0) && 
+	       !n_ptr->routers[n_ptr->last_router]);
+
+	if (!node_is_up(n_ptr))
+		node_lost_contact(n_ptr);
+}
+
+#if 0
+void node_print(struct print_buf *buf, struct node *n_ptr, char *str)
+{
+	u32 i;
+
+	tipc_printf(buf, "\n\n%s", str);
+	for (i = 0; i < MAX_BEARERS; i++) {
+		if (!n_ptr->links[i]) 
+			continue;
+		tipc_printf(buf, "Links[%u]: %x, ", i, n_ptr->links[i]);
+	}
+	tipc_printf(buf, "Active links: [%x,%x]\n",
+		    n_ptr->active_links[0], n_ptr->active_links[1]);
+}
+#endif
+
+u32 tipc_available_nodes(const u32 domain)
+{
+	struct node *n_ptr;
+	u32 cnt = 0;
+
+	for (n_ptr = nodes; n_ptr; n_ptr = n_ptr->next) {
+		if (!in_scope(domain, n_ptr->addr))
+			continue;
+		if (node_is_up(n_ptr))
+			cnt++;
+	}
+	return cnt;
+}
+
+struct sk_buff *node_get_nodes(const void *req_tlv_area, int req_tlv_space)
+{
+	u32 domain;
+	struct sk_buff *buf;
+	struct node *n_ptr;
+        struct tipc_node_info node_info;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	domain = *(u32 *)TLV_DATA(req_tlv_area);
+	domain = ntohl(domain);
+	if (!addr_domain_valid(domain))
+		return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+					      " (network address)");
+
+        if (!nodes)
+                return cfg_reply_none();
+
+	/* For now, get space for all other nodes 
+	   (will need to modify this when slave nodes are supported */
+
+	buf = cfg_reply_alloc(TLV_SPACE(sizeof(node_info)) *
+			    (tipc_max_nodes - 1));
+	if (!buf)
+		return NULL;
+
+	/* Add TLVs for all nodes in scope */
+
+	for (n_ptr = nodes; n_ptr; n_ptr = n_ptr->next) {
+		if (!in_scope(domain, n_ptr->addr))
+			continue;
+                node_info.addr = htonl(n_ptr->addr);
+                node_info.up = htonl(node_is_up(n_ptr));
+		cfg_append_tlv(buf, TIPC_TLV_NODE_INFO, 
+			       &node_info, sizeof(node_info));
+	}
+
+	return buf;
+}
+
+struct sk_buff *node_get_links(const void *req_tlv_area, int req_tlv_space)
+{
+	u32 domain;
+	struct sk_buff *buf;
+	struct node *n_ptr;
+        struct tipc_link_info link_info;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	domain = *(u32 *)TLV_DATA(req_tlv_area);
+	domain = ntohl(domain);
+	if (!addr_domain_valid(domain))
+		return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+					      " (network address)");
+
+        if (!nodes)
+                return cfg_reply_none();
+
+	/* For now, get space for 2 links to all other nodes + bcast link 
+	   (will need to modify this when slave nodes are supported */
+
+	buf = cfg_reply_alloc(TLV_SPACE(sizeof(link_info)) *
+			    (2 * (tipc_max_nodes - 1) + 1));
+	if (!buf)
+		return NULL;
+
+	/* Add TLV for broadcast link */
+
+        link_info.dest = tipc_own_addr & 0xfffff00;
+	link_info.dest = htonl(link_info.dest);
+        link_info.up = htonl(1);
+        sprintf(link_info.str, bc_link_name);
+	cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info));
+
+	/* Add TLVs for any other links in scope */
+
+	for (n_ptr = nodes; n_ptr; n_ptr = n_ptr->next) {
+                u32 i;
+
+		if (!in_scope(domain, n_ptr->addr))
+			continue;
+                for (i = 0; i < MAX_BEARERS; i++) {
+                        if (!n_ptr->links[i]) 
+                                continue;
+                        link_info.dest = htonl(n_ptr->addr);
+                        link_info.up = htonl(link_is_up(n_ptr->links[i]));
+                        strcpy(link_info.str, n_ptr->links[i]->name);
+			cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, 
+				       &link_info, sizeof(link_info));
+                }
+	}
+
+	return buf;
+}
diff --git a/net/tipc/node.h b/net/tipc/node.h
new file mode 100644
index 000000000000..1f616873a724
--- /dev/null
+++ b/net/tipc/node.h
@@ -0,0 +1,141 @@
+/*
+ * net/tipc/node.h: Include file for TIPC node management routines
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NODE_H
+#define _TIPC_NODE_H
+
+#include "node_subscr.h"
+#include "addr.h"
+#include "cluster.h"
+#include "bearer.h"
+
+/**
+ * struct node - TIPC node structure
+ * @addr: network address of node
+ * @lock: spinlock governing access to structure
+ * @owner: pointer to cluster that node belongs to
+ * @next: pointer to next node in sorted list of cluster's nodes
+ * @nsub: list of "node down" subscriptions monitoring node
+ * @active_links: pointers to active links to node
+ * @links: pointers to all links to node
+ * @link_cnt: number of links to node
+ * @permit_changeover: non-zero if node has redundant links to this system
+ * @routers: bitmap (used for multicluster communication)
+ * @last_router: (used for multicluster communication)
+ * @bclink: broadcast-related info
+ *    @supported: non-zero if node supports TIPC b'cast capability
+ *    @acked: sequence # of last outbound b'cast message acknowledged by node
+ *    @last_in: sequence # of last in-sequence b'cast message received from node
+ *    @gap_after: sequence # of last message not requiring a NAK request
+ *    @gap_to: sequence # of last message requiring a NAK request
+ *    @nack_sync: counter that determines when NAK requests should be sent
+ *    @deferred_head: oldest OOS b'cast message received from node
+ *    @deferred_tail: newest OOS b'cast message received from node
+ *    @defragm: list of partially reassembled b'cast message fragments from node
+ */
+ 
+struct node {
+	u32 addr;
+	spinlock_t lock;
+	struct cluster *owner;
+	struct node *next;
+	struct list_head nsub;
+	struct link *active_links[2];
+	struct link *links[MAX_BEARERS];
+	int link_cnt;
+	int permit_changeover;
+	u32 routers[512/32];
+	int last_router;
+	struct {
+		int supported;
+		u32 acked;
+		u32 last_in;
+		u32 gap_after; 
+		u32 gap_to; 
+		u32 nack_sync;
+		struct sk_buff *deferred_head;
+		struct sk_buff *deferred_tail;
+		struct sk_buff *defragm;
+	} bclink;
+};
+
+extern struct node *nodes;
+extern u32 tipc_own_tag;
+
+struct node *node_create(u32 addr);
+void node_delete(struct node *n_ptr);
+struct node *node_attach_link(struct link *l_ptr);
+void node_detach_link(struct node *n_ptr, struct link *l_ptr);
+void node_link_down(struct node *n_ptr, struct link *l_ptr);
+void node_link_up(struct node *n_ptr, struct link *l_ptr);
+int node_has_active_links(struct node *n_ptr);
+int node_has_redundant_links(struct node *n_ptr);
+u32 node_select_router(struct node *n_ptr, u32 ref);
+struct node *node_select_next_hop(u32 addr, u32 selector);
+int node_is_up(struct node *n_ptr);
+void node_add_router(struct node *n_ptr, u32 router);
+void node_remove_router(struct node *n_ptr, u32 router);
+struct sk_buff *node_get_links(const void *req_tlv_area, int req_tlv_space);
+struct sk_buff *node_get_nodes(const void *req_tlv_area, int req_tlv_space);
+
+static inline struct node *node_find(u32 addr)
+{
+	if (likely(in_own_cluster(addr)))
+		return local_nodes[tipc_node(addr)];
+	else if (addr_domain_valid(addr)) {
+		struct cluster *c_ptr = cluster_find(addr);
+
+		if (c_ptr)
+			return c_ptr->nodes[tipc_node(addr)];
+	}
+	return 0;
+}
+
+static inline struct node *node_select(u32 addr, u32 selector)
+{
+	if (likely(in_own_cluster(addr)))
+		return local_nodes[tipc_node(addr)];
+	return node_select_next_hop(addr, selector);
+}
+
+static inline void node_lock(struct node *n_ptr)
+{
+	spin_lock_bh(&n_ptr->lock);
+}
+
+static inline void node_unlock(struct node *n_ptr)
+{
+	spin_unlock_bh(&n_ptr->lock);
+}
+
+#endif
diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c
new file mode 100644
index 000000000000..c38a3b0e847d
--- /dev/null
+++ b/net/tipc/node_subscr.c
@@ -0,0 +1,76 @@
+/*
+ * net/tipc/node_subscr.c: TIPC "node down" subscription handling
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "dbg.h"
+#include "node_subscr.h"
+#include "node.h"
+#include "addr.h"
+
+/**
+ * nodesub_subscribe - create "node down" subscription for specified node
+ */
+
+void nodesub_subscribe(struct node_subscr *node_sub, u32 addr, 
+		       void *usr_handle, net_ev_handler handle_down)
+{
+	node_sub->node = 0;
+	if (addr == tipc_own_addr)
+		return;
+	if (!addr_node_valid(addr)) {
+		warn("node_subscr with illegal %x\n", addr);
+		return;
+	}
+
+	node_sub->handle_node_down = handle_down;
+	node_sub->usr_handle = usr_handle;
+	node_sub->node = node_find(addr);
+	assert(node_sub->node);
+	node_lock(node_sub->node);
+	list_add_tail(&node_sub->nodesub_list, &node_sub->node->nsub);
+	node_unlock(node_sub->node);
+}
+
+/**
+ * nodesub_unsubscribe - cancel "node down" subscription (if any)
+ */
+
+void nodesub_unsubscribe(struct node_subscr *node_sub)
+{
+	if (!node_sub->node)
+		return;
+
+	node_lock(node_sub->node);
+	list_del_init(&node_sub->nodesub_list);
+	node_unlock(node_sub->node);
+}
diff --git a/net/tipc/node_subscr.h b/net/tipc/node_subscr.h
new file mode 100644
index 000000000000..63ae92e0c82e
--- /dev/null
+++ b/net/tipc/node_subscr.h
@@ -0,0 +1,60 @@
+/*
+ * net/tipc/node_subscr.h: Include file for TIPC "node down" subscription handling
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NODE_SUBSCR_H
+#define _TIPC_NODE_SUBSCR_H
+
+#include "addr.h"
+
+typedef void (*net_ev_handler) (void *usr_handle);
+
+/**
+ * struct node_subscr - "node down" subscription entry
+ * @node: ptr to node structure of interest (or NULL, if none)
+ * @handle_node_down: routine to invoke when node fails
+ * @usr_handle: argument to pass to routine when node fails
+ * @nodesub_list: adjacent entries in list of subscriptions for the node
+ */
+
+struct node_subscr {
+	struct node *node;
+	net_ev_handler handle_node_down;
+	void *usr_handle;
+	struct list_head nodesub_list;
+};
+
+void nodesub_subscribe(struct node_subscr *node_sub, u32 addr,
+		       void *usr_handle, net_ev_handler handle_down);
+void nodesub_unsubscribe(struct node_subscr *node_sub);
+
+#endif
diff --git a/net/tipc/port.c b/net/tipc/port.c
new file mode 100644
index 000000000000..f0f228c9589a
--- /dev/null
+++ b/net/tipc/port.c
@@ -0,0 +1,1704 @@
+/*
+ * net/tipc/port.c: TIPC port code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include "dbg.h"
+#include "port.h"
+#include "addr.h"
+#include "link.h"
+#include "node.h"
+#include "port.h"
+#include "name_table.h"
+#include "user_reg.h"
+#include "msg.h"
+#include "bcast.h"
+
+/* Connection management: */
+#define PROBING_INTERVAL 3600000	/* [ms] => 1 h */
+#define CONFIRMED 0
+#define PROBING 1
+
+#define MAX_REJECT_SIZE 1024
+
+static struct sk_buff *msg_queue_head = 0;
+static struct sk_buff *msg_queue_tail = 0;
+
+spinlock_t port_list_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t queue_lock = SPIN_LOCK_UNLOCKED;
+
+LIST_HEAD(ports);
+static void port_handle_node_down(unsigned long ref);
+static struct sk_buff* port_build_self_abort_msg(struct port *,u32 err);
+static struct sk_buff* port_build_peer_abort_msg(struct port *,u32 err);
+static void port_timeout(unsigned long ref);
+
+
+static inline u32 port_peernode(struct port *p_ptr)
+{
+	return msg_destnode(&p_ptr->publ.phdr);
+}
+
+static inline u32 port_peerport(struct port *p_ptr)
+{
+	return msg_destport(&p_ptr->publ.phdr);
+}
+
+static inline u32 port_out_seqno(struct port *p_ptr)
+{
+	return msg_transp_seqno(&p_ptr->publ.phdr);
+}
+
+static inline void port_set_out_seqno(struct port *p_ptr, u32 seqno) 
+{
+	msg_set_transp_seqno(&p_ptr->publ.phdr,seqno);
+}
+
+static inline void port_incr_out_seqno(struct port *p_ptr)
+{
+	struct tipc_msg *m = &p_ptr->publ.phdr;
+
+	if (likely(!msg_routed(m)))
+		return;
+	msg_set_transp_seqno(m, (msg_transp_seqno(m) + 1));
+}
+
+/**
+ * tipc_multicast - send a multicast message to local and remote destinations
+ */
+
+int tipc_multicast(u32 ref, struct tipc_name_seq const *seq, u32 domain,
+		   u32 num_sect, struct iovec const *msg_sect)
+{
+	struct tipc_msg *hdr;
+	struct sk_buff *buf;
+	struct sk_buff *ibuf = NULL;
+	struct port_list dports = {0, NULL, };
+	struct port *oport = port_deref(ref);
+	int ext_targets;
+	int res;
+
+	if (unlikely(!oport))
+		return -EINVAL;
+
+	/* Create multicast message */
+
+	hdr = &oport->publ.phdr;
+	msg_set_type(hdr, TIPC_MCAST_MSG);
+	msg_set_nametype(hdr, seq->type);
+	msg_set_namelower(hdr, seq->lower);
+	msg_set_nameupper(hdr, seq->upper);
+	msg_set_hdr_sz(hdr, MCAST_H_SIZE);
+	res = msg_build(hdr, msg_sect, num_sect, MAX_MSG_SIZE,
+			!oport->user_port, &buf);
+	if (unlikely(!buf))
+		return res;
+
+	/* Figure out where to send multicast message */
+
+	ext_targets = nametbl_mc_translate(seq->type, seq->lower, seq->upper,
+					   TIPC_NODE_SCOPE, &dports);
+	
+	/* Send message to destinations (duplicate it only if necessary) */ 
+
+	if (ext_targets) {
+		if (dports.count != 0) {
+			ibuf = skb_copy(buf, GFP_ATOMIC);
+			if (ibuf == NULL) {
+				port_list_free(&dports);
+				buf_discard(buf);
+				return -ENOMEM;
+			}
+		}
+		res = bclink_send_msg(buf);
+		if ((res < 0) && (dports.count != 0)) {
+			buf_discard(ibuf);
+		}
+	} else {
+		ibuf = buf;
+	}
+
+	if (res >= 0) {
+		if (ibuf)
+			port_recv_mcast(ibuf, &dports);
+	} else {
+		port_list_free(&dports);
+	}
+	return res;
+}
+
+/**
+ * port_recv_mcast - deliver multicast message to all destination ports
+ * 
+ * If there is no port list, perform a lookup to create one
+ */
+
+void port_recv_mcast(struct sk_buff *buf, struct port_list *dp)
+{
+	struct tipc_msg* msg;
+	struct port_list dports = {0, NULL, };
+	struct port_list *item = dp;
+	int cnt = 0;
+
+	assert(buf);
+	msg = buf_msg(buf);
+
+	/* Create destination port list, if one wasn't supplied */
+
+	if (dp == NULL) {
+		nametbl_mc_translate(msg_nametype(msg),
+				     msg_namelower(msg),
+				     msg_nameupper(msg),
+				     TIPC_CLUSTER_SCOPE,
+				     &dports);
+		item = dp = &dports;
+	}
+
+	/* Deliver a copy of message to each destination port */
+
+	if (dp->count != 0) {
+		if (dp->count == 1) {
+			msg_set_destport(msg, dp->ports[0]);
+			port_recv_msg(buf);
+			port_list_free(dp);
+			return;
+		}
+		for (; cnt < dp->count; cnt++) {
+			int index = cnt % PLSIZE;
+			struct sk_buff *b = skb_clone(buf, GFP_ATOMIC);
+
+			if (b == NULL) {
+				warn("Buffer allocation failure\n");
+				msg_dbg(msg, "LOST:");
+				goto exit;
+			}
+			if ((index == 0) && (cnt != 0)) {
+				item = item->next;
+			}
+			msg_set_destport(buf_msg(b),item->ports[index]);
+			port_recv_msg(b);
+		}
+	}
+exit:
+	buf_discard(buf);
+	port_list_free(dp);
+}
+
+/**
+ * tipc_createport_raw - create a native TIPC port
+ * 
+ * Returns local port reference
+ */
+
+u32 tipc_createport_raw(void *usr_handle,
+			u32 (*dispatcher)(struct tipc_port *, struct sk_buff *),
+			void (*wakeup)(struct tipc_port *),
+			const u32 importance)
+{
+	struct port *p_ptr;
+	struct tipc_msg *msg;
+	u32 ref;
+
+	p_ptr = kmalloc(sizeof(*p_ptr), GFP_ATOMIC);
+	if (p_ptr == NULL) {
+		warn("Memory squeeze; failed to create port\n");
+		return 0;
+	}
+	memset(p_ptr, 0, sizeof(*p_ptr));
+	ref = ref_acquire(p_ptr, &p_ptr->publ.lock);
+	if (!ref) {
+		warn("Reference Table Exhausted\n");
+		kfree(p_ptr);
+		return 0;
+	}
+
+	port_lock(ref);
+	p_ptr->publ.ref = ref;
+	msg = &p_ptr->publ.phdr;
+	msg_init(msg, DATA_LOW, TIPC_NAMED_MSG, TIPC_OK, LONG_H_SIZE, 0);
+	msg_set_orignode(msg, tipc_own_addr);
+	msg_set_prevnode(msg, tipc_own_addr);
+	msg_set_origport(msg, ref);
+	msg_set_importance(msg,importance);
+	p_ptr->last_in_seqno = 41;
+	p_ptr->sent = 1;
+	p_ptr->publ.usr_handle = usr_handle;
+	INIT_LIST_HEAD(&p_ptr->wait_list);
+	INIT_LIST_HEAD(&p_ptr->subscription.nodesub_list);
+	p_ptr->congested_link = 0;
+	p_ptr->max_pkt = MAX_PKT_DEFAULT;
+	p_ptr->dispatcher = dispatcher;
+	p_ptr->wakeup = wakeup;
+	p_ptr->user_port = 0;
+	k_init_timer(&p_ptr->timer, (Handler)port_timeout, ref);
+	spin_lock_bh(&port_list_lock);
+	INIT_LIST_HEAD(&p_ptr->publications);
+	INIT_LIST_HEAD(&p_ptr->port_list);
+	list_add_tail(&p_ptr->port_list, &ports);
+	spin_unlock_bh(&port_list_lock);
+	port_unlock(p_ptr);
+	return ref;
+}
+
+int tipc_deleteport(u32 ref)
+{
+	struct port *p_ptr;
+	struct sk_buff *buf = 0;
+
+	tipc_withdraw(ref, 0, 0);
+	p_ptr = port_lock(ref);
+	if (!p_ptr) 
+		return -EINVAL;
+
+	ref_discard(ref);
+	port_unlock(p_ptr);
+
+	k_cancel_timer(&p_ptr->timer);
+	if (p_ptr->publ.connected) {
+		buf = port_build_peer_abort_msg(p_ptr, TIPC_ERR_NO_PORT);
+		nodesub_unsubscribe(&p_ptr->subscription);
+	}
+	if (p_ptr->user_port) {
+		reg_remove_port(p_ptr->user_port);
+		kfree(p_ptr->user_port);
+	}
+
+	spin_lock_bh(&port_list_lock);
+	list_del(&p_ptr->port_list);
+	list_del(&p_ptr->wait_list);
+	spin_unlock_bh(&port_list_lock);
+	k_term_timer(&p_ptr->timer);
+	kfree(p_ptr);
+	dbg("Deleted port %u\n", ref);
+	net_route_msg(buf);
+	return TIPC_OK;
+}
+
+/**
+ * tipc_get_port() - return port associated with 'ref'
+ * 
+ * Note: Port is not locked.
+ */
+
+struct tipc_port *tipc_get_port(const u32 ref)
+{
+	return (struct tipc_port *)ref_deref(ref);
+}
+
+/**
+ * tipc_get_handle - return user handle associated to port 'ref'
+ */
+
+void *tipc_get_handle(const u32 ref)
+{
+	struct port *p_ptr;
+	void * handle;
+
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return 0;
+	handle = p_ptr->publ.usr_handle;
+	port_unlock(p_ptr);
+	return handle;
+}
+
+static inline int port_unreliable(struct port *p_ptr)
+{
+	return msg_src_droppable(&p_ptr->publ.phdr);
+}
+
+int tipc_portunreliable(u32 ref, unsigned int *isunreliable)
+{
+	struct port *p_ptr;
+	
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	*isunreliable = port_unreliable(p_ptr);
+	spin_unlock_bh(p_ptr->publ.lock);
+	return TIPC_OK;
+}
+
+int tipc_set_portunreliable(u32 ref, unsigned int isunreliable)
+{
+	struct port *p_ptr;
+	
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	msg_set_src_droppable(&p_ptr->publ.phdr, (isunreliable != 0));
+	port_unlock(p_ptr);
+	return TIPC_OK;
+}
+
+static inline int port_unreturnable(struct port *p_ptr)
+{
+	return msg_dest_droppable(&p_ptr->publ.phdr);
+}
+
+int tipc_portunreturnable(u32 ref, unsigned int *isunrejectable)
+{
+	struct port *p_ptr;
+	
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	*isunrejectable = port_unreturnable(p_ptr);
+	spin_unlock_bh(p_ptr->publ.lock);
+	return TIPC_OK;
+}
+
+int tipc_set_portunreturnable(u32 ref, unsigned int isunrejectable)
+{
+	struct port *p_ptr;
+	
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	msg_set_dest_droppable(&p_ptr->publ.phdr, (isunrejectable != 0));
+	port_unlock(p_ptr);
+	return TIPC_OK;
+}
+
+/* 
+ * port_build_proto_msg(): build a port level protocol 
+ * or a connection abortion message. Called with 
+ * tipc_port lock on.
+ */
+static struct sk_buff *port_build_proto_msg(u32 destport, u32 destnode,
+					    u32 origport, u32 orignode,
+					    u32 usr, u32 type, u32 err, 
+					    u32 seqno, u32 ack)
+{
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+	
+	buf = buf_acquire(LONG_H_SIZE);
+	if (buf) {
+		msg = buf_msg(buf);
+		msg_init(msg, usr, type, err, LONG_H_SIZE, destnode);
+		msg_set_destport(msg, destport);
+		msg_set_origport(msg, origport);
+		msg_set_destnode(msg, destnode);
+		msg_set_orignode(msg, orignode);
+		msg_set_transp_seqno(msg, seqno);
+		msg_set_msgcnt(msg, ack);
+		msg_dbg(msg, "PORT>SEND>:");
+	}
+	return buf;
+}
+
+int tipc_set_msg_option(struct tipc_port *tp_ptr, const char *opt, const u32 sz)
+{
+	msg_expand(&tp_ptr->phdr, msg_destnode(&tp_ptr->phdr));
+	msg_set_options(&tp_ptr->phdr, opt, sz);
+	return TIPC_OK;
+}
+
+int tipc_reject_msg(struct sk_buff *buf, u32 err)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	struct sk_buff *rbuf;
+	struct tipc_msg *rmsg;
+	int hdr_sz;
+	u32 imp = msg_importance(msg);
+	u32 data_sz = msg_data_sz(msg);
+
+	if (data_sz > MAX_REJECT_SIZE)
+		data_sz = MAX_REJECT_SIZE;
+	if (msg_connected(msg) && (imp < TIPC_CRITICAL_IMPORTANCE))
+		imp++;
+	msg_dbg(msg, "port->rej: ");
+
+	/* discard rejected message if it shouldn't be returned to sender */
+	if (msg_errcode(msg) || msg_dest_droppable(msg)) {
+		buf_discard(buf);
+		return data_sz;
+	}
+
+	/* construct rejected message */
+	if (msg_mcast(msg))
+		hdr_sz = MCAST_H_SIZE;
+	else
+		hdr_sz = LONG_H_SIZE;
+	rbuf = buf_acquire(data_sz + hdr_sz);
+	if (rbuf == NULL) {
+		buf_discard(buf);
+		return data_sz;
+	}
+	rmsg = buf_msg(rbuf);
+	msg_init(rmsg, imp, msg_type(msg), err, hdr_sz, msg_orignode(msg));
+	msg_set_destport(rmsg, msg_origport(msg));
+	msg_set_prevnode(rmsg, tipc_own_addr);
+	msg_set_origport(rmsg, msg_destport(msg));
+	if (msg_short(msg))
+		msg_set_orignode(rmsg, tipc_own_addr);
+	else
+		msg_set_orignode(rmsg, msg_destnode(msg));
+	msg_set_size(rmsg, data_sz + hdr_sz); 
+	msg_set_nametype(rmsg, msg_nametype(msg));
+	msg_set_nameinst(rmsg, msg_nameinst(msg));
+	memcpy(rbuf->data + hdr_sz, msg_data(msg), data_sz);
+
+	/* send self-abort message when rejecting on a connected port */
+	if (msg_connected(msg)) {
+		struct sk_buff *abuf = 0;
+		struct port *p_ptr = port_lock(msg_destport(msg));
+
+		if (p_ptr) {
+			if (p_ptr->publ.connected)
+				abuf = port_build_self_abort_msg(p_ptr, err);
+			port_unlock(p_ptr);
+		}
+		net_route_msg(abuf);
+	}
+
+	/* send rejected message */
+	buf_discard(buf);
+	net_route_msg(rbuf);
+	return data_sz;
+}
+
+int port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr,
+			 struct iovec const *msg_sect, u32 num_sect,
+			 int err)
+{
+	struct sk_buff *buf;
+	int res;
+
+	res = msg_build(hdr, msg_sect, num_sect, MAX_MSG_SIZE, 
+			!p_ptr->user_port, &buf);
+	if (!buf)
+		return res;
+
+	return tipc_reject_msg(buf, err);
+}
+
+static void port_timeout(unsigned long ref)
+{
+	struct port *p_ptr = port_lock(ref);
+	struct sk_buff *buf = 0;
+
+	if (!p_ptr || !p_ptr->publ.connected)
+		return;
+
+	/* Last probe answered ? */
+	if (p_ptr->probing_state == PROBING) {
+		buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_PORT);
+	} else {
+		buf = port_build_proto_msg(port_peerport(p_ptr),
+					   port_peernode(p_ptr),
+					   p_ptr->publ.ref,
+					   tipc_own_addr,
+					   CONN_MANAGER,
+					   CONN_PROBE,
+					   TIPC_OK, 
+					   port_out_seqno(p_ptr),
+					   0);
+		port_incr_out_seqno(p_ptr);
+		p_ptr->probing_state = PROBING;
+		k_start_timer(&p_ptr->timer, p_ptr->probing_interval);
+	}
+	port_unlock(p_ptr);
+	net_route_msg(buf);
+}
+
+
+static void port_handle_node_down(unsigned long ref)
+{
+	struct port *p_ptr = port_lock(ref);
+	struct sk_buff* buf = 0;
+
+	if (!p_ptr)
+		return;
+	buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_NODE);
+	port_unlock(p_ptr);
+	net_route_msg(buf);
+}
+
+
+static struct sk_buff *port_build_self_abort_msg(struct port *p_ptr, u32 err)
+{
+	u32 imp = msg_importance(&p_ptr->publ.phdr);
+
+	if (!p_ptr->publ.connected)
+		return 0;
+	if (imp < TIPC_CRITICAL_IMPORTANCE)
+		imp++;
+	return port_build_proto_msg(p_ptr->publ.ref,
+				    tipc_own_addr,
+				    port_peerport(p_ptr),
+				    port_peernode(p_ptr),
+				    imp,
+				    TIPC_CONN_MSG,
+				    err, 
+				    p_ptr->last_in_seqno + 1,
+				    0);
+}
+
+
+static struct sk_buff *port_build_peer_abort_msg(struct port *p_ptr, u32 err)
+{
+	u32 imp = msg_importance(&p_ptr->publ.phdr);
+
+	if (!p_ptr->publ.connected)
+		return 0;
+	if (imp < TIPC_CRITICAL_IMPORTANCE)
+		imp++;
+	return port_build_proto_msg(port_peerport(p_ptr),
+				    port_peernode(p_ptr),
+				    p_ptr->publ.ref,
+				    tipc_own_addr,
+				    imp,
+				    TIPC_CONN_MSG,
+				    err, 
+				    port_out_seqno(p_ptr),
+				    0);
+}
+
+void port_recv_proto_msg(struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	struct port *p_ptr = port_lock(msg_destport(msg));
+	u32 err = TIPC_OK;
+	struct sk_buff *r_buf = 0;
+	struct sk_buff *abort_buf = 0;
+
+	msg_dbg(msg, "PORT<RECV<:");
+
+	if (!p_ptr) {
+		err = TIPC_ERR_NO_PORT;
+	} else if (p_ptr->publ.connected) {
+		if (port_peernode(p_ptr) != msg_orignode(msg))
+			err = TIPC_ERR_NO_PORT;
+		if (port_peerport(p_ptr) != msg_origport(msg))
+			err = TIPC_ERR_NO_PORT;
+		if (!err && msg_routed(msg)) {
+			u32 seqno = msg_transp_seqno(msg);
+			u32 myno =  ++p_ptr->last_in_seqno;
+			if (seqno != myno) {
+				err = TIPC_ERR_NO_PORT;
+				abort_buf = port_build_self_abort_msg(p_ptr, err);
+			}
+		}
+		if (msg_type(msg) == CONN_ACK) {
+			int wakeup = port_congested(p_ptr) && 
+				     p_ptr->publ.congested &&
+				     p_ptr->wakeup;
+			p_ptr->acked += msg_msgcnt(msg);
+			if (port_congested(p_ptr))
+				goto exit;
+			p_ptr->publ.congested = 0;
+			if (!wakeup)
+				goto exit;
+			p_ptr->wakeup(&p_ptr->publ);
+			goto exit;
+		}
+	} else if (p_ptr->publ.published) {
+		err = TIPC_ERR_NO_PORT;
+	}
+	if (err) {
+		r_buf = port_build_proto_msg(msg_origport(msg),
+					     msg_orignode(msg), 
+					     msg_destport(msg), 
+					     tipc_own_addr,
+					     DATA_HIGH,
+					     TIPC_CONN_MSG,
+					     err,
+					     0,
+					     0);
+		goto exit;
+	}
+
+	/* All is fine */
+	if (msg_type(msg) == CONN_PROBE) {
+		r_buf = port_build_proto_msg(msg_origport(msg), 
+					     msg_orignode(msg), 
+					     msg_destport(msg), 
+					     tipc_own_addr, 
+					     CONN_MANAGER,
+					     CONN_PROBE_REPLY,
+					     TIPC_OK,
+					     port_out_seqno(p_ptr),
+					     0);
+	}
+	p_ptr->probing_state = CONFIRMED;
+	port_incr_out_seqno(p_ptr);
+exit:
+	if (p_ptr)
+		port_unlock(p_ptr);
+	net_route_msg(r_buf);
+	net_route_msg(abort_buf);
+	buf_discard(buf);
+}
+
+static void port_print(struct port *p_ptr, struct print_buf *buf, int full_id)
+{
+        struct publication *publ;
+
+	if (full_id)
+		tipc_printf(buf, "<%u.%u.%u:%u>:", 
+			    tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr),
+                            tipc_node(tipc_own_addr), p_ptr->publ.ref);
+	else
+		tipc_printf(buf, "%-10u:", p_ptr->publ.ref);
+
+        if (p_ptr->publ.connected) {
+                u32 dport = port_peerport(p_ptr);
+                u32 destnode = port_peernode(p_ptr);
+
+                tipc_printf(buf, " connected to <%u.%u.%u:%u>",
+                            tipc_zone(destnode), tipc_cluster(destnode),
+                            tipc_node(destnode), dport);
+                if (p_ptr->publ.conn_type != 0)
+                        tipc_printf(buf, " via {%u,%u}",
+                                    p_ptr->publ.conn_type,
+                                    p_ptr->publ.conn_instance);
+        }
+        else if (p_ptr->publ.published) {
+                tipc_printf(buf, " bound to");
+                list_for_each_entry(publ, &p_ptr->publications, pport_list) {
+			if (publ->lower == publ->upper)
+				tipc_printf(buf, " {%u,%u}", publ->type,
+					    publ->lower);
+			else
+				tipc_printf(buf, " {%u,%u,%u}", publ->type, 
+					    publ->lower, publ->upper);
+                }
+        }
+        tipc_printf(buf, "\n");
+}
+
+#define MAX_PORT_QUERY 32768
+
+struct sk_buff *port_get_ports(void)
+{
+	struct sk_buff *buf;
+	struct tlv_desc *rep_tlv;
+	struct print_buf pb;
+	struct port *p_ptr;
+	int str_len;
+
+	buf = cfg_reply_alloc(TLV_SPACE(MAX_PORT_QUERY));
+	if (!buf)
+		return NULL;
+	rep_tlv = (struct tlv_desc *)buf->data;
+
+	printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_QUERY);
+	spin_lock_bh(&port_list_lock);
+	list_for_each_entry(p_ptr, &ports, port_list) {
+		spin_lock_bh(p_ptr->publ.lock);
+		port_print(p_ptr, &pb, 0);
+		spin_unlock_bh(p_ptr->publ.lock);
+	}
+	spin_unlock_bh(&port_list_lock);
+	str_len = printbuf_validate(&pb);
+
+	skb_put(buf, TLV_SPACE(str_len));
+	TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
+
+	return buf;
+}
+
+#if 0
+
+#define MAX_PORT_STATS 2000
+
+struct sk_buff *port_show_stats(const void *req_tlv_area, int req_tlv_space)
+{
+	u32 ref;
+	struct port *p_ptr;
+	struct sk_buff *buf;
+	struct tlv_desc *rep_tlv;
+	struct print_buf pb;
+	int str_len;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_PORT_REF))
+		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	ref = *(u32 *)TLV_DATA(req_tlv_area);
+	ref = ntohl(ref);
+
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return cfg_reply_error_string("port not found");
+
+	buf = cfg_reply_alloc(TLV_SPACE(MAX_PORT_STATS));
+	if (!buf) {
+		port_unlock(p_ptr);
+		return NULL;
+	}
+	rep_tlv = (struct tlv_desc *)buf->data;
+
+	printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_STATS);
+	port_print(p_ptr, &pb, 1);
+	/* NEED TO FILL IN ADDITIONAL PORT STATISTICS HERE */
+	port_unlock(p_ptr);
+	str_len = printbuf_validate(&pb);
+
+	skb_put(buf, TLV_SPACE(str_len));
+	TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
+
+	return buf;
+}
+
+#endif
+
+void port_reinit(void)
+{
+	struct port *p_ptr;
+	struct tipc_msg *msg;
+
+	spin_lock_bh(&port_list_lock);
+	list_for_each_entry(p_ptr, &ports, port_list) {
+		msg = &p_ptr->publ.phdr;
+		if (msg_orignode(msg) == tipc_own_addr)
+			break;
+		msg_set_orignode(msg, tipc_own_addr);
+	}
+	spin_unlock_bh(&port_list_lock);
+}
+
+
+/*
+ *  port_dispatcher_sigh(): Signal handler for messages destinated
+ *                          to the tipc_port interface.
+ */
+
+static void port_dispatcher_sigh(void *dummy)
+{
+	struct sk_buff *buf;
+
+	spin_lock_bh(&queue_lock);
+	buf = msg_queue_head;
+	msg_queue_head = 0;
+	spin_unlock_bh(&queue_lock);
+
+	while (buf) {
+		struct port *p_ptr;
+		struct user_port *up_ptr;
+		struct tipc_portid orig;
+		struct tipc_name_seq dseq;
+		void *usr_handle;
+		int connected;
+		int published;
+
+		struct sk_buff *next = buf->next;
+		struct tipc_msg *msg = buf_msg(buf);
+		u32 dref = msg_destport(msg);
+		
+		p_ptr = port_lock(dref);
+		if (!p_ptr) {
+			/* Port deleted while msg in queue */
+			tipc_reject_msg(buf, TIPC_ERR_NO_PORT);
+			buf = next;
+			continue;
+		}
+		orig.ref = msg_origport(msg);
+		orig.node = msg_orignode(msg);
+		up_ptr = p_ptr->user_port;
+		usr_handle = up_ptr->usr_handle;
+		connected = p_ptr->publ.connected;
+		published = p_ptr->publ.published;
+
+		if (unlikely(msg_errcode(msg)))
+			goto err;
+
+		switch (msg_type(msg)) {
+		
+		case TIPC_CONN_MSG:{
+				tipc_conn_msg_event cb = up_ptr->conn_msg_cb;
+				u32 peer_port = port_peerport(p_ptr);
+				u32 peer_node = port_peernode(p_ptr);
+
+				spin_unlock_bh(p_ptr->publ.lock);
+				if (unlikely(!connected)) {
+					if (unlikely(published))
+						goto reject;
+					tipc_connect2port(dref,&orig);
+				}
+				if (unlikely(msg_origport(msg) != peer_port))
+					goto reject;
+				if (unlikely(msg_orignode(msg) != peer_node))
+					goto reject;
+				if (unlikely(!cb))
+					goto reject;
+				if (unlikely(++p_ptr->publ.conn_unacked >= 
+					     TIPC_FLOW_CONTROL_WIN))
+					tipc_acknowledge(dref, 
+							 p_ptr->publ.conn_unacked);
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg),
+				   msg_data_sz(msg));
+				break;
+			}
+		case TIPC_DIRECT_MSG:{
+				tipc_msg_event cb = up_ptr->msg_cb;
+
+				spin_unlock_bh(p_ptr->publ.lock);
+				if (unlikely(connected))
+					goto reject;
+				if (unlikely(!cb))
+					goto reject;
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg), 
+				   msg_data_sz(msg), msg_importance(msg),
+				   &orig);
+				break;
+			}
+		case TIPC_NAMED_MSG:{
+				tipc_named_msg_event cb = up_ptr->named_msg_cb;
+
+				spin_unlock_bh(p_ptr->publ.lock);
+				if (unlikely(connected))
+					goto reject;
+				if (unlikely(!cb))
+					goto reject;
+				if (unlikely(!published))
+					goto reject;
+				dseq.type =  msg_nametype(msg);
+				dseq.lower = msg_nameinst(msg);
+				dseq.upper = dseq.lower;
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg), 
+				   msg_data_sz(msg), msg_importance(msg),
+				   &orig, &dseq);
+				break;
+			}
+		}
+		if (buf)
+			buf_discard(buf);
+		buf = next;
+		continue;
+err:
+		switch (msg_type(msg)) {
+		
+		case TIPC_CONN_MSG:{
+				tipc_conn_shutdown_event cb = 
+					up_ptr->conn_err_cb;
+				u32 peer_port = port_peerport(p_ptr);
+				u32 peer_node = port_peernode(p_ptr);
+
+				spin_unlock_bh(p_ptr->publ.lock);
+				if (!connected || !cb)
+					break;
+				if (msg_origport(msg) != peer_port)
+					break;
+				if (msg_orignode(msg) != peer_node)
+					break;
+				tipc_disconnect(dref);
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg),
+				   msg_data_sz(msg), msg_errcode(msg));
+				break;
+			}
+		case TIPC_DIRECT_MSG:{
+				tipc_msg_err_event cb = up_ptr->err_cb;
+
+				spin_unlock_bh(p_ptr->publ.lock);
+				if (connected || !cb)
+					break;
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg),
+				   msg_data_sz(msg), msg_errcode(msg), &orig);
+				break;
+			}
+		case TIPC_NAMED_MSG:{
+				tipc_named_msg_err_event cb = 
+					up_ptr->named_err_cb;
+
+				spin_unlock_bh(p_ptr->publ.lock);
+				if (connected || !cb)
+					break;
+				dseq.type =  msg_nametype(msg);
+				dseq.lower = msg_nameinst(msg);
+				dseq.upper = dseq.lower;
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg), 
+				   msg_data_sz(msg), msg_errcode(msg), &dseq);
+				break;
+			}
+		}
+		if (buf)
+			buf_discard(buf);
+		buf = next;
+		continue;
+reject:
+		tipc_reject_msg(buf, TIPC_ERR_NO_PORT);
+		buf = next;
+	}
+}
+
+/*
+ *  port_dispatcher(): Dispatcher for messages destinated
+ *  to the tipc_port interface. Called with port locked.
+ */
+
+static u32 port_dispatcher(struct tipc_port *dummy, struct sk_buff *buf)
+{
+	buf->next = NULL;
+	spin_lock_bh(&queue_lock);
+	if (msg_queue_head) {
+		msg_queue_tail->next = buf;
+		msg_queue_tail = buf;
+	} else {
+		msg_queue_tail = msg_queue_head = buf;
+		k_signal((Handler)port_dispatcher_sigh, 0);
+	}
+	spin_unlock_bh(&queue_lock);
+	return TIPC_OK;
+}
+
+/* 
+ * Wake up port after congestion: Called with port locked,
+ *                                
+ */
+
+static void port_wakeup_sh(unsigned long ref)
+{
+	struct port *p_ptr;
+	struct user_port *up_ptr;
+	tipc_continue_event cb = 0;
+	void *uh = 0;
+
+	p_ptr = port_lock(ref);
+	if (p_ptr) {
+		up_ptr = p_ptr->user_port;
+		if (up_ptr) {
+			cb = up_ptr->continue_event_cb;
+			uh = up_ptr->usr_handle;
+		}
+		port_unlock(p_ptr);
+	}
+	if (cb)
+		cb(uh, ref);
+}
+
+
+static void port_wakeup(struct tipc_port *p_ptr)
+{
+	k_signal((Handler)port_wakeup_sh, p_ptr->ref);
+}
+
+void tipc_acknowledge(u32 ref, u32 ack)
+{
+	struct port *p_ptr;
+	struct sk_buff *buf = 0;
+
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return;
+	if (p_ptr->publ.connected) {
+		p_ptr->publ.conn_unacked -= ack;
+		buf = port_build_proto_msg(port_peerport(p_ptr),
+					   port_peernode(p_ptr),
+					   ref,
+					   tipc_own_addr,
+					   CONN_MANAGER,
+					   CONN_ACK,
+					   TIPC_OK, 
+					   port_out_seqno(p_ptr),
+					   ack);
+	}
+	port_unlock(p_ptr);
+	net_route_msg(buf);
+}
+
+/*
+ * tipc_createport(): user level call. Will add port to
+ *                    registry if non-zero user_ref.
+ */
+
+int tipc_createport(u32 user_ref, 
+		    void *usr_handle, 
+		    unsigned int importance, 
+		    tipc_msg_err_event error_cb, 
+		    tipc_named_msg_err_event named_error_cb, 
+		    tipc_conn_shutdown_event conn_error_cb, 
+		    tipc_msg_event msg_cb, 
+		    tipc_named_msg_event named_msg_cb, 
+		    tipc_conn_msg_event conn_msg_cb, 
+		    tipc_continue_event continue_event_cb,/* May be zero */
+		    u32 *portref)
+{
+	struct user_port *up_ptr;
+	struct port *p_ptr; 
+	u32 ref;
+
+	up_ptr = (struct user_port *)kmalloc(sizeof(*up_ptr), GFP_ATOMIC);
+	if (up_ptr == NULL) {
+		return -ENOMEM;
+	}
+	ref = tipc_createport_raw(0, port_dispatcher, port_wakeup, importance);
+	p_ptr = port_lock(ref);
+	if (!p_ptr) {
+		kfree(up_ptr);
+		return -ENOMEM;
+	}
+
+	p_ptr->user_port = up_ptr;
+	up_ptr->user_ref = user_ref;
+	up_ptr->usr_handle = usr_handle;
+	up_ptr->ref = p_ptr->publ.ref;
+	up_ptr->err_cb = error_cb;
+	up_ptr->named_err_cb = named_error_cb;
+	up_ptr->conn_err_cb = conn_error_cb;
+	up_ptr->msg_cb = msg_cb;
+	up_ptr->named_msg_cb = named_msg_cb;
+	up_ptr->conn_msg_cb = conn_msg_cb;
+	up_ptr->continue_event_cb = continue_event_cb;
+	INIT_LIST_HEAD(&up_ptr->uport_list);
+	reg_add_port(up_ptr);
+	*portref = p_ptr->publ.ref;
+	dbg(" tipc_createport: %x with ref %u\n", p_ptr, p_ptr->publ.ref);        
+	port_unlock(p_ptr);
+	return TIPC_OK;
+}
+
+int tipc_ownidentity(u32 ref, struct tipc_portid *id)
+{
+	id->ref = ref;
+	id->node = tipc_own_addr;
+	return TIPC_OK;
+}
+
+int tipc_portimportance(u32 ref, unsigned int *importance)
+{
+	struct port *p_ptr;
+	
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	*importance = (unsigned int)msg_importance(&p_ptr->publ.phdr);
+	spin_unlock_bh(p_ptr->publ.lock);
+	return TIPC_OK;
+}
+
+int tipc_set_portimportance(u32 ref, unsigned int imp)
+{
+	struct port *p_ptr;
+
+	if (imp > TIPC_CRITICAL_IMPORTANCE)
+		return -EINVAL;
+
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	msg_set_importance(&p_ptr->publ.phdr, (u32)imp);
+	spin_unlock_bh(p_ptr->publ.lock);
+	return TIPC_OK;
+}
+
+
+int tipc_publish(u32 ref, unsigned int scope, struct tipc_name_seq const *seq)
+{
+	struct port *p_ptr;
+	struct publication *publ;
+	u32 key;
+	int res = -EINVAL;
+
+	p_ptr = port_lock(ref);
+	dbg("tipc_publ %u, p_ptr = %x, conn = %x, scope = %x, "
+	    "lower = %u, upper = %u\n",
+	    ref, p_ptr, p_ptr->publ.connected, scope, seq->lower, seq->upper);
+	if (!p_ptr)
+		return -EINVAL;
+	if (p_ptr->publ.connected)
+		goto exit;
+	if (seq->lower > seq->upper)
+		goto exit;
+	if ((scope < TIPC_ZONE_SCOPE) || (scope > TIPC_NODE_SCOPE))
+		goto exit;
+	key = ref + p_ptr->pub_count + 1;
+	if (key == ref) {
+		res = -EADDRINUSE;
+		goto exit;
+	}
+	publ = nametbl_publish(seq->type, seq->lower, seq->upper,
+			       scope, p_ptr->publ.ref, key);
+	if (publ) {
+		list_add(&publ->pport_list, &p_ptr->publications);
+		p_ptr->pub_count++;
+		p_ptr->publ.published = 1;
+		res = TIPC_OK;
+	}
+exit:
+	port_unlock(p_ptr);
+	return res;
+}
+
+int tipc_withdraw(u32 ref, unsigned int scope, struct tipc_name_seq const *seq)
+{
+	struct port *p_ptr;
+	struct publication *publ;
+	struct publication *tpubl;
+	int res = -EINVAL;
+	
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	if (!p_ptr->publ.published)
+		goto exit;
+	if (!seq) {
+		list_for_each_entry_safe(publ, tpubl, 
+					 &p_ptr->publications, pport_list) {
+			nametbl_withdraw(publ->type, publ->lower, 
+					 publ->ref, publ->key);
+		}
+		res = TIPC_OK;
+	} else {
+		list_for_each_entry_safe(publ, tpubl, 
+					 &p_ptr->publications, pport_list) {
+			if (publ->scope != scope)
+				continue;
+			if (publ->type != seq->type)
+				continue;
+			if (publ->lower != seq->lower)
+				continue;
+			if (publ->upper != seq->upper)
+				break;
+			nametbl_withdraw(publ->type, publ->lower, 
+					 publ->ref, publ->key);
+			res = TIPC_OK;
+			break;
+		}
+	}
+	if (list_empty(&p_ptr->publications))
+		p_ptr->publ.published = 0;
+exit:
+	port_unlock(p_ptr);
+	return res;
+}
+
+int tipc_connect2port(u32 ref, struct tipc_portid const *peer)
+{
+	struct port *p_ptr;
+	struct tipc_msg *msg;
+	int res = -EINVAL;
+
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	if (p_ptr->publ.published || p_ptr->publ.connected)
+		goto exit;
+	if (!peer->ref)
+		goto exit;
+
+	msg = &p_ptr->publ.phdr;
+	msg_set_destnode(msg, peer->node);
+	msg_set_destport(msg, peer->ref);
+	msg_set_orignode(msg, tipc_own_addr);
+	msg_set_origport(msg, p_ptr->publ.ref);
+	msg_set_transp_seqno(msg, 42);
+	msg_set_type(msg, TIPC_CONN_MSG);
+	if (!may_route(peer->node))
+		msg_set_hdr_sz(msg, SHORT_H_SIZE);
+	else
+		msg_set_hdr_sz(msg, LONG_H_SIZE);
+
+	p_ptr->probing_interval = PROBING_INTERVAL;
+	p_ptr->probing_state = CONFIRMED;
+	p_ptr->publ.connected = 1;
+	k_start_timer(&p_ptr->timer, p_ptr->probing_interval);
+
+	nodesub_subscribe(&p_ptr->subscription,peer->node, (void *)ref,
+			  (net_ev_handler)port_handle_node_down);
+	res = TIPC_OK;
+exit:
+	port_unlock(p_ptr);
+	p_ptr->max_pkt = link_get_max_pkt(peer->node, ref);
+	return res;
+}
+
+/*
+ * tipc_disconnect(): Disconnect port form peer.
+ *                    This is a node local operation.
+ */
+
+int tipc_disconnect(u32 ref)
+{
+	struct port *p_ptr;
+	int res = -ENOTCONN;
+
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	if (p_ptr->publ.connected) {
+		p_ptr->publ.connected = 0;
+		/* let timer expire on it's own to avoid deadlock! */
+		nodesub_unsubscribe(&p_ptr->subscription);
+		res = TIPC_OK;
+	}
+	port_unlock(p_ptr);
+	return res;
+}
+
+/*
+ * tipc_shutdown(): Send a SHUTDOWN msg to peer and disconnect
+ */
+int tipc_shutdown(u32 ref)
+{
+	struct port *p_ptr;
+	struct sk_buff *buf = 0;
+
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+
+	if (p_ptr->publ.connected) {
+		u32 imp = msg_importance(&p_ptr->publ.phdr);
+		if (imp < TIPC_CRITICAL_IMPORTANCE)
+			imp++;
+		buf = port_build_proto_msg(port_peerport(p_ptr),
+					   port_peernode(p_ptr),
+					   ref,
+					   tipc_own_addr,
+					   imp,
+					   TIPC_CONN_MSG,
+					   TIPC_CONN_SHUTDOWN, 
+					   port_out_seqno(p_ptr),
+					   0);
+	}
+	port_unlock(p_ptr);
+	net_route_msg(buf);
+	return tipc_disconnect(ref);
+}
+
+int tipc_isconnected(u32 ref, int *isconnected)
+{
+	struct port *p_ptr;
+	
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	*isconnected = p_ptr->publ.connected;
+	port_unlock(p_ptr);
+	return TIPC_OK;
+}
+
+int tipc_peer(u32 ref, struct tipc_portid *peer)
+{
+	struct port *p_ptr;
+	int res;
+	 
+	p_ptr = port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	if (p_ptr->publ.connected) {
+		peer->ref = port_peerport(p_ptr);
+		peer->node = port_peernode(p_ptr);
+		res = TIPC_OK;
+	} else
+		res = -ENOTCONN;
+	port_unlock(p_ptr);
+	return res;
+}
+
+int tipc_ref_valid(u32 ref)
+{
+	/* Works irrespective of type */
+	return !!ref_deref(ref);
+}
+
+
+/*
+ *  port_recv_sections(): Concatenate and deliver sectioned
+ *                        message for this node.
+ */
+
+int port_recv_sections(struct port *sender, unsigned int num_sect,
+		       struct iovec const *msg_sect)
+{
+	struct sk_buff *buf;
+	int res;
+	 
+	res = msg_build(&sender->publ.phdr, msg_sect, num_sect,
+			MAX_MSG_SIZE, !sender->user_port, &buf);
+	if (likely(buf))
+		port_recv_msg(buf);
+	return res;
+}
+
+/**
+ * tipc_send - send message sections on connection
+ */
+
+int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect)
+{
+	struct port *p_ptr;
+	u32 destnode;
+	int res;
+
+	p_ptr = port_deref(ref);
+	if (!p_ptr || !p_ptr->publ.connected)
+		return -EINVAL;
+
+	p_ptr->publ.congested = 1;
+	if (!port_congested(p_ptr)) {
+		destnode = port_peernode(p_ptr);
+		if (likely(destnode != tipc_own_addr))
+			res = link_send_sections_fast(p_ptr, msg_sect, num_sect,
+						      destnode);
+		else
+			res = port_recv_sections(p_ptr, num_sect, msg_sect);
+
+		if (likely(res != -ELINKCONG)) {
+			port_incr_out_seqno(p_ptr);
+			p_ptr->publ.congested = 0;
+			p_ptr->sent++;
+			return res;
+		}
+	}
+	if (port_unreliable(p_ptr)) {
+		p_ptr->publ.congested = 0;
+		/* Just calculate msg length and return */
+		return msg_calc_data_size(msg_sect, num_sect);
+	}
+	return -ELINKCONG;
+}
+
+/** 
+ * tipc_send_buf - send message buffer on connection
+ */
+
+int tipc_send_buf(u32 ref, struct sk_buff *buf, unsigned int dsz)
+{
+	struct port *p_ptr;
+	struct tipc_msg *msg;
+	u32 destnode;
+	u32 hsz;
+	u32 sz;
+	u32 res;
+	 
+	p_ptr = port_deref(ref);
+	if (!p_ptr || !p_ptr->publ.connected)
+		return -EINVAL;
+
+	msg = &p_ptr->publ.phdr;
+	hsz = msg_hdr_sz(msg);
+	sz = hsz + dsz;
+	msg_set_size(msg, sz);
+	if (skb_cow(buf, hsz))
+		return -ENOMEM;
+
+	skb_push(buf, hsz);
+	memcpy(buf->data, (unchar *)msg, hsz);
+	destnode = msg_destnode(msg);
+	p_ptr->publ.congested = 1;
+	if (!port_congested(p_ptr)) {
+		if (likely(destnode != tipc_own_addr))
+			res = tipc_send_buf_fast(buf, destnode);
+		else {
+			port_recv_msg(buf);
+			res = sz;
+		}
+		if (likely(res != -ELINKCONG)) {
+			port_incr_out_seqno(p_ptr);
+			p_ptr->sent++;
+			p_ptr->publ.congested = 0;
+			return res;
+		}
+	}
+	if (port_unreliable(p_ptr)) {
+		p_ptr->publ.congested = 0;
+		return dsz;
+	}
+	return -ELINKCONG;
+}
+
+/**
+ * tipc_forward2name - forward message sections to port name
+ */
+
+int tipc_forward2name(u32 ref, 
+		      struct tipc_name const *name, 
+		      u32 domain,
+		      u32 num_sect, 
+		      struct iovec const *msg_sect,
+		      struct tipc_portid const *orig, 
+		      unsigned int importance)
+{
+	struct port *p_ptr;
+	struct tipc_msg *msg;
+	u32 destnode = domain;
+	u32 destport = 0;
+	int res;
+
+	p_ptr = port_deref(ref);
+	if (!p_ptr || p_ptr->publ.connected)
+		return -EINVAL;
+
+	msg = &p_ptr->publ.phdr;
+	msg_set_type(msg, TIPC_NAMED_MSG);
+	msg_set_orignode(msg, orig->node);
+	msg_set_origport(msg, orig->ref);
+	msg_set_hdr_sz(msg, LONG_H_SIZE);
+	msg_set_nametype(msg, name->type);
+	msg_set_nameinst(msg, name->instance);
+	msg_set_lookup_scope(msg, addr_scope(domain));
+	if (importance <= TIPC_CRITICAL_IMPORTANCE)
+		msg_set_importance(msg,importance);
+	destport = nametbl_translate(name->type, name->instance, &destnode);
+	msg_set_destnode(msg, destnode);
+	msg_set_destport(msg, destport);
+
+	if (likely(destport || destnode)) {
+		p_ptr->sent++;
+		if (likely(destnode == tipc_own_addr))
+			return port_recv_sections(p_ptr, num_sect, msg_sect);
+		res = link_send_sections_fast(p_ptr, msg_sect, num_sect, 
+					      destnode);
+		if (likely(res != -ELINKCONG))
+			return res;
+		if (port_unreliable(p_ptr)) {
+			/* Just calculate msg length and return */
+			return msg_calc_data_size(msg_sect, num_sect);
+		}
+		return -ELINKCONG;
+	}
+	return port_reject_sections(p_ptr, msg, msg_sect, num_sect, 
+				    TIPC_ERR_NO_NAME);
+}
+
+/**
+ * tipc_send2name - send message sections to port name
+ */
+
+int tipc_send2name(u32 ref, 
+		   struct tipc_name const *name,
+		   unsigned int domain, 
+		   unsigned int num_sect, 
+		   struct iovec const *msg_sect)
+{
+	struct tipc_portid orig;
+
+	orig.ref = ref;
+	orig.node = tipc_own_addr;
+	return tipc_forward2name(ref, name, domain, num_sect, msg_sect, &orig,
+				 TIPC_PORT_IMPORTANCE);
+}
+
+/** 
+ * tipc_forward_buf2name - forward message buffer to port name
+ */
+
+int tipc_forward_buf2name(u32 ref,
+			  struct tipc_name const *name,
+			  u32 domain,
+			  struct sk_buff *buf,
+			  unsigned int dsz,
+			  struct tipc_portid const *orig,
+			  unsigned int importance)
+{
+	struct port *p_ptr;
+	struct tipc_msg *msg;
+	u32 destnode = domain;
+	u32 destport = 0;
+	int res;
+
+	p_ptr = (struct port *)ref_deref(ref);
+	if (!p_ptr || p_ptr->publ.connected)
+		return -EINVAL;
+
+	msg = &p_ptr->publ.phdr;
+	if (importance <= TIPC_CRITICAL_IMPORTANCE)
+		msg_set_importance(msg, importance);
+	msg_set_type(msg, TIPC_NAMED_MSG);
+	msg_set_orignode(msg, orig->node);
+	msg_set_origport(msg, orig->ref);
+	msg_set_nametype(msg, name->type);
+	msg_set_nameinst(msg, name->instance);
+	msg_set_lookup_scope(msg, addr_scope(domain));
+	msg_set_hdr_sz(msg, LONG_H_SIZE);
+	msg_set_size(msg, LONG_H_SIZE + dsz);
+	destport = nametbl_translate(name->type, name->instance, &destnode);
+	msg_set_destnode(msg, destnode);
+	msg_set_destport(msg, destport);
+	msg_dbg(msg, "forw2name ==> ");
+	if (skb_cow(buf, LONG_H_SIZE))
+		return -ENOMEM;
+	skb_push(buf, LONG_H_SIZE);
+	memcpy(buf->data, (unchar *)msg, LONG_H_SIZE);
+	msg_dbg(buf_msg(buf),"PREP:");
+	if (likely(destport || destnode)) {
+		p_ptr->sent++;
+		if (destnode == tipc_own_addr)
+			return port_recv_msg(buf);
+		res = tipc_send_buf_fast(buf, destnode);
+		if (likely(res != -ELINKCONG))
+			return res;
+		if (port_unreliable(p_ptr))
+			return dsz;
+		return -ELINKCONG;
+	}
+	return tipc_reject_msg(buf, TIPC_ERR_NO_NAME);
+}
+
+/** 
+ * tipc_send_buf2name - send message buffer to port name
+ */
+
+int tipc_send_buf2name(u32 ref, 
+		       struct tipc_name const *dest, 
+		       u32 domain,
+		       struct sk_buff *buf, 
+		       unsigned int dsz)
+{
+	struct tipc_portid orig;
+
+	orig.ref = ref;
+	orig.node = tipc_own_addr;
+	return tipc_forward_buf2name(ref, dest, domain, buf, dsz, &orig,
+				     TIPC_PORT_IMPORTANCE);
+}
+
+/** 
+ * tipc_forward2port - forward message sections to port identity
+ */
+
+int tipc_forward2port(u32 ref,
+		      struct tipc_portid const *dest,
+		      unsigned int num_sect, 
+		      struct iovec const *msg_sect,
+		      struct tipc_portid const *orig, 
+		      unsigned int importance)
+{
+	struct port *p_ptr;
+	struct tipc_msg *msg;
+	int res;
+
+	p_ptr = port_deref(ref);
+	if (!p_ptr || p_ptr->publ.connected)
+		return -EINVAL;
+
+	msg = &p_ptr->publ.phdr;
+	msg_set_type(msg, TIPC_DIRECT_MSG);
+	msg_set_orignode(msg, orig->node);
+	msg_set_origport(msg, orig->ref);
+	msg_set_destnode(msg, dest->node);
+	msg_set_destport(msg, dest->ref);
+	msg_set_hdr_sz(msg, DIR_MSG_H_SIZE);
+	if (importance <= TIPC_CRITICAL_IMPORTANCE)
+		msg_set_importance(msg, importance);
+	p_ptr->sent++;
+	if (dest->node == tipc_own_addr)
+		return port_recv_sections(p_ptr, num_sect, msg_sect);
+	res = link_send_sections_fast(p_ptr, msg_sect, num_sect, dest->node);
+	if (likely(res != -ELINKCONG))
+		return res;
+	if (port_unreliable(p_ptr)) {
+		/* Just calculate msg length and return */
+		return msg_calc_data_size(msg_sect, num_sect);
+	}
+	return -ELINKCONG;
+}
+
+/** 
+ * tipc_send2port - send message sections to port identity 
+ */
+
+int tipc_send2port(u32 ref, 
+		   struct tipc_portid const *dest,
+		   unsigned int num_sect, 
+		   struct iovec const *msg_sect)
+{
+	struct tipc_portid orig;
+
+	orig.ref = ref;
+	orig.node = tipc_own_addr;
+	return tipc_forward2port(ref, dest, num_sect, msg_sect, &orig, 
+				 TIPC_PORT_IMPORTANCE);
+}
+
+/** 
+ * tipc_forward_buf2port - forward message buffer to port identity
+ */
+int tipc_forward_buf2port(u32 ref,
+			  struct tipc_portid const *dest,
+			  struct sk_buff *buf,
+			  unsigned int dsz,
+			  struct tipc_portid const *orig,
+			  unsigned int importance)
+{
+	struct port *p_ptr;
+	struct tipc_msg *msg;
+	int res;
+
+	p_ptr = (struct port *)ref_deref(ref);
+	if (!p_ptr || p_ptr->publ.connected)
+		return -EINVAL;
+
+	msg = &p_ptr->publ.phdr;
+	msg_set_type(msg, TIPC_DIRECT_MSG);
+	msg_set_orignode(msg, orig->node);
+	msg_set_origport(msg, orig->ref);
+	msg_set_destnode(msg, dest->node);
+	msg_set_destport(msg, dest->ref);
+	msg_set_hdr_sz(msg, DIR_MSG_H_SIZE);
+	if (importance <= TIPC_CRITICAL_IMPORTANCE)
+		msg_set_importance(msg, importance);
+	msg_set_size(msg, DIR_MSG_H_SIZE + dsz);
+	if (skb_cow(buf, DIR_MSG_H_SIZE))
+		return -ENOMEM;
+
+	skb_push(buf, DIR_MSG_H_SIZE);
+	memcpy(buf->data, (unchar *)msg, DIR_MSG_H_SIZE);
+	msg_dbg(msg, "buf2port: ");
+	p_ptr->sent++;
+	if (dest->node == tipc_own_addr)
+		return port_recv_msg(buf);
+	res = tipc_send_buf_fast(buf, dest->node);
+	if (likely(res != -ELINKCONG))
+		return res;
+	if (port_unreliable(p_ptr))
+		return dsz;
+	return -ELINKCONG;
+}
+
+/** 
+ * tipc_send_buf2port - send message buffer to port identity
+ */
+
+int tipc_send_buf2port(u32 ref, 
+		       struct tipc_portid const *dest,
+		       struct sk_buff *buf, 
+		       unsigned int dsz)
+{
+	struct tipc_portid orig;
+
+	orig.ref = ref;
+	orig.node = tipc_own_addr;
+	return tipc_forward_buf2port(ref, dest, buf, dsz, &orig, 
+				     TIPC_PORT_IMPORTANCE);
+}
+
diff --git a/net/tipc/port.h b/net/tipc/port.h
new file mode 100644
index 000000000000..e40235ef92fe
--- /dev/null
+++ b/net/tipc/port.h
@@ -0,0 +1,206 @@
+/*
+ * net/tipc/port.h: Include file for TIPC port code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_PORT_H
+#define _TIPC_PORT_H
+
+#include <net/tipc/tipc_port.h>
+#include "ref.h"
+#include "net.h"
+#include "msg.h"
+#include "dbg.h"
+#include "node_subscr.h"
+
+/**
+ * struct user_port - TIPC user port (used with native API)
+ * @user_ref: id of user who created user port
+ * @usr_handle: user-specified field
+ * @ref: object reference to associated TIPC port
+ * <various callback routines>
+ * @uport_list: adjacent user ports in list of ports held by user
+ */
+ 
+struct user_port {
+	u32 user_ref;
+	void *usr_handle; 
+	u32 ref;
+	tipc_msg_err_event err_cb; 
+	tipc_named_msg_err_event named_err_cb; 
+	tipc_conn_shutdown_event conn_err_cb; 
+	tipc_msg_event msg_cb; 
+	tipc_named_msg_event named_msg_cb; 
+	tipc_conn_msg_event conn_msg_cb; 
+	tipc_continue_event continue_event_cb;
+	struct list_head uport_list;
+};
+
+/**
+ * struct port - TIPC port structure
+ * @publ: TIPC port info available to privileged users
+ * @port_list: adjacent ports in TIPC's global list of ports
+ * @dispatcher: ptr to routine which handles received messages
+ * @wakeup: ptr to routine to call when port is no longer congested
+ * @user_port: ptr to user port associated with port (if any)
+ * @wait_list: adjacent ports in list of ports waiting on link congestion
+ * @congested_link: ptr to congested link port is waiting on
+ * @waiting_pkts:
+ * @sent:
+ * @acked:
+ * @publications: list of publications for port
+ * @pub_count: total # of publications port has made during its lifetime
+ * @max_pkt: maximum packet size "hint" used when building messages sent by port
+ * @probing_state:
+ * @probing_interval:
+ * @last_in_seqno:
+ * @timer_ref:
+ * @subscription: "node down" subscription used to terminate failed connections
+ */
+
+struct port {
+	struct tipc_port publ;
+	struct list_head port_list;
+	u32 (*dispatcher)(struct tipc_port *, struct sk_buff *);
+	void (*wakeup)(struct tipc_port *);
+	struct user_port *user_port;
+	struct list_head wait_list;
+	struct link *congested_link;
+	u32 waiting_pkts;
+	u32 sent;
+	u32 acked;
+	struct list_head publications;
+	u32 pub_count;
+	u32 max_pkt;
+	u32 probing_state;
+	u32 probing_interval;
+	u32 last_in_seqno;
+	struct timer_list timer;
+	struct node_subscr subscription;
+};
+
+extern spinlock_t port_list_lock;
+struct port_list;
+
+int port_recv_sections(struct port *p_ptr, u32 num_sect, 
+		       struct iovec const *msg_sect);
+int port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr,
+			 struct iovec const *msg_sect, u32 num_sect,
+			 int err);
+struct sk_buff *port_get_ports(void);
+struct sk_buff *port_show_stats(const void *req_tlv_area, int req_tlv_space);
+void port_recv_proto_msg(struct sk_buff *buf);
+void port_recv_mcast(struct sk_buff *buf, struct port_list *dp);
+void port_reinit(void);
+
+/**
+ * port_lock - lock port instance referred to and return its pointer
+ */
+
+static inline struct port *port_lock(u32 ref)
+{
+	return (struct port *)ref_lock(ref);
+}
+
+/** 
+ * port_unlock - unlock a port instance
+ * 
+ * Can use pointer instead of ref_unlock() since port is already locked.
+ */
+
+static inline void port_unlock(struct port *p_ptr)
+{
+	spin_unlock_bh(p_ptr->publ.lock);
+}
+
+static inline struct port* port_deref(u32 ref)
+{
+	return (struct port *)ref_deref(ref);
+}
+
+static inline u32 peer_port(struct port *p_ptr)
+{
+	return msg_destport(&p_ptr->publ.phdr);
+}
+
+static inline u32 peer_node(struct port *p_ptr)
+{
+	return msg_destnode(&p_ptr->publ.phdr);
+}
+
+static inline int port_congested(struct port *p_ptr)
+{
+	return((p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2));
+}
+
+/** 
+ * port_recv_msg - receive message from lower layer and deliver to port user
+ */
+
+static inline int port_recv_msg(struct sk_buff *buf)
+{
+	struct port *p_ptr;
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 destport = msg_destport(msg);
+	u32 dsz = msg_data_sz(msg);
+	u32 err;
+	
+	/* forward unresolved named message */
+	if (unlikely(!destport)) {
+		net_route_msg(buf);
+		return dsz;
+	}
+
+	/* validate destination & pass to port, otherwise reject message */
+	p_ptr = port_lock(destport);
+	if (likely(p_ptr)) {
+		if (likely(p_ptr->publ.connected)) {
+			if ((unlikely(msg_origport(msg) != peer_port(p_ptr))) ||
+			    (unlikely(msg_orignode(msg) != peer_node(p_ptr))) ||
+			    (unlikely(!msg_connected(msg)))) {
+				err = TIPC_ERR_NO_PORT;
+				port_unlock(p_ptr);
+				goto reject;
+			}
+		}
+		err = p_ptr->dispatcher(&p_ptr->publ, buf);
+		port_unlock(p_ptr);
+		if (likely(!err))
+			return dsz;
+	} else {
+		err = TIPC_ERR_NO_PORT;
+	}
+reject:
+	dbg("port->rejecting, err = %x..\n",err);
+	return tipc_reject_msg(buf, err);
+}
+
+#endif
diff --git a/net/tipc/ref.c b/net/tipc/ref.c
new file mode 100644
index 000000000000..aee48c797b1e
--- /dev/null
+++ b/net/tipc/ref.c
@@ -0,0 +1,186 @@
+/*
+ * net/tipc/ref.c: TIPC object registry code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "ref.h"
+#include "port.h"
+#include "subscr.h"
+#include "name_distr.h"
+#include "name_table.h"
+#include "config.h"
+#include "discover.h"
+#include "bearer.h"
+#include "node.h"
+#include "bcast.h"
+
+/*
+ * Object reference table consists of 2**N entries.
+ *
+ * A used entry has object ptr != 0, reference == XXXX|own index
+ *				     (XXXX changes each time entry is acquired) 
+ * A free entry has object ptr == 0, reference == YYYY|next free index
+ *				     (YYYY is one more than last used XXXX)
+ *
+ * Free list is initially chained from entry (2**N)-1 to entry 1. 
+ * Entry 0 is not used to allow index 0 to indicate the end of the free list.
+ *
+ * Note: Any accidental reference of the form XXXX|0--0 won't match entry 0
+ * because entry 0's reference field has the form XXXX|1--1.
+ */
+
+struct ref_table ref_table = { 0 };
+
+rwlock_t reftbl_lock = RW_LOCK_UNLOCKED;
+
+/**
+ * ref_table_init - create reference table for objects
+ */
+
+int ref_table_init(u32 requested_size, u32 start)
+{
+	struct reference *table;
+	u32 sz = 1 << 4;
+	u32 index_mask;
+	int i;
+
+	while (sz < requested_size) {
+		sz <<= 1;
+	}
+	table = (struct reference *)vmalloc(sz * sizeof(struct reference));
+	if (table == NULL)
+		return -ENOMEM;
+
+	write_lock_bh(&reftbl_lock);
+	index_mask = sz - 1;
+	for (i = sz - 1; i >= 0; i--) {
+		table[i].object = 0;
+		table[i].lock = SPIN_LOCK_UNLOCKED;
+		table[i].data.next_plus_upper = (start & ~index_mask) + i - 1;
+	}
+	ref_table.entries = table;
+	ref_table.index_mask = index_mask;
+	ref_table.first_free = sz - 1;
+	ref_table.last_free = 1;
+	write_unlock_bh(&reftbl_lock);
+	return TIPC_OK;
+}
+
+/**
+ * ref_table_stop - destroy reference table for objects
+ */
+
+void ref_table_stop(void)
+{
+	if (!ref_table.entries)
+		return;
+
+	vfree(ref_table.entries);
+	ref_table.entries = 0;
+}
+
+/**
+ * ref_acquire - create reference to an object
+ * 
+ * Return a unique reference value which can be translated back to the pointer
+ * 'object' at a later time.  Also, pass back a pointer to the lock protecting 
+ * the object, but without locking it.
+ */
+
+u32 ref_acquire(void *object, spinlock_t **lock)
+{
+	struct reference *entry;
+	u32 index;
+	u32 index_mask;
+	u32 next_plus_upper;
+	u32 reference = 0;
+
+	assert(ref_table.entries && object);
+
+	write_lock_bh(&reftbl_lock);
+	if (ref_table.first_free) {
+		index = ref_table.first_free;
+		entry = &(ref_table.entries[index]);
+		index_mask = ref_table.index_mask;
+		/* take lock in case a previous user of entry still holds it */ 
+		spin_lock_bh(&entry->lock);  
+		next_plus_upper = entry->data.next_plus_upper;
+		ref_table.first_free = next_plus_upper & index_mask;
+		reference = (next_plus_upper & ~index_mask) + index;
+		entry->data.reference = reference;
+		entry->object = object;
+                if (lock != 0)
+                        *lock = &entry->lock;
+		spin_unlock_bh(&entry->lock);
+	}
+	write_unlock_bh(&reftbl_lock);
+	return reference;
+}
+
+/**
+ * ref_discard - invalidate references to an object
+ * 
+ * Disallow future references to an object and free up the entry for re-use.
+ * Note: The entry's spin_lock may still be busy after discard
+ */
+
+void ref_discard(u32 ref)
+{
+	struct reference *entry;
+	u32 index; 
+	u32 index_mask;
+
+	assert(ref_table.entries);
+	assert(ref != 0);
+
+	write_lock_bh(&reftbl_lock);
+	index_mask = ref_table.index_mask;
+	index = ref & index_mask;
+	entry = &(ref_table.entries[index]);
+	assert(entry->object != 0);
+	assert(entry->data.reference == ref);
+
+	/* mark entry as unused */
+	entry->object = 0;
+	if (ref_table.first_free == 0)
+		ref_table.first_free = index;
+	else
+		/* next_plus_upper is always XXXX|0--0 for last free entry */
+		ref_table.entries[ref_table.last_free].data.next_plus_upper 
+			|= index;
+	ref_table.last_free = index;
+
+	/* increment upper bits of entry to invalidate subsequent references */
+	entry->data.next_plus_upper = (ref & ~index_mask) + (index_mask + 1);
+	write_unlock_bh(&reftbl_lock);
+}
+
diff --git a/net/tipc/ref.h b/net/tipc/ref.h
new file mode 100644
index 000000000000..e6cd5bb3dc29
--- /dev/null
+++ b/net/tipc/ref.h
@@ -0,0 +1,128 @@
+/*
+ * net/tipc/ref.h: Include file for TIPC object registry code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_REF_H
+#define _TIPC_REF_H
+
+/**
+ * struct reference - TIPC object reference entry
+ * @object: pointer to object associated with reference entry
+ * @lock: spinlock controlling access to object
+ * @data: reference value associated with object (or link to next unused entry)
+ */
+ 
+struct reference {
+	void *object;
+	spinlock_t lock;
+	union {
+		u32 next_plus_upper;
+		u32 reference;
+	} data;
+};
+
+/**
+ * struct ref_table - table of TIPC object reference entries
+ * @entries: pointer to array of reference entries
+ * @index_mask: bitmask for array index portion of reference values
+ * @first_free: array index of first unused object reference entry
+ * @last_free: array index of last unused object reference entry
+ */
+
+struct ref_table {
+	struct reference *entries;
+	u32 index_mask;
+	u32 first_free;
+	u32 last_free;
+};
+
+extern struct ref_table ref_table;
+
+int ref_table_init(u32 requested_size, u32 start);
+void ref_table_stop(void);
+
+u32 ref_acquire(void *object, spinlock_t **lock);
+void ref_discard(u32 ref);
+
+
+/**
+ * ref_lock - lock referenced object and return pointer to it
+ */
+
+static inline void *ref_lock(u32 ref)
+{
+	if (likely(ref_table.entries)) {
+		struct reference *r =
+			&ref_table.entries[ref & ref_table.index_mask];
+
+		spin_lock_bh(&r->lock);
+		if (likely(r->data.reference == ref))
+			return r->object;
+		spin_unlock_bh(&r->lock);
+	}
+	return 0;
+}
+
+/**
+ * ref_unlock - unlock referenced object 
+ */
+
+static inline void ref_unlock(u32 ref)
+{
+	if (likely(ref_table.entries)) {
+		struct reference *r =
+			&ref_table.entries[ref & ref_table.index_mask];
+
+		if (likely(r->data.reference == ref))
+			spin_unlock_bh(&r->lock);
+		else
+			err("ref_unlock() invoked using obsolete reference\n");
+	}
+}
+
+/**
+ * ref_deref - return pointer referenced object (without locking it)
+ */
+
+static inline void *ref_deref(u32 ref)
+{
+	if (likely(ref_table.entries)) {
+		struct reference *r = 
+			&ref_table.entries[ref & ref_table.index_mask];
+
+		if (likely(r->data.reference == ref))
+			return r->object;
+	}
+	return 0;
+}
+
+#endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
new file mode 100644
index 000000000000..22dcb724e760
--- /dev/null
+++ b/net/tipc/socket.c
@@ -0,0 +1,1722 @@
+/*
+ * net/tipc/socket.c: TIPC socket API
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/version.h>
+#include <linux/fcntl.h>
+#include <linux/version.h>
+#include <asm/semaphore.h>
+#include <asm/string.h>
+#include <asm/atomic.h>
+#include <net/sock.h>
+
+#include <linux/tipc.h>
+#include <net/tipc/tipc_msg.h>
+#include <net/tipc/tipc_port.h>
+
+#include "core.h"
+
+#define SS_LISTENING	-1	/* socket is listening */
+#define SS_READY	-2	/* socket is connectionless */
+
+#define OVERLOAD_LIMIT_BASE    5000
+
+struct tipc_sock {
+	struct sock sk;
+	struct tipc_port *p;
+	struct semaphore sem;
+};
+
+#define tipc_sk(sk) ((struct tipc_sock*)sk)
+
+static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf);
+static void wakeupdispatch(struct tipc_port *tport);
+
+static struct proto_ops packet_ops;
+static struct proto_ops stream_ops;
+static struct proto_ops msg_ops;
+
+static struct proto tipc_proto;
+
+static int sockets_enabled = 0;
+
+static atomic_t tipc_queue_size = ATOMIC_INIT(0);
+
+
+/* 
+ * sock_lock(): Lock a port/socket pair. lock_sock() can 
+ * not be used here, since the same lock must protect ports 
+ * with non-socket interfaces.
+ * See net.c for description of locking policy.
+ */
+static inline void sock_lock(struct tipc_sock* tsock)
+{
+        spin_lock_bh(tsock->p->lock);       
+}
+
+/* 
+ * sock_unlock(): Unlock a port/socket pair
+ */
+static inline void sock_unlock(struct tipc_sock* tsock)
+{
+        spin_unlock_bh(tsock->p->lock);
+}
+
+/**
+ * pollmask - determine the current set of poll() events for a socket
+ * @sock: socket structure
+ * 
+ * TIPC sets the returned events as follows:
+ * a) POLLRDNORM and POLLIN are set if the socket's receive queue is non-empty
+ *    or if a connection-oriented socket is does not have an active connection
+ *    (i.e. a read operation will not block).
+ * b) POLLOUT is set except when a socket's connection has been terminated
+ *    (i.e. a write operation will not block).
+ * c) POLLHUP is set when a socket's connection has been terminated.
+ *
+ * IMPORTANT: The fact that a read or write operation will not block does NOT
+ * imply that the operation will succeed!
+ * 
+ * Returns pollmask value
+ */
+
+static inline u32 pollmask(struct socket *sock)
+{
+	u32 mask;
+
+	if ((skb_queue_len(&sock->sk->sk_receive_queue) != 0) ||
+	    (sock->state == SS_UNCONNECTED) ||
+	    (sock->state == SS_DISCONNECTING))
+		mask = (POLLRDNORM | POLLIN);
+	else
+		mask = 0;
+
+	if (sock->state == SS_DISCONNECTING) 
+		mask |= POLLHUP;
+	else
+		mask |= POLLOUT;
+
+	return mask;
+}
+
+
+/**
+ * advance_queue - discard first buffer in queue
+ * @tsock: TIPC socket
+ */
+
+static inline void advance_queue(struct tipc_sock *tsock)
+{
+        sock_lock(tsock);
+	buf_discard(skb_dequeue(&tsock->sk.sk_receive_queue));
+        sock_unlock(tsock);
+	atomic_dec(&tipc_queue_size);
+}
+
+/**
+ * tipc_create - create a TIPC socket
+ * @sock: pre-allocated socket structure
+ * @protocol: protocol indicator (must be 0)
+ * 
+ * This routine creates and attaches a 'struct sock' to the 'struct socket',
+ * then create and attaches a TIPC port to the 'struct sock' part.
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_create(struct socket *sock, int protocol)
+{
+	struct tipc_sock *tsock;
+	struct tipc_port *port;
+	struct sock *sk;
+        u32 ref;
+
+	if ((sock->type != SOCK_STREAM) && 
+	    (sock->type != SOCK_SEQPACKET) &&
+	    (sock->type != SOCK_DGRAM) &&
+	    (sock->type != SOCK_RDM))
+		return -EPROTOTYPE;
+
+	if (unlikely(protocol != 0))
+		return -EPROTONOSUPPORT;
+
+	ref = tipc_createport_raw(0, &dispatch, &wakeupdispatch, TIPC_LOW_IMPORTANCE);
+	if (unlikely(!ref))
+		return -ENOMEM;
+
+	sock->state = SS_UNCONNECTED;
+
+	switch (sock->type) {
+	case SOCK_STREAM:
+		sock->ops = &stream_ops;
+		break;
+	case SOCK_SEQPACKET:
+		sock->ops = &packet_ops;
+		break;
+	case SOCK_DGRAM:
+		tipc_set_portunreliable(ref, 1);
+		/* fall through */
+	case SOCK_RDM:
+		tipc_set_portunreturnable(ref, 1);
+		sock->ops = &msg_ops;
+		sock->state = SS_READY;
+		break;
+	}
+
+	sk = sk_alloc(AF_TIPC, GFP_KERNEL, &tipc_proto, 1);
+	if (!sk) {
+		tipc_deleteport(ref);
+		return -ENOMEM;
+	}
+
+	sock_init_data(sock, sk);
+	init_waitqueue_head(sk->sk_sleep);
+	sk->sk_rcvtimeo = 8 * HZ;   /* default connect timeout = 8s */
+
+	tsock = tipc_sk(sk);
+	port = tipc_get_port(ref);
+
+	tsock->p = port;
+	port->usr_handle = tsock;
+
+	init_MUTEX(&tsock->sem);
+
+	dbg("sock_create: %x\n",tsock);
+
+	atomic_inc(&tipc_user_count);
+
+	return 0;
+}
+
+/**
+ * release - destroy a TIPC socket
+ * @sock: socket to destroy
+ *
+ * This routine cleans up any messages that are still queued on the socket.
+ * For DGRAM and RDM socket types, all queued messages are rejected.
+ * For SEQPACKET and STREAM socket types, the first message is rejected
+ * and any others are discarded.  (If the first message on a STREAM socket
+ * is partially-read, it is discarded and the next one is rejected instead.)
+ * 
+ * NOTE: Rejected messages are not necessarily returned to the sender!  They
+ * are returned or discarded according to the "destination droppable" setting
+ * specified for the message by the sender.
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int release(struct socket *sock)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	int res = TIPC_OK;
+	struct sk_buff *buf;
+
+        dbg("sock_delete: %x\n",tsock);
+	if (!tsock)
+		return 0;
+	down_interruptible(&tsock->sem);
+	if (!sock->sk) {
+		up(&tsock->sem);
+		return 0;
+	}
+	
+	/* Reject unreceived messages, unless no longer connected */
+
+	while (sock->state != SS_DISCONNECTING) {
+		sock_lock(tsock);
+		buf = skb_dequeue(&sk->sk_receive_queue);
+		if (!buf)
+			tsock->p->usr_handle = 0;
+		sock_unlock(tsock);
+		if (!buf)
+			break;
+		if (TIPC_SKB_CB(buf)->handle != msg_data(buf_msg(buf)))
+			buf_discard(buf);
+		else
+			tipc_reject_msg(buf, TIPC_ERR_NO_PORT);
+		atomic_dec(&tipc_queue_size);
+	}
+
+	/* Delete TIPC port */
+
+	res = tipc_deleteport(tsock->p->ref);
+	sock->sk = NULL;
+
+	/* Discard any remaining messages */
+
+	while ((buf = skb_dequeue(&sk->sk_receive_queue))) {
+		buf_discard(buf);
+		atomic_dec(&tipc_queue_size);
+	}
+
+	up(&tsock->sem);
+
+	sock_put(sk);
+
+        atomic_dec(&tipc_user_count);
+	return res;
+}
+
+/**
+ * bind - associate or disassocate TIPC name(s) with a socket
+ * @sock: socket structure
+ * @uaddr: socket address describing name(s) and desired operation
+ * @uaddr_len: size of socket address data structure
+ * 
+ * Name and name sequence binding is indicated using a positive scope value;
+ * a negative scope value unbinds the specified name.  Specifying no name
+ * (i.e. a socket address length of 0) unbinds all names from the socket.
+ * 
+ * Returns 0 on success, errno otherwise
+ */
+
+static int bind(struct socket *sock, struct sockaddr *uaddr, int uaddr_len)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+	struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
+	int res;
+
+	if (down_interruptible(&tsock->sem))
+		return -ERESTARTSYS;
+	
+	if (unlikely(!uaddr_len)) {
+		res = tipc_withdraw(tsock->p->ref, 0, 0);
+		goto exit;
+	}
+
+	if (uaddr_len < sizeof(struct sockaddr_tipc)) {
+		res = -EINVAL;
+		goto exit;
+	}
+
+	if (addr->family != AF_TIPC) {
+		res = -EAFNOSUPPORT;
+		goto exit;
+	}
+	if (addr->addrtype == TIPC_ADDR_NAME)
+		addr->addr.nameseq.upper = addr->addr.nameseq.lower;
+	else if (addr->addrtype != TIPC_ADDR_NAMESEQ) {
+		res = -EAFNOSUPPORT;
+		goto exit;
+	}
+        
+       	if (addr->scope > 0)
+		res = tipc_publish(tsock->p->ref, addr->scope,
+				   &addr->addr.nameseq);
+	else
+		res = tipc_withdraw(tsock->p->ref, -addr->scope,
+				    &addr->addr.nameseq);
+exit:
+	up(&tsock->sem);
+	return res;
+}
+
+/** 
+ * get_name - get port ID of socket or peer socket
+ * @sock: socket structure
+ * @uaddr: area for returned socket address
+ * @uaddr_len: area for returned length of socket address
+ * @peer: 0 to obtain socket name, 1 to obtain peer socket name
+ * 
+ * Returns 0 on success, errno otherwise
+ */
+
+static int get_name(struct socket *sock, struct sockaddr *uaddr, 
+		    int *uaddr_len, int peer)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+	struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
+	u32 res;
+
+	if (down_interruptible(&tsock->sem))
+		return -ERESTARTSYS;
+
+	*uaddr_len = sizeof(*addr);
+	addr->addrtype = TIPC_ADDR_ID;
+	addr->family = AF_TIPC;
+	addr->scope = 0;
+	if (peer)
+		res = tipc_peer(tsock->p->ref, &addr->addr.id);
+	else
+		res = tipc_ownidentity(tsock->p->ref, &addr->addr.id);
+	addr->addr.name.domain = 0;
+
+	up(&tsock->sem);
+	return res;
+}
+
+/**
+ * poll - read and possibly block on pollmask
+ * @file: file structure associated with the socket
+ * @sock: socket for which to calculate the poll bits
+ * @wait: ???
+ *
+ * Returns the pollmask
+ */
+
+static unsigned int poll(struct file *file, struct socket *sock, 
+			 poll_table *wait)
+{
+	poll_wait(file, sock->sk->sk_sleep, wait);
+	/* NEED LOCK HERE? */
+	return pollmask(sock);
+}
+
+/** 
+ * dest_name_check - verify user is permitted to send to specified port name
+ * @dest: destination address
+ * @m: descriptor for message to be sent
+ * 
+ * Prevents restricted configuration commands from being issued by
+ * unauthorized users.
+ * 
+ * Returns 0 if permission is granted, otherwise errno
+ */
+
+static inline int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m)
+{
+	struct tipc_cfg_msg_hdr hdr;
+
+        if (likely(dest->addr.name.name.type >= TIPC_RESERVED_TYPES))
+                return 0;
+        if (likely(dest->addr.name.name.type == TIPC_TOP_SRV))
+                return 0;
+
+        if (likely(dest->addr.name.name.type != TIPC_CFG_SRV))
+                return -EACCES;
+
+        if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr)))
+		return -EFAULT;
+	if ((ntohs(hdr.tcm_type) & 0xC000) & (!capable(CAP_NET_ADMIN)))
+		return -EACCES;
+        
+	return 0;
+}
+
+/**
+ * send_msg - send message in connectionless manner
+ * @iocb: (unused)
+ * @sock: socket structure
+ * @m: message to send
+ * @total_len: (unused)
+ * 
+ * Message must have an destination specified explicitly.
+ * Used for SOCK_RDM and SOCK_DGRAM messages, 
+ * and for 'SYN' messages on SOCK_SEQPACKET and SOCK_STREAM connections.
+ * (Note: 'SYN+' is prohibited on SOCK_STREAM.)
+ * 
+ * Returns the number of bytes sent on success, or errno otherwise
+ */
+
+static int send_msg(struct kiocb *iocb, struct socket *sock,
+		    struct msghdr *m, size_t total_len)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+        struct sockaddr_tipc *dest = (struct sockaddr_tipc *)m->msg_name;
+	struct sk_buff *buf;
+	int needs_conn;
+	int res = -EINVAL;
+
+	if (unlikely(!dest))
+		return -EDESTADDRREQ;
+	if (unlikely(dest->family != AF_TIPC))
+		return -EINVAL;
+
+	needs_conn = (sock->state != SS_READY);
+	if (unlikely(needs_conn)) {
+		if (sock->state == SS_LISTENING)
+			return -EPIPE;
+		if (sock->state != SS_UNCONNECTED)
+			return -EISCONN;
+		if ((tsock->p->published) ||
+		    ((sock->type == SOCK_STREAM) && (total_len != 0)))
+			return -EOPNOTSUPP;
+	}
+
+	if (down_interruptible(&tsock->sem))
+		return -ERESTARTSYS;
+
+	if (needs_conn) {
+
+		/* Abort any pending connection attempts (very unlikely) */
+
+		while ((buf = skb_dequeue(&sock->sk->sk_receive_queue))) {
+			tipc_reject_msg(buf, TIPC_ERR_NO_PORT);
+			atomic_dec(&tipc_queue_size);
+		}
+
+		sock->state = SS_CONNECTING;
+	}
+
+        do {
+                if (dest->addrtype == TIPC_ADDR_NAME) {
+                        if ((res = dest_name_check(dest, m)))
+                                goto exit;
+                        res = tipc_send2name(tsock->p->ref,
+                                             &dest->addr.name.name,
+                                             dest->addr.name.domain, 
+                                             m->msg_iovlen,
+                                             m->msg_iov);
+                }
+                else if (dest->addrtype == TIPC_ADDR_ID) {
+                        res = tipc_send2port(tsock->p->ref,
+                                             &dest->addr.id,
+                                             m->msg_iovlen,
+                                             m->msg_iov);
+                }
+                else if (dest->addrtype == TIPC_ADDR_MCAST) {
+			if (needs_conn) {
+				res = -EOPNOTSUPP;
+				goto exit;
+			}
+                        if ((res = dest_name_check(dest, m)))
+                                goto exit;
+                        res = tipc_multicast(tsock->p->ref,
+                                             &dest->addr.nameseq,
+                                             0,
+                                             m->msg_iovlen,
+                                             m->msg_iov);
+                }
+                if (likely(res != -ELINKCONG)) {
+exit:                                
+                        up(&tsock->sem);
+                        return res;
+                }
+		if (m->msg_flags & MSG_DONTWAIT) {
+			res = -EWOULDBLOCK;
+			goto exit;
+		}
+                if (wait_event_interruptible(*sock->sk->sk_sleep,
+                                             !tsock->p->congested)) {
+                    res = -ERESTARTSYS;
+                    goto exit;
+                }
+        } while (1);
+}
+
+/** 
+ * send_packet - send a connection-oriented message
+ * @iocb: (unused)
+ * @sock: socket structure
+ * @m: message to send
+ * @total_len: (unused)
+ * 
+ * Used for SOCK_SEQPACKET messages and SOCK_STREAM data.
+ * 
+ * Returns the number of bytes sent on success, or errno otherwise
+ */
+
+static int send_packet(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *m, size_t total_len)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+        struct sockaddr_tipc *dest = (struct sockaddr_tipc *)m->msg_name;
+	int res;
+
+	/* Handle implied connection establishment */
+
+	if (unlikely(dest))
+		return send_msg(iocb, sock, m, total_len);
+
+	if (down_interruptible(&tsock->sem)) {
+		return -ERESTARTSYS;
+        }
+
+        if (unlikely(sock->state != SS_CONNECTED)) {
+                if (sock->state == SS_DISCONNECTING)
+                        res = -EPIPE;   
+                else
+                        res = -ENOTCONN;
+                goto exit;
+        }
+
+        do {
+                res = tipc_send(tsock->p->ref, m->msg_iovlen, m->msg_iov);
+                if (likely(res != -ELINKCONG)) {
+exit:
+                        up(&tsock->sem);
+                        return res;
+                }
+		if (m->msg_flags & MSG_DONTWAIT) {
+			res = -EWOULDBLOCK;
+			goto exit;
+		}
+                if (wait_event_interruptible(*sock->sk->sk_sleep,
+                                             !tsock->p->congested)) {
+                    res = -ERESTARTSYS;
+                    goto exit;
+                }
+        } while (1);
+}
+
+/** 
+ * send_stream - send stream-oriented data
+ * @iocb: (unused)
+ * @sock: socket structure
+ * @m: data to send
+ * @total_len: total length of data to be sent
+ * 
+ * Used for SOCK_STREAM data.
+ * 
+ * Returns the number of bytes sent on success, or errno otherwise
+ */
+
+
+static int send_stream(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *m, size_t total_len)
+{
+	struct msghdr my_msg;
+	struct iovec my_iov;
+	struct iovec *curr_iov;
+	int curr_iovlen;
+	char __user *curr_start;
+	int curr_left;
+	int bytes_to_send;
+	int res;
+	
+	if (likely(total_len <= TIPC_MAX_USER_MSG_SIZE))
+		return send_packet(iocb, sock, m, total_len);
+
+	/* Can only send large data streams if already connected */
+
+        if (unlikely(sock->state != SS_CONNECTED)) {
+                if (sock->state == SS_DISCONNECTING)
+                        return -EPIPE;   
+                else
+                        return -ENOTCONN;
+        }
+
+	/* 
+	 * Send each iovec entry using one or more messages
+	 *
+	 * Note: This algorithm is good for the most likely case 
+	 * (i.e. one large iovec entry), but could be improved to pass sets
+	 * of small iovec entries into send_packet().
+	 */
+
+	my_msg = *m;
+	curr_iov = my_msg.msg_iov;
+	curr_iovlen = my_msg.msg_iovlen;
+	my_msg.msg_iov = &my_iov;
+	my_msg.msg_iovlen = 1;
+
+	while (curr_iovlen--) {
+		curr_start = curr_iov->iov_base;
+		curr_left = curr_iov->iov_len;
+
+		while (curr_left) {
+			bytes_to_send = (curr_left < TIPC_MAX_USER_MSG_SIZE)
+				? curr_left : TIPC_MAX_USER_MSG_SIZE;
+			my_iov.iov_base = curr_start;
+			my_iov.iov_len = bytes_to_send;
+                        if ((res = send_packet(iocb, sock, &my_msg, 0)) < 0)
+                                return res;
+			curr_left -= bytes_to_send;
+			curr_start += bytes_to_send;
+		}
+
+		curr_iov++;
+	}
+
+	return total_len;
+}
+
+/**
+ * auto_connect - complete connection setup to a remote port
+ * @sock: socket structure
+ * @tsock: TIPC-specific socket structure
+ * @msg: peer's response message
+ * 
+ * Returns 0 on success, errno otherwise
+ */
+
+static int auto_connect(struct socket *sock, struct tipc_sock *tsock, 
+			struct tipc_msg *msg)
+{
+	struct tipc_portid peer;
+
+	if (msg_errcode(msg)) {
+		sock->state = SS_DISCONNECTING;
+		return -ECONNREFUSED;
+	}
+
+	peer.ref = msg_origport(msg);
+	peer.node = msg_orignode(msg);
+	tipc_connect2port(tsock->p->ref, &peer);
+	tipc_set_portimportance(tsock->p->ref, msg_importance(msg));
+	sock->state = SS_CONNECTED;
+	return 0;
+}
+
+/**
+ * set_orig_addr - capture sender's address for received message
+ * @m: descriptor for message info
+ * @msg: received message header
+ * 
+ * Note: Address is not captured if not requested by receiver.
+ */
+
+static inline void set_orig_addr(struct msghdr *m, struct tipc_msg *msg)
+{
+        struct sockaddr_tipc *addr = (struct sockaddr_tipc *)m->msg_name;
+
+        if (addr) {
+		addr->family = AF_TIPC;
+		addr->addrtype = TIPC_ADDR_ID;
+		addr->addr.id.ref = msg_origport(msg);
+		addr->addr.id.node = msg_orignode(msg);
+		addr->addr.name.domain = 0;   	/* could leave uninitialized */
+		addr->scope = 0;   		/* could leave uninitialized */
+		m->msg_namelen = sizeof(struct sockaddr_tipc);
+	}
+}
+
+/**
+ * anc_data_recv - optionally capture ancillary data for received message 
+ * @m: descriptor for message info
+ * @msg: received message header
+ * @tport: TIPC port associated with message
+ * 
+ * Note: Ancillary data is not captured if not requested by receiver.
+ * 
+ * Returns 0 if successful, otherwise errno
+ */
+
+static inline int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, 
+				struct tipc_port *tport)
+{
+	u32 anc_data[3];
+	u32 err;
+	u32 dest_type;
+	int res;
+
+	if (likely(m->msg_controllen == 0))
+		return 0;
+
+	/* Optionally capture errored message object(s) */
+
+	err = msg ? msg_errcode(msg) : 0;
+	if (unlikely(err)) {
+		anc_data[0] = err;
+		anc_data[1] = msg_data_sz(msg);
+		if ((res = put_cmsg(m, SOL_SOCKET, TIPC_ERRINFO, 8, anc_data)))
+			return res;
+		if (anc_data[1] &&
+		    (res = put_cmsg(m, SOL_SOCKET, TIPC_RETDATA, anc_data[1], 
+				    msg_data(msg))))
+			return res;
+	}
+
+	/* Optionally capture message destination object */
+
+	dest_type = msg ? msg_type(msg) : TIPC_DIRECT_MSG;
+	switch (dest_type) {
+	case TIPC_NAMED_MSG:
+		anc_data[0] = msg_nametype(msg);
+		anc_data[1] = msg_namelower(msg);
+		anc_data[2] = msg_namelower(msg);
+		break;
+	case TIPC_MCAST_MSG:
+		anc_data[0] = msg_nametype(msg);
+		anc_data[1] = msg_namelower(msg);
+		anc_data[2] = msg_nameupper(msg);
+		break;
+	case TIPC_CONN_MSG:
+		anc_data[0] = tport->conn_type;
+		anc_data[1] = tport->conn_instance;
+		anc_data[2] = tport->conn_instance;
+		break;
+	default:
+		anc_data[0] = 0;
+	}
+	if (anc_data[0] &&
+	    (res = put_cmsg(m, SOL_SOCKET, TIPC_DESTNAME, 12, anc_data)))
+		return res;
+
+	return 0;
+}
+
+/** 
+ * recv_msg - receive packet-oriented message
+ * @iocb: (unused)
+ * @m: descriptor for message info
+ * @buf_len: total size of user buffer area
+ * @flags: receive flags
+ * 
+ * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages.
+ * If the complete message doesn't fit in user area, truncate it.
+ *
+ * Returns size of returned message data, errno otherwise
+ */
+
+static int recv_msg(struct kiocb *iocb, struct socket *sock,
+		    struct msghdr *m, size_t buf_len, int flags)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+	unsigned int q_len;
+	unsigned int sz;
+	u32 err;
+	int res;
+
+	/* Currently doesn't support receiving into multiple iovec entries */
+
+	if (m->msg_iovlen != 1)
+		return -EOPNOTSUPP;
+
+	/* Catch invalid receive attempts */
+
+	if (unlikely(!buf_len))
+		return -EINVAL;
+
+	if (sock->type == SOCK_SEQPACKET) {
+		if (unlikely(sock->state == SS_UNCONNECTED))
+			return -ENOTCONN;
+		if (unlikely((sock->state == SS_DISCONNECTING) && 
+			     (skb_queue_len(&sock->sk->sk_receive_queue) == 0)))
+		       	return -ENOTCONN;
+	}
+
+	/* Look for a message in receive queue; wait if necessary */
+
+	if (unlikely(down_interruptible(&tsock->sem)))
+		return -ERESTARTSYS;
+
+restart:
+	if (unlikely((skb_queue_len(&sock->sk->sk_receive_queue) == 0) &&
+		     (flags & MSG_DONTWAIT))) {
+		res = -EWOULDBLOCK;
+		goto exit;
+	}
+
+	if ((res = wait_event_interruptible(
+		*sock->sk->sk_sleep, 
+		((q_len = skb_queue_len(&sock->sk->sk_receive_queue)) ||
+		 (sock->state == SS_DISCONNECTING))) )) {
+		goto exit;
+	}
+
+	/* Catch attempt to receive on an already terminated connection */
+	/* [THIS CHECK MAY OVERLAP WITH AN EARLIER CHECK] */
+
+	if (!q_len) {
+		res = -ENOTCONN;
+		goto exit;
+	}
+
+	/* Get access to first message in receive queue */
+
+	buf = skb_peek(&sock->sk->sk_receive_queue);
+	msg = buf_msg(buf);
+	sz = msg_data_sz(msg);
+	err = msg_errcode(msg);
+
+	/* Complete connection setup for an implied connect */
+
+	if (unlikely(sock->state == SS_CONNECTING)) {
+		if ((res = auto_connect(sock, tsock, msg)))
+			goto exit;
+	}
+
+	/* Discard an empty non-errored message & try again */
+
+	if ((!sz) && (!err)) {
+		advance_queue(tsock);
+		goto restart;
+	}
+
+	/* Capture sender's address (optional) */
+
+	set_orig_addr(m, msg);
+
+	/* Capture ancillary data (optional) */
+
+	if ((res = anc_data_recv(m, msg, tsock->p)))
+		goto exit;
+
+	/* Capture message data (if valid) & compute return value (always) */
+	
+	if (!err) {
+		if (unlikely(buf_len < sz)) {
+			sz = buf_len;
+			m->msg_flags |= MSG_TRUNC;
+		}
+		if (unlikely(copy_to_user(m->msg_iov->iov_base, msg_data(msg),
+					  sz))) {
+			res = -EFAULT;
+			goto exit;
+		}
+		res = sz;
+	} else {
+		if ((sock->state == SS_READY) ||
+		    ((err == TIPC_CONN_SHUTDOWN) || m->msg_control))
+			res = 0;
+		else
+			res = -ECONNRESET;
+	}
+
+	/* Consume received message (optional) */
+
+	if (likely(!(flags & MSG_PEEK))) {
+                if (unlikely(++tsock->p->conn_unacked >= TIPC_FLOW_CONTROL_WIN))
+                        tipc_acknowledge(tsock->p->ref, tsock->p->conn_unacked);
+		advance_queue(tsock);
+        }
+exit:
+	up(&tsock->sem);
+	return res;
+}
+
+/** 
+ * recv_stream - receive stream-oriented data
+ * @iocb: (unused)
+ * @m: descriptor for message info
+ * @buf_len: total size of user buffer area
+ * @flags: receive flags
+ * 
+ * Used for SOCK_STREAM messages only.  If not enough data is available 
+ * will optionally wait for more; never truncates data.
+ *
+ * Returns size of returned message data, errno otherwise
+ */
+
+static int recv_stream(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *m, size_t buf_len, int flags)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+	unsigned int q_len;
+	unsigned int sz;
+	int sz_to_copy;
+	int sz_copied = 0;
+	int needed;
+	char *crs = m->msg_iov->iov_base;
+	unsigned char *buf_crs;
+	u32 err;
+	int res;
+
+	/* Currently doesn't support receiving into multiple iovec entries */
+
+	if (m->msg_iovlen != 1)
+		return -EOPNOTSUPP;
+
+	/* Catch invalid receive attempts */
+
+	if (unlikely(!buf_len))
+		return -EINVAL;
+
+	if (unlikely(sock->state == SS_DISCONNECTING)) {
+		if (skb_queue_len(&sock->sk->sk_receive_queue) == 0)
+			return -ENOTCONN;
+	} else if (unlikely(sock->state != SS_CONNECTED))
+		return -ENOTCONN;
+
+	/* Look for a message in receive queue; wait if necessary */
+
+	if (unlikely(down_interruptible(&tsock->sem)))
+		return -ERESTARTSYS;
+
+restart:
+	if (unlikely((skb_queue_len(&sock->sk->sk_receive_queue) == 0) &&
+		     (flags & MSG_DONTWAIT))) {
+		res = (sz_copied == 0) ? -EWOULDBLOCK : 0;
+		goto exit;
+	}
+
+	if ((res = wait_event_interruptible(
+		*sock->sk->sk_sleep, 
+		((q_len = skb_queue_len(&sock->sk->sk_receive_queue)) ||
+		 (sock->state == SS_DISCONNECTING))) )) {
+		goto exit;
+	}
+
+	/* Catch attempt to receive on an already terminated connection */
+	/* [THIS CHECK MAY OVERLAP WITH AN EARLIER CHECK] */
+
+	if (!q_len) {
+		res = -ENOTCONN;
+		goto exit;
+	}
+
+	/* Get access to first message in receive queue */
+
+	buf = skb_peek(&sock->sk->sk_receive_queue);
+	msg = buf_msg(buf);
+	sz = msg_data_sz(msg);
+	err = msg_errcode(msg);
+
+	/* Discard an empty non-errored message & try again */
+
+	if ((!sz) && (!err)) {
+		advance_queue(tsock);
+		goto restart;
+	}
+
+	/* Optionally capture sender's address & ancillary data of first msg */
+
+	if (sz_copied == 0) {
+		set_orig_addr(m, msg);
+		if ((res = anc_data_recv(m, msg, tsock->p)))
+			goto exit;
+	}
+
+	/* Capture message data (if valid) & compute return value (always) */
+	
+	if (!err) {
+		buf_crs = (unsigned char *)(TIPC_SKB_CB(buf)->handle);
+		sz = buf->tail - buf_crs;
+
+		needed = (buf_len - sz_copied);
+		sz_to_copy = (sz <= needed) ? sz : needed;
+		if (unlikely(copy_to_user(crs, buf_crs, sz_to_copy))) {
+			res = -EFAULT;
+			goto exit;
+		}
+		sz_copied += sz_to_copy;
+
+		if (sz_to_copy < sz) {
+			if (!(flags & MSG_PEEK))
+				TIPC_SKB_CB(buf)->handle = buf_crs + sz_to_copy;
+			goto exit;
+		}
+
+		crs += sz_to_copy;
+	} else {
+		if (sz_copied != 0)
+			goto exit; /* can't add error msg to valid data */
+
+		if ((err == TIPC_CONN_SHUTDOWN) || m->msg_control)
+			res = 0;
+		else
+			res = -ECONNRESET;
+	}
+
+	/* Consume received message (optional) */
+
+	if (likely(!(flags & MSG_PEEK))) {
+                if (unlikely(++tsock->p->conn_unacked >= TIPC_FLOW_CONTROL_WIN))
+                        tipc_acknowledge(tsock->p->ref, tsock->p->conn_unacked);
+		advance_queue(tsock);
+        }
+
+	/* Loop around if more data is required */
+
+	if ((sz_copied < buf_len)    /* didn't get all requested data */ 
+	    && (flags & MSG_WAITALL) /* ... and need to wait for more */
+	    && (!(flags & MSG_PEEK)) /* ... and aren't just peeking at data */
+	    && (!err)                /* ... and haven't reached a FIN */
+	    )
+		goto restart;
+
+exit:
+	up(&tsock->sem);
+	return res ? res : sz_copied;
+}
+
+/**
+ * queue_overloaded - test if queue overload condition exists
+ * @queue_size: current size of queue
+ * @base: nominal maximum size of queue
+ * @msg: message to be added to queue
+ * 
+ * Returns 1 if queue is currently overloaded, 0 otherwise
+ */
+
+static int queue_overloaded(u32 queue_size, u32 base, struct tipc_msg *msg)
+{
+	u32 threshold;
+	u32 imp = msg_importance(msg);
+
+	if (imp == TIPC_LOW_IMPORTANCE)
+		threshold = base;
+	else if (imp == TIPC_MEDIUM_IMPORTANCE)
+		threshold = base * 2;
+	else if (imp == TIPC_HIGH_IMPORTANCE)
+		threshold = base * 100;
+	else
+		return 0;
+
+	if (msg_connected(msg))
+		threshold *= 4;
+
+	return (queue_size > threshold);
+}
+
+/** 
+ * async_disconnect - wrapper function used to disconnect port
+ * @portref: TIPC port reference (passed as pointer-sized value)
+ */
+
+static void async_disconnect(unsigned long portref)
+{
+	tipc_disconnect((u32)portref);
+}
+
+/** 
+ * dispatch - handle arriving message
+ * @tport: TIPC port that received message
+ * @buf: message
+ * 
+ * Called with port locked.  Must not take socket lock to avoid deadlock risk.
+ * 
+ * Returns TIPC error status code (TIPC_OK if message is not to be rejected)
+ */
+
+static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	struct tipc_sock *tsock = (struct tipc_sock *)tport->usr_handle;
+	struct socket *sock;
+	u32 recv_q_len;
+
+	/* Reject message if socket is closing */
+
+	if (!tsock)
+		return TIPC_ERR_NO_PORT;
+
+	/* Reject message if it is wrong sort of message for socket */
+
+	/*
+	 * WOULD IT BE BETTER TO JUST DISCARD THESE MESSAGES INSTEAD?
+	 * "NO PORT" ISN'T REALLY THE RIGHT ERROR CODE, AND THERE MAY
+	 * BE SECURITY IMPLICATIONS INHERENT IN REJECTING INVALID TRAFFIC
+	 */
+	sock = tsock->sk.sk_socket;
+	if (sock->state == SS_READY) {
+		if (msg_connected(msg)) {
+			msg_dbg(msg, "dispatch filter 1\n");
+			return TIPC_ERR_NO_PORT;
+		}
+	} else {
+		if (msg_mcast(msg)) {
+			msg_dbg(msg, "dispatch filter 2\n");
+			return TIPC_ERR_NO_PORT;
+		}
+		if (sock->state == SS_CONNECTED) {
+			if (!msg_connected(msg)) {
+				msg_dbg(msg, "dispatch filter 3\n");
+				return TIPC_ERR_NO_PORT;
+			}
+		}
+		else if (sock->state == SS_CONNECTING) {
+			if (!msg_connected(msg) && (msg_errcode(msg) == 0)) {
+				msg_dbg(msg, "dispatch filter 4\n");
+				return TIPC_ERR_NO_PORT;
+			}
+		} 
+		else if (sock->state == SS_LISTENING) {
+			if (msg_connected(msg) || msg_errcode(msg)) {
+				msg_dbg(msg, "dispatch filter 5\n");
+				return TIPC_ERR_NO_PORT;
+			}
+		} 
+		else if (sock->state == SS_DISCONNECTING) {
+			msg_dbg(msg, "dispatch filter 6\n");
+			return TIPC_ERR_NO_PORT;
+		}
+		else /* (sock->state == SS_UNCONNECTED) */ {
+			if (msg_connected(msg) || msg_errcode(msg)) {
+				msg_dbg(msg, "dispatch filter 7\n");
+				return TIPC_ERR_NO_PORT;
+			}
+		}
+	}
+
+	/* Reject message if there isn't room to queue it */
+
+	if (unlikely((u32)atomic_read(&tipc_queue_size) > 
+		     OVERLOAD_LIMIT_BASE)) {
+		if (queue_overloaded(atomic_read(&tipc_queue_size), 
+				     OVERLOAD_LIMIT_BASE, msg))
+			return TIPC_ERR_OVERLOAD;
+        }
+	recv_q_len = skb_queue_len(&tsock->sk.sk_receive_queue);
+	if (unlikely(recv_q_len > (OVERLOAD_LIMIT_BASE / 2))) {
+		if (queue_overloaded(recv_q_len, 
+				     OVERLOAD_LIMIT_BASE / 2, msg)) 
+			return TIPC_ERR_OVERLOAD;
+        }
+
+	/* Initiate connection termination for an incoming 'FIN' */
+
+	if (unlikely(msg_errcode(msg) && (sock->state == SS_CONNECTED))) {
+		sock->state = SS_DISCONNECTING;
+		/* Note: Use signal since port lock is already taken! */
+		k_signal((Handler)async_disconnect, tport->ref);
+	}
+
+	/* Enqueue message (finally!) */
+
+	msg_dbg(msg,"<DISP<: ");
+	TIPC_SKB_CB(buf)->handle = msg_data(msg);
+	atomic_inc(&tipc_queue_size);
+	skb_queue_tail(&sock->sk->sk_receive_queue, buf);
+
+        wake_up_interruptible(sock->sk->sk_sleep);
+	return TIPC_OK;
+}
+
+/** 
+ * wakeupdispatch - wake up port after congestion
+ * @tport: port to wakeup
+ * 
+ * Called with port lock on.
+ */
+
+static void wakeupdispatch(struct tipc_port *tport)
+{
+	struct tipc_sock *tsock = (struct tipc_sock *)tport->usr_handle;
+
+        wake_up_interruptible(tsock->sk.sk_sleep);
+}
+
+/**
+ * connect - establish a connection to another TIPC port
+ * @sock: socket structure
+ * @dest: socket address for destination port
+ * @destlen: size of socket address data structure
+ * @flags: (unused)
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int connect(struct socket *sock, struct sockaddr *dest, int destlen, 
+		   int flags)
+{
+   struct tipc_sock *tsock = tipc_sk(sock->sk);
+   struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest;
+   struct msghdr m = {0,};
+   struct sk_buff *buf;
+   struct tipc_msg *msg;
+   int res;
+
+   /* For now, TIPC does not allow use of connect() with DGRAM or RDM types */
+
+   if (sock->state == SS_READY)
+	   return -EOPNOTSUPP;
+
+   /* MOVE THE REST OF THIS ERROR CHECKING TO send_msg()? */
+   if (sock->state == SS_LISTENING)
+	   return -EOPNOTSUPP;
+   if (sock->state == SS_CONNECTING)
+	   return -EALREADY;
+   if (sock->state != SS_UNCONNECTED)
+           return -EISCONN;
+
+   if ((dst->family != AF_TIPC) ||
+       ((dst->addrtype != TIPC_ADDR_NAME) && (dst->addrtype != TIPC_ADDR_ID)))
+           return -EINVAL;
+
+   /* Send a 'SYN-' to destination */
+
+   m.msg_name = dest;
+   if ((res = send_msg(0, sock, &m, 0)) < 0) {
+	   sock->state = SS_DISCONNECTING;
+	   return res;
+   }
+
+   if (down_interruptible(&tsock->sem)) 
+           return -ERESTARTSYS;
+	
+   /* Wait for destination's 'ACK' response */
+
+   res = wait_event_interruptible_timeout(*sock->sk->sk_sleep,
+                                          skb_queue_len(&sock->sk->sk_receive_queue),
+					  sock->sk->sk_rcvtimeo);
+   buf = skb_peek(&sock->sk->sk_receive_queue);
+   if (res > 0) {
+	   msg = buf_msg(buf);
+           res = auto_connect(sock, tsock, msg);
+           if (!res) {
+		   if (dst->addrtype == TIPC_ADDR_NAME) {
+			   tsock->p->conn_type = dst->addr.name.name.type;
+			   tsock->p->conn_instance = dst->addr.name.name.instance;
+		   }
+		   if (!msg_data_sz(msg))
+			   advance_queue(tsock);
+	   }
+   } else {
+	   if (res == 0) {
+		   res = -ETIMEDOUT;
+	   } else
+	           { /* leave "res" unchanged */ }
+	   sock->state = SS_DISCONNECTING;
+   }
+
+   up(&tsock->sem);
+   return res;
+}
+
+/** 
+ * listen - allow socket to listen for incoming connections
+ * @sock: socket structure
+ * @len: (unused)
+ * 
+ * Returns 0 on success, errno otherwise
+ */
+
+static int listen(struct socket *sock, int len)
+{
+	/* REQUIRES SOCKET LOCKING OF SOME SORT? */
+
+	if (sock->state == SS_READY)
+		return -EOPNOTSUPP;
+	if (sock->state != SS_UNCONNECTED)
+		return -EINVAL;
+	sock->state = SS_LISTENING;
+        return 0;
+}
+
+/** 
+ * accept - wait for connection request
+ * @sock: listening socket
+ * @newsock: new socket that is to be connected
+ * @flags: file-related flags associated with socket
+ * 
+ * Returns 0 on success, errno otherwise
+ */
+
+static int accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+	struct sk_buff *buf;
+	int res = -EFAULT;
+
+	if (sock->state == SS_READY)
+		return -EOPNOTSUPP;
+	if (sock->state != SS_LISTENING)
+		return -EINVAL;
+	
+	if (unlikely((skb_queue_len(&sock->sk->sk_receive_queue) == 0) && 
+		     (flags & O_NONBLOCK)))
+		return -EWOULDBLOCK;
+
+	if (down_interruptible(&tsock->sem))
+		return -ERESTARTSYS;
+
+	if (wait_event_interruptible(*sock->sk->sk_sleep, 
+				     skb_queue_len(&sock->sk->sk_receive_queue))) {
+		res = -ERESTARTSYS;
+		goto exit;
+	}
+	buf = skb_peek(&sock->sk->sk_receive_queue);
+
+	res = tipc_create(newsock, 0);
+	if (!res) {
+		struct tipc_sock *new_tsock = tipc_sk(newsock->sk);
+		struct tipc_portid id;
+		struct tipc_msg *msg = buf_msg(buf);
+		u32 new_ref = new_tsock->p->ref;
+
+		id.ref = msg_origport(msg);
+		id.node = msg_orignode(msg);
+		tipc_connect2port(new_ref, &id);
+		newsock->state = SS_CONNECTED;
+
+		tipc_set_portimportance(new_ref, msg_importance(msg));
+		if (msg_named(msg)) {
+			new_tsock->p->conn_type = msg_nametype(msg);
+			new_tsock->p->conn_instance = msg_nameinst(msg);
+		}
+
+               /* 
+		 * Respond to 'SYN-' by discarding it & returning 'ACK'-.
+		 * Respond to 'SYN+' by queuing it on new socket.
+		 */
+
+		msg_dbg(msg,"<ACC<: ");
+                if (!msg_data_sz(msg)) {
+                        struct msghdr m = {0,};
+
+                        send_packet(0, newsock, &m, 0);      
+                        advance_queue(tsock);
+                } else {
+			sock_lock(tsock);
+			skb_dequeue(&sock->sk->sk_receive_queue);
+			sock_unlock(tsock);
+			skb_queue_head(&newsock->sk->sk_receive_queue, buf);
+		}
+	}
+exit:
+	up(&tsock->sem);
+	return res;
+}
+
+/**
+ * shutdown - shutdown socket connection
+ * @sock: socket structure
+ * @how: direction to close (always treated as read + write)
+ *
+ * Terminates connection (if necessary), then purges socket's receive queue.
+ * 
+ * Returns 0 on success, errno otherwise
+ */
+
+static int shutdown(struct socket *sock, int how)
+{
+	struct tipc_sock* tsock = tipc_sk(sock->sk);
+	struct sk_buff *buf;
+	int res;
+
+	/* Could return -EINVAL for an invalid "how", but why bother? */
+
+	if (down_interruptible(&tsock->sem))
+		return -ERESTARTSYS;
+
+	sock_lock(tsock);
+
+	switch (sock->state) {
+	case SS_CONNECTED:
+
+		/* Send 'FIN+' or 'FIN-' message to peer */
+
+		sock_unlock(tsock);
+restart:
+		if ((buf = skb_dequeue(&sock->sk->sk_receive_queue))) {
+			atomic_dec(&tipc_queue_size);
+			if (TIPC_SKB_CB(buf)->handle != msg_data(buf_msg(buf))) {
+				buf_discard(buf);
+				goto restart;
+			}
+			tipc_reject_msg(buf, TIPC_CONN_SHUTDOWN);
+		}
+		else {
+			tipc_shutdown(tsock->p->ref);
+		}
+		sock_lock(tsock);
+
+		/* fall through */
+
+	case SS_DISCONNECTING:
+
+		/* Discard any unreceived messages */
+
+		while ((buf = skb_dequeue(&sock->sk->sk_receive_queue))) {
+			atomic_dec(&tipc_queue_size);
+			buf_discard(buf);
+		}
+		tsock->p->conn_unacked = 0;
+
+		/* fall through */
+
+	case SS_CONNECTING:
+		sock->state = SS_DISCONNECTING;
+		res = 0;
+		break;
+
+	default:
+		res = -ENOTCONN;
+	}
+
+	sock_unlock(tsock);
+
+	up(&tsock->sem);
+	return res;
+}
+
+/**
+ * setsockopt - set socket option
+ * @sock: socket structure
+ * @lvl: option level
+ * @opt: option identifier
+ * @ov: pointer to new option value
+ * @ol: length of option value
+ * 
+ * For stream sockets only, accepts and ignores all IPPROTO_TCP options 
+ * (to ease compatibility).
+ * 
+ * Returns 0 on success, errno otherwise
+ */
+
+static int setsockopt(struct socket *sock, int lvl, int opt, char *ov, int ol)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+	u32 value;
+	int res;
+
+        if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
+                return 0;
+	if (lvl != SOL_TIPC)
+		return -ENOPROTOOPT;
+	if (ol < sizeof(value))
+		return -EINVAL;
+        if ((res = get_user(value, (u32 *)ov)))
+		return res;
+
+	if (down_interruptible(&tsock->sem)) 
+		return -ERESTARTSYS;
+	
+	switch (opt) {
+	case TIPC_IMPORTANCE:
+		res = tipc_set_portimportance(tsock->p->ref, value);
+		break;
+	case TIPC_SRC_DROPPABLE:
+		if (sock->type != SOCK_STREAM)
+			res = tipc_set_portunreliable(tsock->p->ref, value);
+		else 
+			res = -ENOPROTOOPT;
+		break;
+	case TIPC_DEST_DROPPABLE:
+		res = tipc_set_portunreturnable(tsock->p->ref, value);
+		break;
+	case TIPC_CONN_TIMEOUT:
+		sock->sk->sk_rcvtimeo = (value * HZ / 1000);
+		break;
+	default:
+		res = -EINVAL;
+	}
+
+	up(&tsock->sem);
+	return res;
+}
+
+/**
+ * getsockopt - get socket option
+ * @sock: socket structure
+ * @lvl: option level
+ * @opt: option identifier
+ * @ov: receptacle for option value
+ * @ol: receptacle for length of option value
+ * 
+ * For stream sockets only, returns 0 length result for all IPPROTO_TCP options 
+ * (to ease compatibility).
+ * 
+ * Returns 0 on success, errno otherwise
+ */
+
+static int getsockopt(struct socket *sock, int lvl, int opt, char *ov, int *ol)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+        int len;
+	u32 value;
+        int res;
+
+        if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
+                return put_user(0, ol);
+	if (lvl != SOL_TIPC)
+		return -ENOPROTOOPT;
+        if ((res = get_user(len, ol)))
+                return res;
+
+	if (down_interruptible(&tsock->sem)) 
+		return -ERESTARTSYS;
+
+	switch (opt) {
+	case TIPC_IMPORTANCE:
+		res = tipc_portimportance(tsock->p->ref, &value);
+		break;
+	case TIPC_SRC_DROPPABLE:
+		res = tipc_portunreliable(tsock->p->ref, &value);
+		break;
+	case TIPC_DEST_DROPPABLE:
+		res = tipc_portunreturnable(tsock->p->ref, &value);
+		break;
+	case TIPC_CONN_TIMEOUT:
+		value = (sock->sk->sk_rcvtimeo * 1000) / HZ;
+		break;
+	default:
+		res = -EINVAL;
+	}
+
+	if (res) {
+		/* "get" failed */
+	}
+	else if (len < sizeof(value)) {
+		res = -EINVAL;
+	}
+	else if ((res = copy_to_user(ov, &value, sizeof(value)))) {
+		/* couldn't return value */
+	}
+	else {
+		res = put_user(sizeof(value), ol);
+	}
+
+        up(&tsock->sem);
+	return res;
+}
+
+/**
+ * Placeholders for non-implemented functionality
+ * 
+ * Returns error code (POSIX-compliant where defined)
+ */
+
+static int ioctl(struct socket *s, u32 cmd, unsigned long arg)
+{
+        return -EINVAL;
+}
+
+static int no_mmap(struct file *file, struct socket *sock,
+                   struct vm_area_struct *vma)
+{
+        return -EINVAL;
+}
+static ssize_t no_sendpage(struct socket *sock, struct page *page,
+                           int offset, size_t size, int flags)
+{
+        return -EINVAL;
+}
+
+static int no_skpair(struct socket *s1, struct socket *s2)
+{
+	return -EOPNOTSUPP;
+}
+
+/**
+ * Protocol switches for the various types of TIPC sockets
+ */
+
+static struct proto_ops msg_ops = {
+	.owner 		= THIS_MODULE,
+	.family		= AF_TIPC,
+	.release	= release,
+	.bind		= bind,
+	.connect	= connect,
+	.socketpair	= no_skpair,
+	.accept		= accept,
+	.getname	= get_name,
+	.poll		= poll,
+	.ioctl		= ioctl,
+	.listen		= listen,
+	.shutdown	= shutdown,
+	.setsockopt	= setsockopt,
+	.getsockopt	= getsockopt,
+	.sendmsg	= send_msg,
+	.recvmsg	= recv_msg,
+        .mmap		= no_mmap,
+        .sendpage	= no_sendpage
+};
+
+static struct proto_ops packet_ops = {
+	.owner 		= THIS_MODULE,
+	.family		= AF_TIPC,
+	.release	= release,
+	.bind		= bind,
+	.connect	= connect,
+	.socketpair	= no_skpair,
+	.accept		= accept,
+	.getname	= get_name,
+	.poll		= poll,
+	.ioctl		= ioctl,
+	.listen		= listen,
+	.shutdown	= shutdown,
+	.setsockopt	= setsockopt,
+	.getsockopt	= getsockopt,
+	.sendmsg	= send_packet,
+	.recvmsg	= recv_msg,
+        .mmap		= no_mmap,
+        .sendpage	= no_sendpage
+};
+
+static struct proto_ops stream_ops = {
+	.owner 		= THIS_MODULE,
+	.family		= AF_TIPC,
+	.release	= release,
+	.bind		= bind,
+	.connect	= connect,
+	.socketpair	= no_skpair,
+	.accept		= accept,
+	.getname	= get_name,
+	.poll		= poll,
+	.ioctl		= ioctl,
+	.listen		= listen,
+	.shutdown	= shutdown,
+	.setsockopt	= setsockopt,
+	.getsockopt	= getsockopt,
+	.sendmsg	= send_stream,
+	.recvmsg	= recv_stream,
+        .mmap		= no_mmap,
+        .sendpage	= no_sendpage
+};
+
+static struct net_proto_family tipc_family_ops = {
+	.owner 		= THIS_MODULE,
+	.family		= AF_TIPC,
+	.create		= tipc_create
+};
+
+static struct proto tipc_proto = {
+	.name		= "TIPC",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct tipc_sock)
+};
+
+/**
+ * socket_init - initialize TIPC socket interface
+ * 
+ * Returns 0 on success, errno otherwise
+ */
+int socket_init(void)
+{
+	int res;
+
+        res = proto_register(&tipc_proto, 1);
+	if (res) {
+		err("Unable to register TIPC protocol type\n");
+		goto out;
+	}
+
+	res = sock_register(&tipc_family_ops);
+	if (res) {
+		err("Unable to register TIPC socket type\n");
+		proto_unregister(&tipc_proto);
+		goto out;
+	}
+
+	sockets_enabled = 1;
+ out:
+	return res;
+}
+
+/**
+ * sock_stop - stop TIPC socket interface
+ */
+void socket_stop(void)
+{
+	if (!sockets_enabled)
+		return;
+
+	sockets_enabled = 0;
+	sock_unregister(tipc_family_ops.family);
+	proto_unregister(&tipc_proto);
+}
+
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
new file mode 100644
index 000000000000..6f651a2c477e
--- /dev/null
+++ b/net/tipc/subscr.c
@@ -0,0 +1,522 @@
+/*
+ * net/tipc/subscr.c: TIPC subscription service
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "dbg.h"
+#include "subscr.h"
+#include "name_table.h"
+#include "ref.h"
+
+/**
+ * struct subscriber - TIPC network topology subscriber
+ * @ref: object reference to subscriber object itself
+ * @lock: pointer to spinlock controlling access to subscriber object
+ * @subscriber_list: adjacent subscribers in top. server's list of subscribers
+ * @subscription_list: list of subscription objects for this subscriber
+ * @port_ref: object reference to port used to communicate with subscriber
+ * @swap: indicates if subscriber uses opposite endianness in its messages
+ */
+ 
+struct subscriber {
+	u32 ref;
+        spinlock_t *lock;
+	struct list_head subscriber_list;
+	struct list_head subscription_list;
+	u32 port_ref;
+	int swap;
+};
+
+/**
+ * struct top_srv - TIPC network topology subscription service
+ * @user_ref: TIPC userid of subscription service
+ * @setup_port: reference to TIPC port that handles subscription requests
+ * @subscription_count: number of active subscriptions (not subscribers!)
+ * @subscriber_list: list of ports subscribing to service
+ * @lock: spinlock govering access to subscriber list
+ */
+
+struct top_srv {
+	u32 user_ref;
+	u32 setup_port;
+	atomic_t subscription_count;
+	struct list_head subscriber_list;
+	spinlock_t lock;
+};
+
+static struct top_srv topsrv = { 0 };
+
+/**
+ * htohl - convert value to endianness used by destination
+ * @in: value to convert
+ * @swap: non-zero if endianness must be reversed
+ * 
+ * Returns converted value
+ */
+
+static inline u32 htohl(u32 in, int swap)
+{
+	char *c = (char *)&in;
+
+	return swap ? ((c[3] << 3) + (c[2] << 2) + (c[1] << 1) + c[0]) : in;
+}
+
+/**
+ * subscr_send_event - send a message containing a tipc_event to the subscriber
+ */
+
+static void subscr_send_event(struct subscription *sub, 
+			      u32 found_lower, 
+			      u32 found_upper,
+			      u32 event, 
+			      u32 port_ref, 
+			      u32 node)
+{
+	struct iovec msg_sect;
+
+	msg_sect.iov_base = (void *)&sub->evt;
+	msg_sect.iov_len = sizeof(struct tipc_event);
+
+	sub->evt.event = htohl(event, sub->owner->swap);
+	sub->evt.found_lower = htohl(found_lower, sub->owner->swap);
+	sub->evt.found_upper = htohl(found_upper, sub->owner->swap);
+	sub->evt.port.ref = htohl(port_ref, sub->owner->swap);
+	sub->evt.port.node = htohl(node, sub->owner->swap);
+	tipc_send(sub->owner->port_ref, 1, &msg_sect);
+}
+
+/**
+ * subscr_overlap - test for subscription overlap with the given values
+ *
+ * Returns 1 if there is overlap, otherwise 0.
+ */
+
+int subscr_overlap(struct subscription *sub, 
+		   u32 found_lower, 
+		   u32 found_upper)
+
+{
+	if (found_lower < sub->seq.lower)
+		found_lower = sub->seq.lower;
+	if (found_upper > sub->seq.upper)
+		found_upper = sub->seq.upper;
+	if (found_lower > found_upper)
+		return 0;
+	return 1;
+}
+
+/**
+ * subscr_report_overlap - issue event if there is subscription overlap
+ * 
+ * Protected by nameseq.lock in name_table.c
+ */
+
+void subscr_report_overlap(struct subscription *sub, 
+			   u32 found_lower, 
+			   u32 found_upper,
+			   u32 event, 
+			   u32 port_ref, 
+			   u32 node,
+			   int must)
+{
+	dbg("Rep overlap %u:%u,%u<->%u,%u\n", sub->seq.type, sub->seq.lower,
+	    sub->seq.upper, found_lower, found_upper);
+	if (!subscr_overlap(sub, found_lower, found_upper))
+		return;
+	if (!must && (sub->filter != TIPC_SUB_PORTS))
+		return;
+	subscr_send_event(sub, found_lower, found_upper, event, port_ref, node);
+}
+
+/**
+ * subscr_timeout - subscription timeout has occurred
+ */
+
+static void subscr_timeout(struct subscription *sub)
+{
+	struct subscriber *subscriber;
+	u32 subscriber_ref;
+
+	/* Validate subscriber reference (in case subscriber is terminating) */
+
+	subscriber_ref = sub->owner->ref;
+	subscriber = (struct subscriber *)ref_lock(subscriber_ref);
+	if (subscriber == NULL)
+		return;
+
+	/* Unlink subscription from name table */
+
+	nametbl_unsubscribe(sub);
+
+	/* Notify subscriber of timeout, then unlink subscription */
+
+	subscr_send_event(sub, 
+			  sub->evt.s.seq.lower, 
+			  sub->evt.s.seq.upper,
+			  TIPC_SUBSCR_TIMEOUT, 
+			  0, 
+			  0);
+	list_del(&sub->subscription_list);
+
+	/* Now destroy subscription */
+
+	ref_unlock(subscriber_ref);
+	k_term_timer(&sub->timer);
+	kfree(sub);
+	atomic_dec(&topsrv.subscription_count);
+}
+
+/**
+ * subscr_terminate - terminate communication with a subscriber
+ * 
+ * Called with subscriber locked.  Routine must temporarily release this lock
+ * to enable subscription timeout routine(s) to finish without deadlocking; 
+ * the lock is then reclaimed to allow caller to release it upon return.
+ * (This should work even in the unlikely event some other thread creates 
+ * a new object reference in the interim that uses this lock; this routine will
+ * simply wait for it to be released, then claim it.)
+ */
+
+static void subscr_terminate(struct subscriber *subscriber)
+{
+	struct subscription *sub;
+	struct subscription *sub_temp;
+
+	/* Invalidate subscriber reference */
+
+	ref_discard(subscriber->ref);
+	spin_unlock_bh(subscriber->lock);
+
+	/* Destroy any existing subscriptions for subscriber */
+	
+	list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list,
+				 subscription_list) {
+		if (sub->timeout != TIPC_WAIT_FOREVER) {
+			k_cancel_timer(&sub->timer);
+			k_term_timer(&sub->timer);
+		}
+		nametbl_unsubscribe(sub);
+		list_del(&sub->subscription_list);
+		dbg("Term: Removed sub %u,%u,%u from subscriber %x list\n",
+		    sub->seq.type, sub->seq.lower, sub->seq.upper, subscriber);
+		kfree(sub);
+		atomic_dec(&topsrv.subscription_count);
+	}
+
+	/* Sever connection to subscriber */
+
+	tipc_shutdown(subscriber->port_ref);
+	tipc_deleteport(subscriber->port_ref);
+
+	/* Remove subscriber from topology server's subscriber list */
+
+	spin_lock_bh(&topsrv.lock);
+	list_del(&subscriber->subscriber_list);
+	spin_unlock_bh(&topsrv.lock);
+
+	/* Now destroy subscriber */
+
+	spin_lock_bh(subscriber->lock);
+	kfree(subscriber);
+}
+
+/**
+ * subscr_subscribe - create subscription for subscriber
+ * 
+ * Called with subscriber locked
+ */
+
+static void subscr_subscribe(struct tipc_subscr *s,
+			     struct subscriber *subscriber)
+{
+	struct subscription *sub;
+
+	/* Refuse subscription if global limit exceeded */
+
+	if (atomic_read(&topsrv.subscription_count) >= tipc_max_subscriptions) {
+		warn("Failed: max %u subscriptions\n", tipc_max_subscriptions);
+		subscr_terminate(subscriber);
+		return;
+	}
+
+	/* Allocate subscription object */
+
+	sub = kmalloc(sizeof(*sub), GFP_ATOMIC);
+	if (sub == NULL) {
+		warn("Memory squeeze; ignoring subscription\n");
+		subscr_terminate(subscriber);
+		return;
+	}
+
+	/* Determine/update subscriber's endianness */
+
+	if ((s->filter == TIPC_SUB_PORTS) || (s->filter == TIPC_SUB_SERVICE))
+		subscriber->swap = 0;
+	else
+		subscriber->swap = 1;
+
+	/* Initialize subscription object */
+
+	memset(sub, 0, sizeof(*sub));
+	sub->seq.type = htohl(s->seq.type, subscriber->swap);
+	sub->seq.lower = htohl(s->seq.lower, subscriber->swap);
+	sub->seq.upper = htohl(s->seq.upper, subscriber->swap);
+	sub->timeout = htohl(s->timeout, subscriber->swap);
+	sub->filter = htohl(s->filter, subscriber->swap);
+	if ((((sub->filter != TIPC_SUB_PORTS) 
+	      && (sub->filter != TIPC_SUB_SERVICE)))
+	    || (sub->seq.lower > sub->seq.upper)) {
+		warn("Rejecting illegal subscription %u,%u,%u\n",
+		     sub->seq.type, sub->seq.lower, sub->seq.upper);
+		kfree(sub);
+		subscr_terminate(subscriber);
+		return;
+	}
+	memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr));
+	INIT_LIST_HEAD(&sub->subscription_list);
+	INIT_LIST_HEAD(&sub->nameseq_list);
+	list_add(&sub->subscription_list, &subscriber->subscription_list);
+	atomic_inc(&topsrv.subscription_count);
+	if (sub->timeout != TIPC_WAIT_FOREVER) {
+		k_init_timer(&sub->timer,
+			     (Handler)subscr_timeout, (unsigned long)sub);
+		k_start_timer(&sub->timer, sub->timeout);
+	}
+	sub->owner = subscriber;
+	nametbl_subscribe(sub);
+}
+
+/**
+ * subscr_conn_shutdown_event - handle termination request from subscriber
+ */
+
+static void subscr_conn_shutdown_event(void *usr_handle,
+				       u32 portref,
+				       struct sk_buff **buf,
+				       unsigned char const *data,
+				       unsigned int size,
+				       int reason)
+{
+	struct subscriber *subscriber = ref_lock((u32)usr_handle);
+	spinlock_t *subscriber_lock;
+
+	if (subscriber == NULL)
+		return;
+
+	subscriber_lock = subscriber->lock;
+	subscr_terminate(subscriber);
+	spin_unlock_bh(subscriber_lock);
+}
+
+/**
+ * subscr_conn_msg_event - handle new subscription request from subscriber
+ */
+
+static void subscr_conn_msg_event(void *usr_handle,
+				  u32 port_ref,
+				  struct sk_buff **buf,
+				  const unchar *data,
+				  u32 size)
+{
+	struct subscriber *subscriber = ref_lock((u32)usr_handle);
+	spinlock_t *subscriber_lock;
+
+	if (subscriber == NULL)
+		return;
+
+	subscriber_lock = subscriber->lock;
+	if (size != sizeof(struct tipc_subscr))
+		subscr_terminate(subscriber);
+	else
+		subscr_subscribe((struct tipc_subscr *)data, subscriber);
+	
+	spin_unlock_bh(subscriber_lock);
+}
+
+/**
+ * subscr_named_msg_event - handle request to establish a new subscriber
+ */
+
+static void subscr_named_msg_event(void *usr_handle,
+				   u32 port_ref,
+				   struct sk_buff **buf,
+				   const unchar *data,
+				   u32 size,
+				   u32 importance, 
+				   struct tipc_portid const *orig,
+				   struct tipc_name_seq const *dest)
+{
+	struct subscriber *subscriber;
+	struct iovec msg_sect = {0, 0};
+	spinlock_t *subscriber_lock;
+
+	dbg("subscr_named_msg_event: orig = %x own = %x,\n",
+	    orig->node, tipc_own_addr);
+	if (size && (size != sizeof(struct tipc_subscr))) {
+		warn("Received tipc_subscr of invalid size\n");
+		return;
+	}
+
+	/* Create subscriber object */
+
+	subscriber = kmalloc(sizeof(struct subscriber), GFP_ATOMIC);
+	if (subscriber == NULL) {
+		warn("Memory squeeze; ignoring subscriber setup\n");
+		return;
+	}
+	memset(subscriber, 0, sizeof(struct subscriber));
+	INIT_LIST_HEAD(&subscriber->subscription_list);
+	INIT_LIST_HEAD(&subscriber->subscriber_list);
+	subscriber->ref = ref_acquire(subscriber, &subscriber->lock);
+	if (subscriber->ref == 0) {
+		warn("Failed to acquire subscriber reference\n");
+		kfree(subscriber);
+		return;
+	}
+
+	/* Establish a connection to subscriber */
+
+	tipc_createport(topsrv.user_ref,
+			(void *)subscriber->ref,
+			importance,
+			0,
+			0,
+			subscr_conn_shutdown_event,
+			0,
+			0,
+			subscr_conn_msg_event,
+			0,
+			&subscriber->port_ref);
+	if (subscriber->port_ref == 0) {
+		warn("Memory squeeze; failed to create subscription port\n");
+		ref_discard(subscriber->ref);
+		kfree(subscriber);
+		return;
+	}
+	tipc_connect2port(subscriber->port_ref, orig);
+
+
+	/* Add subscriber to topology server's subscriber list */
+
+	ref_lock(subscriber->ref);
+	spin_lock_bh(&topsrv.lock);
+	list_add(&subscriber->subscriber_list, &topsrv.subscriber_list);
+	spin_unlock_bh(&topsrv.lock);
+
+	/*
+	 * Subscribe now if message contains a subscription,
+	 * otherwise send an empty response to complete connection handshaking
+	 */
+
+	subscriber_lock = subscriber->lock;
+	if (size)
+		subscr_subscribe((struct tipc_subscr *)data, subscriber);
+	else
+		tipc_send(subscriber->port_ref, 1, &msg_sect);
+
+	spin_unlock_bh(subscriber_lock);
+}
+
+int subscr_start(void)
+{
+	struct tipc_name_seq seq = {TIPC_TOP_SRV, TIPC_TOP_SRV, TIPC_TOP_SRV};
+	int res = -1;
+
+	memset(&topsrv, 0, sizeof (topsrv));
+	topsrv.lock = SPIN_LOCK_UNLOCKED;
+	INIT_LIST_HEAD(&topsrv.subscriber_list);
+
+	spin_lock_bh(&topsrv.lock);
+	res = tipc_attach(&topsrv.user_ref, 0, 0);
+	if (res) {
+		spin_unlock_bh(&topsrv.lock);
+		return res;
+	}
+
+ 	res = tipc_createport(topsrv.user_ref,
+ 			      0,
+ 			      TIPC_CRITICAL_IMPORTANCE,
+ 			      0,
+ 			      0,
+ 			      0,
+ 			      0,
+ 			      subscr_named_msg_event,
+ 			      0,
+ 			      0,
+ 			      &topsrv.setup_port);
+ 	if (res)
+		goto failed;
+
+ 	res = nametbl_publish_rsv(topsrv.setup_port, TIPC_NODE_SCOPE, &seq);
+ 	if (res)
+		goto failed;
+
+	spin_unlock_bh(&topsrv.lock);
+	return 0;
+
+failed:
+	err("Unable to create subscription service\n");
+	tipc_detach(topsrv.user_ref);
+	topsrv.user_ref = 0;
+	spin_unlock_bh(&topsrv.lock);
+	return res;
+}
+
+void subscr_stop(void)
+{
+	struct subscriber *subscriber;
+	struct subscriber *subscriber_temp;
+	spinlock_t *subscriber_lock;
+
+	if (topsrv.user_ref) {
+		tipc_deleteport(topsrv.setup_port);
+		list_for_each_entry_safe(subscriber, subscriber_temp, 
+					 &topsrv.subscriber_list,
+					 subscriber_list) {
+			ref_lock(subscriber->ref);
+			subscriber_lock = subscriber->lock;
+			subscr_terminate(subscriber);
+			spin_unlock_bh(subscriber_lock);
+		}
+		tipc_detach(topsrv.user_ref);
+		topsrv.user_ref = 0;
+	}
+}
+
+
+int tipc_ispublished(struct tipc_name const *name)
+{
+	u32 domain = 0;
+
+	return(nametbl_translate(name->type, name->instance,&domain) != 0);
+}
+
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
new file mode 100644
index 000000000000..e6cb9c3ad870
--- /dev/null
+++ b/net/tipc/subscr.h
@@ -0,0 +1,77 @@
+/*
+ * net/tipc/subscr.h: Include file for TIPC subscription service
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_SUBSCR_H
+#define _TIPC_SUBSCR_H
+
+/**
+ * struct subscription - TIPC network topology subscription object
+ * @seq: name sequence associated with subscription
+ * @timeout: duration of subscription (in ms)
+ * @filter: event filtering to be done for subscription
+ * @evt: template for events generated by subscription
+ * @subscription_list: adjacent subscriptions in subscriber's subscription list
+ * @nameseq_list: adjacent subscriptions in name sequence's subscription list
+ * @timer_ref: reference to timer governing subscription duration (may be NULL)
+ * @owner: pointer to subscriber object associated with this subscription
+ */
+ 
+struct subscription {
+	struct tipc_name_seq seq;
+	u32 timeout;
+	u32 filter;
+	struct tipc_event evt;
+	struct list_head subscription_list;
+	struct list_head nameseq_list;
+	struct timer_list timer;
+	struct subscriber *owner;
+};
+
+int subscr_overlap(struct subscription * sub, 
+		   u32 found_lower, 
+		   u32 found_upper);
+
+void subscr_report_overlap(struct subscription * sub, 
+			   u32 found_lower, 
+			   u32 found_upper,
+			   u32 event, 
+			   u32 port_ref, 
+			   u32 node,
+			   int must_report);
+
+int subscr_start(void);
+
+void subscr_stop(void);
+
+
+#endif
diff --git a/net/tipc/user_reg.c b/net/tipc/user_reg.c
new file mode 100644
index 000000000000..e4da5becc342
--- /dev/null
+++ b/net/tipc/user_reg.c
@@ -0,0 +1,262 @@
+/*
+ * net/tipc/user_reg.c: TIPC user registry code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "user_reg.h"
+
+/*
+ * TIPC user registry keeps track of users of the tipc_port interface.
+ *
+ * The registry utilizes an array of "TIPC user" entries; 
+ * a user's ID is the index of their associated array entry.
+ * Array entry 0 is not used, so userid 0 is not valid;
+ * TIPC sometimes uses this value to denote an anonymous user.
+ * The list of free entries is initially chained from last entry to entry 1.
+ */
+
+/**
+ * struct tipc_user - registered TIPC user info
+ * @next: index of next free registry entry (or -1 for an allocated entry)
+ * @callback: ptr to routine to call when TIPC mode changes (NULL if none)
+ * @usr_handle: user-defined value passed to callback routine 
+ * @ports: list of user ports owned by the user
+ */
+
+struct tipc_user {
+	int next;
+	tipc_mode_event callback;
+	void *usr_handle;
+	struct list_head ports;
+};
+
+#define MAX_USERID 64
+#define USER_LIST_SIZE ((MAX_USERID + 1) * sizeof(struct tipc_user))
+
+static struct tipc_user *users = 0;
+static u32 next_free_user = MAX_USERID + 1;
+static spinlock_t reg_lock = SPIN_LOCK_UNLOCKED;
+
+/**
+ * reg_init - create TIPC user registry (but don't activate it)
+ * 
+ * If registry has been pre-initialized it is left "as is".
+ * NOTE: This routine may be called when TIPC is inactive.
+ */
+
+static int reg_init(void)
+{
+	u32 i;
+	
+	spin_lock_bh(&reg_lock);
+	if (!users) {
+		users = (struct tipc_user *)kmalloc(USER_LIST_SIZE, GFP_ATOMIC);
+		if (users) {
+			memset(users, 0, USER_LIST_SIZE);
+			for (i = 1; i <= MAX_USERID; i++) {
+				users[i].next = i - 1;
+			}
+			next_free_user = MAX_USERID;
+		}
+	}
+	spin_unlock_bh(&reg_lock);
+	return users ? TIPC_OK : -ENOMEM;
+}
+
+/**
+ * reg_callback - inform TIPC user about current operating mode
+ */
+
+static void reg_callback(struct tipc_user *user_ptr)
+{
+	tipc_mode_event cb;
+	void *arg;
+
+	spin_lock_bh(&reg_lock);
+	cb = user_ptr->callback;
+	arg = user_ptr->usr_handle;
+	spin_unlock_bh(&reg_lock);
+
+	if (cb)
+		cb(arg, tipc_mode, tipc_own_addr);
+}
+
+/**
+ * reg_start - activate TIPC user registry
+ */
+
+int reg_start(void)
+{
+	u32 u;
+	int res;
+
+	if ((res = reg_init()))
+		return res;
+
+	for (u = 1; u <= MAX_USERID; u++) {
+		if (users[u].callback)
+			k_signal((Handler)reg_callback,
+				 (unsigned long)&users[u]);
+	}
+	return TIPC_OK;
+}
+
+/**
+ * reg_stop - shut down & delete TIPC user registry
+ */
+
+void reg_stop(void)
+{               
+	int id;
+
+	if (!users)
+		return;
+
+	for (id = 1; id <= MAX_USERID; id++) {
+		if (users[id].callback)
+			reg_callback(&users[id]);
+	}
+	kfree(users);
+	users = 0;
+}
+
+/**
+ * tipc_attach - register a TIPC user
+ *
+ * NOTE: This routine may be called when TIPC is inactive.
+ */
+
+int tipc_attach(u32 *userid, tipc_mode_event cb, void *usr_handle)
+{
+	struct tipc_user *user_ptr;
+
+	if ((tipc_mode == TIPC_NOT_RUNNING) && !cb)
+		return -ENOPROTOOPT;
+	if (!users)
+		reg_init();
+
+	spin_lock_bh(&reg_lock);
+	if (!next_free_user) {
+		spin_unlock_bh(&reg_lock);
+		return -EBUSY;
+	}
+	user_ptr = &users[next_free_user];
+	*userid = next_free_user;
+	next_free_user = user_ptr->next;
+	user_ptr->next = -1; 
+	spin_unlock_bh(&reg_lock);
+
+	user_ptr->callback = cb;
+	user_ptr->usr_handle = usr_handle;
+	INIT_LIST_HEAD(&user_ptr->ports);
+	atomic_inc(&tipc_user_count);
+	
+	if (cb && (tipc_mode != TIPC_NOT_RUNNING))
+		k_signal((Handler)reg_callback, (unsigned long)user_ptr);
+	return TIPC_OK;
+}
+
+/**
+ * tipc_detach - deregister a TIPC user
+ */
+
+void tipc_detach(u32 userid)
+{
+	struct tipc_user *user_ptr;
+	struct list_head ports_temp;
+	struct user_port *up_ptr, *temp_up_ptr;
+
+	if ((userid == 0) || (userid > MAX_USERID))
+		return;
+
+	spin_lock_bh(&reg_lock);
+	if ((!users) || (users[userid].next >= 0)) {
+		spin_unlock_bh(&reg_lock);
+		return;
+	}
+
+	user_ptr = &users[userid];
+        user_ptr->callback = NULL;              
+	INIT_LIST_HEAD(&ports_temp);
+        list_splice(&user_ptr->ports, &ports_temp);
+	user_ptr->next = next_free_user;
+	next_free_user = userid;
+	spin_unlock_bh(&reg_lock);
+
+	atomic_dec(&tipc_user_count);
+
+        list_for_each_entry_safe(up_ptr, temp_up_ptr, &ports_temp, uport_list) {
+		tipc_deleteport(up_ptr->ref);
+	}
+}
+
+/**
+ * reg_add_port - register a user's driver port
+ */
+
+int reg_add_port(struct user_port *up_ptr)
+{
+	struct tipc_user *user_ptr;
+
+	if (up_ptr->user_ref == 0)
+		return TIPC_OK;
+	if (up_ptr->user_ref > MAX_USERID)
+		return -EINVAL;
+	if ((tipc_mode == TIPC_NOT_RUNNING) || !users )
+		return -ENOPROTOOPT;
+
+	spin_lock_bh(&reg_lock);
+	user_ptr = &users[up_ptr->user_ref];
+	list_add(&up_ptr->uport_list, &user_ptr->ports);
+	spin_unlock_bh(&reg_lock);
+	return TIPC_OK;
+}
+
+/**
+ * reg_remove_port - deregister a user's driver port
+ */
+
+int reg_remove_port(struct user_port *up_ptr)
+{
+	if (up_ptr->user_ref == 0)
+		return TIPC_OK;
+	if (up_ptr->user_ref > MAX_USERID)
+		return -EINVAL;
+	if (!users )
+		return -ENOPROTOOPT;
+
+	spin_lock_bh(&reg_lock);
+	list_del_init(&up_ptr->uport_list);
+	spin_unlock_bh(&reg_lock);
+	return TIPC_OK;
+}
+
diff --git a/net/tipc/user_reg.h b/net/tipc/user_reg.h
new file mode 100644
index 000000000000..44f219458a39
--- /dev/null
+++ b/net/tipc/user_reg.h
@@ -0,0 +1,45 @@
+/*
+ * net/tipc/user_reg.h: Include file for TIPC user registry code
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_USER_REG_H
+#define _TIPC_USER_REG_H
+
+#include "port.h"
+
+int reg_start(void);
+void reg_stop(void);
+
+int reg_add_port(struct user_port *up_ptr);
+int reg_remove_port(struct user_port *up_ptr);
+
+#endif
diff --git a/net/tipc/zone.c b/net/tipc/zone.c
new file mode 100644
index 000000000000..336eebb7bf54
--- /dev/null
+++ b/net/tipc/zone.c
@@ -0,0 +1,166 @@
+/*
+ * net/tipc/zone.c: TIPC zone management routines
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "zone.h"
+#include "net.h"
+#include "addr.h"
+#include "node_subscr.h"
+#include "cluster.h"
+#include "node.h"
+
+struct _zone *zone_create(u32 addr)
+{
+	struct _zone *z_ptr = 0;
+	u32 z_num;
+
+	if (!addr_domain_valid(addr))
+		return 0;
+
+	z_ptr = (struct _zone *)kmalloc(sizeof(*z_ptr), GFP_ATOMIC);
+	if (z_ptr != NULL) {
+		memset(z_ptr, 0, sizeof(*z_ptr));
+		z_num = tipc_zone(addr);
+		z_ptr->addr = tipc_addr(z_num, 0, 0);
+		net.zones[z_num] = z_ptr;
+	}
+	return z_ptr;
+}
+
+void zone_delete(struct _zone *z_ptr)
+{
+	u32 c_num;
+
+	if (!z_ptr)
+		return;
+	for (c_num = 1; c_num <= tipc_max_clusters; c_num++) {
+		cluster_delete(z_ptr->clusters[c_num]);
+	}
+	kfree(z_ptr);
+}
+
+void zone_attach_cluster(struct _zone *z_ptr, struct cluster *c_ptr)
+{
+	u32 c_num = tipc_cluster(c_ptr->addr);
+
+	assert(c_ptr->addr);
+	assert(c_num <= tipc_max_clusters);
+	assert(z_ptr->clusters[c_num] == 0);
+	z_ptr->clusters[c_num] = c_ptr;
+}
+
+void zone_remove_as_router(struct _zone *z_ptr, u32 router)
+{
+	u32 c_num;
+
+	for (c_num = 1; c_num <= tipc_max_clusters; c_num++) {
+		if (z_ptr->clusters[c_num]) {
+			cluster_remove_as_router(z_ptr->clusters[c_num], 
+						 router);
+		}
+	}
+}
+
+void zone_send_external_routes(struct _zone *z_ptr, u32 dest)
+{
+	u32 c_num;
+
+	for (c_num = 1; c_num <= tipc_max_clusters; c_num++) {
+		if (z_ptr->clusters[c_num]) {
+			if (in_own_cluster(z_ptr->addr))
+				continue;
+			cluster_send_ext_routes(z_ptr->clusters[c_num], dest);
+		}
+	}
+}
+
+struct node *zone_select_remote_node(struct _zone *z_ptr, u32 addr, u32 ref)
+{
+	struct cluster *c_ptr;
+	struct node *n_ptr;
+	u32 c_num;
+
+	if (!z_ptr)
+		return 0;
+	c_ptr = z_ptr->clusters[tipc_cluster(addr)];
+	if (!c_ptr)
+		return 0;
+	n_ptr = cluster_select_node(c_ptr, ref);
+	if (n_ptr)
+		return n_ptr;
+
+	/* Links to any other clusters within this zone ? */
+	for (c_num = 1; c_num <= tipc_max_clusters; c_num++) {
+		c_ptr = z_ptr->clusters[c_num];
+		if (!c_ptr)
+			return 0;
+		n_ptr = cluster_select_node(c_ptr, ref);
+		if (n_ptr)
+			return n_ptr;
+	}
+	return 0;
+}
+
+u32 zone_select_router(struct _zone *z_ptr, u32 addr, u32 ref)
+{
+	struct cluster *c_ptr;
+	u32 c_num;
+	u32 router;
+
+	if (!z_ptr)
+		return 0;
+	c_ptr = z_ptr->clusters[tipc_cluster(addr)];
+	router = c_ptr ? cluster_select_router(c_ptr, ref) : 0;
+	if (router)
+		return router;
+
+	/* Links to any other clusters within the zone? */
+	for (c_num = 1; c_num <= tipc_max_clusters; c_num++) {
+		c_ptr = z_ptr->clusters[c_num];
+		router = c_ptr ? cluster_select_router(c_ptr, ref) : 0;
+		if (router)
+			return router;
+	}
+	return 0;
+}
+
+
+u32 zone_next_node(u32 addr)
+{
+	struct cluster *c_ptr = cluster_find(addr);
+
+	if (c_ptr)
+		return cluster_next_node(c_ptr, addr);
+	return 0;
+}
+
diff --git a/net/tipc/zone.h b/net/tipc/zone.h
new file mode 100644
index 000000000000..19fd51a8bd2b
--- /dev/null
+++ b/net/tipc/zone.h
@@ -0,0 +1,68 @@
+/*
+ * net/tipc/zone.h: Include file for TIPC zone management routines
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_ZONE_H
+#define _TIPC_ZONE_H
+
+#include "node_subscr.h"
+#include "net.h"
+
+
+/**
+ * struct _zone - TIPC zone structure
+ * @addr: network address of zone
+ * @clusters: array of pointers to all clusters within zone
+ * @links: (used for inter-zone communication)
+ */
+ 
+struct _zone {
+	u32 addr;
+	struct cluster *clusters[2]; /* currently limited to just 1 cluster */
+	u32 links;
+};
+
+struct node *zone_select_remote_node(struct _zone *z_ptr, u32 addr, u32 ref);
+u32 zone_select_router(struct _zone *z_ptr, u32 addr, u32 ref);
+void zone_remove_as_router(struct _zone *z_ptr, u32 router);
+void zone_send_external_routes(struct _zone *z_ptr, u32 dest);
+struct _zone *zone_create(u32 addr);
+void zone_delete(struct _zone *z_ptr);
+void zone_attach_cluster(struct _zone *z_ptr, struct cluster *c_ptr);
+u32 zone_next_node(u32 addr);
+
+static inline struct _zone *zone_find(u32 addr)
+{
+	return net.zones[tipc_zone(addr)];
+}
+
+#endif
-- 
cgit v1.2.3-71-gd317


From 1dba9743337cabacea79e47ed6d709e636c5ed47 Mon Sep 17 00:00:00 2001
From: Per Liden <per.liden@nospam.ericsson.com>
Date: Thu, 5 Jan 2006 16:34:00 +0100
Subject: [TIPC] Use dynamically allocated family id with NETLINK_GENERIC

Signed-off-by: Per Liden <per.liden@nospam.ericsson.com>
---
 include/linux/tipc.h | 1 -
 net/tipc/netlink.c   | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tipc.h b/include/linux/tipc.h
index ca754f3317f2..1faab77626f2 100644
--- a/include/linux/tipc.h
+++ b/include/linux/tipc.h
@@ -539,7 +539,6 @@ static inline void TLV_LIST_STEP(struct tlv_list_desc *list)
  * Configuration messages exchanged via NETLINK_GENERIC use the following
  * family id, name, version and command.
  */
-#define TIPC_GENL_FAMILY	0x222
 #define TIPC_GENL_NAME		"TIPC"
 #define TIPC_GENL_VERSION	0x1
 #define TIPC_GENL_CMD		0x1
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 35fbc7933604..08b974ed9109 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -40,7 +40,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info)
 	struct nlmsghdr *rep_nlh;
 	struct nlmsghdr *req_nlh = info->nlhdr;
 	struct tipc_genlmsghdr *req_userhdr = info->userhdr;
-	int hdr_space = NLMSG_SPACE(0);
+	int hdr_space = NLMSG_SPACE(GENL_HDRLEN + TIPC_GENL_HDRLEN);
 
 	if ((req_userhdr->cmd & 0xC000) && (!capable(CAP_NET_ADMIN)))
 		rep_buf = cfg_reply_error_string(TIPC_CFG_NOT_NET_ADMIN);
@@ -63,7 +63,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info)
 }
 
 static struct genl_family family = {
-        .id		= TIPC_GENL_FAMILY,
+        .id		= GENL_ID_GENERATE,
         .name		= TIPC_GENL_NAME,
         .version	= TIPC_GENL_VERSION,
         .hdrsize	= TIPC_GENL_HDRLEN,
-- 
cgit v1.2.3-71-gd317


From ea714ccda5d5858ee677a77cf33dbaf34a0060c3 Mon Sep 17 00:00:00 2001
From: Per Liden <per.liden@nospam.ericsson.com>
Date: Wed, 11 Jan 2006 12:28:47 +0100
Subject: [TIPC] Moved configuration interface into tipc_config.h

Restored the old tipc_config.h to get a cleaner division between the
interfaces used by normal TIPC users and TIPC administration utilities.

Signed-off-by: Per Liden <per.liden@nospam.ericsson.com>
---
 include/linux/tipc.h           | 434 +++--------------------------------------
 include/linux/tipc_config.h    | 404 ++++++++++++++++++++++++++++++++++++++
 include/net/tipc/tipc_bearer.h |  28 ++-
 net/tipc/config.h              |   1 +
 net/tipc/socket.c              |   1 +
 5 files changed, 456 insertions(+), 412 deletions(-)
 create mode 100644 include/linux/tipc_config.h

(limited to 'include/linux')

diff --git a/include/linux/tipc.h b/include/linux/tipc.h
index 1faab77626f2..4ecd0047550b 100644
--- a/include/linux/tipc.h
+++ b/include/linux/tipc.h
@@ -1,5 +1,5 @@
 /*
- * include/linux/tipc.h: Include file for TIPC users
+ * include/linux/tipc.h: Include file for TIPC socket interface
  * 
  * Copyright (c) 2003-2005, Ericsson Research Canada
  * Copyright (c) 2005, Wind River Systems
@@ -35,8 +35,6 @@
 #define _LINUX_TIPC_H_
 
 #include <linux/types.h>
-#include <linux/string.h>
-#include <asm/byteorder.h>
 
 /*
  * TIPC addressing primitives
@@ -62,51 +60,51 @@ static inline __u32 tipc_addr(unsigned int zone,
 			      unsigned int cluster,
 			      unsigned int node)
 {
-	return(zone << 24) | (cluster << 12) | node;
+	return (zone << 24) | (cluster << 12) | node;
 }
 
 static inline unsigned int tipc_zone(__u32 addr)
 {
-	return  addr >> 24;
+	return addr >> 24;
 }
 
 static inline unsigned int tipc_cluster(__u32 addr)
 {
-	return(addr >> 12) & 0xfff;
+	return (addr >> 12) & 0xfff;
 }
 
 static inline unsigned int tipc_node(__u32 addr)
 {
-	return  addr & 0xfff;
+	return addr & 0xfff;
 }
 
 /*
  * Application-accessible port name types
  */
 
-#define TIPC_NET_EVENTS		0	/* network event subscription name type */
-#define TIPC_TOP_SRV         	1  	/* topology service name type */
+#define TIPC_CFG_SRV		0	/* configuration service name type */
+#define TIPC_TOP_SRV		1	/* topology service name type */
 #define TIPC_RESERVED_TYPES	64	/* lowest user-publishable name type */
 
 /* 
  * Publication scopes when binding port names and port name sequences
  */
 
-#define TIPC_ZONE_SCOPE         1
-#define TIPC_CLUSTER_SCOPE      2     
-#define TIPC_NODE_SCOPE         3
+#define TIPC_ZONE_SCOPE		1
+#define TIPC_CLUSTER_SCOPE	2
+#define TIPC_NODE_SCOPE		3
 
 /*
  * Limiting values for messages
  */
 
-#define TIPC_MAX_USER_MSG_SIZE 66000
+#define TIPC_MAX_USER_MSG_SIZE	66000
 
-/* 
+/*
  * Message importance levels
  */
 
-#define TIPC_LOW_IMPORTANCE     	0  /* default */
+#define TIPC_LOW_IMPORTANCE		0  /* default */
 #define TIPC_MEDIUM_IMPORTANCE		1
 #define TIPC_HIGH_IMPORTANCE		2
 #define TIPC_CRITICAL_IMPORTANCE	3
@@ -135,7 +133,7 @@ static inline unsigned int tipc_node(__u32 addr)
 #define TIPC_SUB_SINGLE_EVT	0x10	/* expire after first event */
 #endif
 
-#define TIPC_WAIT_FOREVER   	~0	/* timeout for permanent subscription */
+#define TIPC_WAIT_FOREVER	~0	/* timeout for permanent subscription */
 
 struct tipc_subscr {
 	struct tipc_name_seq seq;	/* name sequence of interest */
@@ -144,16 +142,15 @@ struct tipc_subscr {
 	char usr_handle[8];		/* available for subscriber use */
 };
 
-
-#define TIPC_PUBLISHED          1	/* publication event */
-#define TIPC_WITHDRAWN          2	/* withdraw event */
-#define TIPC_SUBSCR_TIMEOUT     3	/* subscription timeout event */
+#define TIPC_PUBLISHED		1	/* publication event */
+#define TIPC_WITHDRAWN		2	/* withdraw event */
+#define TIPC_SUBSCR_TIMEOUT	3	/* subscription timeout event */
 
 struct tipc_event {
 	__u32 event;			/* event type */
 	__u32 found_lower;		/* matching name seq instances */
 	__u32 found_upper;		/*    "      "    "     "      */
-        struct tipc_portid port;	/* associated port */
+	struct tipc_portid port;	/* associated port */
 	struct tipc_subscr s;		/* associated subscription */
 };
 
@@ -173,10 +170,10 @@ struct tipc_event {
 #define SOL_TIPC	271
 #endif
 
-#define TIPC_ADDR_NAMESEQ   	1
-#define TIPC_ADDR_MCAST         1
-#define TIPC_ADDR_NAME      	2
-#define TIPC_ADDR_ID        	3
+#define TIPC_ADDR_NAMESEQ	1
+#define TIPC_ADDR_MCAST		1
+#define TIPC_ADDR_NAME		2
+#define TIPC_ADDR_ID		3
 
 struct sockaddr_tipc {
 	unsigned short family;
@@ -207,391 +204,6 @@ struct sockaddr_tipc {
 #define TIPC_IMPORTANCE		127	/* Default: TIPC_LOW_IMPORTANCE */
 #define TIPC_SRC_DROPPABLE	128	/* Default: 0 (resend congested msg) */
 #define TIPC_DEST_DROPPABLE	129	/* Default: based on socket type */
-#define TIPC_CONN_TIMEOUT     	130	/* Default: 8000 (ms)  */
-
-/*
- * Bearer
- */
-
-/* Identifiers of supported TIPC media types */
-
-#define TIPC_MEDIA_TYPE_ETH	1
-
-/* Maximum sizes of TIPC bearer-related names (including terminating NUL) */ 
-
-#define TIPC_MAX_MEDIA_NAME	16	/* format = media */
-#define TIPC_MAX_IF_NAME	16	/* format = interface */
-#define TIPC_MAX_BEARER_NAME	32	/* format = media:interface */
-#define TIPC_MAX_LINK_NAME	60	/* format = Z.C.N:interface-Z.C.N:interface */
-
-struct tipc_media_addr {
-	__u32  type;
-	union {
-		__u8   eth_addr[6];	/* Ethernet bearer */ 
-#if 0
-		/* Prototypes for other possible bearer types */
-
-		struct {
-			__u16 sin_family;
-			__u16 sin_port;
-			struct {
-				__u32 s_addr;
-			} sin_addr;
-			char pad[4];
-		} addr_in;		/* IP-based bearer */
-		__u16  sock_descr;	/* generic socket bearer */
-#endif
-	} dev_addr;
-};
-
-
-/* Link priority limits (range from 0 to # priorities - 1) */
-
-#define TIPC_NUM_LINK_PRI 32
-
-/* Link tolerance limits (min, default, max), in ms */
-
-#define TIPC_MIN_LINK_TOL 50
-#define TIPC_DEF_LINK_TOL 1500
-#define TIPC_MAX_LINK_TOL 30000
-
-/* Link window limits (min, default, max), in packets */
-
-#define TIPC_MIN_LINK_WIN 16
-#define TIPC_DEF_LINK_WIN 50
-#define TIPC_MAX_LINK_WIN 150
-
-/*
- * Configuration
- *
- * All configuration management messaging involves sending a request message
- * to the TIPC configuration service on a node, which sends a reply message
- * back.  (In the future multi-message replies may be supported.)
- *
- * Both request and reply messages consist of a transport header and payload.
- * The transport header contains info about the desired operation;
- * the payload consists of zero or more type/length/value (TLV) items
- * which specify parameters or results for the operation.
- *
- * For many operations, the request and reply messages have a fixed number
- * of TLVs (usually zero or one); however, some reply messages may return 
- * a variable number of TLVs.  A failed request is denoted by the presence
- * of an "error string" TLV in the reply message instead of the TLV(s) the
- * reply should contain if the request succeeds.
- */
- 
-#define TIPC_CFG_SRV	0		/* configuration service name type */
-
-/* 
- * Public commands:
- * May be issued by any process.
- * Accepted by own node, or by remote node only if remote management enabled.                       
- */
- 
-#define  TIPC_CMD_NOOP   	    0x0000    /* tx none, rx none */
-#define  TIPC_CMD_GET_NODES         0x0001    /* tx net_addr, rx node_info(s) */
-#define  TIPC_CMD_GET_MEDIA_NAMES   0x0002    /* tx none, rx media_name(s) */
-#define  TIPC_CMD_GET_BEARER_NAMES  0x0003    /* tx none, rx bearer_name(s) */
-#define  TIPC_CMD_GET_LINKS         0x0004    /* tx net_addr, rx link_info(s) */
-#define  TIPC_CMD_SHOW_NAME_TABLE   0x0005    /* tx name_tbl_query, rx ultra_string */
-#define  TIPC_CMD_SHOW_PORTS        0x0006    /* tx none, rx ultra_string */
-#define  TIPC_CMD_SHOW_LINK_STATS   0x000B    /* tx link_name, rx ultra_string */
-
-#if 0
-#define  TIPC_CMD_SHOW_PORT_STATS   0x0008    /* tx port_ref, rx ultra_string */
-#define  TIPC_CMD_RESET_PORT_STATS  0x0009    /* tx port_ref, rx none */
-#define  TIPC_CMD_GET_ROUTES        0x000A    /* tx ?, rx ? */
-#define  TIPC_CMD_GET_LINK_PEER     0x000D    /* tx link_name, rx ? */
-#endif
-
-/* 
- * Protected commands:
- * May only be issued by "network administration capable" process.
- * Accepted by own node, or by remote node only if remote management enabled
- * and this node is zone manager.                       
- */
-
-#define  TIPC_CMD_GET_REMOTE_MNG    0x4003    /* tx none, rx unsigned */
-#define  TIPC_CMD_GET_MAX_PORTS     0x4004    /* tx none, rx unsigned */
-#define  TIPC_CMD_GET_MAX_PUBL      0x4005    /* tx none, rx unsigned */
-#define  TIPC_CMD_GET_MAX_SUBSCR    0x4006    /* tx none, rx unsigned */
-#define  TIPC_CMD_GET_MAX_ZONES     0x4007    /* tx none, rx unsigned */
-#define  TIPC_CMD_GET_MAX_CLUSTERS  0x4008    /* tx none, rx unsigned */
-#define  TIPC_CMD_GET_MAX_NODES     0x4009    /* tx none, rx unsigned */
-#define  TIPC_CMD_GET_MAX_SLAVES    0x400A    /* tx none, rx unsigned */
-#define  TIPC_CMD_GET_NETID         0x400B    /* tx none, rx unsigned */
-
-#define  TIPC_CMD_ENABLE_BEARER     0x4101    /* tx bearer_config, rx none */
-#define  TIPC_CMD_DISABLE_BEARER    0x4102    /* tx bearer_name, rx none */
-#define  TIPC_CMD_SET_LINK_TOL      0x4107    /* tx link_config, rx none */
-#define  TIPC_CMD_SET_LINK_PRI      0x4108    /* tx link_config, rx none */
-#define  TIPC_CMD_SET_LINK_WINDOW   0x4109    /* tx link_config, rx none */
-#define  TIPC_CMD_SET_LOG_SIZE      0x410A    /* tx unsigned, rx none */
-#define  TIPC_CMD_DUMP_LOG          0x410B    /* tx none, rx ultra_string */
-#define  TIPC_CMD_RESET_LINK_STATS  0x410C    /* tx link_name, rx none */
-
-#if 0
-#define  TIPC_CMD_CREATE_LINK       0x4103    /* tx link_create, rx none */
-#define  TIPC_CMD_REMOVE_LINK       0x4104    /* tx link_name, rx none */
-#define  TIPC_CMD_BLOCK_LINK        0x4105    /* tx link_name, rx none */
-#define  TIPC_CMD_UNBLOCK_LINK      0x4106    /* tx link_name, rx none */
-#endif
-
-/* 
- * Private commands:
- * May only be issued by "network administration capable" process.
- * Accepted by own node only; cannot be used on a remote node.                       
- */
-
-#define  TIPC_CMD_SET_NODE_ADDR     0x8001    /* tx net_addr, rx none */
-#if 0
-#define  TIPC_CMD_SET_ZONE_MASTER   0x8002    /* tx none, rx none */
-#endif
-#define  TIPC_CMD_SET_REMOTE_MNG    0x8003    /* tx unsigned, rx none */
-#define  TIPC_CMD_SET_MAX_PORTS     0x8004    /* tx unsigned, rx none */
-#define  TIPC_CMD_SET_MAX_PUBL      0x8005    /* tx unsigned, rx none */
-#define  TIPC_CMD_SET_MAX_SUBSCR    0x8006    /* tx unsigned, rx none */
-#define  TIPC_CMD_SET_MAX_ZONES     0x8007    /* tx unsigned, rx none */
-#define  TIPC_CMD_SET_MAX_CLUSTERS  0x8008    /* tx unsigned, rx none */
-#define  TIPC_CMD_SET_MAX_NODES     0x8009    /* tx unsigned, rx none */
-#define  TIPC_CMD_SET_MAX_SLAVES    0x800A    /* tx unsigned, rx none */
-#define  TIPC_CMD_SET_NETID         0x800B    /* tx unsigned, rx none */
-
-/*
- * TLV types defined for TIPC
- */
-
-#define TIPC_TLV_NONE		0	/* no TLV present */
-#define TIPC_TLV_VOID		1	/* empty TLV (0 data bytes)*/
-#define TIPC_TLV_UNSIGNED	2	/* 32-bit integer */
-#define TIPC_TLV_STRING		3	/* char[128] (max) */
-#define TIPC_TLV_LARGE_STRING	4	/* char[2048] (max) */
-#define TIPC_TLV_ULTRA_STRING	5	/* char[32768] (max) */
-
-#define TIPC_TLV_ERROR_STRING	16	/* char[128] containing "error code" */
-#define TIPC_TLV_NET_ADDR   	17	/* 32-bit integer denoting <Z.C.N> */
-#define TIPC_TLV_MEDIA_NAME	18	/* char[MAX_MEDIA_NAME] */
-#define TIPC_TLV_BEARER_NAME	19	/* char[MAX_BEARER_NAME] */
-#define TIPC_TLV_LINK_NAME	20	/* char[MAX_LINK_NAME] */
-#define TIPC_TLV_NODE_INFO	21	/* struct tipc_node_info */
-#define TIPC_TLV_LINK_INFO	22	/* struct tipc_link_info */
-#define TIPC_TLV_BEARER_CONFIG  23	/* struct tipc_bearer_config */
-#define TIPC_TLV_LINK_CONFIG    24	/* struct tipc_link_config */
-#define TIPC_TLV_NAME_TBL_QUERY	25	/* struct tipc_name_table_query */
-#define TIPC_TLV_PORT_REF   	26	/* 32-bit port reference */
-
-struct tipc_node_info {
-	__u32 addr;			/* network address of node */
-	__u32 up;			/* 0=down, 1= up */
-};
-
-struct tipc_link_info {
-	__u32 dest;			/* network address of peer node */
-	__u32 up;			/* 0=down, 1=up */
-	char str[TIPC_MAX_LINK_NAME];	/* link name */
-};
-
-struct tipc_bearer_config {
-	__u32 priority;			/* Range [1,31]. Override per link  */
-	__u32 detect_scope;     
-	char name[TIPC_MAX_BEARER_NAME];
-};
-
-struct tipc_link_config {
-	__u32 value;
-	char name[TIPC_MAX_LINK_NAME];
-};
-
-#define TIPC_NTQ_ALLTYPES 0x80000000
-
-struct tipc_name_table_query {
-	__u32 depth;	/* 1:type, 2:+name info, 3:+port info, 4+:+debug info */
-	__u32 type;	/* {t,l,u} info ignored if high bit of "depth" is set */
-	__u32 lowbound; /* (i.e. displays all entries of name table) */
-	__u32 upbound;
-};
-
-/*
- * The error string TLV is a null-terminated string describing the cause 
- * of the request failure.  To simplify error processing (and to save space)
- * the first character of the string can be a special error code character
- * (lying by the range 0x80 to 0xFF) which represents a pre-defined reason.
- */
-
-#define TIPC_CFG_TLV_ERROR      "\x80"  /* request contains incorrect TLV(s) */
-#define TIPC_CFG_NOT_NET_ADMIN  "\x81"	/* must be network administrator */
-#define TIPC_CFG_NOT_ZONE_MSTR	"\x82"	/* must be zone master */
-#define TIPC_CFG_NO_REMOTE	"\x83"	/* remote management not enabled */
-#define TIPC_CFG_NOT_SUPPORTED  "\x84"	/* request is not supported by TIPC */
-#define TIPC_CFG_INVALID_VALUE  "\x85"  /* request has invalid argument value */
-
-#if 0
-/* prototypes TLV structures for proposed commands */
-struct tipc_link_create {
-	__u32   domain;
-	struct tipc_media_addr peer_addr;
-	char bearer_name[MAX_BEARER_NAME];
-};
-
-struct tipc_route_info {
-	__u32 dest;
-	__u32 router;
-};
-#endif
-
-/*
- * A TLV consists of a descriptor, followed by the TLV value.
- * TLV descriptor fields are stored in network byte order; 
- * TLV values must also be stored in network byte order (where applicable).
- * TLV descriptors must be aligned to addresses which are multiple of 4,
- * so up to 3 bytes of padding may exist at the end of the TLV value area.
- * There must not be any padding between the TLV descriptor and its value.
- */
-
-struct tlv_desc {
-	__u16 tlv_len;		/* TLV length (descriptor + value) */
-	__u16 tlv_type;		/* TLV identifier */
-};
-
-#define TLV_ALIGNTO 4
-
-#define TLV_ALIGN(datalen) (((datalen)+(TLV_ALIGNTO-1)) & ~(TLV_ALIGNTO-1))
-#define TLV_LENGTH(datalen) (sizeof(struct tlv_desc) + (datalen))
-#define TLV_SPACE(datalen) (TLV_ALIGN(TLV_LENGTH(datalen)))
-#define TLV_DATA(tlv) ((void *)((char *)(tlv) + TLV_LENGTH(0)))
-
-static inline int TLV_OK(const void *tlv, __u16 space)
-{
-	/*
-	 * Would also like to check that "tlv" is a multiple of 4,
-	 * but don't know how to do this in a portable way.
-	 * - Tried doing (!(tlv & (TLV_ALIGNTO-1))), but GCC compiler
-	 *   won't allow binary "&" with a pointer.
-	 * - Tried casting "tlv" to integer type, but causes warning about size
-	 *   mismatch when pointer is bigger than chosen type (int, long, ...).
-	 */
-
-	return (space >= TLV_SPACE(0)) &&
-		(ntohs(((struct tlv_desc *)tlv)->tlv_len) <= space);
-}
-
-static inline int TLV_CHECK(const void *tlv, __u16 space, __u16 exp_type)
-{
-	return TLV_OK(tlv, space) && 
-		(ntohs(((struct tlv_desc *)tlv)->tlv_type) == exp_type);
-}
-
-static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len)
-{
-	struct tlv_desc *tlv_ptr;
-	int tlv_len;
-
-	tlv_len = TLV_LENGTH(len);
-	tlv_ptr = (struct tlv_desc *)tlv;
-	tlv_ptr->tlv_type = htons(type);
-	tlv_ptr->tlv_len  = htons(tlv_len);
-	if (len && data)
-		memcpy(TLV_DATA(tlv_ptr), data, tlv_len);
-	return TLV_SPACE(len);
-}
-
-/*
- * A TLV list descriptor simplifies processing of messages 
- * containing multiple TLVs.
- */
-
-struct tlv_list_desc {
-	struct tlv_desc *tlv_ptr;	/* ptr to current TLV */
-	__u32 tlv_space;		/* # bytes from curr TLV to list end */
-};
-
-static inline void TLV_LIST_INIT(struct tlv_list_desc *list, 
-				 void *data, __u32 space)
-{
-	list->tlv_ptr = (struct tlv_desc *)data;
-	list->tlv_space = space;
-}
-	     
-static inline int TLV_LIST_EMPTY(struct tlv_list_desc *list)
-{ 
-	return (list->tlv_space == 0);
-}
-
-static inline int TLV_LIST_CHECK(struct tlv_list_desc *list, __u16 exp_type)
-{
-	return TLV_CHECK(list->tlv_ptr, list->tlv_space, exp_type);
-}
-
-static inline void *TLV_LIST_DATA(struct tlv_list_desc *list)
-{
-	return TLV_DATA(list->tlv_ptr);
-}
-
-static inline void TLV_LIST_STEP(struct tlv_list_desc *list)
-{
-	__u16 tlv_space = TLV_ALIGN(ntohs(list->tlv_ptr->tlv_len));
-
-        list->tlv_ptr = (struct tlv_desc *)((char *)list->tlv_ptr + tlv_space);
-	list->tlv_space -= tlv_space;
-}
-
-/*
- * Configuration messages exchanged via NETLINK_GENERIC use the following
- * family id, name, version and command.
- */
-#define TIPC_GENL_NAME		"TIPC"
-#define TIPC_GENL_VERSION	0x1
-#define TIPC_GENL_CMD		0x1
-
-/*
- * TIPC specific header used in NETLINK_GENERIC requests.
- */
-struct tipc_genlmsghdr {
-	__u32 dest;		/* Destination address */
-	__u16 cmd;		/* Command */
-	__u16 reserved;		/* Unused */
-};
-
-#define TIPC_GENL_HDRLEN	NLMSG_ALIGN(sizeof(struct tipc_genlmsghdr))
-
-/*
- * Configuration messages exchanged via TIPC sockets use the TIPC configuration 
- * message header, which is defined below.  This structure is analogous 
- * to the Netlink message header, but fields are stored in network byte order 
- * and no padding is permitted between the header and the message data 
- * that follows.
- */
-
-struct tipc_cfg_msg_hdr
-{
-	__u32 tcm_len;		/* Message length (including header) */
-	__u16 tcm_type;		/* Command type */
-	__u16 tcm_flags;	/* Additional flags */
-	char  tcm_reserved[8];	/* Unused */
-};
-
-#define TCM_F_REQUEST	0x1	/* Flag: Request message */
-#define TCM_F_MORE	0x2	/* Flag: Message to be continued */
-
-#define TCM_ALIGN(datalen)  (((datalen)+3) & ~3)
-#define TCM_LENGTH(datalen) (sizeof(struct tipc_cfg_msg_hdr) + datalen)
-#define TCM_SPACE(datalen)  (TCM_ALIGN(TCM_LENGTH(datalen)))
-#define TCM_DATA(tcm_hdr)   ((void *)((char *)(tcm_hdr) + TCM_LENGTH(0)))
-
-static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags,
-			  void *data, __u16 data_len)
-{
-	struct tipc_cfg_msg_hdr *tcm_hdr;
-	int msg_len;
-
-	msg_len = TCM_LENGTH(data_len);
-	tcm_hdr = (struct tipc_cfg_msg_hdr *)msg;
-	tcm_hdr->tcm_len   = htonl(msg_len);
-	tcm_hdr->tcm_type  = htons(cmd);
-	tcm_hdr->tcm_flags = htons(flags);
-	if (data_len && data)
-		memcpy(TCM_DATA(msg), data, data_len);
-	return TCM_SPACE(data_len);
-}
+#define TIPC_CONN_TIMEOUT	130	/* Default: 8000 (ms)  */
 
 #endif
diff --git a/include/linux/tipc_config.h b/include/linux/tipc_config.h
new file mode 100644
index 000000000000..97ead313882e
--- /dev/null
+++ b/include/linux/tipc_config.h
@@ -0,0 +1,404 @@
+/*
+ * include/linux/tipc_config.h: Include file for TIPC configuration interface
+ * 
+ * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005-2006, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation 
+ * and/or other materials provided with the distribution.
+ * Neither the names of the copyright holders nor the names of its 
+ * contributors may be used to endorse or promote products derived from this 
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_TIPC_CONFIG_H_
+#define _LINUX_TIPC_CONFIG_H_
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <asm/byteorder.h>
+
+/*
+ * Configuration
+ *
+ * All configuration management messaging involves sending a request message
+ * to the TIPC configuration service on a node, which sends a reply message
+ * back.  (In the future multi-message replies may be supported.)
+ *
+ * Both request and reply messages consist of a transport header and payload.
+ * The transport header contains info about the desired operation;
+ * the payload consists of zero or more type/length/value (TLV) items
+ * which specify parameters or results for the operation.
+ *
+ * For many operations, the request and reply messages have a fixed number
+ * of TLVs (usually zero or one); however, some reply messages may return 
+ * a variable number of TLVs.  A failed request is denoted by the presence
+ * of an "error string" TLV in the reply message instead of the TLV(s) the
+ * reply should contain if the request succeeds.
+ */
+ 
+/* 
+ * Public commands:
+ * May be issued by any process.
+ * Accepted by own node, or by remote node only if remote management enabled.                       
+ */
+ 
+#define  TIPC_CMD_NOOP   	    0x0000    /* tx none, rx none */
+#define  TIPC_CMD_GET_NODES         0x0001    /* tx net_addr, rx node_info(s) */
+#define  TIPC_CMD_GET_MEDIA_NAMES   0x0002    /* tx none, rx media_name(s) */
+#define  TIPC_CMD_GET_BEARER_NAMES  0x0003    /* tx none, rx bearer_name(s) */
+#define  TIPC_CMD_GET_LINKS         0x0004    /* tx net_addr, rx link_info(s) */
+#define  TIPC_CMD_SHOW_NAME_TABLE   0x0005    /* tx name_tbl_query, rx ultra_string */
+#define  TIPC_CMD_SHOW_PORTS        0x0006    /* tx none, rx ultra_string */
+#define  TIPC_CMD_SHOW_LINK_STATS   0x000B    /* tx link_name, rx ultra_string */
+
+#if 0
+#define  TIPC_CMD_SHOW_PORT_STATS   0x0008    /* tx port_ref, rx ultra_string */
+#define  TIPC_CMD_RESET_PORT_STATS  0x0009    /* tx port_ref, rx none */
+#define  TIPC_CMD_GET_ROUTES        0x000A    /* tx ?, rx ? */
+#define  TIPC_CMD_GET_LINK_PEER     0x000D    /* tx link_name, rx ? */
+#endif
+
+/* 
+ * Protected commands:
+ * May only be issued by "network administration capable" process.
+ * Accepted by own node, or by remote node only if remote management enabled
+ * and this node is zone manager.                       
+ */
+
+#define  TIPC_CMD_GET_REMOTE_MNG    0x4003    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_PORTS     0x4004    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_PUBL      0x4005    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_SUBSCR    0x4006    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_ZONES     0x4007    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_CLUSTERS  0x4008    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_NODES     0x4009    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_MAX_SLAVES    0x400A    /* tx none, rx unsigned */
+#define  TIPC_CMD_GET_NETID         0x400B    /* tx none, rx unsigned */
+
+#define  TIPC_CMD_ENABLE_BEARER     0x4101    /* tx bearer_config, rx none */
+#define  TIPC_CMD_DISABLE_BEARER    0x4102    /* tx bearer_name, rx none */
+#define  TIPC_CMD_SET_LINK_TOL      0x4107    /* tx link_config, rx none */
+#define  TIPC_CMD_SET_LINK_PRI      0x4108    /* tx link_config, rx none */
+#define  TIPC_CMD_SET_LINK_WINDOW   0x4109    /* tx link_config, rx none */
+#define  TIPC_CMD_SET_LOG_SIZE      0x410A    /* tx unsigned, rx none */
+#define  TIPC_CMD_DUMP_LOG          0x410B    /* tx none, rx ultra_string */
+#define  TIPC_CMD_RESET_LINK_STATS  0x410C    /* tx link_name, rx none */
+
+#if 0
+#define  TIPC_CMD_CREATE_LINK       0x4103    /* tx link_create, rx none */
+#define  TIPC_CMD_REMOVE_LINK       0x4104    /* tx link_name, rx none */
+#define  TIPC_CMD_BLOCK_LINK        0x4105    /* tx link_name, rx none */
+#define  TIPC_CMD_UNBLOCK_LINK      0x4106    /* tx link_name, rx none */
+#endif
+
+/* 
+ * Private commands:
+ * May only be issued by "network administration capable" process.
+ * Accepted by own node only; cannot be used on a remote node.                       
+ */
+
+#define  TIPC_CMD_SET_NODE_ADDR     0x8001    /* tx net_addr, rx none */
+#if 0
+#define  TIPC_CMD_SET_ZONE_MASTER   0x8002    /* tx none, rx none */
+#endif
+#define  TIPC_CMD_SET_REMOTE_MNG    0x8003    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_PORTS     0x8004    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_PUBL      0x8005    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_SUBSCR    0x8006    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_ZONES     0x8007    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_CLUSTERS  0x8008    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_NODES     0x8009    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_MAX_SLAVES    0x800A    /* tx unsigned, rx none */
+#define  TIPC_CMD_SET_NETID         0x800B    /* tx unsigned, rx none */
+
+/*
+ * TLV types defined for TIPC
+ */
+
+#define TIPC_TLV_NONE		0	/* no TLV present */
+#define TIPC_TLV_VOID		1	/* empty TLV (0 data bytes)*/
+#define TIPC_TLV_UNSIGNED	2	/* 32-bit integer */
+#define TIPC_TLV_STRING		3	/* char[128] (max) */
+#define TIPC_TLV_LARGE_STRING	4	/* char[2048] (max) */
+#define TIPC_TLV_ULTRA_STRING	5	/* char[32768] (max) */
+
+#define TIPC_TLV_ERROR_STRING	16	/* char[128] containing "error code" */
+#define TIPC_TLV_NET_ADDR   	17	/* 32-bit integer denoting <Z.C.N> */
+#define TIPC_TLV_MEDIA_NAME	18	/* char[TIPC_MAX_MEDIA_NAME] */
+#define TIPC_TLV_BEARER_NAME	19	/* char[TIPC_MAX_BEARER_NAME] */
+#define TIPC_TLV_LINK_NAME	20	/* char[TIPC_MAX_LINK_NAME] */
+#define TIPC_TLV_NODE_INFO	21	/* struct tipc_node_info */
+#define TIPC_TLV_LINK_INFO	22	/* struct tipc_link_info */
+#define TIPC_TLV_BEARER_CONFIG  23	/* struct tipc_bearer_config */
+#define TIPC_TLV_LINK_CONFIG    24	/* struct tipc_link_config */
+#define TIPC_TLV_NAME_TBL_QUERY	25	/* struct tipc_name_table_query */
+#define TIPC_TLV_PORT_REF   	26	/* 32-bit port reference */
+
+/*
+ * Maximum sizes of TIPC bearer-related names (including terminating NUL)
+ */ 
+
+#define TIPC_MAX_MEDIA_NAME	16	/* format = media */
+#define TIPC_MAX_IF_NAME	16	/* format = interface */
+#define TIPC_MAX_BEARER_NAME	32	/* format = media:interface */
+#define TIPC_MAX_LINK_NAME	60	/* format = Z.C.N:interface-Z.C.N:interface */
+
+/*
+ * Link priority limits (range from 0 to # priorities - 1)
+ */
+
+#define TIPC_NUM_LINK_PRI 32
+
+/*
+ * Link tolerance limits (min, default, max), in ms
+ */
+
+#define TIPC_MIN_LINK_TOL 50
+#define TIPC_DEF_LINK_TOL 1500
+#define TIPC_MAX_LINK_TOL 30000
+
+/*
+ * Link window limits (min, default, max), in packets
+ */
+
+#define TIPC_MIN_LINK_WIN 16
+#define TIPC_DEF_LINK_WIN 50
+#define TIPC_MAX_LINK_WIN 150
+
+
+struct tipc_node_info {
+	__u32 addr;			/* network address of node */
+	__u32 up;			/* 0=down, 1= up */
+};
+
+struct tipc_link_info {
+	__u32 dest;			/* network address of peer node */
+	__u32 up;			/* 0=down, 1=up */
+	char str[TIPC_MAX_LINK_NAME];	/* link name */
+};
+
+struct tipc_bearer_config {
+	__u32 priority;			/* Range [1,31]. Override per link  */
+	__u32 detect_scope;     
+	char name[TIPC_MAX_BEARER_NAME];
+};
+
+struct tipc_link_config {
+	__u32 value;
+	char name[TIPC_MAX_LINK_NAME];
+};
+
+#define TIPC_NTQ_ALLTYPES 0x80000000
+
+struct tipc_name_table_query {
+	__u32 depth;	/* 1:type, 2:+name info, 3:+port info, 4+:+debug info */
+	__u32 type;	/* {t,l,u} info ignored if high bit of "depth" is set */
+	__u32 lowbound; /* (i.e. displays all entries of name table) */
+	__u32 upbound;
+};
+
+/*
+ * The error string TLV is a null-terminated string describing the cause 
+ * of the request failure.  To simplify error processing (and to save space)
+ * the first character of the string can be a special error code character
+ * (lying by the range 0x80 to 0xFF) which represents a pre-defined reason.
+ */
+
+#define TIPC_CFG_TLV_ERROR      "\x80"  /* request contains incorrect TLV(s) */
+#define TIPC_CFG_NOT_NET_ADMIN  "\x81"	/* must be network administrator */
+#define TIPC_CFG_NOT_ZONE_MSTR	"\x82"	/* must be zone master */
+#define TIPC_CFG_NO_REMOTE	"\x83"	/* remote management not enabled */
+#define TIPC_CFG_NOT_SUPPORTED  "\x84"	/* request is not supported by TIPC */
+#define TIPC_CFG_INVALID_VALUE  "\x85"  /* request has invalid argument value */
+
+#if 0
+/* prototypes TLV structures for proposed commands */
+struct tipc_link_create {
+	__u32   domain;
+	struct tipc_media_addr peer_addr;
+	char bearer_name[TIPC_MAX_BEARER_NAME];
+};
+
+struct tipc_route_info {
+	__u32 dest;
+	__u32 router;
+};
+#endif
+
+/*
+ * A TLV consists of a descriptor, followed by the TLV value.
+ * TLV descriptor fields are stored in network byte order; 
+ * TLV values must also be stored in network byte order (where applicable).
+ * TLV descriptors must be aligned to addresses which are multiple of 4,
+ * so up to 3 bytes of padding may exist at the end of the TLV value area.
+ * There must not be any padding between the TLV descriptor and its value.
+ */
+
+struct tlv_desc {
+	__u16 tlv_len;		/* TLV length (descriptor + value) */
+	__u16 tlv_type;		/* TLV identifier */
+};
+
+#define TLV_ALIGNTO 4
+
+#define TLV_ALIGN(datalen) (((datalen)+(TLV_ALIGNTO-1)) & ~(TLV_ALIGNTO-1))
+#define TLV_LENGTH(datalen) (sizeof(struct tlv_desc) + (datalen))
+#define TLV_SPACE(datalen) (TLV_ALIGN(TLV_LENGTH(datalen)))
+#define TLV_DATA(tlv) ((void *)((char *)(tlv) + TLV_LENGTH(0)))
+
+static inline int TLV_OK(const void *tlv, __u16 space)
+{
+	/*
+	 * Would also like to check that "tlv" is a multiple of 4,
+	 * but don't know how to do this in a portable way.
+	 * - Tried doing (!(tlv & (TLV_ALIGNTO-1))), but GCC compiler
+	 *   won't allow binary "&" with a pointer.
+	 * - Tried casting "tlv" to integer type, but causes warning about size
+	 *   mismatch when pointer is bigger than chosen type (int, long, ...).
+	 */
+
+	return (space >= TLV_SPACE(0)) &&
+		(ntohs(((struct tlv_desc *)tlv)->tlv_len) <= space);
+}
+
+static inline int TLV_CHECK(const void *tlv, __u16 space, __u16 exp_type)
+{
+	return TLV_OK(tlv, space) && 
+		(ntohs(((struct tlv_desc *)tlv)->tlv_type) == exp_type);
+}
+
+static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len)
+{
+	struct tlv_desc *tlv_ptr;
+	int tlv_len;
+
+	tlv_len = TLV_LENGTH(len);
+	tlv_ptr = (struct tlv_desc *)tlv;
+	tlv_ptr->tlv_type = htons(type);
+	tlv_ptr->tlv_len  = htons(tlv_len);
+	if (len && data)
+		memcpy(TLV_DATA(tlv_ptr), data, tlv_len);
+	return TLV_SPACE(len);
+}
+
+/*
+ * A TLV list descriptor simplifies processing of messages 
+ * containing multiple TLVs.
+ */
+
+struct tlv_list_desc {
+	struct tlv_desc *tlv_ptr;	/* ptr to current TLV */
+	__u32 tlv_space;		/* # bytes from curr TLV to list end */
+};
+
+static inline void TLV_LIST_INIT(struct tlv_list_desc *list, 
+				 void *data, __u32 space)
+{
+	list->tlv_ptr = (struct tlv_desc *)data;
+	list->tlv_space = space;
+}
+	     
+static inline int TLV_LIST_EMPTY(struct tlv_list_desc *list)
+{ 
+	return (list->tlv_space == 0);
+}
+
+static inline int TLV_LIST_CHECK(struct tlv_list_desc *list, __u16 exp_type)
+{
+	return TLV_CHECK(list->tlv_ptr, list->tlv_space, exp_type);
+}
+
+static inline void *TLV_LIST_DATA(struct tlv_list_desc *list)
+{
+	return TLV_DATA(list->tlv_ptr);
+}
+
+static inline void TLV_LIST_STEP(struct tlv_list_desc *list)
+{
+	__u16 tlv_space = TLV_ALIGN(ntohs(list->tlv_ptr->tlv_len));
+
+        list->tlv_ptr = (struct tlv_desc *)((char *)list->tlv_ptr + tlv_space);
+	list->tlv_space -= tlv_space;
+}
+
+/*
+ * Configuration messages exchanged via NETLINK_GENERIC use the following
+ * family id, name, version and command.
+ */
+#define TIPC_GENL_NAME		"TIPC"
+#define TIPC_GENL_VERSION	0x1
+#define TIPC_GENL_CMD		0x1
+
+/*
+ * TIPC specific header used in NETLINK_GENERIC requests.
+ */
+struct tipc_genlmsghdr {
+	__u32 dest;		/* Destination address */
+	__u16 cmd;		/* Command */
+	__u16 reserved;		/* Unused */
+};
+
+#define TIPC_GENL_HDRLEN	NLMSG_ALIGN(sizeof(struct tipc_genlmsghdr))
+
+/*
+ * Configuration messages exchanged via TIPC sockets use the TIPC configuration 
+ * message header, which is defined below.  This structure is analogous 
+ * to the Netlink message header, but fields are stored in network byte order 
+ * and no padding is permitted between the header and the message data 
+ * that follows.
+ */
+
+struct tipc_cfg_msg_hdr
+{
+	__u32 tcm_len;		/* Message length (including header) */
+	__u16 tcm_type;		/* Command type */
+	__u16 tcm_flags;	/* Additional flags */
+	char  tcm_reserved[8];	/* Unused */
+};
+
+#define TCM_F_REQUEST	0x1	/* Flag: Request message */
+#define TCM_F_MORE	0x2	/* Flag: Message to be continued */
+
+#define TCM_ALIGN(datalen)  (((datalen)+3) & ~3)
+#define TCM_LENGTH(datalen) (sizeof(struct tipc_cfg_msg_hdr) + datalen)
+#define TCM_SPACE(datalen)  (TCM_ALIGN(TCM_LENGTH(datalen)))
+#define TCM_DATA(tcm_hdr)   ((void *)((char *)(tcm_hdr) + TCM_LENGTH(0)))
+
+static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags,
+			  void *data, __u16 data_len)
+{
+	struct tipc_cfg_msg_hdr *tcm_hdr;
+	int msg_len;
+
+	msg_len = TCM_LENGTH(data_len);
+	tcm_hdr = (struct tipc_cfg_msg_hdr *)msg;
+	tcm_hdr->tcm_len   = htonl(msg_len);
+	tcm_hdr->tcm_type  = htons(cmd);
+	tcm_hdr->tcm_flags = htons(flags);
+	if (data_len && data)
+		memcpy(TCM_DATA(msg), data, data_len);
+	return TCM_SPACE(data_len);
+}
+
+#endif
diff --git a/include/net/tipc/tipc_bearer.h b/include/net/tipc/tipc_bearer.h
index a3daf697b6b6..df7b9145a599 100644
--- a/include/net/tipc/tipc_bearer.h
+++ b/include/net/tipc/tipc_bearer.h
@@ -36,10 +36,36 @@
 
 #ifdef __KERNEL__
 
-#include <linux/tipc.h>
+#include <linux/tipc_config.h>
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 
+/*
+ * Identifiers of supported TIPC media types
+ */
+
+#define TIPC_MEDIA_TYPE_ETH	1
+
+struct tipc_media_addr {
+	__u32  type;
+	union {
+		__u8   eth_addr[6];	/* Ethernet bearer */ 
+#if 0
+		/* Prototypes for other possible bearer types */
+
+		struct {
+			__u16 sin_family;
+			__u16 sin_port;
+			struct {
+				__u32 s_addr;
+			} sin_addr;
+			char pad[4];
+		} addr_in;		/* IP-based bearer */
+		__u16  sock_descr;	/* generic socket bearer */
+#endif
+	} dev_addr;
+};
+
 /**
  * struct tipc_bearer - TIPC bearer info available to privileged users
  * @usr_handle: pointer to additional user-defined information about bearer
diff --git a/net/tipc/config.h b/net/tipc/config.h
index 7ac0af57bf57..ca76cb85a3b7 100644
--- a/net/tipc/config.h
+++ b/net/tipc/config.h
@@ -37,6 +37,7 @@
 /* ---------------------------------------------------------------------- */
 
 #include <linux/tipc.h>
+#include <linux/tipc_config.h>
 #include "link.h"
 
 struct sk_buff *cfg_reply_alloc(int payload_size);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 22dcb724e760..688de0bf72d5 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -48,6 +48,7 @@
 #include <net/sock.h>
 
 #include <linux/tipc.h>
+#include <linux/tipc_config.h>
 #include <net/tipc/tipc_msg.h>
 #include <net/tipc/tipc_port.h>
 
-- 
cgit v1.2.3-71-gd317


From 9ea1fd3c1a15c620d1e3d0aa269d34b705477003 Mon Sep 17 00:00:00 2001
From: Per Liden <per.liden@nospam.ericsson.com>
Date: Wed, 11 Jan 2006 13:30:43 +0100
Subject: [TIPC] License header update

The license header in each file now more clearly state that this
code is licensed under a dual BSD/GPL. Before this was only
evident if you looked at the MODULE_LICENSE line in core.c.

Signed-off-by: Per Liden <per.liden@nospam.ericsson.com>
---
 include/linux/tipc.h           | 42 +++++++++++++++++++++++-------------------
 include/linux/tipc_config.h    | 42 +++++++++++++++++++++++-------------------
 include/net/tipc/tipc.h        | 42 +++++++++++++++++++++++-------------------
 include/net/tipc/tipc_bearer.h | 42 +++++++++++++++++++++++-------------------
 include/net/tipc/tipc_msg.h    | 42 +++++++++++++++++++++++-------------------
 include/net/tipc/tipc_port.h   | 42 +++++++++++++++++++++++-------------------
 net/tipc/addr.c                | 42 +++++++++++++++++++++++-------------------
 net/tipc/addr.h                | 42 +++++++++++++++++++++++-------------------
 net/tipc/bcast.c               | 42 +++++++++++++++++++++++-------------------
 net/tipc/bcast.h               | 42 +++++++++++++++++++++++-------------------
 net/tipc/bearer.c              | 42 +++++++++++++++++++++++-------------------
 net/tipc/bearer.h              | 42 +++++++++++++++++++++++-------------------
 net/tipc/cluster.c             | 20 ++++++++++++--------
 net/tipc/cluster.h             | 42 +++++++++++++++++++++++-------------------
 net/tipc/config.c              | 42 +++++++++++++++++++++++-------------------
 net/tipc/config.h              | 42 +++++++++++++++++++++++-------------------
 net/tipc/core.c                | 42 +++++++++++++++++++++++-------------------
 net/tipc/core.h                | 42 +++++++++++++++++++++++-------------------
 net/tipc/dbg.c                 | 42 +++++++++++++++++++++++-------------------
 net/tipc/dbg.h                 | 42 +++++++++++++++++++++++-------------------
 net/tipc/discover.c            | 42 +++++++++++++++++++++++-------------------
 net/tipc/discover.h            | 42 +++++++++++++++++++++++-------------------
 net/tipc/eth_media.c           | 42 +++++++++++++++++++++++-------------------
 net/tipc/handler.c             | 42 +++++++++++++++++++++++-------------------
 net/tipc/link.c                | 42 +++++++++++++++++++++++-------------------
 net/tipc/link.h                | 42 +++++++++++++++++++++++-------------------
 net/tipc/msg.c                 | 42 +++++++++++++++++++++++-------------------
 net/tipc/msg.h                 | 42 +++++++++++++++++++++++-------------------
 net/tipc/name_distr.c          | 42 +++++++++++++++++++++++-------------------
 net/tipc/name_distr.h          | 42 +++++++++++++++++++++++-------------------
 net/tipc/name_table.c          | 42 +++++++++++++++++++++++-------------------
 net/tipc/name_table.h          | 42 +++++++++++++++++++++++-------------------
 net/tipc/net.c                 | 20 ++++++++++++--------
 net/tipc/net.h                 | 22 +++++++++++++---------
 net/tipc/netlink.c             | 42 +++++++++++++++++++++++-------------------
 net/tipc/node.c                | 42 +++++++++++++++++++++++-------------------
 net/tipc/node.h                | 42 +++++++++++++++++++++++-------------------
 net/tipc/node_subscr.c         | 42 +++++++++++++++++++++++-------------------
 net/tipc/node_subscr.h         | 42 +++++++++++++++++++++++-------------------
 net/tipc/port.c                | 42 +++++++++++++++++++++++-------------------
 net/tipc/port.h                | 42 +++++++++++++++++++++++-------------------
 net/tipc/ref.c                 | 22 +++++++++++++---------
 net/tipc/ref.h                 | 42 +++++++++++++++++++++++-------------------
 net/tipc/socket.c              | 42 +++++++++++++++++++++++-------------------
 net/tipc/subscr.c              | 42 +++++++++++++++++++++++-------------------
 net/tipc/subscr.h              | 42 +++++++++++++++++++++++-------------------
 net/tipc/user_reg.c            | 22 +++++++++++++---------
 net/tipc/user_reg.h            | 42 +++++++++++++++++++++++-------------------
 net/tipc/zone.c                | 20 ++++++++++++--------
 net/tipc/zone.h                | 20 ++++++++++++--------
 50 files changed, 1076 insertions(+), 876 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tipc.h b/include/linux/tipc.h
index 4ecd0047550b..ff109c1c1414 100644
--- a/include/linux/tipc.h
+++ b/include/linux/tipc.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/include/linux/tipc_config.h b/include/linux/tipc_config.h
index 97ead313882e..6e5a3af0d212 100644
--- a/include/linux/tipc_config.h
+++ b/include/linux/tipc_config.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/include/net/tipc/tipc.h b/include/net/tipc/tipc.h
index 1d4d8d0701e3..b763d556355d 100644
--- a/include/net/tipc/tipc.h
+++ b/include/net/tipc/tipc.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/include/net/tipc/tipc_bearer.h b/include/net/tipc/tipc_bearer.h
index df7b9145a599..0cfb9bbd9ac6 100644
--- a/include/net/tipc/tipc_bearer.h
+++ b/include/net/tipc/tipc_bearer.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/include/net/tipc/tipc_msg.h b/include/net/tipc/tipc_msg.h
index 78513f5236bb..a5c7ed598fc1 100644
--- a/include/net/tipc/tipc_msg.h
+++ b/include/net/tipc/tipc_msg.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/include/net/tipc/tipc_port.h b/include/net/tipc/tipc_port.h
index ec0f0de7f0e2..db947ff12656 100644
--- a/include/net/tipc/tipc_port.h
+++ b/include/net/tipc/tipc_port.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index bc353639562a..89a25a2d03be 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 9dabebcba6de..b65a0ab2647b 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 35ca90667a59..180d80b37a8d 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -7,28 +7,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index cc2ede15c4fd..27d1bba5c945 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index c19465c5f035..579b17ded6c5 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 02a16f86defa..96d4bd9e767f 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/cluster.c b/net/tipc/cluster.c
index d7d0f5f17e00..4e86b2a6fc7f 100644
--- a/net/tipc/cluster.c
+++ b/net/tipc/cluster.c
@@ -9,14 +9,18 @@
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
diff --git a/net/tipc/cluster.h b/net/tipc/cluster.h
index c12875ab46be..a066b5e518b4 100644
--- a/net/tipc/cluster.h
+++ b/net/tipc/cluster.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/config.c b/net/tipc/config.c
index 0f31c342d11c..9e55cf8caec7 100644
--- a/net/tipc/config.c
+++ b/net/tipc/config.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/config.h b/net/tipc/config.h
index ca76cb85a3b7..a12c6482c9c2 100644
--- a/net/tipc/config.h
+++ b/net/tipc/config.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 17c723f49185..6d45d71bdae0 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/core.h b/net/tipc/core.h
index d92898d6c093..4b29fb63c8fe 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index efd6d652d052..22fa030150f5 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/dbg.h b/net/tipc/dbg.h
index af4217a993cf..07a42ba6846b 100644
--- a/net/tipc/dbg.h
+++ b/net/tipc/dbg.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 73e151cf2a51..31e242f39e7f 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
index 90c1de905e78..17c150b18dcd 100644
--- a/net/tipc/discover.h
+++ b/net/tipc/discover.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index b634d7a5640e..8acbef8e93a8 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/handler.c b/net/tipc/handler.c
index c8fbb2071dfc..8f5dcbd4cd3a 100644
--- a/net/tipc/handler.c
+++ b/net/tipc/handler.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 92acb80bb24d..c9261002ea68 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/link.h b/net/tipc/link.h
index e7db3476d84e..a67868606eff 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index c6e90efd59f2..acc6e354290c 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 3dcd23f51883..6323de97434b 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 5e0604cb3aeb..31448272a13a 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h
index 7c6616aecc03..3f9c28d87cc0 100644
--- a/net/tipc/name_distr.h
+++ b/net/tipc/name_distr.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 9626c5b30678..e152d6dddfab 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 5036b5bac360..8e3749cf9b71 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/net.c b/net/tipc/net.c
index eba88033b90e..73c6aa8fa1a3 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -9,14 +9,18 @@
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
diff --git a/net/tipc/net.h b/net/tipc/net.h
index 50098455a898..97650ff63593 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -8,15 +8,19 @@
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
- * 
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 2d778dfc4492..de3eb45a6683 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -5,28 +5,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/node.c b/net/tipc/node.c
index e311638b9b3d..e86614494c10 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 1f616873a724..8919c873b0e0 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c
index c38a3b0e847d..ecdb98216687 100644
--- a/net/tipc/node_subscr.c
+++ b/net/tipc/node_subscr.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/node_subscr.h b/net/tipc/node_subscr.h
index 63ae92e0c82e..3d2d8f546ef0 100644
--- a/net/tipc/node_subscr.h
+++ b/net/tipc/node_subscr.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/port.c b/net/tipc/port.c
index f0f228c9589a..e69e8b42a190 100644
--- a/net/tipc/port.c
+++ b/net/tipc/port.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/port.h b/net/tipc/port.h
index e40235ef92fe..e10d85a4bf9b 100644
--- a/net/tipc/port.h
+++ b/net/tipc/port.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/ref.c b/net/tipc/ref.c
index aee48c797b1e..b6f60822ec36 100644
--- a/net/tipc/ref.c
+++ b/net/tipc/ref.c
@@ -8,15 +8,19 @@
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
- * 
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
diff --git a/net/tipc/ref.h b/net/tipc/ref.h
index e6cd5bb3dc29..08ba83fd5611 100644
--- a/net/tipc/ref.h
+++ b/net/tipc/ref.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 688de0bf72d5..1120d0340310 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 6f651a2c477e..9486d8ee98b8 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index e6cb9c3ad870..6b91ca9e2419 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/user_reg.c b/net/tipc/user_reg.c
index e4da5becc342..64165ec46043 100644
--- a/net/tipc/user_reg.c
+++ b/net/tipc/user_reg.c
@@ -8,15 +8,19 @@
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
- * 
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
diff --git a/net/tipc/user_reg.h b/net/tipc/user_reg.h
index 44f219458a39..0f96004ebdcc 100644
--- a/net/tipc/user_reg.h
+++ b/net/tipc/user_reg.h
@@ -6,28 +6,32 @@
  * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
+ * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation 
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this 
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
diff --git a/net/tipc/zone.c b/net/tipc/zone.c
index 336eebb7bf54..b47deecb73f5 100644
--- a/net/tipc/zone.c
+++ b/net/tipc/zone.c
@@ -9,14 +9,18 @@
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
diff --git a/net/tipc/zone.h b/net/tipc/zone.h
index 19fd51a8bd2b..e05cc6102043 100644
--- a/net/tipc/zone.h
+++ b/net/tipc/zone.h
@@ -9,14 +9,18 @@
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * Neither the names of the copyright holders nor the names of its 
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-- 
cgit v1.2.3-71-gd317


From 9da1c8b694f8e72a16f259614caaae50cbcdaf10 Mon Sep 17 00:00:00 2001
From: Per Liden <per.liden@nospam.ericsson.com>
Date: Wed, 11 Jan 2006 18:40:41 +0100
Subject: [TIPC] Update of file headers

The copyright statements from different parts of Ericsson
have been merged into one.

Signed-off-by: Per Liden <per.liden@nospam.ericsson.com>
---
 include/linux/tipc.h           | 3 +--
 include/linux/tipc_config.h    | 3 +--
 include/net/tipc/tipc.h        | 3 +--
 include/net/tipc/tipc_bearer.h | 3 +--
 include/net/tipc/tipc_msg.h    | 3 +--
 include/net/tipc/tipc_port.h   | 3 +--
 net/tipc/addr.c                | 3 +--
 net/tipc/addr.h                | 3 +--
 net/tipc/bcast.c               | 3 +--
 net/tipc/bcast.h               | 3 +--
 net/tipc/bearer.c              | 3 +--
 net/tipc/bearer.h              | 3 +--
 net/tipc/cluster.c             | 3 +--
 net/tipc/cluster.h             | 3 +--
 net/tipc/config.c              | 3 +--
 net/tipc/config.h              | 3 +--
 net/tipc/core.c                | 3 +--
 net/tipc/core.h                | 3 +--
 net/tipc/dbg.c                 | 3 +--
 net/tipc/dbg.h                 | 3 +--
 net/tipc/discover.c            | 3 +--
 net/tipc/discover.h            | 3 +--
 net/tipc/eth_media.c           | 3 +--
 net/tipc/handler.c             | 3 +--
 net/tipc/link.c                | 3 +--
 net/tipc/link.h                | 3 +--
 net/tipc/msg.c                 | 3 +--
 net/tipc/msg.h                 | 3 +--
 net/tipc/name_distr.c          | 3 +--
 net/tipc/name_distr.h          | 3 +--
 net/tipc/name_table.c          | 3 +--
 net/tipc/name_table.h          | 3 +--
 net/tipc/net.c                 | 3 +--
 net/tipc/net.h                 | 3 +--
 net/tipc/netlink.c             | 1 -
 net/tipc/node.c                | 3 +--
 net/tipc/node.h                | 3 +--
 net/tipc/node_subscr.c         | 3 +--
 net/tipc/node_subscr.h         | 3 +--
 net/tipc/port.c                | 3 +--
 net/tipc/port.h                | 3 +--
 net/tipc/ref.c                 | 3 +--
 net/tipc/ref.h                 | 3 +--
 net/tipc/socket.c              | 3 +--
 net/tipc/subscr.c              | 3 +--
 net/tipc/subscr.h              | 3 +--
 net/tipc/user_reg.c            | 3 +--
 net/tipc/user_reg.h            | 3 +--
 net/tipc/zone.c                | 3 +--
 net/tipc/zone.h                | 3 +--
 50 files changed, 49 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tipc.h b/include/linux/tipc.h
index ff109c1c1414..243a15f54002 100644
--- a/include/linux/tipc.h
+++ b/include/linux/tipc.h
@@ -1,9 +1,8 @@
 /*
  * include/linux/tipc.h: Include file for TIPC socket interface
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/linux/tipc_config.h b/include/linux/tipc_config.h
index 6e5a3af0d212..a52c8c64a5a3 100644
--- a/include/linux/tipc_config.h
+++ b/include/linux/tipc_config.h
@@ -1,9 +1,8 @@
 /*
  * include/linux/tipc_config.h: Include file for TIPC configuration interface
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/net/tipc/tipc.h b/include/net/tipc/tipc.h
index b763d556355d..560a329660fd 100644
--- a/include/net/tipc/tipc.h
+++ b/include/net/tipc/tipc.h
@@ -1,9 +1,8 @@
 /*
  * include/net/tipc/tipc.h: Main include file for TIPC users
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/net/tipc/tipc_bearer.h b/include/net/tipc/tipc_bearer.h
index 0cfb9bbd9ac6..098607cd4b78 100644
--- a/include/net/tipc/tipc_bearer.h
+++ b/include/net/tipc/tipc_bearer.h
@@ -1,9 +1,8 @@
 /*
  * include/net/tipc/tipc_bearer.h: Include file for privileged access to TIPC bearers
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/net/tipc/tipc_msg.h b/include/net/tipc/tipc_msg.h
index a5c7ed598fc1..4d096eebc93f 100644
--- a/include/net/tipc/tipc_msg.h
+++ b/include/net/tipc/tipc_msg.h
@@ -1,9 +1,8 @@
 /*
  * include/net/tipc/tipc_msg.h: Include file for privileged access to TIPC message headers
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/net/tipc/tipc_port.h b/include/net/tipc/tipc_port.h
index db947ff12656..3f602744164e 100644
--- a/include/net/tipc/tipc_port.h
+++ b/include/net/tipc/tipc_port.h
@@ -1,9 +1,8 @@
 /*
  * include/net/tipc/tipc_port.h: Include file for privileged access to TIPC ports
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index 89a25a2d03be..6d5709deee68 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/addr.c: TIPC address utility routines
  *     
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index b65a0ab2647b..cdf95486043e 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/addr.h: Include file for TIPC address utility routines
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 180d80b37a8d..d7d06f29f194 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -1,10 +1,9 @@
 /*
  * net/tipc/bcast.c: TIPC broadcast code
  *     
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004, Intel Corporation.
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 27d1bba5c945..5430e524b4f9 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/bcast.h: Include file for TIPC broadcast code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 579b17ded6c5..9c8ab26301d7 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/bearer.c: TIPC bearer code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 96d4bd9e767f..55197ccf1a1a 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/bearer.h: Include file for TIPC bearer code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/cluster.c b/net/tipc/cluster.c
index 4e86b2a6fc7f..7c31f690c386 100644
--- a/net/tipc/cluster.c
+++ b/net/tipc/cluster.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/cluster.c: TIPC cluster management routines
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/cluster.h b/net/tipc/cluster.h
index a066b5e518b4..a9a46fa4fc2a 100644
--- a/net/tipc/cluster.h
+++ b/net/tipc/cluster.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/cluster.h: Include file for TIPC cluster management routines
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/config.c b/net/tipc/config.c
index 9e55cf8caec7..6c45771bd993 100644
--- a/net/tipc/config.c
+++ b/net/tipc/config.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/config.c: TIPC configuration management code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/config.h b/net/tipc/config.h
index a12c6482c9c2..646377d40454 100644
--- a/net/tipc/config.h
+++ b/net/tipc/config.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/config.h: Include file for TIPC configuration service code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 6d45d71bdae0..e83ac06e31ba 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/core.c: TIPC module code
  *
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 4f71fde023e1..deee5de5fb33 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/core.h: Include file for TIPC global declarations
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index 22fa030150f5..3264ccb74484 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/dbg.c: TIPC print buffer routines for debuggign
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/dbg.h b/net/tipc/dbg.h
index 07a42ba6846b..264c741f95b9 100644
--- a/net/tipc/dbg.h
+++ b/net/tipc/dbg.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/dbg.h: Include file for TIPC print buffer routines
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 31e242f39e7f..b106ef1621cc 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/discover.c
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
index 17c150b18dcd..2a6114d91626 100644
--- a/net/tipc/discover.h
+++ b/net/tipc/discover.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/discover.h
  *
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index 8acbef8e93a8..fe0a4286e48b 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/eth_media.c: Ethernet bearer support for TIPC
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/handler.c b/net/tipc/handler.c
index 8f5dcbd4cd3a..cab6f6150460 100644
--- a/net/tipc/handler.c
+++ b/net/tipc/handler.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/handler.c: TIPC signal handling
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/link.c b/net/tipc/link.c
index c9261002ea68..48d0483297ea 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/link.c: TIPC link code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/link.h b/net/tipc/link.h
index a67868606eff..686eb7d684e0 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/link.h: Include file for TIPC link code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index acc6e354290c..4ffbb9d44429 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/msg.c: TIPC message header routines
  *     
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 6323de97434b..71e272face60 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/msg.h: Include file for TIPC message header routines
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 31448272a13a..93ac2b25220e 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/name_distr.c: TIPC name distribution code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h
index 3f9c28d87cc0..41c1b1c065d4 100644
--- a/net/tipc/name_distr.h
+++ b/net/tipc/name_distr.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/name_distr.h: Include file for TIPC name distribution code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index e152d6dddfab..8568df3cc6b2 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/name_table.c: TIPC name table code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 8e3749cf9b71..c771b093e120 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/name_table.h: Include file for TIPC name table code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 73c6aa8fa1a3..a6989946e235 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/net.c: TIPC network routing code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/net.h b/net/tipc/net.h
index 97650ff63593..6f0758e894c3 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/net.h: Include file for TIPC network routing code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 7b963833964a..c19c2fa2ce55 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -2,7 +2,6 @@
  * net/tipc/netlink.c: TIPC configuration handling
  * 
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/node.c b/net/tipc/node.c
index e86614494c10..565c35b54c0d 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/node.c: TIPC node management routines
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 8919c873b0e0..26c04ba2d49e 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/node.h: Include file for TIPC node management routines
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c
index ecdb98216687..f271b61ecc2e 100644
--- a/net/tipc/node_subscr.c
+++ b/net/tipc/node_subscr.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/node_subscr.c: TIPC "node down" subscription handling
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/node_subscr.h b/net/tipc/node_subscr.h
index 3d2d8f546ef0..3145339b55b9 100644
--- a/net/tipc/node_subscr.h
+++ b/net/tipc/node_subscr.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/node_subscr.h: Include file for TIPC "node down" subscription handling
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/port.c b/net/tipc/port.c
index e69e8b42a190..ea45b17b045b 100644
--- a/net/tipc/port.c
+++ b/net/tipc/port.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/port.c: TIPC port code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/port.h b/net/tipc/port.h
index e10d85a4bf9b..c15f687bd69b 100644
--- a/net/tipc/port.h
+++ b/net/tipc/port.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/port.h: Include file for TIPC port code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/ref.c b/net/tipc/ref.c
index b6f60822ec36..eb5f2b2799d9 100644
--- a/net/tipc/ref.c
+++ b/net/tipc/ref.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/ref.c: TIPC object registry code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/ref.h b/net/tipc/ref.h
index 08ba83fd5611..12dd827be304 100644
--- a/net/tipc/ref.h
+++ b/net/tipc/ref.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/ref.h: Include file for TIPC object registry code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index e0b429c20baf..cb90d8a99882 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/socket.c: TIPC socket API
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index b52959fca5e6..728f9f073a50 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/subscr.c: TIPC subscription service
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 6b91ca9e2419..ccff4efcb755 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/subscr.h: Include file for TIPC subscription service
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/user_reg.c b/net/tipc/user_reg.c
index 64165ec46043..37b73e3ba071 100644
--- a/net/tipc/user_reg.c
+++ b/net/tipc/user_reg.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/user_reg.c: TIPC user registry code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/user_reg.h b/net/tipc/user_reg.h
index 0f96004ebdcc..3a55c755f1fd 100644
--- a/net/tipc/user_reg.h
+++ b/net/tipc/user_reg.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/user_reg.h: Include file for TIPC user registry code
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/zone.c b/net/tipc/zone.c
index b47deecb73f5..cb5528699a64 100644
--- a/net/tipc/zone.c
+++ b/net/tipc/zone.c
@@ -1,9 +1,8 @@
 /*
  * net/tipc/zone.c: TIPC zone management routines
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/zone.h b/net/tipc/zone.h
index e05cc6102043..2fc5ed71b6dd 100644
--- a/net/tipc/zone.h
+++ b/net/tipc/zone.h
@@ -1,9 +1,8 @@
 /*
  * net/tipc/zone.h: Include file for TIPC zone management routines
  * 
- * Copyright (c) 2003-2005, Ericsson Research Canada
+ * Copyright (c) 2003-2006, Ericsson AB
  * Copyright (c) 2005, Wind River Systems
- * Copyright (c) 2005-2006, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
-- 
cgit v1.2.3-71-gd317


From 2e4e6a17af35be359cc8f1c924f8f198fbd478cc Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Thu, 12 Jan 2006 13:30:04 -0800
Subject: [NETFILTER] x_tables: Abstraction layer for {ip,ip6,arp}_tables

This monster-patch tries to do the best job for unifying the data
structures and backend interfaces for the three evil clones ip_tables,
ip6_tables and arp_tables.  In an ideal world we would never have
allowed this kind of copy+paste programming... but well, our world
isn't (yet?) ideal.

o introduce a new x_tables module
o {ip,arp,ip6}_tables depend on this x_tables module
o registration functions for tables, matches and targets are only
  wrappers around x_tables provided functions
o all matches/targets that are used from ip_tables and ip6_tables
  are now implemented as xt_FOOBAR.c files and provide module aliases
  to ipt_FOOBAR and ip6t_FOOBAR
o header files for xt_matches are in include/linux/netfilter/,
  include/linux/netfilter_{ipv4,ipv6} contains compatibility wrappers
  around the xt_FOOBAR.h headers

Based on this patchset we're going to further unify the code,
gradually getting rid of all the layer 3 specific assumptions.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nf_conntrack_common.h  |   3 +
 include/linux/netfilter/x_tables.h             | 224 +++++++
 include/linux/netfilter/xt_CLASSIFY.h          |   8 +
 include/linux/netfilter/xt_CONNMARK.h          |  25 +
 include/linux/netfilter/xt_MARK.h              |  21 +
 include/linux/netfilter/xt_NFQUEUE.h           |  16 +
 include/linux/netfilter/xt_comment.h           |  10 +
 include/linux/netfilter/xt_connbytes.h         |  25 +
 include/linux/netfilter/xt_connmark.h          |  18 +
 include/linux/netfilter/xt_conntrack.h         |  63 ++
 include/linux/netfilter/xt_dccp.h              |  23 +
 include/linux/netfilter/xt_helper.h            |   8 +
 include/linux/netfilter/xt_length.h            |   9 +
 include/linux/netfilter/xt_limit.h             |  21 +
 include/linux/netfilter/xt_mac.h               |   8 +
 include/linux/netfilter/xt_mark.h              |   9 +
 include/linux/netfilter/xt_physdev.h           |  24 +
 include/linux/netfilter/xt_pkttype.h           |   8 +
 include/linux/netfilter/xt_realm.h             |  10 +
 include/linux/netfilter/xt_sctp.h              | 107 ++++
 include/linux/netfilter/xt_state.h             |  13 +
 include/linux/netfilter/xt_string.h            |  18 +
 include/linux/netfilter/xt_tcpmss.h            |   9 +
 include/linux/netfilter/xt_tcpudp.h            |  36 ++
 include/linux/netfilter_arp/arp_tables.h       | 123 +---
 include/linux/netfilter_ipv4/ip_conntrack.h    |   3 -
 include/linux/netfilter_ipv4/ip_tables.h       | 217 ++-----
 include/linux/netfilter_ipv4/ipt_CLASSIFY.h    |   5 +-
 include/linux/netfilter_ipv4/ipt_CONNMARK.h    |  16 +-
 include/linux/netfilter_ipv4/ipt_MARK.h        |  22 +-
 include/linux/netfilter_ipv4/ipt_NFQUEUE.h     |   8 +-
 include/linux/netfilter_ipv4/ipt_comment.h     |   8 +-
 include/linux/netfilter_ipv4/ipt_connbytes.h   |  31 +-
 include/linux/netfilter_ipv4/ipt_connmark.h    |  15 +-
 include/linux/netfilter_ipv4/ipt_conntrack.h   |  66 +-
 include/linux/netfilter_ipv4/ipt_dccp.h        |  22 +-
 include/linux/netfilter_ipv4/ipt_helper.h      |   7 +-
 include/linux/netfilter_ipv4/ipt_length.h      |   6 +-
 include/linux/netfilter_ipv4/ipt_limit.h       |  19 +-
 include/linux/netfilter_ipv4/ipt_mac.h         |   7 +-
 include/linux/netfilter_ipv4/ipt_mark.h        |   8 +-
 include/linux/netfilter_ipv4/ipt_physdev.h     |  27 +-
 include/linux/netfilter_ipv4/ipt_pkttype.h     |   7 +-
 include/linux/netfilter_ipv4/ipt_realm.h       |   7 +-
 include/linux/netfilter_ipv4/ipt_state.h       |  16 +-
 include/linux/netfilter_ipv4/ipt_string.h      |  16 +-
 include/linux/netfilter_ipv4/ipt_tcpmss.h      |   6 +-
 include/linux/netfilter_ipv6/ip6_tables.h      | 208 ++----
 include/linux/netfilter_ipv6/ip6t_MARK.h       |   9 +-
 include/linux/netfilter_ipv6/ip6t_length.h     |   6 +-
 include/linux/netfilter_ipv6/ip6t_limit.h      |  21 +-
 include/linux/netfilter_ipv6/ip6t_mac.h        |   9 +-
 include/linux/netfilter_ipv6/ip6t_mark.h       |   8 +-
 include/linux/netfilter_ipv6/ip6t_physdev.h    |  27 +-
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h |   3 -
 include/net/netfilter/nf_conntrack.h           |   3 -
 net/bridge/netfilter/ebt_log.c                 |   1 +
 net/ipv4/netfilter/Kconfig                     | 250 +-------
 net/ipv4/netfilter/Makefile                    |  21 -
 net/ipv4/netfilter/arp_tables.c                | 444 +++----------
 net/ipv4/netfilter/arpt_mangle.c               |   7 +-
 net/ipv4/netfilter/arptable_filter.c           |   1 +
 net/ipv4/netfilter/ip_conntrack_standalone.c   |   4 +-
 net/ipv4/netfilter/ip_nat_rule.c               |   5 +-
 net/ipv4/netfilter/ip_nat_standalone.c         |   2 +-
 net/ipv4/netfilter/ip_tables.c                 | 842 +++----------------------
 net/ipv4/netfilter/ipt_CLASSIFY.c              |  90 ---
 net/ipv4/netfilter/ipt_CLUSTERIP.c             |   3 +-
 net/ipv4/netfilter/ipt_CONNMARK.c              | 122 ----
 net/ipv4/netfilter/ipt_DSCP.c                  |   2 +-
 net/ipv4/netfilter/ipt_ECN.c                   |   3 +-
 net/ipv4/netfilter/ipt_LOG.c                   |   2 +-
 net/ipv4/netfilter/ipt_MARK.c                  | 172 -----
 net/ipv4/netfilter/ipt_MASQUERADE.c            |   2 +-
 net/ipv4/netfilter/ipt_NETMAP.c                |   2 +-
 net/ipv4/netfilter/ipt_NFQUEUE.c               |  70 --
 net/ipv4/netfilter/ipt_NOTRACK.c               |  76 ---
 net/ipv4/netfilter/ipt_REDIRECT.c              |   2 +-
 net/ipv4/netfilter/ipt_REJECT.c                |   3 +-
 net/ipv4/netfilter/ipt_SAME.c                  |   2 +-
 net/ipv4/netfilter/ipt_TCPMSS.c                |   3 +-
 net/ipv4/netfilter/ipt_TOS.c                   |   2 +-
 net/ipv4/netfilter/ipt_TTL.c                   |   2 +-
 net/ipv4/netfilter/ipt_ULOG.c                  |   2 +-
 net/ipv4/netfilter/ipt_addrtype.c              |   4 +-
 net/ipv4/netfilter/ipt_ah.c                    |   6 +-
 net/ipv4/netfilter/ipt_comment.c               |  59 --
 net/ipv4/netfilter/ipt_connbytes.c             | 161 -----
 net/ipv4/netfilter/ipt_connmark.c              |  88 ---
 net/ipv4/netfilter/ipt_conntrack.c             | 232 -------
 net/ipv4/netfilter/ipt_dccp.c                  | 176 ------
 net/ipv4/netfilter/ipt_dscp.c                  |   4 +-
 net/ipv4/netfilter/ipt_ecn.c                   |   5 +-
 net/ipv4/netfilter/ipt_esp.c                   |   6 +-
 net/ipv4/netfilter/ipt_hashlimit.c             |   3 +-
 net/ipv4/netfilter/ipt_helper.c                | 168 -----
 net/ipv4/netfilter/ipt_iprange.c               |   4 +-
 net/ipv4/netfilter/ipt_length.c                |  64 --
 net/ipv4/netfilter/ipt_limit.c                 | 157 -----
 net/ipv4/netfilter/ipt_mac.c                   |  80 ---
 net/ipv4/netfilter/ipt_mark.c                  |  71 ---
 net/ipv4/netfilter/ipt_multiport.c             |  10 +-
 net/ipv4/netfilter/ipt_owner.c                 |   3 +-
 net/ipv4/netfilter/ipt_physdev.c               | 135 ----
 net/ipv4/netfilter/ipt_pkttype.c               |  70 --
 net/ipv4/netfilter/ipt_realm.c                 |  76 ---
 net/ipv4/netfilter/ipt_recent.c                |   6 +-
 net/ipv4/netfilter/ipt_sctp.c                  | 203 ------
 net/ipv4/netfilter/ipt_state.c                 |  74 ---
 net/ipv4/netfilter/ipt_string.c                |  91 ---
 net/ipv4/netfilter/ipt_tcpmss.c                | 127 ----
 net/ipv4/netfilter/ipt_tos.c                   |   3 +-
 net/ipv4/netfilter/ipt_ttl.c                   |   4 +-
 net/ipv4/netfilter/iptable_filter.c            |   3 +-
 net/ipv4/netfilter/iptable_mangle.c            |   1 +
 net/ipv4/netfilter/iptable_raw.c               |   3 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |   7 +-
 net/ipv6/netfilter/Kconfig                     |  72 +--
 net/ipv6/netfilter/Makefile                    |   6 -
 net/ipv6/netfilter/ip6_tables.c                | 828 +++---------------------
 net/ipv6/netfilter/ip6t_HL.c                   |   2 +-
 net/ipv6/netfilter/ip6t_LOG.c                  |   2 +-
 net/ipv6/netfilter/ip6t_MARK.c                 |  81 ---
 net/ipv6/netfilter/ip6t_NFQUEUE.c              |  70 --
 net/ipv6/netfilter/ip6t_REJECT.c               |   3 +-
 net/ipv6/netfilter/ip6t_ah.c                   |   2 +-
 net/ipv6/netfilter/ip6t_dst.c                  |   2 +-
 net/ipv6/netfilter/ip6t_esp.c                  |   2 +-
 net/ipv6/netfilter/ip6t_eui64.c                |   2 +-
 net/ipv6/netfilter/ip6t_frag.c                 |   2 +-
 net/ipv6/netfilter/ip6t_hbh.c                  |   2 +-
 net/ipv6/netfilter/ip6t_hl.c                   |   2 +-
 net/ipv6/netfilter/ip6t_ipv6header.c           |   2 +-
 net/ipv6/netfilter/ip6t_length.c               |  66 --
 net/ipv6/netfilter/ip6t_limit.c                | 147 -----
 net/ipv6/netfilter/ip6t_mac.c                  |  81 ---
 net/ipv6/netfilter/ip6t_mark.c                 |  66 --
 net/ipv6/netfilter/ip6t_multiport.c            |   3 +-
 net/ipv6/netfilter/ip6t_owner.c                |   2 +-
 net/ipv6/netfilter/ip6t_physdev.c              | 135 ----
 net/ipv6/netfilter/ip6t_rt.c                   |   2 +-
 net/ipv6/netfilter/ip6table_filter.c           |   1 +
 net/ipv6/netfilter/ip6table_mangle.c           |   1 +
 net/ipv6/netfilter/ip6table_raw.c              |   5 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |   8 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c        |  45 +-
 net/netfilter/Kconfig                          | 258 ++++++++
 net/netfilter/Makefile                         |  37 +-
 net/netfilter/nf_conntrack_standalone.c        |   4 +-
 net/netfilter/x_tables.c                       | 624 ++++++++++++++++++
 net/netfilter/xt_CLASSIFY.c                    | 109 ++++
 net/netfilter/xt_CONNMARK.c                    | 141 +++++
 net/netfilter/xt_MARK.c                        | 191 ++++++
 net/netfilter/xt_NFQUEUE.c                     | 107 ++++
 net/netfilter/xt_NOTRACK.c                     |  92 +++
 net/netfilter/xt_comment.c                     |  80 +++
 net/netfilter/xt_connbytes.c                   | 180 ++++++
 net/netfilter/xt_connmark.c                    | 109 ++++
 net/netfilter/xt_conntrack.c                   | 238 +++++++
 net/netfilter/xt_dccp.c                        | 221 +++++++
 net/netfilter/xt_helper.c                      | 188 ++++++
 net/netfilter/xt_length.c                      |  98 +++
 net/netfilter/xt_limit.c                       | 175 +++++
 net/netfilter/xt_mac.c                         | 100 +++
 net/netfilter/xt_mark.c                        |  91 +++
 net/netfilter/xt_physdev.c                     | 155 +++++
 net/netfilter/xt_pkttype.c                     |  82 +++
 net/netfilter/xt_realm.c                       |  79 +++
 net/netfilter/xt_sctp.c                        | 250 ++++++++
 net/netfilter/xt_state.c                       |  96 +++
 net/netfilter/xt_string.c                      | 111 ++++
 net/netfilter/xt_tcpmss.c                      | 172 +++++
 net/netfilter/xt_tcpudp.c                      | 333 ++++++++++
 net/sched/act_ipt.c                            |   2 +-
 174 files changed, 5672 insertions(+), 6206 deletions(-)
 create mode 100644 include/linux/netfilter/x_tables.h
 create mode 100644 include/linux/netfilter/xt_CLASSIFY.h
 create mode 100644 include/linux/netfilter/xt_CONNMARK.h
 create mode 100644 include/linux/netfilter/xt_MARK.h
 create mode 100644 include/linux/netfilter/xt_NFQUEUE.h
 create mode 100644 include/linux/netfilter/xt_comment.h
 create mode 100644 include/linux/netfilter/xt_connbytes.h
 create mode 100644 include/linux/netfilter/xt_connmark.h
 create mode 100644 include/linux/netfilter/xt_conntrack.h
 create mode 100644 include/linux/netfilter/xt_dccp.h
 create mode 100644 include/linux/netfilter/xt_helper.h
 create mode 100644 include/linux/netfilter/xt_length.h
 create mode 100644 include/linux/netfilter/xt_limit.h
 create mode 100644 include/linux/netfilter/xt_mac.h
 create mode 100644 include/linux/netfilter/xt_mark.h
 create mode 100644 include/linux/netfilter/xt_physdev.h
 create mode 100644 include/linux/netfilter/xt_pkttype.h
 create mode 100644 include/linux/netfilter/xt_realm.h
 create mode 100644 include/linux/netfilter/xt_sctp.h
 create mode 100644 include/linux/netfilter/xt_state.h
 create mode 100644 include/linux/netfilter/xt_string.h
 create mode 100644 include/linux/netfilter/xt_tcpmss.h
 create mode 100644 include/linux/netfilter/xt_tcpudp.h
 delete mode 100644 net/ipv4/netfilter/ipt_CLASSIFY.c
 delete mode 100644 net/ipv4/netfilter/ipt_CONNMARK.c
 delete mode 100644 net/ipv4/netfilter/ipt_MARK.c
 delete mode 100644 net/ipv4/netfilter/ipt_NFQUEUE.c
 delete mode 100644 net/ipv4/netfilter/ipt_NOTRACK.c
 delete mode 100644 net/ipv4/netfilter/ipt_comment.c
 delete mode 100644 net/ipv4/netfilter/ipt_connbytes.c
 delete mode 100644 net/ipv4/netfilter/ipt_connmark.c
 delete mode 100644 net/ipv4/netfilter/ipt_conntrack.c
 delete mode 100644 net/ipv4/netfilter/ipt_dccp.c
 delete mode 100644 net/ipv4/netfilter/ipt_helper.c
 delete mode 100644 net/ipv4/netfilter/ipt_length.c
 delete mode 100644 net/ipv4/netfilter/ipt_limit.c
 delete mode 100644 net/ipv4/netfilter/ipt_mac.c
 delete mode 100644 net/ipv4/netfilter/ipt_mark.c
 delete mode 100644 net/ipv4/netfilter/ipt_physdev.c
 delete mode 100644 net/ipv4/netfilter/ipt_pkttype.c
 delete mode 100644 net/ipv4/netfilter/ipt_realm.c
 delete mode 100644 net/ipv4/netfilter/ipt_sctp.c
 delete mode 100644 net/ipv4/netfilter/ipt_state.c
 delete mode 100644 net/ipv4/netfilter/ipt_string.c
 delete mode 100644 net/ipv4/netfilter/ipt_tcpmss.c
 delete mode 100644 net/ipv6/netfilter/ip6t_MARK.c
 delete mode 100644 net/ipv6/netfilter/ip6t_NFQUEUE.c
 delete mode 100644 net/ipv6/netfilter/ip6t_length.c
 delete mode 100644 net/ipv6/netfilter/ip6t_limit.c
 delete mode 100644 net/ipv6/netfilter/ip6t_mac.c
 delete mode 100644 net/ipv6/netfilter/ip6t_mark.c
 delete mode 100644 net/ipv6/netfilter/ip6t_physdev.c
 create mode 100644 net/netfilter/x_tables.c
 create mode 100644 net/netfilter/xt_CLASSIFY.c
 create mode 100644 net/netfilter/xt_CONNMARK.c
 create mode 100644 net/netfilter/xt_MARK.c
 create mode 100644 net/netfilter/xt_NFQUEUE.c
 create mode 100644 net/netfilter/xt_NOTRACK.c
 create mode 100644 net/netfilter/xt_comment.c
 create mode 100644 net/netfilter/xt_connbytes.c
 create mode 100644 net/netfilter/xt_connmark.c
 create mode 100644 net/netfilter/xt_conntrack.c
 create mode 100644 net/netfilter/xt_dccp.c
 create mode 100644 net/netfilter/xt_helper.c
 create mode 100644 net/netfilter/xt_length.c
 create mode 100644 net/netfilter/xt_limit.c
 create mode 100644 net/netfilter/xt_mac.c
 create mode 100644 net/netfilter/xt_mark.c
 create mode 100644 net/netfilter/xt_physdev.c
 create mode 100644 net/netfilter/xt_pkttype.c
 create mode 100644 net/netfilter/xt_realm.c
 create mode 100644 net/netfilter/xt_sctp.c
 create mode 100644 net/netfilter/xt_state.c
 create mode 100644 net/netfilter/xt_string.c
 create mode 100644 net/netfilter/xt_tcpmss.c
 create mode 100644 net/netfilter/xt_tcpudp.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 6d39b518486b..3ff88c878308 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -154,6 +154,9 @@ struct ip_conntrack_stat
 	unsigned int expect_delete;
 };
 
+/* call to create an explicit dependency on nf_conntrack. */
+extern void need_conntrack(void);
+
 #endif /* __KERNEL__ */
 
 #endif /* _NF_CONNTRACK_COMMON_H */
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
new file mode 100644
index 000000000000..472f04834809
--- /dev/null
+++ b/include/linux/netfilter/x_tables.h
@@ -0,0 +1,224 @@
+#ifndef _X_TABLES_H
+#define _X_TABLES_H
+
+#define XT_FUNCTION_MAXNAMELEN 30
+#define XT_TABLE_MAXNAMELEN 32
+
+/* The argument to IPT_SO_GET_REVISION_*.  Returns highest revision
+ * kernel supports, if >= revision. */
+struct xt_get_revision
+{
+	char name[XT_FUNCTION_MAXNAMELEN-1];
+
+	u_int8_t revision;
+};
+
+/* CONTINUE verdict for targets */
+#define XT_CONTINUE 0xFFFFFFFF
+
+/* For standard target */
+#define XT_RETURN (-NF_REPEAT - 1)
+
+#define XT_ALIGN(s) (((s) + (__alignof__(void *)-1)) & ~(__alignof__(void *)-1))
+
+/* Standard return verdict, or do jump. */
+#define XT_STANDARD_TARGET ""
+/* Error verdict. */
+#define XT_ERROR_TARGET "ERROR"
+
+/*
+ * New IP firewall options for [gs]etsockopt at the RAW IP level.
+ * Unlike BSD Linux inherits IP options so you don't have to use a raw
+ * socket for this. Instead we check rights in the calls. */
+#define XT_BASE_CTL		64	/* base for firewall socket options */
+
+#define XT_SO_SET_REPLACE	(XT_BASE_CTL)
+#define XT_SO_SET_ADD_COUNTERS	(XT_BASE_CTL + 1)
+#define XT_SO_SET_MAX		XT_SO_SET_ADD_COUNTERS
+
+#define XT_SO_GET_INFO			(XT_BASE_CTL)
+#define XT_SO_GET_ENTRIES		(XT_BASE_CTL + 1)
+#define XT_SO_GET_REVISION_MATCH	(XT_BASE_CTL + 2)
+#define XT_SO_GET_REVISION_TARGET	(XT_BASE_CTL + 3)
+#define XT_SO_GET_MAX			XT_SO_GET_REVISION_TARGET
+
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
+#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+
+struct xt_counters
+{
+	u_int64_t pcnt, bcnt;			/* Packet and byte counters */
+};
+
+/* The argument to IPT_SO_ADD_COUNTERS. */
+struct xt_counters_info
+{
+	/* Which table. */
+	char name[XT_TABLE_MAXNAMELEN];
+
+	unsigned int num_counters;
+
+	/* The counters (actually `number' of these). */
+	struct xt_counters counters[0];
+};
+
+#define XT_INV_PROTO		0x40	/* Invert the sense of PROTO. */
+
+#ifdef __KERNEL__
+
+#include <linux/netdevice.h>
+
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
+#include <linux/netfilter_ipv4/listhelp.h>
+
+struct xt_match
+{
+	struct list_head list;
+
+	const char name[XT_FUNCTION_MAXNAMELEN-1];
+
+	u_int8_t revision;
+
+	/* Return true or false: return FALSE and set *hotdrop = 1 to
+           force immediate packet drop. */
+	/* Arguments changed since 2.6.9, as this must now handle
+	   non-linear skb, using skb_header_pointer and
+	   skb_ip_make_writable. */
+	int (*match)(const struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out,
+		     const void *matchinfo,
+		     int offset,
+		     unsigned int protoff,
+		     int *hotdrop);
+
+	/* Called when user tries to insert an entry of this type. */
+	/* Should return true or false. */
+	int (*checkentry)(const char *tablename,
+			  const void *ip,
+			  void *matchinfo,
+			  unsigned int matchinfosize,
+			  unsigned int hook_mask);
+
+	/* Called when entry of this type deleted. */
+	void (*destroy)(void *matchinfo, unsigned int matchinfosize);
+
+	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
+	struct module *me;
+};
+
+/* Registration hooks for targets. */
+struct xt_target
+{
+	struct list_head list;
+
+	const char name[XT_FUNCTION_MAXNAMELEN-1];
+
+	u_int8_t revision;
+
+	/* Returns verdict. Argument order changed since 2.6.9, as this
+	   must now handle non-linear skbs, using skb_copy_bits and
+	   skb_ip_make_writable. */
+	unsigned int (*target)(struct sk_buff **pskb,
+			       const struct net_device *in,
+			       const struct net_device *out,
+			       unsigned int hooknum,
+			       const void *targinfo,
+			       void *userdata);
+
+	/* Called when user tries to insert an entry of this type:
+           hook_mask is a bitmask of hooks from which it can be
+           called. */
+	/* Should return true or false. */
+	int (*checkentry)(const char *tablename,
+			  const void *entry,
+			  void *targinfo,
+			  unsigned int targinfosize,
+			  unsigned int hook_mask);
+
+	/* Called when entry of this type deleted. */
+	void (*destroy)(void *targinfo, unsigned int targinfosize);
+
+	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
+	struct module *me;
+};
+
+/* Furniture shopping... */
+struct xt_table
+{
+	struct list_head list;
+
+	/* A unique name... */
+	char name[XT_TABLE_MAXNAMELEN];
+
+	/* What hooks you will enter on */
+	unsigned int valid_hooks;
+
+	/* Lock for the curtain */
+	rwlock_t lock;
+
+	/* Man behind the curtain... */
+	//struct ip6t_table_info *private;
+	void *private;
+
+	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
+	struct module *me;
+
+	int af;		/* address/protocol family */
+};
+
+#include <linux/netfilter_ipv4.h>
+
+/* The table itself */
+struct xt_table_info
+{
+	/* Size per table */
+	unsigned int size;
+	/* Number of entries: FIXME. --RR */
+	unsigned int number;
+	/* Initial number of entries. Needed for module usage count */
+	unsigned int initial_entries;
+
+	/* Entry points and underflows */
+	unsigned int hook_entry[NF_IP_NUMHOOKS];
+	unsigned int underflow[NF_IP_NUMHOOKS];
+
+	/* ipt_entry tables: one per CPU */
+	char *entries[NR_CPUS];
+};
+
+extern int xt_register_target(int af, struct xt_target *target);
+extern void xt_unregister_target(int af, struct xt_target *target);
+extern int xt_register_match(int af, struct xt_match *target);
+extern void xt_unregister_match(int af, struct xt_match *target);
+
+extern int xt_register_table(struct xt_table *table,
+			     struct xt_table_info *bootstrap,
+			     struct xt_table_info *newinfo);
+extern void *xt_unregister_table(struct xt_table *table);
+
+extern struct xt_table_info *xt_replace_table(struct xt_table *table,
+					      unsigned int num_counters,
+					      struct xt_table_info *newinfo,
+					      int *error);
+
+extern struct xt_match *xt_find_match(int af, const char *name, u8 revision);
+extern struct xt_target *xt_find_target(int af, const char *name, u8 revision);
+extern struct xt_target *xt_request_find_target(int af, const char *name, 
+						u8 revision);
+extern int xt_find_revision(int af, const char *name, u8 revision, int target,
+			    int *err);
+
+extern struct xt_table *xt_find_table_lock(int af, const char *name);
+extern void xt_table_unlock(struct xt_table *t);
+
+extern int xt_proto_init(int af);
+extern void xt_proto_fini(int af);
+
+extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
+extern void xt_free_table_info(struct xt_table_info *info);
+
+#endif /* __KERNEL__ */
+
+#endif /* _X_TABLES_H */
diff --git a/include/linux/netfilter/xt_CLASSIFY.h b/include/linux/netfilter/xt_CLASSIFY.h
new file mode 100644
index 000000000000..58111355255d
--- /dev/null
+++ b/include/linux/netfilter/xt_CLASSIFY.h
@@ -0,0 +1,8 @@
+#ifndef _XT_CLASSIFY_H
+#define _XT_CLASSIFY_H
+
+struct xt_classify_target_info {
+	u_int32_t priority;
+};
+
+#endif /*_XT_CLASSIFY_H */
diff --git a/include/linux/netfilter/xt_CONNMARK.h b/include/linux/netfilter/xt_CONNMARK.h
new file mode 100644
index 000000000000..9f744689fffc
--- /dev/null
+++ b/include/linux/netfilter/xt_CONNMARK.h
@@ -0,0 +1,25 @@
+#ifndef _XT_CONNMARK_H_target
+#define _XT_CONNMARK_H_target
+
+/* Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+enum {
+	XT_CONNMARK_SET = 0,
+	XT_CONNMARK_SAVE,
+	XT_CONNMARK_RESTORE
+};
+
+struct xt_connmark_target_info {
+	unsigned long mark;
+	unsigned long mask;
+	u_int8_t mode;
+};
+
+#endif /*_XT_CONNMARK_H_target*/
diff --git a/include/linux/netfilter/xt_MARK.h b/include/linux/netfilter/xt_MARK.h
new file mode 100644
index 000000000000..b021e93ee5d6
--- /dev/null
+++ b/include/linux/netfilter/xt_MARK.h
@@ -0,0 +1,21 @@
+#ifndef _XT_MARK_H_target
+#define _XT_MARK_H_target
+
+/* Version 0 */
+struct xt_mark_target_info {
+	unsigned long mark;
+};
+
+/* Version 1 */
+enum {
+	XT_MARK_SET=0,
+	XT_MARK_AND,
+	XT_MARK_OR,
+};
+
+struct xt_mark_target_info_v1 {
+	unsigned long mark;
+	u_int8_t mode;
+};
+
+#endif /*_XT_MARK_H_target */
diff --git a/include/linux/netfilter/xt_NFQUEUE.h b/include/linux/netfilter/xt_NFQUEUE.h
new file mode 100644
index 000000000000..9a9af79f74d2
--- /dev/null
+++ b/include/linux/netfilter/xt_NFQUEUE.h
@@ -0,0 +1,16 @@
+/* iptables module for using NFQUEUE mechanism
+ *
+ * (C) 2005 Harald Welte <laforge@netfilter.org>
+ *
+ * This software is distributed under GNU GPL v2, 1991
+ * 
+*/
+#ifndef _XT_NFQ_TARGET_H
+#define _XT_NFQ_TARGET_H
+
+/* target info */
+struct xt_NFQ_info {
+	u_int16_t queuenum;
+};
+
+#endif /* _XT_NFQ_TARGET_H */
diff --git a/include/linux/netfilter/xt_comment.h b/include/linux/netfilter/xt_comment.h
new file mode 100644
index 000000000000..eacfedc6b5d0
--- /dev/null
+++ b/include/linux/netfilter/xt_comment.h
@@ -0,0 +1,10 @@
+#ifndef _XT_COMMENT_H
+#define _XT_COMMENT_H
+
+#define XT_MAX_COMMENT_LEN 256
+
+struct xt_comment_info {
+	unsigned char comment[XT_MAX_COMMENT_LEN];
+};
+
+#endif /* XT_COMMENT_H */
diff --git a/include/linux/netfilter/xt_connbytes.h b/include/linux/netfilter/xt_connbytes.h
new file mode 100644
index 000000000000..c022c989754d
--- /dev/null
+++ b/include/linux/netfilter/xt_connbytes.h
@@ -0,0 +1,25 @@
+#ifndef _XT_CONNBYTES_H
+#define _XT_CONNBYTES_H
+
+enum xt_connbytes_what {
+	XT_CONNBYTES_PKTS,
+	XT_CONNBYTES_BYTES,
+	XT_CONNBYTES_AVGPKT,
+};
+
+enum xt_connbytes_direction {
+	XT_CONNBYTES_DIR_ORIGINAL,
+	XT_CONNBYTES_DIR_REPLY,
+	XT_CONNBYTES_DIR_BOTH,
+};
+
+struct xt_connbytes_info
+{
+	struct {
+		aligned_u64 from;	/* count to be matched */
+		aligned_u64 to;		/* count to be matched */
+	} count;
+	u_int8_t what;		/* ipt_connbytes_what */
+	u_int8_t direction;	/* ipt_connbytes_direction */
+};
+#endif
diff --git a/include/linux/netfilter/xt_connmark.h b/include/linux/netfilter/xt_connmark.h
new file mode 100644
index 000000000000..c592f6ae0883
--- /dev/null
+++ b/include/linux/netfilter/xt_connmark.h
@@ -0,0 +1,18 @@
+#ifndef _XT_CONNMARK_H
+#define _XT_CONNMARK_H
+
+/* Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+struct xt_connmark_info {
+	unsigned long mark, mask;
+	u_int8_t invert;
+};
+
+#endif /*_XT_CONNMARK_H*/
diff --git a/include/linux/netfilter/xt_conntrack.h b/include/linux/netfilter/xt_conntrack.h
new file mode 100644
index 000000000000..34f63cf2e293
--- /dev/null
+++ b/include/linux/netfilter/xt_conntrack.h
@@ -0,0 +1,63 @@
+/* Header file for kernel module to match connection tracking information.
+ * GPL (C) 2001  Marc Boucher (marc@mbsi.ca).
+ */
+
+#ifndef _XT_CONNTRACK_H
+#define _XT_CONNTRACK_H
+
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
+#include <linux/in.h>
+
+#define XT_CONNTRACK_STATE_BIT(ctinfo) (1 << ((ctinfo)%IP_CT_IS_REPLY+1))
+#define XT_CONNTRACK_STATE_INVALID (1 << 0)
+
+#define XT_CONNTRACK_STATE_SNAT (1 << (IP_CT_NUMBER + 1))
+#define XT_CONNTRACK_STATE_DNAT (1 << (IP_CT_NUMBER + 2))
+#define XT_CONNTRACK_STATE_UNTRACKED (1 << (IP_CT_NUMBER + 3))
+
+/* flags, invflags: */
+#define XT_CONNTRACK_STATE	0x01
+#define XT_CONNTRACK_PROTO	0x02
+#define XT_CONNTRACK_ORIGSRC	0x04
+#define XT_CONNTRACK_ORIGDST	0x08
+#define XT_CONNTRACK_REPLSRC	0x10
+#define XT_CONNTRACK_REPLDST	0x20
+#define XT_CONNTRACK_STATUS	0x40
+#define XT_CONNTRACK_EXPIRES	0x80
+
+/* This is exposed to userspace, so remains frozen in time. */
+struct ip_conntrack_old_tuple
+{
+	struct {
+		__u32 ip;
+		union {
+			__u16 all;
+		} u;
+	} src;
+
+	struct {
+		__u32 ip;
+		union {
+			__u16 all;
+		} u;
+
+		/* The protocol. */
+		u16 protonum;
+	} dst;
+};
+
+struct xt_conntrack_info
+{
+	unsigned int statemask, statusmask;
+
+	struct ip_conntrack_old_tuple tuple[IP_CT_DIR_MAX];
+	struct in_addr sipmsk[IP_CT_DIR_MAX], dipmsk[IP_CT_DIR_MAX];
+
+	unsigned long expires_min, expires_max;
+
+	/* Flags word */
+	u_int8_t flags;
+	/* Inverse flags */
+	u_int8_t invflags;
+};
+#endif /*_XT_CONNTRACK_H*/
diff --git a/include/linux/netfilter/xt_dccp.h b/include/linux/netfilter/xt_dccp.h
new file mode 100644
index 000000000000..e0221b9d32cb
--- /dev/null
+++ b/include/linux/netfilter/xt_dccp.h
@@ -0,0 +1,23 @@
+#ifndef _XT_DCCP_H_
+#define _XT_DCCP_H_
+
+#define XT_DCCP_SRC_PORTS	        0x01
+#define XT_DCCP_DEST_PORTS	        0x02
+#define XT_DCCP_TYPE			0x04
+#define XT_DCCP_OPTION			0x08
+
+#define XT_DCCP_VALID_FLAGS		0x0f
+
+struct xt_dccp_info {
+	u_int16_t dpts[2];  /* Min, Max */
+	u_int16_t spts[2];  /* Min, Max */
+
+	u_int16_t flags;
+	u_int16_t invflags;
+
+	u_int16_t typemask;
+	u_int8_t option;
+};
+
+#endif /* _XT_DCCP_H_ */
+
diff --git a/include/linux/netfilter/xt_helper.h b/include/linux/netfilter/xt_helper.h
new file mode 100644
index 000000000000..6b42763f999d
--- /dev/null
+++ b/include/linux/netfilter/xt_helper.h
@@ -0,0 +1,8 @@
+#ifndef _XT_HELPER_H
+#define _XT_HELPER_H
+
+struct xt_helper_info {
+	int invert;
+	char name[30];
+};
+#endif /* _XT_HELPER_H */
diff --git a/include/linux/netfilter/xt_length.h b/include/linux/netfilter/xt_length.h
new file mode 100644
index 000000000000..7c2b439f73fe
--- /dev/null
+++ b/include/linux/netfilter/xt_length.h
@@ -0,0 +1,9 @@
+#ifndef _XT_LENGTH_H
+#define _XT_LENGTH_H
+
+struct xt_length_info {
+    u_int16_t	min, max;
+    u_int8_t	invert;
+};
+
+#endif /*_XT_LENGTH_H*/
diff --git a/include/linux/netfilter/xt_limit.h b/include/linux/netfilter/xt_limit.h
new file mode 100644
index 000000000000..b3ce65375ecb
--- /dev/null
+++ b/include/linux/netfilter/xt_limit.h
@@ -0,0 +1,21 @@
+#ifndef _XT_RATE_H
+#define _XT_RATE_H
+
+/* timings are in milliseconds. */
+#define XT_LIMIT_SCALE 10000
+
+/* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
+   seconds, or one every 59 hours. */
+struct xt_rateinfo {
+	u_int32_t avg;    /* Average secs between packets * scale */
+	u_int32_t burst;  /* Period multiplier for upper limit. */
+
+	/* Used internally by the kernel */
+	unsigned long prev;
+	u_int32_t credit;
+	u_int32_t credit_cap, cost;
+
+	/* Ugly, ugly fucker. */
+	struct xt_rateinfo *master;
+};
+#endif /*_XT_RATE_H*/
diff --git a/include/linux/netfilter/xt_mac.h b/include/linux/netfilter/xt_mac.h
new file mode 100644
index 000000000000..b892cdc67e06
--- /dev/null
+++ b/include/linux/netfilter/xt_mac.h
@@ -0,0 +1,8 @@
+#ifndef _XT_MAC_H
+#define _XT_MAC_H
+
+struct xt_mac_info {
+    unsigned char srcaddr[ETH_ALEN];
+    int invert;
+};
+#endif /*_XT_MAC_H*/
diff --git a/include/linux/netfilter/xt_mark.h b/include/linux/netfilter/xt_mark.h
new file mode 100644
index 000000000000..802dd4842caf
--- /dev/null
+++ b/include/linux/netfilter/xt_mark.h
@@ -0,0 +1,9 @@
+#ifndef _XT_MARK_H
+#define _XT_MARK_H
+
+struct xt_mark_info {
+    unsigned long mark, mask;
+    u_int8_t invert;
+};
+
+#endif /*_XT_MARK_H*/
diff --git a/include/linux/netfilter/xt_physdev.h b/include/linux/netfilter/xt_physdev.h
new file mode 100644
index 000000000000..25a7a1815b5b
--- /dev/null
+++ b/include/linux/netfilter/xt_physdev.h
@@ -0,0 +1,24 @@
+#ifndef _XT_PHYSDEV_H
+#define _XT_PHYSDEV_H
+
+#ifdef __KERNEL__
+#include <linux/if.h>
+#endif
+
+#define XT_PHYSDEV_OP_IN		0x01
+#define XT_PHYSDEV_OP_OUT		0x02
+#define XT_PHYSDEV_OP_BRIDGED		0x04
+#define XT_PHYSDEV_OP_ISIN		0x08
+#define XT_PHYSDEV_OP_ISOUT		0x10
+#define XT_PHYSDEV_OP_MASK		(0x20 - 1)
+
+struct xt_physdev_info {
+	char physindev[IFNAMSIZ];
+	char in_mask[IFNAMSIZ];
+	char physoutdev[IFNAMSIZ];
+	char out_mask[IFNAMSIZ];
+	u_int8_t invert;
+	u_int8_t bitmask;
+};
+
+#endif /*_XT_PHYSDEV_H*/
diff --git a/include/linux/netfilter/xt_pkttype.h b/include/linux/netfilter/xt_pkttype.h
new file mode 100644
index 000000000000..f265cf52faea
--- /dev/null
+++ b/include/linux/netfilter/xt_pkttype.h
@@ -0,0 +1,8 @@
+#ifndef _XT_PKTTYPE_H
+#define _XT_PKTTYPE_H
+
+struct xt_pkttype_info {
+	int	pkttype;
+	int	invert;
+};
+#endif /*_XT_PKTTYPE_H*/
diff --git a/include/linux/netfilter/xt_realm.h b/include/linux/netfilter/xt_realm.h
new file mode 100644
index 000000000000..220e87245716
--- /dev/null
+++ b/include/linux/netfilter/xt_realm.h
@@ -0,0 +1,10 @@
+#ifndef _XT_REALM_H
+#define _XT_REALM_H
+
+struct xt_realm_info {
+	u_int32_t id;
+	u_int32_t mask;
+	u_int8_t invert;
+};
+
+#endif /* _XT_REALM_H */
diff --git a/include/linux/netfilter/xt_sctp.h b/include/linux/netfilter/xt_sctp.h
new file mode 100644
index 000000000000..b157897e7792
--- /dev/null
+++ b/include/linux/netfilter/xt_sctp.h
@@ -0,0 +1,107 @@
+#ifndef _XT_SCTP_H_
+#define _XT_SCTP_H_
+
+#define XT_SCTP_SRC_PORTS	        0x01
+#define XT_SCTP_DEST_PORTS	        0x02
+#define XT_SCTP_CHUNK_TYPES		0x04
+
+#define XT_SCTP_VALID_FLAGS		0x07
+
+#define ELEMCOUNT(x) (sizeof(x)/sizeof(x[0]))
+
+
+struct xt_sctp_flag_info {
+	u_int8_t chunktype;
+	u_int8_t flag;
+	u_int8_t flag_mask;
+};
+
+#define XT_NUM_SCTP_FLAGS	4
+
+struct xt_sctp_info {
+	u_int16_t dpts[2];  /* Min, Max */
+	u_int16_t spts[2];  /* Min, Max */
+
+	u_int32_t chunkmap[256 / sizeof (u_int32_t)];  /* Bit mask of chunks to be matched according to RFC 2960 */
+
+#define SCTP_CHUNK_MATCH_ANY   0x01  /* Match if any of the chunk types are present */
+#define SCTP_CHUNK_MATCH_ALL   0x02  /* Match if all of the chunk types are present */
+#define SCTP_CHUNK_MATCH_ONLY  0x04  /* Match if these are the only chunk types present */
+
+	u_int32_t chunk_match_type;
+	struct xt_sctp_flag_info flag_info[XT_NUM_SCTP_FLAGS];
+	int flag_count;
+
+	u_int32_t flags;
+	u_int32_t invflags;
+};
+
+#define bytes(type) (sizeof(type) * 8)
+
+#define SCTP_CHUNKMAP_SET(chunkmap, type) 		\
+	do { 						\
+		chunkmap[type / bytes(u_int32_t)] |= 	\
+			1 << (type % bytes(u_int32_t));	\
+	} while (0)
+
+#define SCTP_CHUNKMAP_CLEAR(chunkmap, type)		 	\
+	do {							\
+		chunkmap[type / bytes(u_int32_t)] &= 		\
+			~(1 << (type % bytes(u_int32_t)));	\
+	} while (0)
+
+#define SCTP_CHUNKMAP_IS_SET(chunkmap, type) 			\
+({								\
+	(chunkmap[type / bytes (u_int32_t)] & 			\
+		(1 << (type % bytes (u_int32_t)))) ? 1: 0;	\
+})
+
+#define SCTP_CHUNKMAP_RESET(chunkmap) 				\
+	do {							\
+		int i; 						\
+		for (i = 0; i < ELEMCOUNT(chunkmap); i++)	\
+			chunkmap[i] = 0;			\
+	} while (0)
+
+#define SCTP_CHUNKMAP_SET_ALL(chunkmap) 			\
+	do {							\
+		int i; 						\
+		for (i = 0; i < ELEMCOUNT(chunkmap); i++) 	\
+			chunkmap[i] = ~0;			\
+	} while (0)
+
+#define SCTP_CHUNKMAP_COPY(destmap, srcmap) 			\
+	do {							\
+		int i; 						\
+		for (i = 0; i < ELEMCOUNT(chunkmap); i++) 	\
+			destmap[i] = srcmap[i];			\
+	} while (0)
+
+#define SCTP_CHUNKMAP_IS_CLEAR(chunkmap) 		\
+({							\
+	int i; 						\
+	int flag = 1;					\
+	for (i = 0; i < ELEMCOUNT(chunkmap); i++) {	\
+		if (chunkmap[i]) {			\
+			flag = 0;			\
+			break;				\
+		}					\
+	}						\
+        flag;						\
+})
+
+#define SCTP_CHUNKMAP_IS_ALL_SET(chunkmap) 		\
+({							\
+	int i; 						\
+	int flag = 1;					\
+	for (i = 0; i < ELEMCOUNT(chunkmap); i++) {	\
+		if (chunkmap[i] != ~0) {		\
+			flag = 0;			\
+				break;			\
+		}					\
+	}						\
+        flag;						\
+})
+
+#endif /* _XT_SCTP_H_ */
+
diff --git a/include/linux/netfilter/xt_state.h b/include/linux/netfilter/xt_state.h
new file mode 100644
index 000000000000..c06f32edee07
--- /dev/null
+++ b/include/linux/netfilter/xt_state.h
@@ -0,0 +1,13 @@
+#ifndef _XT_STATE_H
+#define _XT_STATE_H
+
+#define XT_STATE_BIT(ctinfo) (1 << ((ctinfo)%IP_CT_IS_REPLY+1))
+#define XT_STATE_INVALID (1 << 0)
+
+#define XT_STATE_UNTRACKED (1 << (IP_CT_NUMBER + 1))
+
+struct xt_state_info
+{
+	unsigned int statemask;
+};
+#endif /*_XT_STATE_H*/
diff --git a/include/linux/netfilter/xt_string.h b/include/linux/netfilter/xt_string.h
new file mode 100644
index 000000000000..3b3419f2637d
--- /dev/null
+++ b/include/linux/netfilter/xt_string.h
@@ -0,0 +1,18 @@
+#ifndef _XT_STRING_H
+#define _XT_STRING_H
+
+#define XT_STRING_MAX_PATTERN_SIZE 128
+#define XT_STRING_MAX_ALGO_NAME_SIZE 16
+
+struct xt_string_info
+{
+	u_int16_t from_offset;
+	u_int16_t to_offset;
+	char	  algo[XT_STRING_MAX_ALGO_NAME_SIZE];
+	char 	  pattern[XT_STRING_MAX_PATTERN_SIZE];
+	u_int8_t  patlen;
+	u_int8_t  invert;
+	struct ts_config __attribute__((aligned(8))) *config;
+};
+
+#endif /*_XT_STRING_H*/
diff --git a/include/linux/netfilter/xt_tcpmss.h b/include/linux/netfilter/xt_tcpmss.h
new file mode 100644
index 000000000000..e03274c4c790
--- /dev/null
+++ b/include/linux/netfilter/xt_tcpmss.h
@@ -0,0 +1,9 @@
+#ifndef _XT_TCPMSS_MATCH_H
+#define _XT_TCPMSS_MATCH_H
+
+struct xt_tcpmss_match_info {
+    u_int16_t mss_min, mss_max;
+    u_int8_t invert;
+};
+
+#endif /*_XT_TCPMSS_MATCH_H*/
diff --git a/include/linux/netfilter/xt_tcpudp.h b/include/linux/netfilter/xt_tcpudp.h
new file mode 100644
index 000000000000..78bc65f11adf
--- /dev/null
+++ b/include/linux/netfilter/xt_tcpudp.h
@@ -0,0 +1,36 @@
+#ifndef _XT_TCPUDP_H
+#define _XT_TCPUDP_H
+
+/* TCP matching stuff */
+struct xt_tcp
+{
+	u_int16_t spts[2];			/* Source port range. */
+	u_int16_t dpts[2];			/* Destination port range. */
+	u_int8_t option;			/* TCP Option iff non-zero*/
+	u_int8_t flg_mask;			/* TCP flags mask byte */
+	u_int8_t flg_cmp;			/* TCP flags compare byte */
+	u_int8_t invflags;			/* Inverse flags */
+};
+
+/* Values for "inv" field in struct ipt_tcp. */
+#define XT_TCP_INV_SRCPT	0x01	/* Invert the sense of source ports. */
+#define XT_TCP_INV_DSTPT	0x02	/* Invert the sense of dest ports. */
+#define XT_TCP_INV_FLAGS	0x04	/* Invert the sense of TCP flags. */
+#define XT_TCP_INV_OPTION	0x08	/* Invert the sense of option test. */
+#define XT_TCP_INV_MASK		0x0F	/* All possible flags. */
+
+/* UDP matching stuff */
+struct xt_udp
+{
+	u_int16_t spts[2];			/* Source port range. */
+	u_int16_t dpts[2];			/* Destination port range. */
+	u_int8_t invflags;			/* Inverse flags */
+};
+
+/* Values for "invflags" field in struct ipt_udp. */
+#define XT_UDP_INV_SRCPT	0x01	/* Invert the sense of source ports. */
+#define XT_UDP_INV_DSTPT	0x02	/* Invert the sense of dest ports. */
+#define XT_UDP_INV_MASK	0x03	/* All possible flags. */
+
+
+#endif
diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index e98a870a20be..fd21796e5131 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -19,8 +19,12 @@
 #include <linux/compiler.h>
 #include <linux/netfilter_arp.h>
 
-#define ARPT_FUNCTION_MAXNAMELEN 30
-#define ARPT_TABLE_MAXNAMELEN 32
+#include <linux/netfilter/x_tables.h>
+
+#define ARPT_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
+#define ARPT_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
+#define arpt_target xt_target
+#define arpt_table xt_table
 
 #define ARPT_DEV_ADDR_LEN_MAX 16
 
@@ -91,11 +95,6 @@ struct arpt_standard_target
 	int verdict;
 };
 
-struct arpt_counters
-{
-	u_int64_t pcnt, bcnt;			/* Packet and byte counters */
-};
-
 /* Values for "flag" field in struct arpt_ip (general arp structure).
  * No flags defined yet.
  */
@@ -130,7 +129,7 @@ struct arpt_entry
 	unsigned int comefrom;
 
 	/* Packet and byte counters. */
-	struct arpt_counters counters;
+	struct xt_counters counters;
 
 	/* The matches (if any), then the target. */
 	unsigned char elems[0];
@@ -141,23 +140,24 @@ struct arpt_entry
  * Unlike BSD Linux inherits IP options so you don't have to use a raw
  * socket for this. Instead we check rights in the calls.
  */
-#define ARPT_BASE_CTL		96	/* base for firewall socket options */
+#define ARPT_CTL_OFFSET		32
+#define ARPT_BASE_CTL		(XT_BASE_CTL+ARPT_CTL_OFFSET)
 
-#define ARPT_SO_SET_REPLACE		(ARPT_BASE_CTL)
-#define ARPT_SO_SET_ADD_COUNTERS	(ARPT_BASE_CTL + 1)
-#define ARPT_SO_SET_MAX			ARPT_SO_SET_ADD_COUNTERS
+#define ARPT_SO_SET_REPLACE		(XT_SO_SET_REPLACE+ARPT_CTL_OFFSET)
+#define ARPT_SO_SET_ADD_COUNTERS	(XT_SO_SET_ADD_COUNTERS+ARPT_CTL_OFFSET)
+#define ARPT_SO_SET_MAX			(XT_SO_SET_MAX+ARPT_CTL_OFFSET)
 
-#define ARPT_SO_GET_INFO		(ARPT_BASE_CTL)
-#define ARPT_SO_GET_ENTRIES		(ARPT_BASE_CTL + 1)
-/* #define ARPT_SO_GET_REVISION_MATCH	(ARPT_BASE_CTL + 2)*/
-#define ARPT_SO_GET_REVISION_TARGET	(ARPT_BASE_CTL + 3)
-#define ARPT_SO_GET_MAX			ARPT_SO_GET_REVISION_TARGET
+#define ARPT_SO_GET_INFO		(XT_SO_GET_INFO+ARPT_CTL_OFFSET)
+#define ARPT_SO_GET_ENTRIES		(XT_SO_GET_ENTRIES+ARPT_CTL_OFFSET)
+/* #define ARPT_SO_GET_REVISION_MATCH	XT_SO_GET_REVISION_MATCH  */
+#define ARPT_SO_GET_REVISION_TARGET	(XT_SO_GET_REVISION_TARGET+ARPT_CTL_OFFSET)
+#define ARPT_SO_GET_MAX			(XT_SO_GET_REVISION_TARGET+ARPT_CTL_OFFSET)
 
 /* CONTINUE verdict for targets */
-#define ARPT_CONTINUE 0xFFFFFFFF
+#define ARPT_CONTINUE XT_CONTINUE
 
 /* For standard target */
-#define ARPT_RETURN (-NF_REPEAT - 1)
+#define ARPT_RETURN XT_RETURN
 
 /* The argument to ARPT_SO_GET_INFO */
 struct arpt_getinfo
@@ -208,23 +208,14 @@ struct arpt_replace
 	/* Number of counters (must be equal to current number of entries). */
 	unsigned int num_counters;
 	/* The old entries' counters. */
-	struct arpt_counters __user *counters;
+	struct xt_counters __user *counters;
 
 	/* The entries (hang off end: not really an array). */
 	struct arpt_entry entries[0];
 };
 
 /* The argument to ARPT_SO_ADD_COUNTERS. */
-struct arpt_counters_info
-{
-	/* Which table. */
-	char name[ARPT_TABLE_MAXNAMELEN];
-
-	unsigned int num_counters;
-
-	/* The counters (actually `number' of these). */
-	struct arpt_counters counters[0];
-};
+#define arpt_counters_info xt_counters_info
 
 /* The argument to ARPT_SO_GET_ENTRIES. */
 struct arpt_get_entries
@@ -239,19 +230,10 @@ struct arpt_get_entries
 	struct arpt_entry entrytable[0];
 };
 
-/* The argument to ARPT_SO_GET_REVISION_*.  Returns highest revision
- * kernel supports, if >= revision. */
-struct arpt_get_revision
-{
-	char name[ARPT_FUNCTION_MAXNAMELEN-1];
-
-	u_int8_t revision;
-};
-
 /* Standard return verdict, or do jump. */
-#define ARPT_STANDARD_TARGET ""
+#define ARPT_STANDARD_TARGET XT_STANDARD_TARGET
 /* Error verdict. */
-#define ARPT_ERROR_TARGET "ERROR"
+#define ARPT_ERROR_TARGET XT_ERROR_TARGET
 
 /* Helper functions */
 static __inline__ struct arpt_entry_target *arpt_get_target(struct arpt_entry *e)
@@ -281,63 +263,8 @@ static __inline__ struct arpt_entry_target *arpt_get_target(struct arpt_entry *e
  */
 #ifdef __KERNEL__
 
-/* Registration hooks for targets. */
-struct arpt_target
-{
-	struct list_head list;
-
-	const char name[ARPT_FUNCTION_MAXNAMELEN-1];
-
-	u_int8_t revision;
-
-	/* Returns verdict. */
-	unsigned int (*target)(struct sk_buff **pskb,
-			       unsigned int hooknum,
-			       const struct net_device *in,
-			       const struct net_device *out,
-			       const void *targinfo,
-			       void *userdata);
-
-	/* Called when user tries to insert an entry of this type:
-           hook_mask is a bitmask of hooks from which it can be
-           called. */
-	/* Should return true or false. */
-	int (*checkentry)(const char *tablename,
-			  const struct arpt_entry *e,
-			  void *targinfo,
-			  unsigned int targinfosize,
-			  unsigned int hook_mask);
-
-	/* Called when entry of this type deleted. */
-	void (*destroy)(void *targinfo, unsigned int targinfosize);
-
-	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
-	struct module *me;
-};
-
-extern int arpt_register_target(struct arpt_target *target);
-extern void arpt_unregister_target(struct arpt_target *target);
-
-/* Furniture shopping... */
-struct arpt_table
-{
-	struct list_head list;
-
-	/* A unique name... */
-	char name[ARPT_TABLE_MAXNAMELEN];
-
-	/* What hooks you will enter on */
-	unsigned int valid_hooks;
-
-	/* Lock for the curtain */
-	rwlock_t lock;
-
-	/* Man behind the curtain... */
-	struct arpt_table_info *private;
-
-	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
-	struct module *me;
-};
+#define arpt_register_target(tgt) xt_register_target(NF_ARP, tgt)
+#define arpt_unregister_target(tgt) xt_unregister_target(NF_ARP, tgt)
 
 extern int arpt_register_table(struct arpt_table *table,
 			       const struct arpt_replace *repl);
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index b3432ab59a17..215765f043e6 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -199,9 +199,6 @@ ip_conntrack_put(struct ip_conntrack *ct)
 	nf_conntrack_put(&ct->ct_general);
 }
 
-/* call to create an explicit dependency on ip_conntrack. */
-extern void need_ip_conntrack(void);
-
 extern int invert_tuplepr(struct ip_conntrack_tuple *inverse,
 			  const struct ip_conntrack_tuple *orig);
 
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index d19d65cf4530..76ba24b68515 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -25,8 +25,14 @@
 #include <linux/compiler.h>
 #include <linux/netfilter_ipv4.h>
 
-#define IPT_FUNCTION_MAXNAMELEN 30
-#define IPT_TABLE_MAXNAMELEN 32
+#include <linux/netfilter/x_tables.h>
+
+#define IPT_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
+#define IPT_TABLE_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
+#define ipt_match xt_match
+#define ipt_target xt_target
+#define ipt_table xt_table
+#define ipt_get_revision xt_get_revision
 
 /* Yes, Virginia, you have to zero the padding. */
 struct ipt_ip {
@@ -102,10 +108,7 @@ struct ipt_standard_target
 	int verdict;
 };
 
-struct ipt_counters
-{
-	u_int64_t pcnt, bcnt;			/* Packet and byte counters */
-};
+#define ipt_counters xt_counters
 
 /* Values for "flag" field in struct ipt_ip (general ip structure). */
 #define IPT_F_FRAG		0x01	/* Set if rule is a fragment rule */
@@ -119,7 +122,7 @@ struct ipt_counters
 #define IPT_INV_SRCIP		0x08	/* Invert the sense of SRC IP. */
 #define IPT_INV_DSTIP		0x10	/* Invert the sense of DST OP. */
 #define IPT_INV_FRAG		0x20	/* Invert the sense of FRAG. */
-#define IPT_INV_PROTO		0x40	/* Invert the sense of PROTO. */
+#define IPT_INV_PROTO		XT_INV_PROTO
 #define IPT_INV_MASK		0x7F	/* All possible flag bits mask. */
 
 /* This structure defines each of the firewall rules.  Consists of 3
@@ -141,7 +144,7 @@ struct ipt_entry
 	unsigned int comefrom;
 
 	/* Packet and byte counters. */
-	struct ipt_counters counters;
+	struct xt_counters counters;
 
 	/* The matches (if any), then the target. */
 	unsigned char elems[0];
@@ -151,54 +154,34 @@ struct ipt_entry
  * New IP firewall options for [gs]etsockopt at the RAW IP level.
  * Unlike BSD Linux inherits IP options so you don't have to use a raw
  * socket for this. Instead we check rights in the calls. */
-#define IPT_BASE_CTL		64	/* base for firewall socket options */
+#define IPT_BASE_CTL		XT_BASE_CTL
 
-#define IPT_SO_SET_REPLACE	(IPT_BASE_CTL)
-#define IPT_SO_SET_ADD_COUNTERS	(IPT_BASE_CTL + 1)
-#define IPT_SO_SET_MAX		IPT_SO_SET_ADD_COUNTERS
+#define IPT_SO_SET_REPLACE	XT_SO_SET_REPLACE
+#define IPT_SO_SET_ADD_COUNTERS	XT_SO_SET_ADD_COUNTERS
+#define IPT_SO_SET_MAX		XT_SO_SET_MAX
 
-#define IPT_SO_GET_INFO			(IPT_BASE_CTL)
-#define IPT_SO_GET_ENTRIES		(IPT_BASE_CTL + 1)
-#define IPT_SO_GET_REVISION_MATCH	(IPT_BASE_CTL + 2)
-#define IPT_SO_GET_REVISION_TARGET	(IPT_BASE_CTL + 3)
-#define IPT_SO_GET_MAX			IPT_SO_GET_REVISION_TARGET
+#define IPT_SO_GET_INFO			XT_SO_GET_INFO
+#define IPT_SO_GET_ENTRIES		XT_SO_GET_ENTRIES
+#define IPT_SO_GET_REVISION_MATCH	XT_SO_GET_REVISION_MATCH
+#define IPT_SO_GET_REVISION_TARGET	XT_SO_GET_REVISION_TARGET
+#define IPT_SO_GET_MAX			XT_SO_GET_REVISION_TARGET
 
-/* CONTINUE verdict for targets */
-#define IPT_CONTINUE 0xFFFFFFFF
+#define IPT_CONTINUE XT_CONTINUE
+#define IPT_RETURN XT_RETURN
 
-/* For standard target */
-#define IPT_RETURN (-NF_REPEAT - 1)
+#include <linux/netfilter/xt_tcpudp.h>
+#define ipt_udp xt_udp
+#define ipt_tcp xt_tcp
 
-/* TCP matching stuff */
-struct ipt_tcp
-{
-	u_int16_t spts[2];			/* Source port range. */
-	u_int16_t dpts[2];			/* Destination port range. */
-	u_int8_t option;			/* TCP Option iff non-zero*/
-	u_int8_t flg_mask;			/* TCP flags mask byte */
-	u_int8_t flg_cmp;			/* TCP flags compare byte */
-	u_int8_t invflags;			/* Inverse flags */
-};
-
-/* Values for "inv" field in struct ipt_tcp. */
-#define IPT_TCP_INV_SRCPT	0x01	/* Invert the sense of source ports. */
-#define IPT_TCP_INV_DSTPT	0x02	/* Invert the sense of dest ports. */
-#define IPT_TCP_INV_FLAGS	0x04	/* Invert the sense of TCP flags. */
-#define IPT_TCP_INV_OPTION	0x08	/* Invert the sense of option test. */
-#define IPT_TCP_INV_MASK	0x0F	/* All possible flags. */
-
-/* UDP matching stuff */
-struct ipt_udp
-{
-	u_int16_t spts[2];			/* Source port range. */
-	u_int16_t dpts[2];			/* Destination port range. */
-	u_int8_t invflags;			/* Inverse flags */
-};
+#define IPT_TCP_INV_SRCPT	XT_TCP_INV_SRCPT
+#define IPT_TCP_INV_DSTPT	XT_TCP_INV_DSTPT
+#define IPT_TCP_INV_FLAGS	XT_TCP_INV_FLAGS
+#define IPT_TCP_INV_OPTION	XT_TCP_INV_OPTION
+#define IPT_TCP_INV_MASK	XT_TCP_INV_MASK
 
-/* Values for "invflags" field in struct ipt_udp. */
-#define IPT_UDP_INV_SRCPT	0x01	/* Invert the sense of source ports. */
-#define IPT_UDP_INV_DSTPT	0x02	/* Invert the sense of dest ports. */
-#define IPT_UDP_INV_MASK	0x03	/* All possible flags. */
+#define IPT_UDP_INV_SRCPT	XT_UDP_INV_SRCPT
+#define IPT_UDP_INV_DSTPT	XT_UDP_INV_DSTPT
+#define IPT_UDP_INV_MASK	XT_UDP_INV_MASK
 
 /* ICMP matching stuff */
 struct ipt_icmp
@@ -260,23 +243,14 @@ struct ipt_replace
 	/* Number of counters (must be equal to current number of entries). */
 	unsigned int num_counters;
 	/* The old entries' counters. */
-	struct ipt_counters __user *counters;
+	struct xt_counters __user *counters;
 
 	/* The entries (hang off end: not really an array). */
 	struct ipt_entry entries[0];
 };
 
 /* The argument to IPT_SO_ADD_COUNTERS. */
-struct ipt_counters_info
-{
-	/* Which table. */
-	char name[IPT_TABLE_MAXNAMELEN];
-
-	unsigned int num_counters;
-
-	/* The counters (actually `number' of these). */
-	struct ipt_counters counters[0];
-};
+#define ipt_counters_info xt_counters_info
 
 /* The argument to IPT_SO_GET_ENTRIES. */
 struct ipt_get_entries
@@ -291,19 +265,10 @@ struct ipt_get_entries
 	struct ipt_entry entrytable[0];
 };
 
-/* The argument to IPT_SO_GET_REVISION_*.  Returns highest revision
- * kernel supports, if >= revision. */
-struct ipt_get_revision
-{
-	char name[IPT_FUNCTION_MAXNAMELEN-1];
-
-	u_int8_t revision;
-};
-
 /* Standard return verdict, or do jump. */
-#define IPT_STANDARD_TARGET ""
+#define IPT_STANDARD_TARGET XT_STANDARD_TARGET
 /* Error verdict. */
-#define IPT_ERROR_TARGET "ERROR"
+#define IPT_ERROR_TARGET XT_ERROR_TARGET
 
 /* Helper functions */
 static __inline__ struct ipt_entry_target *
@@ -356,103 +321,18 @@ ipt_get_target(struct ipt_entry *e)
 #include <linux/init.h>
 extern void ipt_init(void) __init;
 
-struct ipt_match
-{
-	struct list_head list;
-
-	const char name[IPT_FUNCTION_MAXNAMELEN-1];
-
-	u_int8_t revision;
-
-	/* Return true or false: return FALSE and set *hotdrop = 1 to
-           force immediate packet drop. */
-	/* Arguments changed since 2.4, as this must now handle
-           non-linear skbs, using skb_copy_bits and
-           skb_ip_make_writable. */
-	int (*match)(const struct sk_buff *skb,
-		     const struct net_device *in,
-		     const struct net_device *out,
-		     const void *matchinfo,
-		     int offset,
-		     int *hotdrop);
-
-	/* Called when user tries to insert an entry of this type. */
-	/* Should return true or false. */
-	int (*checkentry)(const char *tablename,
-			  const struct ipt_ip *ip,
-			  void *matchinfo,
-			  unsigned int matchinfosize,
-			  unsigned int hook_mask);
-
-	/* Called when entry of this type deleted. */
-	void (*destroy)(void *matchinfo, unsigned int matchinfosize);
-
-	/* Set this to THIS_MODULE. */
-	struct module *me;
-};
-
-/* Registration hooks for targets. */
-struct ipt_target
-{
-	struct list_head list;
-
-	const char name[IPT_FUNCTION_MAXNAMELEN-1];
-
-	u_int8_t revision;
-
-	/* Called when user tries to insert an entry of this type:
-           hook_mask is a bitmask of hooks from which it can be
-           called. */
-	/* Should return true or false. */
-	int (*checkentry)(const char *tablename,
-			  const struct ipt_entry *e,
-			  void *targinfo,
-			  unsigned int targinfosize,
-			  unsigned int hook_mask);
-
-	/* Called when entry of this type deleted. */
-	void (*destroy)(void *targinfo, unsigned int targinfosize);
-
-	/* Returns verdict.  Argument order changed since 2.4, as this
-           must now handle non-linear skbs, using skb_copy_bits and
-           skb_ip_make_writable. */
-	unsigned int (*target)(struct sk_buff **pskb,
-			       const struct net_device *in,
-			       const struct net_device *out,
-			       unsigned int hooknum,
-			       const void *targinfo,
-			       void *userdata);
-
-	/* Set this to THIS_MODULE. */
-	struct module *me;
-};
+#define ipt_register_target(tgt) xt_register_target(AF_INET, tgt)
+#define ipt_unregister_target(tgt) xt_unregister_target(AF_INET, tgt)
 
-extern int ipt_register_target(struct ipt_target *target);
-extern void ipt_unregister_target(struct ipt_target *target);
+#define ipt_register_match(mtch) xt_register_match(AF_INET, mtch)
+#define ipt_unregister_match(mtch) xt_unregister_match(AF_INET, mtch)
 
-extern int ipt_register_match(struct ipt_match *match);
-extern void ipt_unregister_match(struct ipt_match *match);
+//#define ipt_register_table(tbl, repl) xt_register_table(AF_INET, tbl, repl)
+//#define ipt_unregister_table(tbl) xt_unregister_table(AF_INET, tbl)
 
-/* Furniture shopping... */
-struct ipt_table
-{
-	struct list_head list;
-
-	/* A unique name... */
-	char name[IPT_TABLE_MAXNAMELEN];
-
-	/* What hooks you will enter on */
-	unsigned int valid_hooks;
-
-	/* Lock for the curtain */
-	rwlock_t lock;
-
-	/* Man behind the curtain... */
-	struct ipt_table_info *private;
-
-	/* Set to THIS_MODULE. */
-	struct module *me;
-};
+extern int ipt_register_table(struct ipt_table *table,
+			      const struct ipt_replace *repl);
+extern void ipt_unregister_table(struct ipt_table *table);
 
 /* net/sched/ipt.c: Gimme access to your targets!  Gets target->me. */
 extern struct ipt_target *ipt_find_target(const char *name, u8 revision);
@@ -476,9 +356,6 @@ struct ipt_error
 	struct ipt_error_target target;
 };
 
-extern int ipt_register_table(struct ipt_table *table,
-			      const struct ipt_replace *repl);
-extern void ipt_unregister_table(struct ipt_table *table);
 extern unsigned int ipt_do_table(struct sk_buff **pskb,
 				 unsigned int hook,
 				 const struct net_device *in,
@@ -486,6 +363,6 @@ extern unsigned int ipt_do_table(struct sk_buff **pskb,
 				 struct ipt_table *table,
 				 void *userdata);
 
-#define IPT_ALIGN(s) (((s) + (__alignof__(struct ipt_entry)-1)) & ~(__alignof__(struct ipt_entry)-1))
+#define IPT_ALIGN(s) XT_ALIGN(s)
 #endif /*__KERNEL__*/
 #endif /* _IPTABLES_H */
diff --git a/include/linux/netfilter_ipv4/ipt_CLASSIFY.h b/include/linux/netfilter_ipv4/ipt_CLASSIFY.h
index 7596e3dd00ca..a46d511b5c36 100644
--- a/include/linux/netfilter_ipv4/ipt_CLASSIFY.h
+++ b/include/linux/netfilter_ipv4/ipt_CLASSIFY.h
@@ -1,8 +1,7 @@
 #ifndef _IPT_CLASSIFY_H
 #define _IPT_CLASSIFY_H
 
-struct ipt_classify_target_info {
-	u_int32_t priority;
-};
+#include <linux/netfilter/xt_CLASSIFY.h>
+#define ipt_classify_target_info xt_classify_target_info
 
 #endif /*_IPT_CLASSIFY_H */
diff --git a/include/linux/netfilter_ipv4/ipt_CONNMARK.h b/include/linux/netfilter_ipv4/ipt_CONNMARK.h
index d3c02536fc4c..9ecfee0a9e33 100644
--- a/include/linux/netfilter_ipv4/ipt_CONNMARK.h
+++ b/include/linux/netfilter_ipv4/ipt_CONNMARK.h
@@ -9,17 +9,11 @@
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  */
+#include <linux/netfilter/xt_CONNMARK.h>
+#define IPT_CONNMARK_SET	XT_CONNMARK_SET
+#define IPT_CONNMARK_SAVE	XT_CONNMARK_SAVE
+#define	IPT_CONNMARK_RESTORE	XT_CONNMARK_RESTORE
 
-enum {
-	IPT_CONNMARK_SET = 0,
-	IPT_CONNMARK_SAVE,
-	IPT_CONNMARK_RESTORE
-};
-
-struct ipt_connmark_target_info {
-	unsigned long mark;
-	unsigned long mask;
-	u_int8_t mode;
-};
+#define ipt_connmark_target_info xt_connmark_target_info
 
 #endif /*_IPT_CONNMARK_H_target*/
diff --git a/include/linux/netfilter_ipv4/ipt_MARK.h b/include/linux/netfilter_ipv4/ipt_MARK.h
index f47485790ed4..697a486a96d3 100644
--- a/include/linux/netfilter_ipv4/ipt_MARK.h
+++ b/include/linux/netfilter_ipv4/ipt_MARK.h
@@ -1,20 +1,18 @@
 #ifndef _IPT_MARK_H_target
 #define _IPT_MARK_H_target
 
+/* Backwards compatibility for old userspace */
+
+#include <linux/netfilter/xt_MARK.h>
+
 /* Version 0 */
-struct ipt_mark_target_info {
-	unsigned long mark;
-};
+#define ipt_mark_target_info xt_mark_target_info
 
 /* Version 1 */
-enum {
-	IPT_MARK_SET=0,
-	IPT_MARK_AND,
-	IPT_MARK_OR
-};
+#define IPT_MARK_SET	XT_MARK_SET
+#define IPT_MARK_AND	XT_MARK_AND
+#define	IPT_MARK_OR	XT_MARK_OR
+
+#define ipt_mark_target_info_v1 xt_mark_target_info_v1
 
-struct ipt_mark_target_info_v1 {
-	unsigned long mark;
-	u_int8_t mode;
-};
 #endif /*_IPT_MARK_H_target*/
diff --git a/include/linux/netfilter_ipv4/ipt_NFQUEUE.h b/include/linux/netfilter_ipv4/ipt_NFQUEUE.h
index b5b2943b0c66..97a2a7557cb9 100644
--- a/include/linux/netfilter_ipv4/ipt_NFQUEUE.h
+++ b/include/linux/netfilter_ipv4/ipt_NFQUEUE.h
@@ -8,9 +8,9 @@
 #ifndef _IPT_NFQ_TARGET_H
 #define _IPT_NFQ_TARGET_H
 
-/* target info */
-struct ipt_NFQ_info {
-	u_int16_t queuenum;
-};
+/* Backwards compatibility for old userspace */
+#include <linux/netfilter/xt_NFQUEUE.h>
+
+#define ipt_NFQ_info xt_NFQ_info
 
 #endif /* _IPT_DSCP_TARGET_H */
diff --git a/include/linux/netfilter_ipv4/ipt_comment.h b/include/linux/netfilter_ipv4/ipt_comment.h
index 85c1123c29ce..ae2afc2f7481 100644
--- a/include/linux/netfilter_ipv4/ipt_comment.h
+++ b/include/linux/netfilter_ipv4/ipt_comment.h
@@ -1,10 +1,10 @@
 #ifndef _IPT_COMMENT_H
 #define _IPT_COMMENT_H
 
-#define IPT_MAX_COMMENT_LEN 256
+#include <linux/netfilter/xt_comment.h>
 
-struct ipt_comment_info {
-	unsigned char comment[IPT_MAX_COMMENT_LEN];
-};
+#define IPT_MAX_COMMENT_LEN XT_MAX_COMMENT_LEN
+
+#define ipt_comment_info xt_comment_info
 
 #endif /* _IPT_COMMENT_H */
diff --git a/include/linux/netfilter_ipv4/ipt_connbytes.h b/include/linux/netfilter_ipv4/ipt_connbytes.h
index 9e5532f8d8ac..b04dfa3083c9 100644
--- a/include/linux/netfilter_ipv4/ipt_connbytes.h
+++ b/include/linux/netfilter_ipv4/ipt_connbytes.h
@@ -1,25 +1,18 @@
 #ifndef _IPT_CONNBYTES_H
 #define _IPT_CONNBYTES_H
 
-enum ipt_connbytes_what {
-	IPT_CONNBYTES_PKTS,
-	IPT_CONNBYTES_BYTES,
-	IPT_CONNBYTES_AVGPKT,
-};
+#include <net/netfilter/xt_connbytes.h>
+#define ipt_connbytes_what xt_connbytes_what
 
-enum ipt_connbytes_direction {
-	IPT_CONNBYTES_DIR_ORIGINAL,
-	IPT_CONNBYTES_DIR_REPLY,
-	IPT_CONNBYTES_DIR_BOTH,
-};
+#define IPT_CONNBYTES_PKTS	XT_CONNBYTES_PACKETS
+#define IPT_CONNBYTES_BYTES	XT_CONNBYTES_BYTES
+#define IPT_CONNBYTES_AVGPKT	XT_CONNBYTES_AVGPKT
+
+#define ipt_connbytes_direction 	xt_connbytes_direction
+#define IPT_CONNBYTES_DIR_ORIGINAL 	XT_CONNBYTES_DIR_ORIGINAL
+#define IPT_CONNBYTES_DIR_REPLY 	XT_CONNBYTES_DIR_REPLY
+#define IPT_CONNBYTES_DIR_BOTH		XT_CONNBYTES_DIR_BOTH
+
+#define ipt_connbytes_info xt_connbytes_info
 
-struct ipt_connbytes_info
-{
-	struct {
-		aligned_u64 from;	/* count to be matched */
-		aligned_u64 to;		/* count to be matched */
-	} count;
-	u_int8_t what;		/* ipt_connbytes_what */
-	u_int8_t direction;	/* ipt_connbytes_direction */
-};
 #endif
diff --git a/include/linux/netfilter_ipv4/ipt_connmark.h b/include/linux/netfilter_ipv4/ipt_connmark.h
index 46573270d9aa..c7ba6560d44c 100644
--- a/include/linux/netfilter_ipv4/ipt_connmark.h
+++ b/include/linux/netfilter_ipv4/ipt_connmark.h
@@ -1,18 +1,7 @@
 #ifndef _IPT_CONNMARK_H
 #define _IPT_CONNMARK_H
 
-/* Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
- * by Henrik Nordstrom <hno@marasystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-struct ipt_connmark_info {
-	unsigned long mark, mask;
-	u_int8_t invert;
-};
+#include <linux/netfilter/xt_connmark.h>
+#define ipt_connmark_info xt_connmark_info
 
 #endif /*_IPT_CONNMARK_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_conntrack.h b/include/linux/netfilter_ipv4/ipt_conntrack.h
index 413c5658bd3a..cde6762949c5 100644
--- a/include/linux/netfilter_ipv4/ipt_conntrack.h
+++ b/include/linux/netfilter_ipv4/ipt_conntrack.h
@@ -5,56 +5,24 @@
 #ifndef _IPT_CONNTRACK_H
 #define _IPT_CONNTRACK_H
 
-#define IPT_CONNTRACK_STATE_BIT(ctinfo) (1 << ((ctinfo)%IP_CT_IS_REPLY+1))
-#define IPT_CONNTRACK_STATE_INVALID (1 << 0)
+#include <linux/netfilter/xt_conntrack.h>
 
-#define IPT_CONNTRACK_STATE_SNAT (1 << (IP_CT_NUMBER + 1))
-#define IPT_CONNTRACK_STATE_DNAT (1 << (IP_CT_NUMBER + 2))
-#define IPT_CONNTRACK_STATE_UNTRACKED (1 << (IP_CT_NUMBER + 3))
+#define IPT_CONNTRACK_STATE_BIT(ctinfo) XT_CONNTRACK_STATE_BIT(ctinfo)
+#define IPT_CONNTRACK_STATE_INVALID 	XT_CONNTRACK_STATE_INVALID
 
-/* flags, invflags: */
-#define IPT_CONNTRACK_STATE	0x01
-#define IPT_CONNTRACK_PROTO	0x02
-#define IPT_CONNTRACK_ORIGSRC	0x04
-#define IPT_CONNTRACK_ORIGDST	0x08
-#define IPT_CONNTRACK_REPLSRC	0x10
-#define IPT_CONNTRACK_REPLDST	0x20
-#define IPT_CONNTRACK_STATUS	0x40
-#define IPT_CONNTRACK_EXPIRES	0x80
-
-/* This is exposed to userspace, so remains frozen in time. */
-struct ip_conntrack_old_tuple
-{
-	struct {
-		__u32 ip;
-		union {
-			__u16 all;
-		} u;
-	} src;
-
-	struct {
-		__u32 ip;
-		union {
-			__u16 all;
-		} u;
-
-		/* The protocol. */
-		u16 protonum;
-	} dst;
-};
+#define IPT_CONNTRACK_STATE_SNAT 	XT_CONNTRACK_STATE_SNAT
+#define IPT_CONNTRACK_STATE_DNAT	XT_CONNTRACK_STATE_DNAT
+#define IPT_CONNTRACK_STATE_UNTRACKED	XT_CONNTRACK_STATE_UNTRACKED
 
-struct ipt_conntrack_info
-{
-	unsigned int statemask, statusmask;
-
-	struct ip_conntrack_old_tuple tuple[IP_CT_DIR_MAX];
-	struct in_addr sipmsk[IP_CT_DIR_MAX], dipmsk[IP_CT_DIR_MAX];
-
-	unsigned long expires_min, expires_max;
-
-	/* Flags word */
-	u_int8_t flags;
-	/* Inverse flags */
-	u_int8_t invflags;
-};
+/* flags, invflags: */
+#define IPT_CONNTRACK_STATE		XT_CONNTRACK_STATE
+#define IPT_CONNTRACK_PROTO		XT_CONNTRACK_PROTO
+#define IPT_CONNTRACK_ORIGSRC		XT_CONNTRACK_ORIGSRC
+#define IPT_CONNTRACK_ORIGDST		XT_CONNTRACK_ORIGDST
+#define IPT_CONNTRACK_REPLSRC		XT_CONNTRACK_REPLSRC
+#define IPT_CONNTRACK_REPLDST		XT_CONNTRACK_REPLDST
+#define IPT_CONNTRACK_STATUS		XT_CONNTRACK_STATUS
+#define IPT_CONNTRACK_EXPIRES		XT_CONNTRACK_EXPIRES
+
+#define ipt_conntrack_info		xt_conntrack_info
 #endif /*_IPT_CONNTRACK_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_dccp.h b/include/linux/netfilter_ipv4/ipt_dccp.h
index 3cb3a522e62b..e70d11e1f53c 100644
--- a/include/linux/netfilter_ipv4/ipt_dccp.h
+++ b/include/linux/netfilter_ipv4/ipt_dccp.h
@@ -1,23 +1,15 @@
 #ifndef _IPT_DCCP_H_
 #define _IPT_DCCP_H_
 
-#define IPT_DCCP_SRC_PORTS	        0x01
-#define IPT_DCCP_DEST_PORTS	        0x02
-#define IPT_DCCP_TYPE			0x04
-#define IPT_DCCP_OPTION			0x08
+#include <linux/netfilter/xt_dccp.h>
+#define IPT_DCCP_SRC_PORTS	XT_DCCP_SRC_PORTS
+#define IPT_DCCP_DEST_PORTS	XT_DCCP_DEST_PORTS
+#define IPT_DCCP_TYPE		XT_DCCP_TYPE
+#define IPT_DCCP_OPTION		XT_DCCP_OPTION
 
-#define IPT_DCCP_VALID_FLAGS		0x0f
+#define IPT_DCCP_VALID_FLAGS 	XT_DCCP_VALID_FLAGS
 
-struct ipt_dccp_info {
-	u_int16_t dpts[2];  /* Min, Max */
-	u_int16_t spts[2];  /* Min, Max */
-
-	u_int16_t flags;
-	u_int16_t invflags;
-
-	u_int16_t typemask;
-	u_int8_t option;
-};
+#define ipt_dccp_info xt_dccp_info
 
 #endif /* _IPT_DCCP_H_ */
 
diff --git a/include/linux/netfilter_ipv4/ipt_helper.h b/include/linux/netfilter_ipv4/ipt_helper.h
index 6f12ecb8c93d..80452c218551 100644
--- a/include/linux/netfilter_ipv4/ipt_helper.h
+++ b/include/linux/netfilter_ipv4/ipt_helper.h
@@ -1,8 +1,7 @@
 #ifndef _IPT_HELPER_H
 #define _IPT_HELPER_H
 
-struct ipt_helper_info {
-	int invert;
-	char name[30];
-};
+#include <linux/netfilter/xt_helper.h>
+#define ipt_helper_info xt_helper_info
+
 #endif /* _IPT_HELPER_H */
diff --git a/include/linux/netfilter_ipv4/ipt_length.h b/include/linux/netfilter_ipv4/ipt_length.h
index 6e0885229615..9b45206ffcef 100644
--- a/include/linux/netfilter_ipv4/ipt_length.h
+++ b/include/linux/netfilter_ipv4/ipt_length.h
@@ -1,9 +1,7 @@
 #ifndef _IPT_LENGTH_H
 #define _IPT_LENGTH_H
 
-struct ipt_length_info {
-    u_int16_t	min, max;
-    u_int8_t	invert;
-};
+#include <linux/netfilter/xt_length.h>
+#define ipt_length_info xt_length_info
 
 #endif /*_IPT_LENGTH_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_limit.h b/include/linux/netfilter_ipv4/ipt_limit.h
index 256453409e21..92f5cd07bbc4 100644
--- a/include/linux/netfilter_ipv4/ipt_limit.h
+++ b/include/linux/netfilter_ipv4/ipt_limit.h
@@ -1,21 +1,8 @@
 #ifndef _IPT_RATE_H
 #define _IPT_RATE_H
 
-/* timings are in milliseconds. */
-#define IPT_LIMIT_SCALE 10000
+#include <linux/netfilter/xt_limit.h>
+#define IPT_LIMIT_SCALE XT_LIMIT_SCALE
+#define ipt_rateinfo xt_rateinfo
 
-/* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
-   seconds, or one every 59 hours. */
-struct ipt_rateinfo {
-	u_int32_t avg;    /* Average secs between packets * scale */
-	u_int32_t burst;  /* Period multiplier for upper limit. */
-
-	/* Used internally by the kernel */
-	unsigned long prev;
-	u_int32_t credit;
-	u_int32_t credit_cap, cost;
-
-	/* Ugly, ugly fucker. */
-	struct ipt_rateinfo *master;
-};
 #endif /*_IPT_RATE_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_mac.h b/include/linux/netfilter_ipv4/ipt_mac.h
index f8d5b8e7ccdb..b186008a3c47 100644
--- a/include/linux/netfilter_ipv4/ipt_mac.h
+++ b/include/linux/netfilter_ipv4/ipt_mac.h
@@ -1,8 +1,7 @@
 #ifndef _IPT_MAC_H
 #define _IPT_MAC_H
 
-struct ipt_mac_info {
-    unsigned char srcaddr[ETH_ALEN];
-    int invert;
-};
+#include <linux/netfilter/xt_mac.h>
+#define ipt_mac_info xt_mac_info
+
 #endif /*_IPT_MAC_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_mark.h b/include/linux/netfilter_ipv4/ipt_mark.h
index f3952b563d4c..bfde67c61224 100644
--- a/include/linux/netfilter_ipv4/ipt_mark.h
+++ b/include/linux/netfilter_ipv4/ipt_mark.h
@@ -1,9 +1,9 @@
 #ifndef _IPT_MARK_H
 #define _IPT_MARK_H
 
-struct ipt_mark_info {
-    unsigned long mark, mask;
-    u_int8_t invert;
-};
+/* Backwards compatibility for old userspace */
+#include <linux/netfilter/xt_mark.h>
+
+#define ipt_mark_info xt_mark_info
 
 #endif /*_IPT_MARK_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_physdev.h b/include/linux/netfilter_ipv4/ipt_physdev.h
index 7538c8655ec0..2400e7140f26 100644
--- a/include/linux/netfilter_ipv4/ipt_physdev.h
+++ b/include/linux/netfilter_ipv4/ipt_physdev.h
@@ -1,24 +1,17 @@
 #ifndef _IPT_PHYSDEV_H
 #define _IPT_PHYSDEV_H
 
-#ifdef __KERNEL__
-#include <linux/if.h>
-#endif
+/* Backwards compatibility for old userspace */
 
-#define IPT_PHYSDEV_OP_IN		0x01
-#define IPT_PHYSDEV_OP_OUT		0x02
-#define IPT_PHYSDEV_OP_BRIDGED		0x04
-#define IPT_PHYSDEV_OP_ISIN		0x08
-#define IPT_PHYSDEV_OP_ISOUT		0x10
-#define IPT_PHYSDEV_OP_MASK		(0x20 - 1)
+#include <linux/netfilter/xt_physdev.h>
 
-struct ipt_physdev_info {
-	char physindev[IFNAMSIZ];
-	char in_mask[IFNAMSIZ];
-	char physoutdev[IFNAMSIZ];
-	char out_mask[IFNAMSIZ];
-	u_int8_t invert;
-	u_int8_t bitmask;
-};
+#define IPT_PHYSDEV_OP_IN		XT_PHYSDEV_OP_IN
+#define IPT_PHYSDEV_OP_OUT		XT_PHYSDEV_OP_OUT
+#define IPT_PHYSDEV_OP_BRIDGED		XT_PHYSDEV_OP_BRIDGED
+#define IPT_PHYSDEV_OP_ISIN		XT_PHYSDEV_OP_ISIN
+#define IPT_PHYSDEV_OP_ISOUT		XT_PHYSDEV_OP_ISOUT
+#define IPT_PHYSDEV_OP_MASK		XT_PHYSDEV_OP_MASK
+
+#define ipt_physdev_info xt_physdev_info
 
 #endif /*_IPT_PHYSDEV_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_pkttype.h b/include/linux/netfilter_ipv4/ipt_pkttype.h
index d53a65848683..ff1fbc949a0c 100644
--- a/include/linux/netfilter_ipv4/ipt_pkttype.h
+++ b/include/linux/netfilter_ipv4/ipt_pkttype.h
@@ -1,8 +1,7 @@
 #ifndef _IPT_PKTTYPE_H
 #define _IPT_PKTTYPE_H
 
-struct ipt_pkttype_info {
-	int	pkttype;
-	int	invert;
-};
+#include <linux/netfilter/xt_pkttype.h>
+#define ipt_pkttype_info xt_pkttype_info
+
 #endif /*_IPT_PKTTYPE_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_realm.h b/include/linux/netfilter_ipv4/ipt_realm.h
index a4d6698723ac..b3996eaa0188 100644
--- a/include/linux/netfilter_ipv4/ipt_realm.h
+++ b/include/linux/netfilter_ipv4/ipt_realm.h
@@ -1,10 +1,7 @@
 #ifndef _IPT_REALM_H
 #define _IPT_REALM_H
 
-struct ipt_realm_info {
-	u_int32_t id;
-	u_int32_t mask;
-	u_int8_t invert;
-};
+#include <linux/netfilter/xt_realm.h>
+#define ipt_realm_info xt_realm_info
 
 #endif /* _IPT_REALM_H */
diff --git a/include/linux/netfilter_ipv4/ipt_state.h b/include/linux/netfilter_ipv4/ipt_state.h
index 5df37868933d..a44a99cc28cc 100644
--- a/include/linux/netfilter_ipv4/ipt_state.h
+++ b/include/linux/netfilter_ipv4/ipt_state.h
@@ -1,13 +1,15 @@
 #ifndef _IPT_STATE_H
 #define _IPT_STATE_H
 
-#define IPT_STATE_BIT(ctinfo) (1 << ((ctinfo)%IP_CT_IS_REPLY+1))
-#define IPT_STATE_INVALID (1 << 0)
+/* Backwards compatibility for old userspace */
 
-#define IPT_STATE_UNTRACKED (1 << (IP_CT_NUMBER + 1))
+#include <linux/netfilter/xt_state.h>
+
+#define IPT_STATE_BIT		XT_STATE_BIT
+#define IPT_STATE_INVALID	XT_STATE_INVALID
+
+#define IPT_STATE_UNTRACKED	XT_STATE_UNTRACKED
+
+#define ipt_state_info		xt_state_info
 
-struct ipt_state_info
-{
-	unsigned int statemask;
-};
 #endif /*_IPT_STATE_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_string.h b/include/linux/netfilter_ipv4/ipt_string.h
index a265f6e44eab..c26de3059903 100644
--- a/include/linux/netfilter_ipv4/ipt_string.h
+++ b/include/linux/netfilter_ipv4/ipt_string.h
@@ -1,18 +1,10 @@
 #ifndef _IPT_STRING_H
 #define _IPT_STRING_H
 
-#define IPT_STRING_MAX_PATTERN_SIZE 128
-#define IPT_STRING_MAX_ALGO_NAME_SIZE 16
+#include <linux/netfilter/xt_string.h>
 
-struct ipt_string_info
-{
-	u_int16_t from_offset;
-	u_int16_t to_offset;
-	char	  algo[IPT_STRING_MAX_ALGO_NAME_SIZE];
-	char 	  pattern[IPT_STRING_MAX_PATTERN_SIZE];
-	u_int8_t  patlen;
-	u_int8_t  invert;
-	struct ts_config __attribute__((aligned(8))) *config;
-};
+#define IPT_STRING_MAX_PATTERN_SIZE XT_STRING_MAX_PATTERN_SIZE
+#define IPT_STRING_MAX_ALGO_NAME_SIZE XT_STRING_MAX_ALGO_NAME_SIZE
+#define ipt_string_info xt_string_info
 
 #endif /*_IPT_STRING_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_tcpmss.h b/include/linux/netfilter_ipv4/ipt_tcpmss.h
index e2b14397f701..18bbc8e8e009 100644
--- a/include/linux/netfilter_ipv4/ipt_tcpmss.h
+++ b/include/linux/netfilter_ipv4/ipt_tcpmss.h
@@ -1,9 +1,7 @@
 #ifndef _IPT_TCPMSS_MATCH_H
 #define _IPT_TCPMSS_MATCH_H
 
-struct ipt_tcpmss_match_info {
-    u_int16_t mss_min, mss_max;
-    u_int8_t invert;
-};
+#include <linux/netfilter/xt_tcpmss.h>
+#define ipt_tcpmss_match_info xt_tcpmss_match_info
 
 #endif /*_IPT_TCPMSS_MATCH_H*/
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index c163ba31aab7..f249b574f0fa 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -25,8 +25,15 @@
 #include <linux/compiler.h>
 #include <linux/netfilter_ipv6.h>
 
-#define IP6T_FUNCTION_MAXNAMELEN 30
-#define IP6T_TABLE_MAXNAMELEN 32
+#include <linux/netfilter/x_tables.h>
+
+#define IP6T_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
+#define IP6T_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
+
+#define ip6t_match xt_match
+#define ip6t_target xt_target
+#define ip6t_table xt_table
+#define ip6t_get_revision xt_get_revision
 
 /* Yes, Virginia, you have to zero the padding. */
 struct ip6t_ip6 {
@@ -104,10 +111,7 @@ struct ip6t_standard_target
 	int verdict;
 };
 
-struct ip6t_counters
-{
-	u_int64_t pcnt, bcnt;			/* Packet and byte counters */
-};
+#define ip6t_counters	xt_counters
 
 /* Values for "flag" field in struct ip6t_ip6 (general ip6 structure). */
 #define IP6T_F_PROTO		0x01	/* Set if rule cares about upper 
@@ -123,7 +127,7 @@ struct ip6t_counters
 #define IP6T_INV_SRCIP		0x08	/* Invert the sense of SRC IP. */
 #define IP6T_INV_DSTIP		0x10	/* Invert the sense of DST OP. */
 #define IP6T_INV_FRAG		0x20	/* Invert the sense of FRAG. */
-#define IP6T_INV_PROTO		0x40	/* Invert the sense of PROTO. */
+#define IP6T_INV_PROTO		XT_INV_PROTO
 #define IP6T_INV_MASK		0x7F	/* All possible flag bits mask. */
 
 /* This structure defines each of the firewall rules.  Consists of 3
@@ -145,7 +149,7 @@ struct ip6t_entry
 	unsigned int comefrom;
 
 	/* Packet and byte counters. */
-	struct ip6t_counters counters;
+	struct xt_counters counters;
 
 	/* The matches (if any), then the target. */
 	unsigned char elems[0];
@@ -155,54 +159,41 @@ struct ip6t_entry
  * New IP firewall options for [gs]etsockopt at the RAW IP level.
  * Unlike BSD Linux inherits IP options so you don't have to use
  * a raw socket for this. Instead we check rights in the calls. */
-#define IP6T_BASE_CTL			64	/* base for firewall socket options */
+#define IP6T_BASE_CTL			XT_BASE_CTL
 
-#define IP6T_SO_SET_REPLACE		(IP6T_BASE_CTL)
-#define IP6T_SO_SET_ADD_COUNTERS	(IP6T_BASE_CTL + 1)
-#define IP6T_SO_SET_MAX			IP6T_SO_SET_ADD_COUNTERS
+#define IP6T_SO_SET_REPLACE		XT_SO_SET_REPLACE
+#define IP6T_SO_SET_ADD_COUNTERS	XT_SO_SET_ADD_COUNTERS
+#define IP6T_SO_SET_MAX			XT_SO_SET_MAX
 
-#define IP6T_SO_GET_INFO		(IP6T_BASE_CTL)
-#define IP6T_SO_GET_ENTRIES		(IP6T_BASE_CTL + 1)
-#define	IP6T_SO_GET_REVISION_MATCH	(IP6T_BASE_CTL + 2)
-#define	IP6T_SO_GET_REVISION_TARGET	(IP6T_BASE_CTL + 3)
-#define IP6T_SO_GET_MAX			IP6T_SO_GET_REVISION_TARGET
+#define IP6T_SO_GET_INFO		XT_SO_GET_INFO
+#define IP6T_SO_GET_ENTRIES		XT_SO_GET_ENTRIES
+#define	IP6T_SO_GET_REVISION_MATCH	XT_SO_GET_REVISION_MATCH
+#define	IP6T_SO_GET_REVISION_TARGET	XT_SO_GET_REVISION_TARGET
+#define IP6T_SO_GET_MAX			XT_SO_GET_REVISION_TARGET
 
 /* CONTINUE verdict for targets */
-#define IP6T_CONTINUE 0xFFFFFFFF
+#define IP6T_CONTINUE XT_CONTINUE
 
 /* For standard target */
-#define IP6T_RETURN (-NF_REPEAT - 1)
+#define IP6T_RETURN XT_RETURN
 
-/* TCP matching stuff */
-struct ip6t_tcp
-{
-	u_int16_t spts[2];			/* Source port range. */
-	u_int16_t dpts[2];			/* Destination port range. */
-	u_int8_t option;			/* TCP Option iff non-zero*/
-	u_int8_t flg_mask;			/* TCP flags mask byte */
-	u_int8_t flg_cmp;			/* TCP flags compare byte */
-	u_int8_t invflags;			/* Inverse flags */
-};
+/* TCP/UDP matching stuff */
+#include <linux/netfilter/xt_tcpudp.h>
+
+#define ip6t_tcp xt_tcp
+#define ip6t_udp xt_udp
 
 /* Values for "inv" field in struct ipt_tcp. */
-#define IP6T_TCP_INV_SRCPT	0x01	/* Invert the sense of source ports. */
-#define IP6T_TCP_INV_DSTPT	0x02	/* Invert the sense of dest ports. */
-#define IP6T_TCP_INV_FLAGS	0x04	/* Invert the sense of TCP flags. */
-#define IP6T_TCP_INV_OPTION	0x08	/* Invert the sense of option test. */
-#define IP6T_TCP_INV_MASK	0x0F	/* All possible flags. */
-
-/* UDP matching stuff */
-struct ip6t_udp
-{
-	u_int16_t spts[2];			/* Source port range. */
-	u_int16_t dpts[2];			/* Destination port range. */
-	u_int8_t invflags;			/* Inverse flags */
-};
+#define IP6T_TCP_INV_SRCPT	XT_TCP_INV_SRCPT
+#define IP6T_TCP_INV_DSTPT	XT_TCP_INV_DSTPT
+#define IP6T_TCP_INV_FLAGS	XT_TCP_INV_FLAGS
+#define IP6T_TCP_INV_OPTION	XT_TCP_INV_OPTION
+#define IP6T_TCP_INV_MASK	XT_TCP_INV_MASK
 
 /* Values for "invflags" field in struct ipt_udp. */
-#define IP6T_UDP_INV_SRCPT	0x01	/* Invert the sense of source ports. */
-#define IP6T_UDP_INV_DSTPT	0x02	/* Invert the sense of dest ports. */
-#define IP6T_UDP_INV_MASK	0x03	/* All possible flags. */
+#define IP6T_UDP_INV_SRCPT	XT_UDP_INV_SRCPT
+#define IP6T_UDP_INV_DSTPT	XT_UDP_INV_DSTPT
+#define IP6T_UDP_INV_MASK	XT_UDP_INV_MASK
 
 /* ICMP matching stuff */
 struct ip6t_icmp
@@ -264,23 +255,14 @@ struct ip6t_replace
 	/* Number of counters (must be equal to current number of entries). */
 	unsigned int num_counters;
 	/* The old entries' counters. */
-	struct ip6t_counters __user *counters;
+	struct xt_counters __user *counters;
 
 	/* The entries (hang off end: not really an array). */
 	struct ip6t_entry entries[0];
 };
 
 /* The argument to IP6T_SO_ADD_COUNTERS. */
-struct ip6t_counters_info
-{
-	/* Which table. */
-	char name[IP6T_TABLE_MAXNAMELEN];
-
-	unsigned int num_counters;
-
-	/* The counters (actually `number' of these). */
-	struct ip6t_counters counters[0];
-};
+#define ip6t_counters_info xt_counters_info
 
 /* The argument to IP6T_SO_GET_ENTRIES. */
 struct ip6t_get_entries
@@ -295,19 +277,10 @@ struct ip6t_get_entries
 	struct ip6t_entry entrytable[0];
 };
 
-/* The argument to IP6T_SO_GET_REVISION_*.  Returns highest revision
- * kernel supports, if >= revision. */
-struct ip6t_get_revision
-{
-	char name[IP6T_FUNCTION_MAXNAMELEN-1];
-
-	u_int8_t revision;
-};
-
 /* Standard return verdict, or do jump. */
-#define IP6T_STANDARD_TARGET ""
+#define IP6T_STANDARD_TARGET XT_STANDARD_TARGET
 /* Error verdict. */
-#define IP6T_ERROR_TARGET "ERROR"
+#define IP6T_ERROR_TARGET XT_ERROR_TARGET
 
 /* Helper functions */
 static __inline__ struct ip6t_entry_target *
@@ -361,104 +334,11 @@ ip6t_get_target(struct ip6t_entry *e)
 #include <linux/init.h>
 extern void ip6t_init(void) __init;
 
-struct ip6t_match
-{
-	struct list_head list;
-
-	const char name[IP6T_FUNCTION_MAXNAMELEN-1];
-
-	u_int8_t revision;
-
-	/* Return true or false: return FALSE and set *hotdrop = 1 to
-           force immediate packet drop. */
-	/* Arguments changed since 2.6.9, as this must now handle
-	   non-linear skb, using skb_header_pointer and
-	   skb_ip_make_writable. */
-	int (*match)(const struct sk_buff *skb,
-		     const struct net_device *in,
-		     const struct net_device *out,
-		     const void *matchinfo,
-		     int offset,
-		     unsigned int protoff,
-		     int *hotdrop);
-
-	/* Called when user tries to insert an entry of this type. */
-	/* Should return true or false. */
-	int (*checkentry)(const char *tablename,
-			  const struct ip6t_ip6 *ip,
-			  void *matchinfo,
-			  unsigned int matchinfosize,
-			  unsigned int hook_mask);
-
-	/* Called when entry of this type deleted. */
-	void (*destroy)(void *matchinfo, unsigned int matchinfosize);
-
-	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
-	struct module *me;
-};
-
-/* Registration hooks for targets. */
-struct ip6t_target
-{
-	struct list_head list;
-
-	const char name[IP6T_FUNCTION_MAXNAMELEN-1];
-
-	u_int8_t revision;
-
-	/* Returns verdict. Argument order changed since 2.6.9, as this
-	   must now handle non-linear skbs, using skb_copy_bits and
-	   skb_ip_make_writable. */
-	unsigned int (*target)(struct sk_buff **pskb,
-			       const struct net_device *in,
-			       const struct net_device *out,
-			       unsigned int hooknum,
-			       const void *targinfo,
-			       void *userdata);
-
-	/* Called when user tries to insert an entry of this type:
-           hook_mask is a bitmask of hooks from which it can be
-           called. */
-	/* Should return true or false. */
-	int (*checkentry)(const char *tablename,
-			  const struct ip6t_entry *e,
-			  void *targinfo,
-			  unsigned int targinfosize,
-			  unsigned int hook_mask);
-
-	/* Called when entry of this type deleted. */
-	void (*destroy)(void *targinfo, unsigned int targinfosize);
-
-	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
-	struct module *me;
-};
-
-extern int ip6t_register_target(struct ip6t_target *target);
-extern void ip6t_unregister_target(struct ip6t_target *target);
-
-extern int ip6t_register_match(struct ip6t_match *match);
-extern void ip6t_unregister_match(struct ip6t_match *match);
+#define ip6t_register_target(tgt) xt_register_target(AF_INET6, tgt)
+#define ip6t_unregister_target(tgt) xt_unregister_target(AF_INET6, tgt)
 
-/* Furniture shopping... */
-struct ip6t_table
-{
-	struct list_head list;
-
-	/* A unique name... */
-	char name[IP6T_TABLE_MAXNAMELEN];
-
-	/* What hooks you will enter on */
-	unsigned int valid_hooks;
-
-	/* Lock for the curtain */
-	rwlock_t lock;
-
-	/* Man behind the curtain... */
-	struct ip6t_table_info *private;
-
-	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
-	struct module *me;
-};
+#define ip6t_register_match(match) xt_register_match(AF_INET6, match)
+#define ip6t_unregister_match(match) xt_unregister_match(AF_INET6, match)
 
 extern int ip6t_register_table(struct ip6t_table *table,
 			       const struct ip6t_replace *repl);
diff --git a/include/linux/netfilter_ipv6/ip6t_MARK.h b/include/linux/netfilter_ipv6/ip6t_MARK.h
index 7ade8d8f5246..7cf629a8ab92 100644
--- a/include/linux/netfilter_ipv6/ip6t_MARK.h
+++ b/include/linux/netfilter_ipv6/ip6t_MARK.h
@@ -1,8 +1,9 @@
 #ifndef _IP6T_MARK_H_target
 #define _IP6T_MARK_H_target
 
-struct ip6t_mark_target_info {
-	unsigned long mark;
-};
+/* Backwards compatibility for old userspace */
+#include <linux/netfilter/xt_MARK.h>
 
-#endif /*_IPT_MARK_H_target*/
+#define ip6t_mark_target_info xt_mark_target_info
+
+#endif /*_IP6T_MARK_H_target*/
diff --git a/include/linux/netfilter_ipv6/ip6t_length.h b/include/linux/netfilter_ipv6/ip6t_length.h
index 7fc09f9f9d63..9e9689d03ed7 100644
--- a/include/linux/netfilter_ipv6/ip6t_length.h
+++ b/include/linux/netfilter_ipv6/ip6t_length.h
@@ -1,10 +1,8 @@
 #ifndef _IP6T_LENGTH_H
 #define _IP6T_LENGTH_H
 
-struct ip6t_length_info {
-	u_int16_t  min, max;
-	u_int8_t   invert;
-};
+#include <linux/netfilter/xt_length.h>
+#define ip6t_length_info xt_length_info
 
 #endif /*_IP6T_LENGTH_H*/
 	
diff --git a/include/linux/netfilter_ipv6/ip6t_limit.h b/include/linux/netfilter_ipv6/ip6t_limit.h
index f2866e50f3b4..487e5ea342c6 100644
--- a/include/linux/netfilter_ipv6/ip6t_limit.h
+++ b/include/linux/netfilter_ipv6/ip6t_limit.h
@@ -1,21 +1,8 @@
 #ifndef _IP6T_RATE_H
 #define _IP6T_RATE_H
 
-/* timings are in milliseconds. */
-#define IP6T_LIMIT_SCALE 10000
+#include <linux/netfilter/xt_limit.h>
+#define IP6T_LIMIT_SCALE XT_LIMIT_SCALE
+#define ip6t_rateinfo xt_rateinfo
 
-/* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
-   seconds, or one every 59 hours. */
-struct ip6t_rateinfo {
-	u_int32_t avg;    /* Average secs between packets * scale */
-	u_int32_t burst;  /* Period multiplier for upper limit. */
-
-	/* Used internally by the kernel */
-	unsigned long prev;
-	u_int32_t credit;
-	u_int32_t credit_cap, cost;
-
-	/* Ugly, ugly fucker. */
-	struct ip6t_rateinfo *master;
-};
-#endif /*_IPT_RATE_H*/
+#endif /*_IP6T_RATE_H*/
diff --git a/include/linux/netfilter_ipv6/ip6t_mac.h b/include/linux/netfilter_ipv6/ip6t_mac.h
index 87c088c21848..ac58e83e9423 100644
--- a/include/linux/netfilter_ipv6/ip6t_mac.h
+++ b/include/linux/netfilter_ipv6/ip6t_mac.h
@@ -1,8 +1,7 @@
 #ifndef _IP6T_MAC_H
 #define _IP6T_MAC_H
 
-struct ip6t_mac_info {
-    unsigned char srcaddr[ETH_ALEN];
-    int invert;
-};
-#endif /*_IPT_MAC_H*/
+#include <linux/netfilter/xt_mac.h>
+#define ip6t_mac_info xt_mac_info
+
+#endif /*_IP6T_MAC_H*/
diff --git a/include/linux/netfilter_ipv6/ip6t_mark.h b/include/linux/netfilter_ipv6/ip6t_mark.h
index a734441e1c19..ff204951ddc3 100644
--- a/include/linux/netfilter_ipv6/ip6t_mark.h
+++ b/include/linux/netfilter_ipv6/ip6t_mark.h
@@ -1,9 +1,9 @@
 #ifndef _IP6T_MARK_H
 #define _IP6T_MARK_H
 
-struct ip6t_mark_info {
-    unsigned long mark, mask;
-    u_int8_t invert;
-};
+/* Backwards compatibility for old userspace */
+#include <linux/netfilter/xt_mark.h>
+
+#define ip6t_mark_info xt_mark_info
 
 #endif /*_IPT_MARK_H*/
diff --git a/include/linux/netfilter_ipv6/ip6t_physdev.h b/include/linux/netfilter_ipv6/ip6t_physdev.h
index c234731cd66b..c161c0a81b55 100644
--- a/include/linux/netfilter_ipv6/ip6t_physdev.h
+++ b/include/linux/netfilter_ipv6/ip6t_physdev.h
@@ -1,24 +1,17 @@
 #ifndef _IP6T_PHYSDEV_H
 #define _IP6T_PHYSDEV_H
 
-#ifdef __KERNEL__
-#include <linux/if.h>
-#endif
+/* Backwards compatibility for old userspace */
 
-#define IP6T_PHYSDEV_OP_IN		0x01
-#define IP6T_PHYSDEV_OP_OUT		0x02
-#define IP6T_PHYSDEV_OP_BRIDGED		0x04
-#define IP6T_PHYSDEV_OP_ISIN		0x08
-#define IP6T_PHYSDEV_OP_ISOUT		0x10
-#define IP6T_PHYSDEV_OP_MASK		(0x20 - 1)
+#include <linux/netfilter/xt_physdev.h>
 
-struct ip6t_physdev_info {
-	char physindev[IFNAMSIZ];
-	char in_mask[IFNAMSIZ];
-	char physoutdev[IFNAMSIZ];
-	char out_mask[IFNAMSIZ];
-	u_int8_t invert;
-	u_int8_t bitmask;
-};
+#define IP6T_PHYSDEV_OP_IN		XT_PHYSDEV_OP_IN
+#define IP6T_PHYSDEV_OP_OUT		XT_PHYSDEV_OP_OUT
+#define IP6T_PHYSDEV_OP_BRIDGED		XT_PHYSDEV_OP_BRIDGED
+#define IP6T_PHYSDEV_OP_ISIN		XT_PHYSDEV_OP_ISIN
+#define IP6T_PHYSDEV_OP_ISOUT		XT_PHYSDEV_OP_ISOUT
+#define IP6T_PHYSDEV_OP_MASK		XT_PHYSDEV_OP_MASK
+
+#define ip6t_physdev_info xt_physdev_info
 
 #endif /*_IP6T_PHYSDEV_H*/
diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 25b081a730e6..91684436af8e 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -37,7 +37,4 @@ struct nf_conntrack_ipv4 {
 struct sk_buff *
 nf_ct_ipv4_ct_gather_frags(struct sk_buff *skb);
 
-/* call to create an explicit dependency on nf_conntrack_l3proto_ipv4. */
-extern void need_ip_conntrack(void);
-
 #endif /*_NF_CONNTRACK_IPV4_H*/
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 64b82b74a650..6d075ca16e6e 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -221,9 +221,6 @@ extern void nf_ct_helper_put(struct nf_conntrack_helper *helper);
 extern struct nf_conntrack_helper *
 __nf_conntrack_helper_find_byname(const char *name);
 
-/* call to create an explicit dependency on nf_conntrack. */
-extern void need_nf_conntrack(void);
-
 extern int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
 				const struct nf_conntrack_tuple *orig);
 
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 9f6e0193ae10..a29c1232c420 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -15,6 +15,7 @@
 #include <linux/netfilter.h>
 #include <linux/module.h>
 #include <linux/ip.h>
+#include <linux/in.h>
 #include <linux/if_arp.h>
 #include <linux/spinlock.h>
 
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index a9893ec03e02..db783036e4d8 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -182,6 +182,7 @@ config IP_NF_QUEUE
 
 config IP_NF_IPTABLES
 	tristate "IP tables support (required for filtering/masq/NAT)"
+	depends on NETFILTER_XTABLES
 	help
 	  iptables is a general, extensible packet identification framework.
 	  The packet filtering and full NAT (masquerading, port forwarding,
@@ -191,16 +192,6 @@ config IP_NF_IPTABLES
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 # The matches.
-config IP_NF_MATCH_LIMIT
-	tristate "limit match support"
-	depends on IP_NF_IPTABLES
-	help
-	  limit matching allows you to control the rate at which a rule can be
-	  matched: mainly useful in combination with the LOG target ("LOG
-	  target support", below) and to avoid some Denial of Service attacks.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP_NF_MATCH_IPRANGE
 	tristate "IP range match support"
 	depends on IP_NF_IPTABLES
@@ -210,37 +201,6 @@ config IP_NF_MATCH_IPRANGE
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_MATCH_MAC
-	tristate "MAC address match support"
-	depends on IP_NF_IPTABLES
-	help
-	  MAC matching allows you to match packets based on the source
-	  Ethernet address of the packet.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
-config IP_NF_MATCH_PKTTYPE
-	tristate "Packet type match support"
-	depends on IP_NF_IPTABLES
-	help
-	  Packet type matching allows you to match a packet by
-	  its "class", eg. BROADCAST, MULTICAST, ...
-
-	  Typical usage:
-	  iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
-config IP_NF_MATCH_MARK
-	tristate "netfilter MARK match support"
-	depends on IP_NF_IPTABLES
-	help
-	  Netfilter mark matching allows you to match packets based on the
-	  `nfmark' value in the packet.  This can be set by the MARK target
-	  (see below).
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP_NF_MATCH_MULTIPORT
 	tristate "Multiple port match support"
 	depends on IP_NF_IPTABLES
@@ -301,15 +261,6 @@ config IP_NF_MATCH_AH_ESP
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_MATCH_LENGTH
-	tristate "LENGTH match support"
-	depends on IP_NF_IPTABLES
-	help
-	  This option allows you to match the length of a packet against a
-	  specific value or range of values.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP_NF_MATCH_TTL
 	tristate "TTL match support"
 	depends on IP_NF_IPTABLES
@@ -319,50 +270,6 @@ config IP_NF_MATCH_TTL
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_MATCH_TCPMSS
-	tristate "tcpmss match support"
-	depends on IP_NF_IPTABLES
-	help
-	  This option adds a `tcpmss' match, which allows you to examine the
-	  MSS value of TCP SYN packets, which control the maximum packet size
-	  for that connection.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
-config IP_NF_MATCH_HELPER
-	tristate "Helper match support"
-	depends on IP_NF_IPTABLES
-	depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
-	help
-	  Helper matching allows you to match packets in dynamic connections
-	  tracked by a conntrack-helper, ie. ip_conntrack_ftp
-
-	  To compile it as a module, choose M here.  If unsure, say Y.
-
-config IP_NF_MATCH_STATE
-	tristate "Connection state match support"
-	depends on IP_NF_IPTABLES
-	depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
-	help
-	  Connection state matching allows you to match packets based on their
-	  relationship to a tracked connection (ie. previous packets).  This
-	  is a powerful tool for packet classification.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
-config IP_NF_MATCH_CONNTRACK
-	tristate "Connection tracking match support"
-	depends on IP_NF_IPTABLES
-	depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
-	help
-	  This is a general conntrack match module, a superset of the state match.
-
-	  It allows matching on additional conntrack information, which is
-	  useful in complex configurations, such as NAT gateways with multiple
-	  internet links or tunnels.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP_NF_MATCH_OWNER
 	tristate "Owner match support"
 	depends on IP_NF_IPTABLES
@@ -372,15 +279,6 @@ config IP_NF_MATCH_OWNER
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_MATCH_PHYSDEV
-	tristate "Physdev match support"
-	depends on IP_NF_IPTABLES && BRIDGE_NETFILTER
-	help
-	  Physdev packet matching matches against the physical bridge ports
-	  the IP packet arrived on or will leave by.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP_NF_MATCH_ADDRTYPE
 	tristate  'address type match support'
 	depends on IP_NF_IPTABLES
@@ -391,75 +289,6 @@ config IP_NF_MATCH_ADDRTYPE
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/modules.txt>.  If unsure, say `N'.
 
-config IP_NF_MATCH_REALM
-	tristate  'realm match support'
-	depends on IP_NF_IPTABLES
-	select NET_CLS_ROUTE
-	help
-	  This option adds a `realm' match, which allows you to use the realm
-	  key from the routing subsystem inside iptables.
-	
-	  This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option 
-	  in tc world.
-	
-	  If you want to compile it as a module, say M here and read
-	  <file:Documentation/modules.txt>.  If unsure, say `N'.
-
-config IP_NF_MATCH_SCTP
-	tristate  'SCTP protocol match support'
-	depends on IP_NF_IPTABLES
-	help
-	  With this option enabled, you will be able to use the iptables
-	  `sctp' match in order to match on SCTP source/destination ports
-	  and SCTP chunk types.
-
-	  If you want to compile it as a module, say M here and read
-	  <file:Documentation/modules.txt>.  If unsure, say `N'.
-
-config IP_NF_MATCH_DCCP
-	tristate  'DCCP protocol match support'
-	depends on IP_NF_IPTABLES
-	help
-	  With this option enabled, you will be able to use the iptables
-	  `dccp' match in order to match on DCCP source/destination ports
-	  and DCCP flags.
-
-	  If you want to compile it as a module, say M here and read
-	  <file:Documentation/modules.txt>.  If unsure, say `N'.
-
-config IP_NF_MATCH_COMMENT
-	tristate  'comment match support'
-	depends on IP_NF_IPTABLES
-	help
-	  This option adds a `comment' dummy-match, which allows you to put
-	  comments in your iptables ruleset.
-
-	  If you want to compile it as a module, say M here and read
-	  <file:Documentation/modules.txt>.  If unsure, say `N'.
-
-config IP_NF_MATCH_CONNMARK
-	tristate  'Connection mark match support'
-	depends on IP_NF_IPTABLES
-	depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
-	help
-	  This option adds a `connmark' match, which allows you to match the
-	  connection mark value previously set for the session by `CONNMARK'. 
-	
-	  If you want to compile it as a module, say M here and read
-	  <file:Documentation/modules.txt>.  The module will be called
-	  ipt_connmark.o.  If unsure, say `N'.
-
-config IP_NF_MATCH_CONNBYTES
-	tristate  'Connection byte/packet counter match support'
-	depends on IP_NF_IPTABLES
-	depends on (IP_NF_CONNTRACK && IP_NF_CT_ACCT) || (NF_CT_ACCT && NF_CONNTRACK_IPV4)
-	help
-	  This option adds a `connbytes' match, which allows you to match the
-	  number of bytes and/or packets for each direction within a connection.
-
-	  If you want to compile it as a module, say M here and read
-	  <file:Documentation/modules.txt>.  If unsure, say `N'.
-
 config IP_NF_MATCH_HASHLIMIT
 	tristate  'hashlimit match support'
 	depends on IP_NF_IPTABLES
@@ -474,19 +303,6 @@ config IP_NF_MATCH_HASHLIMIT
 	  destination IP' or `500pps from any given source IP'  with a single
 	  IPtables rule.
 
-config IP_NF_MATCH_STRING
-	tristate  'string match support'
-	depends on IP_NF_IPTABLES 
-	select TEXTSEARCH
-	select TEXTSEARCH_KMP
-	select TEXTSEARCH_BM
-	select TEXTSEARCH_FSM
-	help
-	  This option adds a `string' match, which allows you to look for
-	  pattern matchings in packets.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP_NF_MATCH_POLICY
        tristate "IPsec policy match support"
        depends on IP_NF_IPTABLES && XFRM
@@ -572,17 +388,6 @@ config IP_NF_TARGET_TCPMSS
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_TARGET_NFQUEUE
-	tristate "NFQUEUE Target Support"
-	depends on IP_NF_IPTABLES
-	help
-	  This Target replaced the old obsolete QUEUE target.
-
-	  As opposed to QUEUE, it supports 65535 different queues,
-	  not just one.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 # NAT + specific targets
 config IP_NF_NAT
 	tristate "Full NAT"
@@ -735,31 +540,6 @@ config IP_NF_TARGET_DSCP
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_TARGET_MARK
-	tristate "MARK target support"
-	depends on IP_NF_MANGLE
-	help
-	  This option adds a `MARK' target, which allows you to create rules
-	  in the `mangle' table which alter the netfilter mark (nfmark) field
-	  associated with the packet prior to routing. This can change
-	  the routing method (see `Use netfilter MARK value as routing
-	  key') and can also be used by other subsystems to change their
-	  behavior.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
-config IP_NF_TARGET_CLASSIFY
-	tristate "CLASSIFY target support"
-	depends on IP_NF_MANGLE
-	help
-	  This option adds a `CLASSIFY' target, which enables the user to set
-	  the priority of a packet. Some qdiscs can use this value for
-	  classification, among these are:
-
-  	  atm, cbq, dsmark, pfifo_fast, htb, prio
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP_NF_TARGET_TTL
 	tristate  'TTL target support'
 	depends on IP_NF_MANGLE
@@ -774,19 +554,6 @@ config IP_NF_TARGET_TTL
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_TARGET_CONNMARK
-	tristate  'CONNMARK target support'
-	depends on IP_NF_MANGLE
-	depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
-	help
-	  This option adds a `CONNMARK' target, which allows one to manipulate
-	  the connection mark value.  Similar to the MARK target, but
-	  affects the connection mark value rather than the packet mark value.
-	
-	  If you want to compile it as a module, say M here and read
-	  <file:Documentation/modules.txt>.  The module will be called
-	  ipt_CONNMARK.o.  If unsure, say `N'.
-
 config IP_NF_TARGET_CLUSTERIP
 	tristate "CLUSTERIP target support (EXPERIMENTAL)"
 	depends on IP_NF_MANGLE && EXPERIMENTAL
@@ -810,23 +577,10 @@ config IP_NF_RAW
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/modules.txt>.  If unsure, say `N'.
 
-config IP_NF_TARGET_NOTRACK
-	tristate  'NOTRACK target support'
-	depends on IP_NF_RAW
-	depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
-	help
-	  The NOTRACK target allows a select rule to specify
-	  which packets *not* to enter the conntrack/NAT
-	  subsystem with all the consequences (no ICMP error tracking,
-	  no protocol helpers for the selected packets).
-	
-	  If you want to compile it as a module, say M here and read
-	  <file:Documentation/modules.txt>.  If unsure, say `N'.
-
-
 # ARP tables
 config IP_NF_ARPTABLES
 	tristate "ARP tables support"
+	depends on NETFILTER_XTABLES
 	help
 	  arptables is a general, extensible packet identification framework.
 	  The ARP packet filtering and mangling (manipulation)subsystems
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 549b01a648b3..bcefe64b9317 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -47,14 +47,8 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
 
 # matches
 obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
-obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
 obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
-obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
-obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o
-obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
-obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
 obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
-obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o
 obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o
 obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
 obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
@@ -62,40 +56,25 @@ obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
 obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
 obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o
 obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o
-obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o
 obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
-obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
-obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
-obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
-obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o
-obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
-obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
-obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
 obj-$(CONFIG_IP_NF_MATCH_POLICY) += ipt_policy.o
-obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
-obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
 
 # targets
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
 obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
 obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o
-obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
 obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
 obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
 obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
-obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o
 obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o
 obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
-obj-$(CONFIG_IP_NF_TARGET_CONNMARK) += ipt_CONNMARK.o
 obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
-obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
 obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
-obj-$(CONFIG_IP_NF_TARGET_NFQUEUE) += ipt_NFQUEUE.o
 
 # generic ARP tables
 obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b6d5284c8020..afe3d8f8177d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -24,6 +24,7 @@
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_arp/arp_tables.h>
 
 MODULE_LICENSE("GPL");
@@ -55,28 +56,9 @@ do {								\
 #else
 #define ARP_NF_ASSERT(x)
 #endif
-#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
 
-static DECLARE_MUTEX(arpt_mutex);
-
-#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
-#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
 #include <linux/netfilter_ipv4/listhelp.h>
 
-struct arpt_table_info {
-	unsigned int size;
-	unsigned int number;
-	unsigned int initial_entries;
-	unsigned int hook_entry[NF_ARP_NUMHOOKS];
-	unsigned int underflow[NF_ARP_NUMHOOKS];
-	void *entries[NR_CPUS];
-};
-
-static LIST_HEAD(arpt_target);
-static LIST_HEAD(arpt_tables);
-#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
-#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-
 static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
 				      char *hdr_addr, int len)
 {
@@ -223,9 +205,9 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
 }
 
 static unsigned int arpt_error(struct sk_buff **pskb,
-			       unsigned int hooknum,
 			       const struct net_device *in,
 			       const struct net_device *out,
+			       unsigned int hooknum,
 			       const void *targinfo,
 			       void *userinfo)
 {
@@ -254,6 +236,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
 	struct arpt_entry *e, *back;
 	const char *indev, *outdev;
 	void *table_base;
+	struct xt_table_info *private = table->private;
 
 	/* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
 	if (!pskb_may_pull((*pskb), (sizeof(struct arphdr) +
@@ -265,9 +248,9 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
 	outdev = out ? out->name : nulldevname;
 
 	read_lock_bh(&table->lock);
-	table_base = (void *)table->private->entries[smp_processor_id()];
-	e = get_entry(table_base, table->private->hook_entry[hook]);
-	back = get_entry(table_base, table->private->underflow[hook]);
+	table_base = (void *)private->entries[smp_processor_id()];
+	e = get_entry(table_base, private->hook_entry[hook]);
+	back = get_entry(table_base, private->underflow[hook]);
 
 	arp = (*pskb)->nh.arph;
 	do {
@@ -315,8 +298,8 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
 				 * abs. verdicts
 				 */
 				verdict = t->u.kernel.target->target(pskb,
-								     hook,
 								     in, out,
+								     hook,
 								     t->data,
 								     userdata);
 
@@ -341,106 +324,6 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
 		return verdict;
 }
 
-/*
- * These are weird, but module loading must not be done with mutex
- * held (since they will register), and we have to have a single
- * function to use try_then_request_module().
- */
-
-/* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
-static inline struct arpt_table *find_table_lock(const char *name)
-{
-	struct arpt_table *t;
-
-	if (down_interruptible(&arpt_mutex) != 0)
-		return ERR_PTR(-EINTR);
-
-	list_for_each_entry(t, &arpt_tables, list)
-		if (strcmp(t->name, name) == 0 && try_module_get(t->me))
-			return t;
-	up(&arpt_mutex);
-	return NULL;
-}
-
-
-/* Find target, grabs ref.  Returns ERR_PTR() on error. */
-static inline struct arpt_target *find_target(const char *name, u8 revision)
-{
-	struct arpt_target *t;
-	int err = 0;
-
-	if (down_interruptible(&arpt_mutex) != 0)
-		return ERR_PTR(-EINTR);
-
-	list_for_each_entry(t, &arpt_target, list) {
-		if (strcmp(t->name, name) == 0) {
-			if (t->revision == revision) {
-				if (try_module_get(t->me)) {
-					up(&arpt_mutex);
-					return t;
-				}
-			} else
-				err = -EPROTOTYPE; /* Found something. */
-		}
-	}
-	up(&arpt_mutex);
-	return ERR_PTR(err);
-}
-
-struct arpt_target *arpt_find_target(const char *name, u8 revision)
-{
-	struct arpt_target *target;
-
-	target = try_then_request_module(find_target(name, revision),
-					 "arpt_%s", name);
-	if (IS_ERR(target) || !target)
-		return NULL;
-	return target;
-}
-
-static int target_revfn(const char *name, u8 revision, int *bestp)
-{
-	struct arpt_target *t;
-	int have_rev = 0;
-
-	list_for_each_entry(t, &arpt_target, list) {
-		if (strcmp(t->name, name) == 0) {
-			if (t->revision > *bestp)
-				*bestp = t->revision;
-			if (t->revision == revision)
-				have_rev =1;
-		}
-	}
-	return have_rev;
-}
-
-/* Returns true or false (if no such extension at all) */
-static inline int find_revision(const char *name, u8 revision,
-				int (*revfn)(const char *, u8, int *),
-				int *err)
-{
-	int have_rev, best = -1;
-
-	if (down_interruptible(&arpt_mutex) != 0) {
-		*err = -EINTR;
-		return 1;
-	}
-	have_rev = revfn(name, revision, &best);
-	up(&arpt_mutex);
-
-	/* Nothing at all?  Return 0 to try loading module. */
-	if (best == -1) {
-		*err = -ENOENT;
-		return 0;
-	}
-
-	*err = best;
-	if (!have_rev)
-		*err = -EPROTONOSUPPORT;
-	return 1;
-}
-
-
 /* All zeroes == unconditional rule. */
 static inline int unconditional(const struct arpt_arp *arp)
 {
@@ -456,7 +339,7 @@ static inline int unconditional(const struct arpt_arp *arp)
 /* Figures out from what hook each rule can be called: returns 0 if
  * there are loops.  Puts hook bitmask in comefrom.
  */
-static int mark_source_chains(struct arpt_table_info *newinfo,
+static int mark_source_chains(struct xt_table_info *newinfo,
 			      unsigned int valid_hooks, void *entry0)
 {
 	unsigned int hook;
@@ -587,8 +470,8 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i
 	}
 
 	t = arpt_get_target(e);
-	target = try_then_request_module(find_target(t->u.user.name,
-						     t->u.user.revision),
+	target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name,
+							t->u.user.revision),
 					 "arpt_%s", t->u.user.name);
 	if (IS_ERR(target) || !target) {
 		duprintf("check_entry: `%s' not found\n", t->u.user.name);
@@ -622,7 +505,7 @@ out:
 }
 
 static inline int check_entry_size_and_hooks(struct arpt_entry *e,
-					     struct arpt_table_info *newinfo,
+					     struct xt_table_info *newinfo,
 					     unsigned char *base,
 					     unsigned char *limit,
 					     const unsigned int *hook_entries,
@@ -656,7 +539,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
            < 0 (not ARPT_RETURN). --RR */
 
 	/* Clear counters and comefrom */
-	e->counters = ((struct arpt_counters) { 0, 0 });
+	e->counters = ((struct xt_counters) { 0, 0 });
 	e->comefrom = 0;
 
 	(*i)++;
@@ -683,7 +566,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
  */
 static int translate_table(const char *name,
 			   unsigned int valid_hooks,
-			   struct arpt_table_info *newinfo,
+			   struct xt_table_info *newinfo,
 			   void *entry0,
 			   unsigned int size,
 			   unsigned int number,
@@ -764,34 +647,9 @@ static int translate_table(const char *name,
 	return ret;
 }
 
-static struct arpt_table_info *replace_table(struct arpt_table *table,
-					     unsigned int num_counters,
-					     struct arpt_table_info *newinfo,
-					     int *error)
-{
-	struct arpt_table_info *oldinfo;
-
-	/* Do the substitution. */
-	write_lock_bh(&table->lock);
-	/* Check inside lock: is the old number correct? */
-	if (num_counters != table->private->number) {
-		duprintf("num_counters != table->private->number (%u/%u)\n",
-			 num_counters, table->private->number);
-		write_unlock_bh(&table->lock);
-		*error = -EAGAIN;
-		return NULL;
-	}
-	oldinfo = table->private;
-	table->private = newinfo;
-	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
-
-	return oldinfo;
-}
-
 /* Gets counters. */
 static inline int add_entry_to_counter(const struct arpt_entry *e,
-				       struct arpt_counters total[],
+				       struct xt_counters total[],
 				       unsigned int *i)
 {
 	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
@@ -801,7 +659,7 @@ static inline int add_entry_to_counter(const struct arpt_entry *e,
 }
 
 static inline int set_entry_to_counter(const struct arpt_entry *e,
-				       struct arpt_counters total[],
+				       struct xt_counters total[],
 				       unsigned int *i)
 {
 	SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
@@ -810,8 +668,8 @@ static inline int set_entry_to_counter(const struct arpt_entry *e,
 	return 0;
 }
 
-static void get_counters(const struct arpt_table_info *t,
-			 struct arpt_counters counters[])
+static void get_counters(const struct xt_table_info *t,
+			 struct xt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
@@ -849,7 +707,8 @@ static int copy_entries_to_user(unsigned int total_size,
 {
 	unsigned int off, num, countersize;
 	struct arpt_entry *e;
-	struct arpt_counters *counters;
+	struct xt_counters *counters;
+	struct xt_table_info *private = table->private;
 	int ret = 0;
 	void *loc_cpu_entry;
 
@@ -857,18 +716,18 @@ static int copy_entries_to_user(unsigned int total_size,
 	 * (other than comefrom, which userspace doesn't care
 	 * about).
 	 */
-	countersize = sizeof(struct arpt_counters) * table->private->number;
-	counters = vmalloc(countersize);
+	countersize = sizeof(struct xt_counters) * private->number;
+	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
 		return -ENOMEM;
 
 	/* First, sum counters... */
 	write_lock_bh(&table->lock);
-	get_counters(table->private, counters);
+	get_counters(private, counters);
 	write_unlock_bh(&table->lock);
 
-	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	/* ... then copy entire thing ... */
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
@@ -911,75 +770,34 @@ static int get_entries(const struct arpt_get_entries *entries,
 	int ret;
 	struct arpt_table *t;
 
-	t = find_table_lock(entries->name);
+	t = xt_find_table_lock(NF_ARP, entries->name);
 	if (t || !IS_ERR(t)) {
+		struct xt_table_info *private = t->private;
 		duprintf("t->private->number = %u\n",
-			 t->private->number);
-		if (entries->size == t->private->size)
-			ret = copy_entries_to_user(t->private->size,
+			 private->number);
+		if (entries->size == private->size)
+			ret = copy_entries_to_user(private->size,
 						   t, uptr->entrytable);
 		else {
 			duprintf("get_entries: I've got %u not %u!\n",
-				 t->private->size,
-				 entries->size);
+				 private->size, entries->size);
 			ret = -EINVAL;
 		}
 		module_put(t->me);
-		up(&arpt_mutex);
+		xt_table_unlock(t);
 	} else
 		ret = t ? PTR_ERR(t) : -ENOENT;
 
 	return ret;
 }
 
-static void free_table_info(struct arpt_table_info *info)
-{
-	int cpu;
-	for_each_cpu(cpu) {
-		if (info->size <= PAGE_SIZE)
-			kfree(info->entries[cpu]);
-		else
-			vfree(info->entries[cpu]);
-	}
-	kfree(info);
-}
-
-static struct arpt_table_info *alloc_table_info(unsigned int size)
-{
-	struct arpt_table_info *newinfo;
-	int cpu;
-	
-	newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL);
-	if (!newinfo)
-		return NULL;
-
-	newinfo->size = size;
-
-	for_each_cpu(cpu) {
-		if (size <= PAGE_SIZE)
-			newinfo->entries[cpu] = kmalloc_node(size,
-							GFP_KERNEL,
-							cpu_to_node(cpu));
-		else
-			newinfo->entries[cpu] = vmalloc_node(size,
-							     cpu_to_node(cpu));
-
-		if (newinfo->entries[cpu] == NULL) {
-			free_table_info(newinfo);
-			return NULL;
-		}
-	}
-
-	return newinfo;
-}
-
 static int do_replace(void __user *user, unsigned int len)
 {
 	int ret;
 	struct arpt_replace tmp;
 	struct arpt_table *t;
-	struct arpt_table_info *newinfo, *oldinfo;
-	struct arpt_counters *counters;
+	struct xt_table_info *newinfo, *oldinfo;
+	struct xt_counters *counters;
 	void *loc_cpu_entry, *loc_cpu_old_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
@@ -989,11 +807,7 @@ static int do_replace(void __user *user, unsigned int len)
 	if (len != sizeof(tmp) + tmp.size)
 		return -ENOPROTOOPT;
 
-	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
-	if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
-		return -ENOMEM;
-
-	newinfo = alloc_table_info(tmp.size);
+	newinfo = xt_alloc_table_info(tmp.size);
 	if (!newinfo)
 		return -ENOMEM;
 
@@ -1005,7 +819,7 @@ static int do_replace(void __user *user, unsigned int len)
 		goto free_newinfo;
 	}
 
-	counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters));
+	counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto free_newinfo;
@@ -1019,7 +833,7 @@ static int do_replace(void __user *user, unsigned int len)
 
 	duprintf("arp_tables: Translated table\n");
 
-	t = try_then_request_module(find_table_lock(tmp.name),
+	t = try_then_request_module(xt_find_table_lock(NF_ARP, tmp.name),
 				    "arptable_%s", tmp.name);
 	if (!t || IS_ERR(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
@@ -1034,7 +848,7 @@ static int do_replace(void __user *user, unsigned int len)
 		goto put_module;
 	}
 
-	oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+	oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret);
 	if (!oldinfo)
 		goto put_module;
 
@@ -1054,23 +868,23 @@ static int do_replace(void __user *user, unsigned int len)
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
 
-	free_table_info(oldinfo);
+	xt_free_table_info(oldinfo);
 	if (copy_to_user(tmp.counters, counters,
-			 sizeof(struct arpt_counters) * tmp.num_counters) != 0)
+			 sizeof(struct xt_counters) * tmp.num_counters) != 0)
 		ret = -EFAULT;
 	vfree(counters);
-	up(&arpt_mutex);
+	xt_table_unlock(t);
 	return ret;
 
  put_module:
 	module_put(t->me);
-	up(&arpt_mutex);
+	xt_table_unlock(t);
  free_newinfo_counters_untrans:
 	ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
  free_newinfo_counters:
 	vfree(counters);
  free_newinfo:
-	free_table_info(newinfo);
+	xt_free_table_info(newinfo);
 	return ret;
 }
 
@@ -1078,7 +892,7 @@ static int do_replace(void __user *user, unsigned int len)
  * and everything is OK.
  */
 static inline int add_counter_to_entry(struct arpt_entry *e,
-				       const struct arpt_counters addme[],
+				       const struct xt_counters addme[],
 				       unsigned int *i)
 {
 
@@ -1091,15 +905,16 @@ static inline int add_counter_to_entry(struct arpt_entry *e,
 static int do_add_counters(void __user *user, unsigned int len)
 {
 	unsigned int i;
-	struct arpt_counters_info tmp, *paddc;
+	struct xt_counters_info tmp, *paddc;
 	struct arpt_table *t;
+	struct xt_table_info *private;
 	int ret = 0;
 	void *loc_cpu_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
 
-	if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters))
+	if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters))
 		return -EINVAL;
 
 	paddc = vmalloc(len);
@@ -1111,29 +926,30 @@ static int do_add_counters(void __user *user, unsigned int len)
 		goto free;
 	}
 
-	t = find_table_lock(tmp.name);
+	t = xt_find_table_lock(NF_ARP, tmp.name);
 	if (!t || IS_ERR(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free;
 	}
 
 	write_lock_bh(&t->lock);
-	if (t->private->number != paddc->num_counters) {
+	private = t->private;
+	if (private->number != paddc->num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = t->private->entries[smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
-			   t->private->size,
+			   private->size,
 			   add_counter_to_entry,
 			   paddc->counters,
 			   &i);
  unlock_up_free:
 	write_unlock_bh(&t->lock);
-	up(&arpt_mutex);
+	xt_table_unlock(t);
 	module_put(t->me);
  free:
 	vfree(paddc);
@@ -1190,25 +1006,26 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 		}
 		name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
 
-		t = try_then_request_module(find_table_lock(name),
+		t = try_then_request_module(xt_find_table_lock(NF_ARP, name),
 					    "arptable_%s", name);
 		if (t && !IS_ERR(t)) {
 			struct arpt_getinfo info;
+			struct xt_table_info *private = t->private;
 
 			info.valid_hooks = t->valid_hooks;
-			memcpy(info.hook_entry, t->private->hook_entry,
+			memcpy(info.hook_entry, private->hook_entry,
 			       sizeof(info.hook_entry));
-			memcpy(info.underflow, t->private->underflow,
+			memcpy(info.underflow, private->underflow,
 			       sizeof(info.underflow));
-			info.num_entries = t->private->number;
-			info.size = t->private->size;
+			info.num_entries = private->number;
+			info.size = private->size;
 			strcpy(info.name, name);
 
 			if (copy_to_user(user, &info, *len) != 0)
 				ret = -EFAULT;
 			else
 				ret = 0;
-			up(&arpt_mutex);
+			xt_table_unlock(t);
 			module_put(t->me);
 		} else
 			ret = t ? PTR_ERR(t) : -ENOENT;
@@ -1233,7 +1050,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 	}
 
 	case ARPT_SO_GET_REVISION_TARGET: {
-		struct arpt_get_revision rev;
+		struct xt_get_revision rev;
 
 		if (*len != sizeof(rev)) {
 			ret = -EINVAL;
@@ -1244,8 +1061,8 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 			break;
 		}
 
-		try_then_request_module(find_revision(rev.name, rev.revision,
-						      target_revfn, &ret),
+		try_then_request_module(xt_find_revision(NF_ARP, rev.name,
+							 rev.revision, 1, &ret),
 					"arpt_%s", rev.name);
 		break;
 	}
@@ -1258,38 +1075,16 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 	return ret;
 }
 
-/* Registration hooks for targets. */
-int arpt_register_target(struct arpt_target *target)
-{
-	int ret;
-
-	ret = down_interruptible(&arpt_mutex);
-	if (ret != 0)
-		return ret;
-
-	list_add(&target->list, &arpt_target);
-	up(&arpt_mutex);
-
-	return ret;
-}
-
-void arpt_unregister_target(struct arpt_target *target)
-{
-	down(&arpt_mutex);
-	LIST_DELETE(&arpt_target, target);
-	up(&arpt_mutex);
-}
-
 int arpt_register_table(struct arpt_table *table,
 			const struct arpt_replace *repl)
 {
 	int ret;
-	struct arpt_table_info *newinfo;
-	static struct arpt_table_info bootstrap
+	struct xt_table_info *newinfo;
+	static struct xt_table_info bootstrap
 		= { 0, 0, 0, { 0 }, { 0 }, { } };
 	void *loc_cpu_entry;
 
-	newinfo = alloc_table_info(repl->size);
+	newinfo = xt_alloc_table_info(repl->size);
 	if (!newinfo) {
 		ret = -ENOMEM;
 		return ret;
@@ -1304,60 +1099,33 @@ int arpt_register_table(struct arpt_table *table,
 			      repl->num_entries,
 			      repl->hook_entry,
 			      repl->underflow);
+
 	duprintf("arpt_register_table: translate table gives %d\n", ret);
 	if (ret != 0) {
-		free_table_info(newinfo);
+		xt_free_table_info(newinfo);
 		return ret;
 	}
 
-	ret = down_interruptible(&arpt_mutex);
-	if (ret != 0) {
-		free_table_info(newinfo);
+	if (xt_register_table(table, &bootstrap, newinfo) != 0) {
+		xt_free_table_info(newinfo);
 		return ret;
 	}
 
-	/* Don't autoload: we'd eat our tail... */
-	if (list_named_find(&arpt_tables, table->name)) {
-		ret = -EEXIST;
-		goto free_unlock;
-	}
-
-	/* Simplifies replace_table code. */
-	table->private = &bootstrap;
-	if (!replace_table(table, 0, newinfo, &ret))
-		goto free_unlock;
-
-	duprintf("table->private->number = %u\n",
-		 table->private->number);
-	
-	/* save number of initial entries */
-	table->private->initial_entries = table->private->number;
-
-	rwlock_init(&table->lock);
-	list_prepend(&arpt_tables, table);
-
- unlock:
-	up(&arpt_mutex);
-	return ret;
-
- free_unlock:
-	free_table_info(newinfo);
-	goto unlock;
+	return 0;
 }
 
 void arpt_unregister_table(struct arpt_table *table)
 {
+	struct xt_table_info *private;
 	void *loc_cpu_entry;
 
-	down(&arpt_mutex);
-	LIST_DELETE(&arpt_tables, table);
-	up(&arpt_mutex);
+	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
-	ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size,
 			   cleanup_entry, NULL);
-	free_table_info(table->private);
+	xt_free_table_info(private);
 }
 
 /* The built-in targets: standard (NULL) and error. */
@@ -1380,52 +1148,15 @@ static struct nf_sockopt_ops arpt_sockopts = {
 	.get		= do_arpt_get_ctl,
 };
 
-#ifdef CONFIG_PROC_FS
-static inline int print_name(const struct arpt_table *t,
-			     off_t start_offset, char *buffer, int length,
-			     off_t *pos, unsigned int *count)
-{
-	if ((*count)++ >= start_offset) {
-		unsigned int namelen;
-
-		namelen = sprintf(buffer + *pos, "%s\n", t->name);
-		if (*pos + namelen > length) {
-			/* Stop iterating */
-			return 1;
-		}
-		*pos += namelen;
-	}
-	return 0;
-}
-
-static int arpt_get_tables(char *buffer, char **start, off_t offset, int length)
-{
-	off_t pos = 0;
-	unsigned int count = 0;
-
-	if (down_interruptible(&arpt_mutex) != 0)
-		return 0;
-
-	LIST_FIND(&arpt_tables, print_name, struct arpt_table *,
-		  offset, buffer, length, &pos, &count);
-
-	up(&arpt_mutex);
-
-	/* `start' hack - see fs/proc/generic.c line ~105 */
-	*start=(char *)((unsigned long)count-offset);
-	return pos;
-}
-#endif /*CONFIG_PROC_FS*/
-
 static int __init init(void)
 {
 	int ret;
 
+	xt_proto_init(NF_ARP);
+
 	/* Noone else will be downing sem now, so we won't sleep */
-	down(&arpt_mutex);
-	list_append(&arpt_target, &arpt_standard_target);
-	list_append(&arpt_target, &arpt_error_target);
-	up(&arpt_mutex);
+	xt_register_target(NF_ARP, &arpt_standard_target);
+	xt_register_target(NF_ARP, &arpt_error_target);
 
 	/* Register setsockopt */
 	ret = nf_register_sockopt(&arpt_sockopts);
@@ -1434,19 +1165,6 @@ static int __init init(void)
 		return ret;
 	}
 
-#ifdef CONFIG_PROC_FS
-	{
-		struct proc_dir_entry *proc;
-
-		proc = proc_net_create("arp_tables_names", 0, arpt_get_tables);
-		if (!proc) {
-			nf_unregister_sockopt(&arpt_sockopts);
-			return -ENOMEM;
-		}
-		proc->owner = THIS_MODULE;
-	}
-#endif
-
 	printk("arp_tables: (C) 2002 David S. Miller\n");
 	return 0;
 }
@@ -1454,16 +1172,12 @@ static int __init init(void)
 static void __exit fini(void)
 {
 	nf_unregister_sockopt(&arpt_sockopts);
-#ifdef CONFIG_PROC_FS
-	proc_net_remove("arp_tables_names");
-#endif
+	xt_proto_fini(NF_ARP);
 }
 
 EXPORT_SYMBOL(arpt_register_table);
 EXPORT_SYMBOL(arpt_unregister_table);
 EXPORT_SYMBOL(arpt_do_table);
-EXPORT_SYMBOL(arpt_register_target);
-EXPORT_SYMBOL(arpt_unregister_target);
 
 module_init(init);
 module_exit(fini);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index 3e592ec86482..c97650a16a5b 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -8,8 +8,9 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
 MODULE_DESCRIPTION("arptables arp payload mangle target");
 
 static unsigned int
-target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in,
-   const struct net_device *out, const void *targinfo, void *userinfo)
+target(struct sk_buff **pskb, const struct net_device *in,
+   const struct net_device *out, unsigned int hooknum, const void *targinfo,
+   void *userinfo)
 {
 	const struct arpt_mangle *mangle = targinfo;
 	struct arphdr *arp;
@@ -64,7 +65,7 @@ target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in,
 }
 
 static int
-checkentry(const char *tablename, const struct arpt_entry *e, void *targinfo,
+checkentry(const char *tablename, const void *e, void *targinfo,
    unsigned int targinfosize, unsigned int hook_mask)
 {
 	const struct arpt_mangle *mangle = targinfo;
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 0d759f5a4ef0..f6ab45f48681 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -145,6 +145,7 @@ static struct arpt_table packet_filter = {
 	.lock		= RW_LOCK_UNLOCKED,
 	.private	= NULL,
 	.me		= THIS_MODULE,
+	.af		= NF_ARP,
 };
 
 /* The work comes in here from netfilter.c */
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 9dec1293f67a..833fcb4be5e7 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -944,7 +944,7 @@ module_exit(fini);
 
 /* Some modules need us, but don't depend directly on any symbol.
    They should call this. */
-void need_ip_conntrack(void)
+void need_conntrack(void)
 {
 }
 
@@ -962,7 +962,7 @@ EXPORT_SYMBOL(ip_ct_get_tuple);
 EXPORT_SYMBOL(invert_tuplepr);
 EXPORT_SYMBOL(ip_conntrack_alter_reply);
 EXPORT_SYMBOL(ip_conntrack_destroyed);
-EXPORT_SYMBOL(need_ip_conntrack);
+EXPORT_SYMBOL(need_conntrack);
 EXPORT_SYMBOL(ip_conntrack_helper_register);
 EXPORT_SYMBOL(ip_conntrack_helper_unregister);
 EXPORT_SYMBOL(ip_ct_iterate_cleanup);
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index cb66b8bddeb3..1de86282d232 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -95,6 +95,7 @@ static struct ipt_table nat_table = {
 	.valid_hooks	= NAT_VALID_HOOKS,
 	.lock		= RW_LOCK_UNLOCKED,
 	.me		= THIS_MODULE,
+	.af		= AF_INET,
 };
 
 /* Source NAT */
@@ -168,7 +169,7 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb,
 }
 
 static int ipt_snat_checkentry(const char *tablename,
-			       const struct ipt_entry *e,
+			       const void *entry,
 			       void *targinfo,
 			       unsigned int targinfosize,
 			       unsigned int hook_mask)
@@ -201,7 +202,7 @@ static int ipt_snat_checkentry(const char *tablename,
 }
 
 static int ipt_dnat_checkentry(const char *tablename,
-			       const struct ipt_entry *e,
+			       const void *entry,
 			       void *targinfo,
 			       unsigned int targinfosize,
 			       unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 8b8a1f00bbf4..ad438fb185b8 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -364,7 +364,7 @@ static int init_or_cleanup(int init)
 {
 	int ret = 0;
 
-	need_ip_conntrack();
+	need_conntrack();
 
 	if (!init) goto cleanup;
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 877bc96d3336..2371b2062c2d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -2,7 +2,7 @@
  * Packet matching code.
  *
  * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
- * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -11,6 +11,8 @@
  * 19 Jan 2002 Harald Welte <laforge@gnumonks.org>
  * 	- increase module usage count as soon as we have rules inside
  * 	  a table
+ * 08 Oct 2005 Harald Welte <lafore@netfilter.org>
+ * 	- Generalize into "x_tables" layer and "{ip,ip6,arp}_tables"
  */
 #include <linux/config.h>
 #include <linux/cache.h>
@@ -20,8 +22,6 @@
 #include <linux/vmalloc.h>
 #include <linux/netdevice.h>
 #include <linux/module.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
 #include <linux/icmp.h>
 #include <net/ip.h>
 #include <asm/uaccess.h>
@@ -30,6 +30,7 @@
 #include <linux/err.h>
 #include <linux/cpumask.h>
 
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
 
 MODULE_LICENSE("GPL");
@@ -62,14 +63,6 @@ do {								\
 #else
 #define IP_NF_ASSERT(x)
 #endif
-#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
-
-static DECLARE_MUTEX(ipt_mutex);
-
-/* Must have mutex */
-#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
-#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
-#include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
 /* All the better to debug you with... */
@@ -86,36 +79,6 @@ static DECLARE_MUTEX(ipt_mutex);
 
    Hence the start of any table is given by get_table() below.  */
 
-/* The table itself */
-struct ipt_table_info
-{
-	/* Size per table */
-	unsigned int size;
-	/* Number of entries: FIXME. --RR */
-	unsigned int number;
-	/* Initial number of entries. Needed for module usage count */
-	unsigned int initial_entries;
-
-	/* Entry points and underflows */
-	unsigned int hook_entry[NF_IP_NUMHOOKS];
-	unsigned int underflow[NF_IP_NUMHOOKS];
-
-	/* ipt_entry tables: one per CPU */
-	void *entries[NR_CPUS];
-};
-
-static LIST_HEAD(ipt_target);
-static LIST_HEAD(ipt_match);
-static LIST_HEAD(ipt_tables);
-#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
-#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-
-#if 0
-#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
-#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
-#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0)
-#endif
-
 /* Returns whether matches rule or not. */
 static inline int
 ip_packet_match(const struct iphdr *ip,
@@ -234,7 +197,8 @@ int do_match(struct ipt_entry_match *m,
 	     int *hotdrop)
 {
 	/* Stop iteration if it doesn't match */
-	if (!m->u.kernel.match->match(skb, in, out, m->data, offset, hotdrop))
+	if (!m->u.kernel.match->match(skb, in, out, m->data, offset, 
+	    skb->nh.iph->ihl*4, hotdrop))
 		return 1;
 	else
 		return 0;
@@ -265,6 +229,7 @@ ipt_do_table(struct sk_buff **pskb,
 	const char *indev, *outdev;
 	void *table_base;
 	struct ipt_entry *e, *back;
+	struct xt_table_info *private = table->private;
 
 	/* Initialization */
 	ip = (*pskb)->nh.iph;
@@ -281,24 +246,11 @@ ipt_do_table(struct sk_buff **pskb,
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	table_base = (void *)table->private->entries[smp_processor_id()];
-	e = get_entry(table_base, table->private->hook_entry[hook]);
-
-#ifdef CONFIG_NETFILTER_DEBUG
-	/* Check noone else using our table */
-	if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac
-	    && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) {
-		printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n",
-		       smp_processor_id(),
-		       table->name,
-		       &((struct ipt_entry *)table_base)->comefrom,
-		       ((struct ipt_entry *)table_base)->comefrom);
-	}
-	((struct ipt_entry *)table_base)->comefrom = 0x57acc001;
-#endif
+	table_base = (void *)private->entries[smp_processor_id()];
+	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
-	back = get_entry(table_base, table->private->underflow[hook]);
+	back = get_entry(table_base, private->underflow[hook]);
 
 	do {
 		IP_NF_ASSERT(e);
@@ -384,9 +336,6 @@ ipt_do_table(struct sk_buff **pskb,
 		}
 	} while (!hotdrop);
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	((struct ipt_entry *)table_base)->comefrom = 0xdead57ac;
-#endif
 	read_unlock_bh(&table->lock);
 
 #ifdef DEBUG_ALLOW_ALL
@@ -398,145 +347,6 @@ ipt_do_table(struct sk_buff **pskb,
 #endif
 }
 
-/*
- * These are weird, but module loading must not be done with mutex
- * held (since they will register), and we have to have a single
- * function to use try_then_request_module().
- */
-
-/* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
-static inline struct ipt_table *find_table_lock(const char *name)
-{
-	struct ipt_table *t;
-
-	if (down_interruptible(&ipt_mutex) != 0)
-		return ERR_PTR(-EINTR);
-
-	list_for_each_entry(t, &ipt_tables, list)
-		if (strcmp(t->name, name) == 0 && try_module_get(t->me))
-			return t;
-	up(&ipt_mutex);
-	return NULL;
-}
-
-/* Find match, grabs ref.  Returns ERR_PTR() on error. */
-static inline struct ipt_match *find_match(const char *name, u8 revision)
-{
-	struct ipt_match *m;
-	int err = 0;
-
-	if (down_interruptible(&ipt_mutex) != 0)
-		return ERR_PTR(-EINTR);
-
-	list_for_each_entry(m, &ipt_match, list) {
-		if (strcmp(m->name, name) == 0) {
-			if (m->revision == revision) {
-				if (try_module_get(m->me)) {
-					up(&ipt_mutex);
-					return m;
-				}
-			} else
-				err = -EPROTOTYPE; /* Found something. */
-		}
-	}
-	up(&ipt_mutex);
-	return ERR_PTR(err);
-}
-
-/* Find target, grabs ref.  Returns ERR_PTR() on error. */
-static inline struct ipt_target *find_target(const char *name, u8 revision)
-{
-	struct ipt_target *t;
-	int err = 0;
-
-	if (down_interruptible(&ipt_mutex) != 0)
-		return ERR_PTR(-EINTR);
-
-	list_for_each_entry(t, &ipt_target, list) {
-		if (strcmp(t->name, name) == 0) {
-			if (t->revision == revision) {
-				if (try_module_get(t->me)) {
-					up(&ipt_mutex);
-					return t;
-				}
-			} else
-				err = -EPROTOTYPE; /* Found something. */
-		}
-	}
-	up(&ipt_mutex);
-	return ERR_PTR(err);
-}
-
-struct ipt_target *ipt_find_target(const char *name, u8 revision)
-{
-	struct ipt_target *target;
-
-	target = try_then_request_module(find_target(name, revision),
-					 "ipt_%s", name);
-	if (IS_ERR(target) || !target)
-		return NULL;
-	return target;
-}
-
-static int match_revfn(const char *name, u8 revision, int *bestp)
-{
-	struct ipt_match *m;
-	int have_rev = 0;
-
-	list_for_each_entry(m, &ipt_match, list) {
-		if (strcmp(m->name, name) == 0) {
-			if (m->revision > *bestp)
-				*bestp = m->revision;
-			if (m->revision == revision)
-				have_rev = 1;
-		}
-	}
-	return have_rev;
-}
-
-static int target_revfn(const char *name, u8 revision, int *bestp)
-{
-	struct ipt_target *t;
-	int have_rev = 0;
-
-	list_for_each_entry(t, &ipt_target, list) {
-		if (strcmp(t->name, name) == 0) {
-			if (t->revision > *bestp)
-				*bestp = t->revision;
-			if (t->revision == revision)
-				have_rev = 1;
-		}
-	}
-	return have_rev;
-}
-
-/* Returns true or false (if no such extension at all) */
-static inline int find_revision(const char *name, u8 revision,
-				int (*revfn)(const char *, u8, int *),
-				int *err)
-{
-	int have_rev, best = -1;
-
-	if (down_interruptible(&ipt_mutex) != 0) {
-		*err = -EINTR;
-		return 1;
-	}
-	have_rev = revfn(name, revision, &best);
-	up(&ipt_mutex);
-
-	/* Nothing at all?  Return 0 to try loading module. */
-	if (best == -1) {
-		*err = -ENOENT;
-		return 0;
-	}
-
-	*err = best;
-	if (!have_rev)
-		*err = -EPROTONOSUPPORT;
-	return 1;
-}
-
-
 /* All zeroes == unconditional rule. */
 static inline int
 unconditional(const struct ipt_ip *ip)
@@ -553,7 +363,7 @@ unconditional(const struct ipt_ip *ip)
 /* Figures out from what hook each rule can be called: returns 0 if
    there are loops.  Puts hook bitmask in comefrom. */
 static int
-mark_source_chains(struct ipt_table_info *newinfo,
+mark_source_chains(struct xt_table_info *newinfo,
 		   unsigned int valid_hooks, void *entry0)
 {
 	unsigned int hook;
@@ -699,7 +509,7 @@ check_match(struct ipt_entry_match *m,
 {
 	struct ipt_match *match;
 
-	match = try_then_request_module(find_match(m->u.user.name,
+	match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
 						   m->u.user.revision),
 					"ipt_%s", m->u.user.name);
 	if (IS_ERR(match) || !match) {
@@ -744,7 +554,8 @@ check_entry(struct ipt_entry *e, const char *name, unsigned int size,
 		goto cleanup_matches;
 
 	t = ipt_get_target(e);
-	target = try_then_request_module(find_target(t->u.user.name,
+	target = try_then_request_module(xt_find_target(AF_INET,
+						     t->u.user.name,
 						     t->u.user.revision),
 					 "ipt_%s", t->u.user.name);
 	if (IS_ERR(target) || !target) {
@@ -781,7 +592,7 @@ check_entry(struct ipt_entry *e, const char *name, unsigned int size,
 
 static inline int
 check_entry_size_and_hooks(struct ipt_entry *e,
-			   struct ipt_table_info *newinfo,
+			   struct xt_table_info *newinfo,
 			   unsigned char *base,
 			   unsigned char *limit,
 			   const unsigned int *hook_entries,
@@ -815,7 +626,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
            < 0 (not IPT_RETURN). --RR */
 
 	/* Clear counters and comefrom */
-	e->counters = ((struct ipt_counters) { 0, 0 });
+	e->counters = ((struct xt_counters) { 0, 0 });
 	e->comefrom = 0;
 
 	(*i)++;
@@ -845,7 +656,7 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i)
 static int
 translate_table(const char *name,
 		unsigned int valid_hooks,
-		struct ipt_table_info *newinfo,
+		struct xt_table_info *newinfo,
 		void *entry0,
 		unsigned int size,
 		unsigned int number,
@@ -922,48 +733,10 @@ translate_table(const char *name,
 	return ret;
 }
 
-static struct ipt_table_info *
-replace_table(struct ipt_table *table,
-	      unsigned int num_counters,
-	      struct ipt_table_info *newinfo,
-	      int *error)
-{
-	struct ipt_table_info *oldinfo;
-
-#ifdef CONFIG_NETFILTER_DEBUG
-	{
-		int cpu;
-
-		for_each_cpu(cpu) {
-			struct ipt_entry *table_base = newinfo->entries[cpu];
-			if (table_base)
-				table_base->comefrom = 0xdead57ac;
-		}
-	}
-#endif
-
-	/* Do the substitution. */
-	write_lock_bh(&table->lock);
-	/* Check inside lock: is the old number correct? */
-	if (num_counters != table->private->number) {
-		duprintf("num_counters != table->private->number (%u/%u)\n",
-			 num_counters, table->private->number);
-		write_unlock_bh(&table->lock);
-		*error = -EAGAIN;
-		return NULL;
-	}
-	oldinfo = table->private;
-	table->private = newinfo;
-	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
-
-	return oldinfo;
-}
-
 /* Gets counters. */
 static inline int
 add_entry_to_counter(const struct ipt_entry *e,
-		     struct ipt_counters total[],
+		     struct xt_counters total[],
 		     unsigned int *i)
 {
 	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
@@ -984,8 +757,8 @@ set_entry_to_counter(const struct ipt_entry *e,
 }
 
 static void
-get_counters(const struct ipt_table_info *t,
-	     struct ipt_counters counters[])
+get_counters(const struct xt_table_info *t,
+	     struct xt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
@@ -1024,14 +797,15 @@ copy_entries_to_user(unsigned int total_size,
 {
 	unsigned int off, num, countersize;
 	struct ipt_entry *e;
-	struct ipt_counters *counters;
+	struct xt_counters *counters;
+	struct xt_table_info *private = table->private;
 	int ret = 0;
 	void *loc_cpu_entry;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
 	   about). */
-	countersize = sizeof(struct ipt_counters) * table->private->number;
+	countersize = sizeof(struct xt_counters) * private->number;
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
@@ -1039,14 +813,14 @@ copy_entries_to_user(unsigned int total_size,
 
 	/* First, sum counters... */
 	write_lock_bh(&table->lock);
-	get_counters(table->private, counters);
+	get_counters(private, counters);
 	write_unlock_bh(&table->lock);
 
 	/* choose the copy that is on our node/cpu, ...
 	 * This choice is lazy (because current thread is
 	 * allowed to migrate to another cpu)
 	 */
-	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	/* ... then copy entire thing ... */
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
@@ -1108,74 +882,36 @@ get_entries(const struct ipt_get_entries *entries,
 	int ret;
 	struct ipt_table *t;
 
-	t = find_table_lock(entries->name);
+	t = xt_find_table_lock(AF_INET, entries->name);
 	if (t && !IS_ERR(t)) {
+		struct xt_table_info *private = t->private;
 		duprintf("t->private->number = %u\n",
-			 t->private->number);
-		if (entries->size == t->private->size)
-			ret = copy_entries_to_user(t->private->size,
+			 private->number);
+		if (entries->size == private->size)
+			ret = copy_entries_to_user(private->size,
 						   t, uptr->entrytable);
 		else {
 			duprintf("get_entries: I've got %u not %u!\n",
-				 t->private->size,
+				 private->size,
 				 entries->size);
 			ret = -EINVAL;
 		}
 		module_put(t->me);
-		up(&ipt_mutex);
+		xt_table_unlock(t);
 	} else
 		ret = t ? PTR_ERR(t) : -ENOENT;
 
 	return ret;
 }
 
-static void free_table_info(struct ipt_table_info *info)
-{
-	int cpu;
-	for_each_cpu(cpu) {
-		if (info->size <= PAGE_SIZE)
-			kfree(info->entries[cpu]);
-		else
-			vfree(info->entries[cpu]);
-	}
-	kfree(info);
-}
-
-static struct ipt_table_info *alloc_table_info(unsigned int size)
-{
-	struct ipt_table_info *newinfo;
-	int cpu;
-
-	newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
-	if (!newinfo)
-		return NULL;
-
-	newinfo->size = size;
-
-	for_each_cpu(cpu) {
-		if (size <= PAGE_SIZE)
-			newinfo->entries[cpu] = kmalloc_node(size,
-				GFP_KERNEL,
-				cpu_to_node(cpu));
-		else
-			newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu));
-		if (newinfo->entries[cpu] == 0) {
-			free_table_info(newinfo);
-			return NULL;
-		}
-	}
-
-	return newinfo;
-}
-
 static int
 do_replace(void __user *user, unsigned int len)
 {
 	int ret;
 	struct ipt_replace tmp;
 	struct ipt_table *t;
-	struct ipt_table_info *newinfo, *oldinfo;
-	struct ipt_counters *counters;
+	struct xt_table_info *newinfo, *oldinfo;
+	struct xt_counters *counters;
 	void *loc_cpu_entry, *loc_cpu_old_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
@@ -1185,11 +921,7 @@ do_replace(void __user *user, unsigned int len)
 	if (len != sizeof(tmp) + tmp.size)
 		return -ENOPROTOOPT;
 
-	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
-	if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
-		return -ENOMEM;
-
-	newinfo = alloc_table_info(tmp.size);
+	newinfo = xt_alloc_table_info(tmp.size);
 	if (!newinfo)
 		return -ENOMEM;
 
@@ -1201,7 +933,7 @@ do_replace(void __user *user, unsigned int len)
 		goto free_newinfo;
 	}
 
-	counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters));
+	counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto free_newinfo;
@@ -1215,7 +947,7 @@ do_replace(void __user *user, unsigned int len)
 
 	duprintf("ip_tables: Translated table\n");
 
-	t = try_then_request_module(find_table_lock(tmp.name),
+	t = try_then_request_module(xt_find_table_lock(AF_INET, tmp.name),
 				    "iptable_%s", tmp.name);
 	if (!t || IS_ERR(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
@@ -1230,7 +962,7 @@ do_replace(void __user *user, unsigned int len)
 		goto put_module;
 	}
 
-	oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+	oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret);
 	if (!oldinfo)
 		goto put_module;
 
@@ -1249,23 +981,23 @@ do_replace(void __user *user, unsigned int len)
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
-	free_table_info(oldinfo);
+	xt_free_table_info(oldinfo);
 	if (copy_to_user(tmp.counters, counters,
-			 sizeof(struct ipt_counters) * tmp.num_counters) != 0)
+			 sizeof(struct xt_counters) * tmp.num_counters) != 0)
 		ret = -EFAULT;
 	vfree(counters);
-	up(&ipt_mutex);
+	xt_table_unlock(t);
 	return ret;
 
  put_module:
 	module_put(t->me);
-	up(&ipt_mutex);
+	xt_table_unlock(t);
  free_newinfo_counters_untrans:
 	IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
  free_newinfo_counters:
 	vfree(counters);
  free_newinfo:
-	free_table_info(newinfo);
+	xt_free_table_info(newinfo);
 	return ret;
 }
 
@@ -1273,7 +1005,7 @@ do_replace(void __user *user, unsigned int len)
  * and everything is OK. */
 static inline int
 add_counter_to_entry(struct ipt_entry *e,
-		     const struct ipt_counters addme[],
+		     const struct xt_counters addme[],
 		     unsigned int *i)
 {
 #if 0
@@ -1295,15 +1027,16 @@ static int
 do_add_counters(void __user *user, unsigned int len)
 {
 	unsigned int i;
-	struct ipt_counters_info tmp, *paddc;
+	struct xt_counters_info tmp, *paddc;
 	struct ipt_table *t;
+	struct xt_table_info *private;
 	int ret = 0;
 	void *loc_cpu_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
 
-	if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
+	if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters))
 		return -EINVAL;
 
 	paddc = vmalloc_node(len, numa_node_id());
@@ -1315,29 +1048,30 @@ do_add_counters(void __user *user, unsigned int len)
 		goto free;
 	}
 
-	t = find_table_lock(tmp.name);
+	t = xt_find_table_lock(AF_INET, tmp.name);
 	if (!t || IS_ERR(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free;
 	}
 
 	write_lock_bh(&t->lock);
-	if (t->private->number != paddc->num_counters) {
+	private = t->private;
+	if (private->number != paddc->num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = t->private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
-			  t->private->size,
+			  private->size,
 			  add_counter_to_entry,
 			  paddc->counters,
 			  &i);
  unlock_up_free:
 	write_unlock_bh(&t->lock);
-	up(&ipt_mutex);
+	xt_table_unlock(t);
 	module_put(t->me);
  free:
 	vfree(paddc);
@@ -1396,25 +1130,26 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		}
 		name[IPT_TABLE_MAXNAMELEN-1] = '\0';
 
-		t = try_then_request_module(find_table_lock(name),
+		t = try_then_request_module(xt_find_table_lock(AF_INET, name),
 					    "iptable_%s", name);
 		if (t && !IS_ERR(t)) {
 			struct ipt_getinfo info;
+			struct xt_table_info *private = t->private;
 
 			info.valid_hooks = t->valid_hooks;
-			memcpy(info.hook_entry, t->private->hook_entry,
+			memcpy(info.hook_entry, private->hook_entry,
 			       sizeof(info.hook_entry));
-			memcpy(info.underflow, t->private->underflow,
+			memcpy(info.underflow, private->underflow,
 			       sizeof(info.underflow));
-			info.num_entries = t->private->number;
-			info.size = t->private->size;
+			info.num_entries = private->number;
+			info.size = private->size;
 			memcpy(info.name, name, sizeof(info.name));
 
 			if (copy_to_user(user, &info, *len) != 0)
 				ret = -EFAULT;
 			else
 				ret = 0;
-			up(&ipt_mutex);
+			xt_table_unlock(t);
 			module_put(t->me);
 		} else
 			ret = t ? PTR_ERR(t) : -ENOENT;
@@ -1441,7 +1176,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	case IPT_SO_GET_REVISION_MATCH:
 	case IPT_SO_GET_REVISION_TARGET: {
 		struct ipt_get_revision rev;
-		int (*revfn)(const char *, u8, int *);
+		int target;
 
 		if (*len != sizeof(rev)) {
 			ret = -EINVAL;
@@ -1453,12 +1188,13 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		}
 
 		if (cmd == IPT_SO_GET_REVISION_TARGET)
-			revfn = target_revfn;
+			target = 1;
 		else
-			revfn = match_revfn;
+			target = 0;
 
-		try_then_request_module(find_revision(rev.name, rev.revision,
-						      revfn, &ret),
+		try_then_request_module(xt_find_revision(AF_INET, rev.name,
+							 rev.revision,
+							 target, &ret),
 					"ipt_%s", rev.name);
 		break;
 	}
@@ -1471,60 +1207,15 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	return ret;
 }
 
-/* Registration hooks for targets. */
-int
-ipt_register_target(struct ipt_target *target)
+int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl)
 {
 	int ret;
-
-	ret = down_interruptible(&ipt_mutex);
-	if (ret != 0)
-		return ret;
-	list_add(&target->list, &ipt_target);
-	up(&ipt_mutex);
-	return ret;
-}
-
-void
-ipt_unregister_target(struct ipt_target *target)
-{
-	down(&ipt_mutex);
-	LIST_DELETE(&ipt_target, target);
-	up(&ipt_mutex);
-}
-
-int
-ipt_register_match(struct ipt_match *match)
-{
-	int ret;
-
-	ret = down_interruptible(&ipt_mutex);
-	if (ret != 0)
-		return ret;
-
-	list_add(&match->list, &ipt_match);
-	up(&ipt_mutex);
-
-	return ret;
-}
-
-void
-ipt_unregister_match(struct ipt_match *match)
-{
-	down(&ipt_mutex);
-	LIST_DELETE(&ipt_match, match);
-	up(&ipt_mutex);
-}
-
-int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
-{
-	int ret;
-	struct ipt_table_info *newinfo;
-	static struct ipt_table_info bootstrap
+	struct xt_table_info *newinfo;
+	static struct xt_table_info bootstrap
 		= { 0, 0, 0, { 0 }, { 0 }, { } };
 	void *loc_cpu_entry;
 
-	newinfo = alloc_table_info(repl->size);
+	newinfo = xt_alloc_table_info(repl->size);
 	if (!newinfo)
 		return -ENOMEM;
 
@@ -1540,246 +1231,29 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
 			      repl->hook_entry,
 			      repl->underflow);
 	if (ret != 0) {
-		free_table_info(newinfo);
+		xt_free_table_info(newinfo);
 		return ret;
 	}
 
-	ret = down_interruptible(&ipt_mutex);
-	if (ret != 0) {
-		free_table_info(newinfo);
+	if (xt_register_table(table, &bootstrap, newinfo) != 0) {
+		xt_free_table_info(newinfo);
 		return ret;
 	}
 
-	/* Don't autoload: we'd eat our tail... */
-	if (list_named_find(&ipt_tables, table->name)) {
-		ret = -EEXIST;
-		goto free_unlock;
-	}
-
-	/* Simplifies replace_table code. */
-	table->private = &bootstrap;
-	if (!replace_table(table, 0, newinfo, &ret))
-		goto free_unlock;
-
-	duprintf("table->private->number = %u\n",
-		 table->private->number);
-	
-	/* save number of initial entries */
-	table->private->initial_entries = table->private->number;
-
-	rwlock_init(&table->lock);
-	list_prepend(&ipt_tables, table);
-
- unlock:
-	up(&ipt_mutex);
-	return ret;
-
- free_unlock:
-	free_table_info(newinfo);
-	goto unlock;
+	return 0;
 }
 
 void ipt_unregister_table(struct ipt_table *table)
 {
+	struct xt_table_info *private;
 	void *loc_cpu_entry;
 
-	down(&ipt_mutex);
-	LIST_DELETE(&ipt_tables, table);
-	up(&ipt_mutex);
+ 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
-	IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
-			  cleanup_entry, NULL);
-	free_table_info(table->private);
-}
-
-/* Returns 1 if the port is matched by the range, 0 otherwise */
-static inline int
-port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert)
-{
-	int ret;
-
-	ret = (port >= min && port <= max) ^ invert;
-	return ret;
-}
-
-static int
-tcp_find_option(u_int8_t option,
-		const struct sk_buff *skb,
-		unsigned int optlen,
-		int invert,
-		int *hotdrop)
-{
-	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
-	u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
-	unsigned int i;
-
-	duprintf("tcp_match: finding option\n");
-
-	if (!optlen)
-		return invert;
-
-	/* If we don't have the whole header, drop packet. */
-	op = skb_header_pointer(skb,
-				skb->nh.iph->ihl*4 + sizeof(struct tcphdr),
-				optlen, _opt);
-	if (op == NULL) {
-		*hotdrop = 1;
-		return 0;
-	}
-
-	for (i = 0; i < optlen; ) {
-		if (op[i] == option) return !invert;
-		if (op[i] < 2) i++;
-		else i += op[i+1]?:1;
-	}
-
-	return invert;
-}
-
-static int
-tcp_match(const struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  const void *matchinfo,
-	  int offset,
-	  int *hotdrop)
-{
-	struct tcphdr _tcph, *th;
-	const struct ipt_tcp *tcpinfo = matchinfo;
-
-	if (offset) {
-		/* To quote Alan:
-
-		   Don't allow a fragment of TCP 8 bytes in. Nobody normal
-		   causes this. Its a cracker trying to break in by doing a
-		   flag overwrite to pass the direction checks.
-		*/
-		if (offset == 1) {
-			duprintf("Dropping evil TCP offset=1 frag.\n");
-			*hotdrop = 1;
-		}
-		/* Must not be a fragment. */
-		return 0;
-	}
-
-#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
-
-	th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
-				sizeof(_tcph), &_tcph);
-	if (th == NULL) {
-		/* We've been asked to examine this packet, and we
-		   can't.  Hence, no choice but to drop. */
-		duprintf("Dropping evil TCP offset=0 tinygram.\n");
-		*hotdrop = 1;
-		return 0;
-	}
-
-	if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
-			ntohs(th->source),
-			!!(tcpinfo->invflags & IPT_TCP_INV_SRCPT)))
-		return 0;
-	if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
-			ntohs(th->dest),
-			!!(tcpinfo->invflags & IPT_TCP_INV_DSTPT)))
-		return 0;
-	if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
-		      == tcpinfo->flg_cmp,
-		      IPT_TCP_INV_FLAGS))
-		return 0;
-	if (tcpinfo->option) {
-		if (th->doff * 4 < sizeof(_tcph)) {
-			*hotdrop = 1;
-			return 0;
-		}
-		if (!tcp_find_option(tcpinfo->option, skb,
-				     th->doff*4 - sizeof(_tcph),
-				     tcpinfo->invflags & IPT_TCP_INV_OPTION,
-				     hotdrop))
-			return 0;
-	}
-	return 1;
-}
-
-/* Called when user tries to insert an entry of this type. */
-static int
-tcp_checkentry(const char *tablename,
-	       const struct ipt_ip *ip,
-	       void *matchinfo,
-	       unsigned int matchsize,
-	       unsigned int hook_mask)
-{
-	const struct ipt_tcp *tcpinfo = matchinfo;
-
-	/* Must specify proto == TCP, and no unknown invflags */
-	return ip->proto == IPPROTO_TCP
-		&& !(ip->invflags & IPT_INV_PROTO)
-		&& matchsize == IPT_ALIGN(sizeof(struct ipt_tcp))
-		&& !(tcpinfo->invflags & ~IPT_TCP_INV_MASK);
-}
-
-static int
-udp_match(const struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  const void *matchinfo,
-	  int offset,
-	  int *hotdrop)
-{
-	struct udphdr _udph, *uh;
-	const struct ipt_udp *udpinfo = matchinfo;
-
-	/* Must not be a fragment. */
-	if (offset)
-		return 0;
-
-	uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
-				sizeof(_udph), &_udph);
-	if (uh == NULL) {
-		/* We've been asked to examine this packet, and we
-		   can't.  Hence, no choice but to drop. */
-		duprintf("Dropping evil UDP tinygram.\n");
-		*hotdrop = 1;
-		return 0;
-	}
-
-	return port_match(udpinfo->spts[0], udpinfo->spts[1],
-			  ntohs(uh->source),
-			  !!(udpinfo->invflags & IPT_UDP_INV_SRCPT))
-		&& port_match(udpinfo->dpts[0], udpinfo->dpts[1],
-			      ntohs(uh->dest),
-			      !!(udpinfo->invflags & IPT_UDP_INV_DSTPT));
-}
-
-/* Called when user tries to insert an entry of this type. */
-static int
-udp_checkentry(const char *tablename,
-	       const struct ipt_ip *ip,
-	       void *matchinfo,
-	       unsigned int matchinfosize,
-	       unsigned int hook_mask)
-{
-	const struct ipt_udp *udpinfo = matchinfo;
-
-	/* Must specify proto == UDP, and no unknown invflags */
-	if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) {
-		duprintf("ipt_udp: Protocol %u != %u\n", ip->proto,
-			 IPPROTO_UDP);
-		return 0;
-	}
-	if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) {
-		duprintf("ipt_udp: matchsize %u != %u\n",
-			 matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp)));
-		return 0;
-	}
-	if (udpinfo->invflags & ~IPT_UDP_INV_MASK) {
-		duprintf("ipt_udp: unknown flags %X\n",
-			 udpinfo->invflags);
-		return 0;
-	}
-
-	return 1;
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL);
+	xt_free_table_info(private);
 }
 
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
@@ -1798,6 +1272,7 @@ icmp_match(const struct sk_buff *skb,
 	   const struct net_device *out,
 	   const void *matchinfo,
 	   int offset,
+	   unsigned int protoff,
 	   int *hotdrop)
 {
 	struct icmphdr _icmph, *ic;
@@ -1807,8 +1282,7 @@ icmp_match(const struct sk_buff *skb,
 	if (offset)
 		return 0;
 
-	ic = skb_header_pointer(skb, skb->nh.iph->ihl*4,
-				sizeof(_icmph), &_icmph);
+	ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph);
 	if (ic == NULL) {
 		/* We've been asked to examine this packet, and we
 		 * can't.  Hence, no choice but to drop.
@@ -1828,11 +1302,12 @@ icmp_match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 icmp_checkentry(const char *tablename,
-	   const struct ipt_ip *ip,
+	   const void *info,
 	   void *matchinfo,
 	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
+	const struct ipt_ip *ip = info;
 	const struct ipt_icmp *icmpinfo = matchinfo;
 
 	/* Must specify proto == ICMP, and no unknown invflags */
@@ -1862,123 +1337,22 @@ static struct nf_sockopt_ops ipt_sockopts = {
 	.get		= do_ipt_get_ctl,
 };
 
-static struct ipt_match tcp_matchstruct = {
-	.name		= "tcp",
-	.match		= &tcp_match,
-	.checkentry	= &tcp_checkentry,
-};
-
-static struct ipt_match udp_matchstruct = {
-	.name		= "udp",
-	.match		= &udp_match,
-	.checkentry	= &udp_checkentry,
-};
-
 static struct ipt_match icmp_matchstruct = {
 	.name		= "icmp",
 	.match		= &icmp_match,
 	.checkentry	= &icmp_checkentry,
 };
 
-#ifdef CONFIG_PROC_FS
-static inline int print_name(const char *i,
-			     off_t start_offset, char *buffer, int length,
-			     off_t *pos, unsigned int *count)
-{
-	if ((*count)++ >= start_offset) {
-		unsigned int namelen;
-
-		namelen = sprintf(buffer + *pos, "%s\n",
-				  i + sizeof(struct list_head));
-		if (*pos + namelen > length) {
-			/* Stop iterating */
-			return 1;
-		}
-		*pos += namelen;
-	}
-	return 0;
-}
-
-static inline int print_target(const struct ipt_target *t,
-                               off_t start_offset, char *buffer, int length,
-                               off_t *pos, unsigned int *count)
-{
-	if (t == &ipt_standard_target || t == &ipt_error_target)
-		return 0;
-	return print_name((char *)t, start_offset, buffer, length, pos, count);
-}
-
-static int ipt_get_tables(char *buffer, char **start, off_t offset, int length)
-{
-	off_t pos = 0;
-	unsigned int count = 0;
-
-	if (down_interruptible(&ipt_mutex) != 0)
-		return 0;
-
-	LIST_FIND(&ipt_tables, print_name, void *,
-		  offset, buffer, length, &pos, &count);
-
-	up(&ipt_mutex);
-
-	/* `start' hack - see fs/proc/generic.c line ~105 */
-	*start=(char *)((unsigned long)count-offset);
-	return pos;
-}
-
-static int ipt_get_targets(char *buffer, char **start, off_t offset, int length)
-{
-	off_t pos = 0;
-	unsigned int count = 0;
-
-	if (down_interruptible(&ipt_mutex) != 0)
-		return 0;
-
-	LIST_FIND(&ipt_target, print_target, struct ipt_target *,
-		  offset, buffer, length, &pos, &count);
-	
-	up(&ipt_mutex);
-
-	*start = (char *)((unsigned long)count - offset);
-	return pos;
-}
-
-static int ipt_get_matches(char *buffer, char **start, off_t offset, int length)
-{
-	off_t pos = 0;
-	unsigned int count = 0;
-
-	if (down_interruptible(&ipt_mutex) != 0)
-		return 0;
-	
-	LIST_FIND(&ipt_match, print_name, void *,
-		  offset, buffer, length, &pos, &count);
-
-	up(&ipt_mutex);
-
-	*start = (char *)((unsigned long)count - offset);
-	return pos;
-}
-
-static const struct { char *name; get_info_t *get_info; } ipt_proc_entry[] =
-{ { "ip_tables_names", ipt_get_tables },
-  { "ip_tables_targets", ipt_get_targets },
-  { "ip_tables_matches", ipt_get_matches },
-  { NULL, NULL} };
-#endif /*CONFIG_PROC_FS*/
-
 static int __init init(void)
 {
 	int ret;
 
+	xt_proto_init(AF_INET);
+
 	/* Noone else will be downing sem now, so we won't sleep */
-	down(&ipt_mutex);
-	list_append(&ipt_target, &ipt_standard_target);
-	list_append(&ipt_target, &ipt_error_target);
-	list_append(&ipt_match, &tcp_matchstruct);
-	list_append(&ipt_match, &udp_matchstruct);
-	list_append(&ipt_match, &icmp_matchstruct);
-	up(&ipt_mutex);
+	xt_register_target(AF_INET, &ipt_standard_target);
+	xt_register_target(AF_INET, &ipt_error_target);
+	xt_register_match(AF_INET, &icmp_matchstruct);
 
 	/* Register setsockopt */
 	ret = nf_register_sockopt(&ipt_sockopts);
@@ -1987,49 +1361,23 @@ static int __init init(void)
 		return ret;
 	}
 
-#ifdef CONFIG_PROC_FS
-	{
-	struct proc_dir_entry *proc;
-	int i;
-
-	for (i = 0; ipt_proc_entry[i].name; i++) {
-		proc = proc_net_create(ipt_proc_entry[i].name, 0,
-				       ipt_proc_entry[i].get_info);
-		if (!proc) {
-			while (--i >= 0)
-				proc_net_remove(ipt_proc_entry[i].name);
-			nf_unregister_sockopt(&ipt_sockopts);
-			return -ENOMEM;
-		}
-		proc->owner = THIS_MODULE;
-	}
-	}
-#endif
-
-	printk("ip_tables: (C) 2000-2002 Netfilter core team\n");
+	printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n");
 	return 0;
 }
 
 static void __exit fini(void)
 {
 	nf_unregister_sockopt(&ipt_sockopts);
-#ifdef CONFIG_PROC_FS
-	{
-	int i;
-	for (i = 0; ipt_proc_entry[i].name; i++)
-		proc_net_remove(ipt_proc_entry[i].name);
-	}
-#endif
+
+	xt_unregister_match(AF_INET, &icmp_matchstruct);
+	xt_unregister_target(AF_INET, &ipt_error_target);
+	xt_unregister_target(AF_INET, &ipt_standard_target);
+
+	xt_proto_fini(AF_INET);
 }
 
 EXPORT_SYMBOL(ipt_register_table);
 EXPORT_SYMBOL(ipt_unregister_table);
-EXPORT_SYMBOL(ipt_register_match);
-EXPORT_SYMBOL(ipt_unregister_match);
 EXPORT_SYMBOL(ipt_do_table);
-EXPORT_SYMBOL(ipt_register_target);
-EXPORT_SYMBOL(ipt_unregister_target);
-EXPORT_SYMBOL(ipt_find_target);
-
 module_init(init);
 module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
deleted file mode 100644
index dab78d8bd494..000000000000
--- a/net/ipv4/netfilter/ipt_CLASSIFY.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * This is a module which is used for setting the skb->priority field
- * of an skb for qdisc classification.
- */
-
-/* (C) 2001-2002 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/checksum.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_CLASSIFY.h>
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("iptables qdisc classification target module");
-
-static unsigned int
-target(struct sk_buff **pskb,
-       const struct net_device *in,
-       const struct net_device *out,
-       unsigned int hooknum,
-       const void *targinfo,
-       void *userinfo)
-{
-	const struct ipt_classify_target_info *clinfo = targinfo;
-
-	if((*pskb)->priority != clinfo->priority) 
-		(*pskb)->priority = clinfo->priority;
-
-	return IPT_CONTINUE;
-}
-
-static int
-checkentry(const char *tablename,
-           const struct ipt_entry *e,
-           void *targinfo,
-           unsigned int targinfosize,
-           unsigned int hook_mask)
-{
-	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){
-		printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n",
-		       targinfosize,
-		       IPT_ALIGN(sizeof(struct ipt_classify_target_info)));
-		return 0;
-	}
-	
-	if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) |
-	                  (1 << NF_IP_POST_ROUTING))) {
-		printk(KERN_ERR "CLASSIFY: only valid in LOCAL_OUT, FORWARD "
-		                "and POST_ROUTING.\n");
-		return 0;
-	}
-
-	if (strcmp(tablename, "mangle") != 0) {
-		printk(KERN_ERR "CLASSIFY: can only be called from "
-		                "\"mangle\" table, not \"%s\".\n",
-		                tablename);
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ipt_target ipt_classify_reg = { 
-	.name 		= "CLASSIFY", 
-	.target 	= target,
-	.checkentry	= checkentry,
-	.me 		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ipt_register_target(&ipt_classify_reg);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_target(&ipt_classify_reg);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 45c52d8f4d99..d9bc971f03af 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -379,12 +379,13 @@ target(struct sk_buff **pskb,
 
 static int
 checkentry(const char *tablename,
-	   const struct ipt_entry *e,
+	   const void *e_void,
            void *targinfo,
            unsigned int targinfosize,
            unsigned int hook_mask)
 {
 	struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+	const struct ipt_entry *e = e_void;
 
 	struct clusterip_config *config;
 
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
deleted file mode 100644
index 8acac5a40a92..000000000000
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/* This kernel module is used to modify the connection mark values, or
- * to optionally restore the skb nfmark from the connection mark
- *
- * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
- * by Henrik Nordstrom <hno@marasystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/checksum.h>
-
-MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
-MODULE_DESCRIPTION("IP tables CONNMARK matching module");
-MODULE_LICENSE("GPL");
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_CONNMARK.h>
-#include <net/netfilter/nf_conntrack_compat.h>
-
-static unsigned int
-target(struct sk_buff **pskb,
-       const struct net_device *in,
-       const struct net_device *out,
-       unsigned int hooknum,
-       const void *targinfo,
-       void *userinfo)
-{
-	const struct ipt_connmark_target_info *markinfo = targinfo;
-	u_int32_t diff;
-	u_int32_t nfmark;
-	u_int32_t newmark;
-	u_int32_t ctinfo;
-	u_int32_t *ctmark = nf_ct_get_mark(*pskb, &ctinfo);
-
-	if (ctmark) {
-	    switch(markinfo->mode) {
-	    case IPT_CONNMARK_SET:
-		newmark = (*ctmark & ~markinfo->mask) | markinfo->mark;
-		if (newmark != *ctmark)
-		    *ctmark = newmark;
-		break;
-	    case IPT_CONNMARK_SAVE:
-		newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask);
-		if (*ctmark != newmark)
-		    *ctmark = newmark;
-		break;
-	    case IPT_CONNMARK_RESTORE:
-		nfmark = (*pskb)->nfmark;
-		diff = (*ctmark ^ nfmark) & markinfo->mask;
-		if (diff != 0)
-		    (*pskb)->nfmark = nfmark ^ diff;
-		break;
-	    }
-	}
-
-	return IPT_CONTINUE;
-}
-
-static int
-checkentry(const char *tablename,
-	   const struct ipt_entry *e,
-	   void *targinfo,
-	   unsigned int targinfosize,
-	   unsigned int hook_mask)
-{
-	struct ipt_connmark_target_info *matchinfo = targinfo;
-	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_connmark_target_info))) {
-		printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n",
-		       targinfosize,
-		       IPT_ALIGN(sizeof(struct ipt_connmark_target_info)));
-		return 0;
-	}
-
-	if (matchinfo->mode == IPT_CONNMARK_RESTORE) {
-	    if (strcmp(tablename, "mangle") != 0) {
-		    printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename);
-		    return 0;
-	    }
-	}
-
-	if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
-		printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ipt_target ipt_connmark_reg = {
-	.name = "CONNMARK",
-	.target = &target,
-	.checkentry = &checkentry,
-	.me = THIS_MODULE
-};
-
-static int __init init(void)
-{
-	need_ip_conntrack();
-	return ipt_register_target(&ipt_connmark_reg);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_target(&ipt_connmark_reg);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
index 6e319570a28c..898cdf79ce18 100644
--- a/net/ipv4/netfilter/ipt_DSCP.c
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -57,7 +57,7 @@ target(struct sk_buff **pskb,
 
 static int
 checkentry(const char *tablename,
-	   const struct ipt_entry *e,
+	   const void *e_void,
            void *targinfo,
            unsigned int targinfosize,
            unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index a1319693f648..706445426a6d 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -113,12 +113,13 @@ target(struct sk_buff **pskb,
 
 static int
 checkentry(const char *tablename,
-	   const struct ipt_entry *e,
+	   const void *e_void,
            void *targinfo,
            unsigned int targinfosize,
            unsigned int hook_mask)
 {
 	const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo;
+	const struct ipt_entry *e = e_void;
 
 	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ECN_info))) {
 		printk(KERN_WARNING "ECN: targinfosize %u != %Zu\n",
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 30be0f1dae37..6606ddb66a29 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -431,7 +431,7 @@ ipt_log_target(struct sk_buff **pskb,
 }
 
 static int ipt_log_checkentry(const char *tablename,
-			      const struct ipt_entry *e,
+			      const void *e,
 			      void *targinfo,
 			      unsigned int targinfosize,
 			      unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
deleted file mode 100644
index 52b4f2c296bf..000000000000
--- a/net/ipv4/netfilter/ipt_MARK.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/* This is a module which is used for setting the NFMARK field of an skb. */
-
-/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/checksum.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_MARK.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
-MODULE_DESCRIPTION("iptables MARK modification module");
-
-static unsigned int
-target_v0(struct sk_buff **pskb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  unsigned int hooknum,
-	  const void *targinfo,
-	  void *userinfo)
-{
-	const struct ipt_mark_target_info *markinfo = targinfo;
-
-	if((*pskb)->nfmark != markinfo->mark)
-		(*pskb)->nfmark = markinfo->mark;
-
-	return IPT_CONTINUE;
-}
-
-static unsigned int
-target_v1(struct sk_buff **pskb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  unsigned int hooknum,
-	  const void *targinfo,
-	  void *userinfo)
-{
-	const struct ipt_mark_target_info_v1 *markinfo = targinfo;
-	int mark = 0;
-
-	switch (markinfo->mode) {
-	case IPT_MARK_SET:
-		mark = markinfo->mark;
-		break;
-		
-	case IPT_MARK_AND:
-		mark = (*pskb)->nfmark & markinfo->mark;
-		break;
-		
-	case IPT_MARK_OR:
-		mark = (*pskb)->nfmark | markinfo->mark;
-		break;
-	}
-
-	if((*pskb)->nfmark != mark)
-		(*pskb)->nfmark = mark;
-
-	return IPT_CONTINUE;
-}
-
-
-static int
-checkentry_v0(const char *tablename,
-	      const struct ipt_entry *e,
-	      void *targinfo,
-	      unsigned int targinfosize,
-	      unsigned int hook_mask)
-{
-	struct ipt_mark_target_info *markinfo = targinfo;
-
-	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
-		printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
-		       targinfosize,
-		       IPT_ALIGN(sizeof(struct ipt_mark_target_info)));
-		return 0;
-	}
-
-	if (strcmp(tablename, "mangle") != 0) {
-		printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
-		return 0;
-	}
-
-	if (markinfo->mark > 0xffffffff) {
-		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
-		return 0;
-	}
-
-	return 1;
-}
-
-static int
-checkentry_v1(const char *tablename,
-	      const struct ipt_entry *e,
-	      void *targinfo,
-	      unsigned int targinfosize,
-	      unsigned int hook_mask)
-{
-	struct ipt_mark_target_info_v1 *markinfo = targinfo;
-
-	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))){
-		printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
-		       targinfosize,
-		       IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1)));
-		return 0;
-	}
-
-	if (strcmp(tablename, "mangle") != 0) {
-		printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
-		return 0;
-	}
-
-	if (markinfo->mode != IPT_MARK_SET
-	    && markinfo->mode != IPT_MARK_AND
-	    && markinfo->mode != IPT_MARK_OR) {
-		printk(KERN_WARNING "MARK: unknown mode %u\n",
-		       markinfo->mode);
-		return 0;
-	}
-
-	if (markinfo->mark > 0xffffffff) {
-		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ipt_target ipt_mark_reg_v0 = {
-	.name		= "MARK",
-	.target		= target_v0,
-	.checkentry	= checkentry_v0,
-	.me		= THIS_MODULE,
-	.revision	= 0,
-};
-
-static struct ipt_target ipt_mark_reg_v1 = {
-	.name		= "MARK",
-	.target		= target_v1,
-	.checkentry	= checkentry_v1,
-	.me		= THIS_MODULE,
-	.revision	= 1,
-};
-
-static int __init init(void)
-{
-	int err;
-
-	err = ipt_register_target(&ipt_mark_reg_v0);
-	if (!err) {
-		err = ipt_register_target(&ipt_mark_reg_v1);
-		if (err)
-			ipt_unregister_target(&ipt_mark_reg_v0);
-	}
-	return err;
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_target(&ipt_mark_reg_v0);
-	ipt_unregister_target(&ipt_mark_reg_v1);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 27860510ca6d..12c56d3343ca 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -40,7 +40,7 @@ static DEFINE_RWLOCK(masq_lock);
 /* FIXME: Multiple targets. --RR */
 static int
 masquerade_check(const char *tablename,
-		 const struct ipt_entry *e,
+		 const void *e,
 		 void *targinfo,
 		 unsigned int targinfosize,
 		 unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index e6e7b6095363..b074467fe67b 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -31,7 +31,7 @@ MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target");
 
 static int
 check(const char *tablename,
-      const struct ipt_entry *e,
+      const void *e,
       void *targinfo,
       unsigned int targinfosize,
       unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c
deleted file mode 100644
index 3cedc9be8807..000000000000
--- a/net/ipv4/netfilter/ipt_NFQUEUE.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/* iptables module for using new netfilter netlink queue
- *
- * (C) 2005 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as 
- * published by the Free Software Foundation.
- * 
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
-
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("iptables NFQUEUE target");
-MODULE_LICENSE("GPL");
-
-static unsigned int
-target(struct sk_buff **pskb,
-       const struct net_device *in,
-       const struct net_device *out,
-       unsigned int hooknum,
-       const void *targinfo,
-       void *userinfo)
-{
-	const struct ipt_NFQ_info *tinfo = targinfo;
-
-	return NF_QUEUE_NR(tinfo->queuenum);
-}
-
-static int
-checkentry(const char *tablename,
-	   const struct ipt_entry *e,
-           void *targinfo,
-           unsigned int targinfosize,
-           unsigned int hook_mask)
-{
-	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) {
-		printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
-		       targinfosize,
-		       IPT_ALIGN(sizeof(struct ipt_NFQ_info)));
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ipt_target ipt_NFQ_reg = {
-	.name		= "NFQUEUE",
-	.target		= target,
-	.checkentry	= checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ipt_register_target(&ipt_NFQ_reg);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_target(&ipt_NFQ_reg);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c
deleted file mode 100644
index e3c69d072c6e..000000000000
--- a/net/ipv4/netfilter/ipt_NOTRACK.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/* This is a module which is used for setting up fake conntracks
- * on packets so that they are not seen by the conntrack/NAT code.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <net/netfilter/nf_conntrack_compat.h>
-
-static unsigned int
-target(struct sk_buff **pskb,
-       const struct net_device *in,
-       const struct net_device *out,
-       unsigned int hooknum,
-       const void *targinfo,
-       void *userinfo)
-{
-	/* Previously seen (loopback)? Ignore. */
-	if ((*pskb)->nfct != NULL)
-		return IPT_CONTINUE;
-
-	/* Attach fake conntrack entry. 
-	   If there is a real ct entry correspondig to this packet, 
-	   it'll hang aroun till timing out. We don't deal with it
-	   for performance reasons. JK */
-	nf_ct_untrack(*pskb);
-	(*pskb)->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get((*pskb)->nfct);
-
-	return IPT_CONTINUE;
-}
-
-static int
-checkentry(const char *tablename,
-	   const struct ipt_entry *e,
-           void *targinfo,
-           unsigned int targinfosize,
-           unsigned int hook_mask)
-{
-	if (targinfosize != 0) {
-		printk(KERN_WARNING "NOTRACK: targinfosize %u != 0\n",
-		       targinfosize);
-		return 0;
-	}
-
-	if (strcmp(tablename, "raw") != 0) {
-		printk(KERN_WARNING "NOTRACK: can only be called from \"raw\" table, not \"%s\"\n", tablename);
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ipt_target ipt_notrack_reg = { 
-	.name = "NOTRACK", 
-	.target = target, 
-	.checkentry = checkentry,
-	.me = THIS_MODULE 
-};
-
-static int __init init(void)
-{
-	if (ipt_register_target(&ipt_notrack_reg))
-		return -EINVAL;
-
-	return 0;
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_target(&ipt_notrack_reg);
-}
-
-module_init(init);
-module_exit(fini);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 5245bfd33d52..140be51f2f01 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables REDIRECT target module");
 /* FIXME: Take multiple ranges --RR */
 static int
 redirect_check(const char *tablename,
-	       const struct ipt_entry *e,
+	       const void *e,
 	       void *targinfo,
 	       unsigned int targinfosize,
 	       unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 6693526ae128..3eb47aae78c5 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -282,12 +282,13 @@ static unsigned int reject(struct sk_buff **pskb,
 }
 
 static int check(const char *tablename,
-		 const struct ipt_entry *e,
+		 const void *e_void,
 		 void *targinfo,
 		 unsigned int targinfosize,
 		 unsigned int hook_mask)
 {
  	const struct ipt_reject_info *rejinfo = targinfo;
+	const struct ipt_entry *e = e_void;
 
  	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_reject_info))) {
   		DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize);
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
index 7a0536d864ac..a22de59bba0e 100644
--- a/net/ipv4/netfilter/ipt_SAME.c
+++ b/net/ipv4/netfilter/ipt_SAME.c
@@ -49,7 +49,7 @@ MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip");
 
 static int
 same_check(const char *tablename,
-	      const struct ipt_entry *e,
+	      const void *e,
 	      void *targinfo,
 	      unsigned int targinfosize,
 	      unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 8db70d6908c3..c122841e182c 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -210,12 +210,13 @@ static inline int find_syn_match(const struct ipt_entry_match *m)
 /* Must specify -p tcp --syn/--tcp-flags SYN */
 static int
 ipt_tcpmss_checkentry(const char *tablename,
-		      const struct ipt_entry *e,
+		      const void *e_void,
 		      void *targinfo,
 		      unsigned int targinfosize,
 		      unsigned int hook_mask)
 {
 	const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
+	const struct ipt_entry *e = e_void;
 
 	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tcpmss_info))) {
 		DEBUGP("ipt_tcpmss_checkentry: targinfosize %u != %u\n",
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index deadb36d4428..3a44a56db239 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -52,7 +52,7 @@ target(struct sk_buff **pskb,
 
 static int
 checkentry(const char *tablename,
-	   const struct ipt_entry *e,
+	   const void *e_void,
            void *targinfo,
            unsigned int targinfosize,
            unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
index b9ae6a9382f3..b769eb231970 100644
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -66,7 +66,7 @@ ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in,
 }
 
 static int ipt_ttl_checkentry(const char *tablename,
-		const struct ipt_entry *e,
+		const void *e,
 		void *targinfo,
 		unsigned int targinfosize,
 		unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 38641cd06123..641dbc477650 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -330,7 +330,7 @@ static void ipt_logfn(unsigned int pf,
 }
 
 static int ipt_ulog_checkentry(const char *tablename,
-			       const struct ipt_entry *e,
+			       const void *e,
 			       void *targinfo,
 			       unsigned int targinfosize,
 			       unsigned int hookmask)
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index e19c2a52d00c..d6b83a976518 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -29,7 +29,7 @@ static inline int match_type(u_int32_t addr, u_int16_t mask)
 
 static int match(const struct sk_buff *skb, const struct net_device *in,
 		 const struct net_device *out, const void *matchinfo,
-		 int offset, int *hotdrop)
+		 int offset, unsigned int protoff, int *hotdrop)
 {
 	const struct ipt_addrtype_info *info = matchinfo;
 	const struct iphdr *iph = skb->nh.iph;
@@ -43,7 +43,7 @@ static int match(const struct sk_buff *skb, const struct net_device *in,
 	return ret;
 }
 
-static int checkentry(const char *tablename, const struct ipt_ip *ip,
+static int checkentry(const char *tablename, const void *ip,
 		      void *matchinfo, unsigned int matchsize,
 		      unsigned int hook_mask)
 {
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index a0fea847cb72..144adfec13cc 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -41,6 +41,7 @@ match(const struct sk_buff *skb,
       const struct net_device *out,
       const void *matchinfo,
       int offset,
+      unsigned int protoff,
       int *hotdrop)
 {
 	struct ip_auth_hdr _ahdr, *ah;
@@ -50,7 +51,7 @@ match(const struct sk_buff *skb,
 	if (offset)
 		return 0;
 
-	ah = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+	ah = skb_header_pointer(skb, protoff,
 				sizeof(_ahdr), &_ahdr);
 	if (ah == NULL) {
 		/* We've been asked to examine this packet, and we
@@ -69,12 +70,13 @@ match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 checkentry(const char *tablename,
-	   const struct ipt_ip *ip,
+	   const void *ip_void,
 	   void *matchinfo,
 	   unsigned int matchinfosize,
 	   unsigned int hook_mask)
 {
 	const struct ipt_ah *ahinfo = matchinfo;
+	const struct ipt_ip *ip = ip_void;
 
 	/* Must specify proto == AH, and no unknown invflags */
 	if (ip->proto != IPPROTO_AH || (ip->invflags & IPT_INV_PROTO)) {
diff --git a/net/ipv4/netfilter/ipt_comment.c b/net/ipv4/netfilter/ipt_comment.c
deleted file mode 100644
index 6b76a1ea5245..000000000000
--- a/net/ipv4/netfilter/ipt_comment.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Implements a dummy match to allow attaching comments to rules
- *
- * 2003-05-13 Brad Fisher (brad@info-link.net)
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_comment.h>
-
-MODULE_AUTHOR("Brad Fisher <brad@info-link.net>");
-MODULE_DESCRIPTION("iptables comment match module");
-MODULE_LICENSE("GPL");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	/* We always match */
-	return 1;
-}
-
-static int
-checkentry(const char *tablename,
-           const struct ipt_ip *ip,
-           void *matchinfo,
-           unsigned int matchsize,
-           unsigned int hook_mask)
-{
-	/* Check the size */
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_comment_info)))
-		return 0;
-	return 1;
-}
-
-static struct ipt_match comment_match = {
-	.name		= "comment",
-	.match		= match,
-	.checkentry	= checkentry,
-	.me		= THIS_MODULE
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&comment_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&comment_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
deleted file mode 100644
index d68a048b7176..000000000000
--- a/net/ipv4/netfilter/ipt_connbytes.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Kernel module to match connection tracking byte counter.
- * GPL (C) 2002 Martin Devera (devik@cdi.cz).
- *
- * 2004-07-20 Harald Welte <laforge@netfilter.org>
- * 	- reimplemented to use per-connection accounting counters
- * 	- add functionality to match number of packets
- * 	- add functionality to match average packet size
- * 	- add support to match directions seperately
- *
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/netfilter/nf_conntrack_compat.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_connbytes.h>
-
-#include <asm/div64.h>
-#include <asm/bitops.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
-
-/* 64bit divisor, dividend and result. dynamic precision */
-static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
-{
-	u_int32_t d = divisor;
-
-	if (divisor > 0xffffffffULL) {
-		unsigned int shift = fls(divisor >> 32);
-
-		d = divisor >> shift;
-		dividend >>= shift;
-	}
-
-	do_div(dividend, d);
-	return dividend;
-}
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_connbytes_info *sinfo = matchinfo;
-	u_int64_t what = 0;	/* initialize to make gcc happy */
-	const struct ip_conntrack_counter *counters;
-
-	if (!(counters = nf_ct_get_counters(skb)))
-		return 0; /* no match */
-
-	switch (sinfo->what) {
-	case IPT_CONNBYTES_PKTS:
-		switch (sinfo->direction) {
-		case IPT_CONNBYTES_DIR_ORIGINAL:
-			what = counters[IP_CT_DIR_ORIGINAL].packets;
-			break;
-		case IPT_CONNBYTES_DIR_REPLY:
-			what = counters[IP_CT_DIR_REPLY].packets;
-			break;
-		case IPT_CONNBYTES_DIR_BOTH:
-			what = counters[IP_CT_DIR_ORIGINAL].packets;
-			what += counters[IP_CT_DIR_REPLY].packets;
-			break;
-		}
-		break;
-	case IPT_CONNBYTES_BYTES:
-		switch (sinfo->direction) {
-		case IPT_CONNBYTES_DIR_ORIGINAL:
-			what = counters[IP_CT_DIR_ORIGINAL].bytes;
-			break;
-		case IPT_CONNBYTES_DIR_REPLY:
-			what = counters[IP_CT_DIR_REPLY].bytes;
-			break;
-		case IPT_CONNBYTES_DIR_BOTH:
-			what = counters[IP_CT_DIR_ORIGINAL].bytes;
-			what += counters[IP_CT_DIR_REPLY].bytes;
-			break;
-		}
-		break;
-	case IPT_CONNBYTES_AVGPKT:
-		switch (sinfo->direction) {
-		case IPT_CONNBYTES_DIR_ORIGINAL:
-			what = div64_64(counters[IP_CT_DIR_ORIGINAL].bytes,
-					counters[IP_CT_DIR_ORIGINAL].packets);
-			break;
-		case IPT_CONNBYTES_DIR_REPLY:
-			what = div64_64(counters[IP_CT_DIR_REPLY].bytes,
-					counters[IP_CT_DIR_REPLY].packets);
-			break;
-		case IPT_CONNBYTES_DIR_BOTH:
-			{
-				u_int64_t bytes;
-				u_int64_t pkts;
-				bytes = counters[IP_CT_DIR_ORIGINAL].bytes +
-					counters[IP_CT_DIR_REPLY].bytes;
-				pkts = counters[IP_CT_DIR_ORIGINAL].packets+
-					counters[IP_CT_DIR_REPLY].packets;
-
-				/* FIXME_THEORETICAL: what to do if sum
-				 * overflows ? */
-
-				what = div64_64(bytes, pkts);
-			}
-			break;
-		}
-		break;
-	}
-
-	if (sinfo->count.to)
-		return (what <= sinfo->count.to && what >= sinfo->count.from);
-	else
-		return (what >= sinfo->count.from);
-}
-
-static int check(const char *tablename,
-		 const struct ipt_ip *ip,
-		 void *matchinfo,
-		 unsigned int matchsize,
-		 unsigned int hook_mask)
-{
-	const struct ipt_connbytes_info *sinfo = matchinfo;
-
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info)))
-		return 0;
-
-	if (sinfo->what != IPT_CONNBYTES_PKTS &&
-	    sinfo->what != IPT_CONNBYTES_BYTES &&
-	    sinfo->what != IPT_CONNBYTES_AVGPKT)
-		return 0;
-
-	if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL &&
-	    sinfo->direction != IPT_CONNBYTES_DIR_REPLY &&
-	    sinfo->direction != IPT_CONNBYTES_DIR_BOTH)
-		return 0;
-
-	return 1;
-}
-
-static struct ipt_match state_match = {
-	.name		= "connbytes",
-	.match		= &match,
-	.checkentry	= &check,
-	.me		= THIS_MODULE
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&state_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&state_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
deleted file mode 100644
index 5306ef293b92..000000000000
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/* This kernel module matches connection mark values set by the
- * CONNMARK target
- *
- * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
- * by Henrik Nordstrom <hno@marasystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
-MODULE_DESCRIPTION("IP tables connmark match module");
-MODULE_LICENSE("GPL");
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_connmark.h>
-#include <net/netfilter/nf_conntrack_compat.h>
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_connmark_info *info = matchinfo;
-	u_int32_t ctinfo;
-	const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo);
-	if (!ctmark)
-		return 0;
-
-	return (((*ctmark) & info->mask) == info->mark) ^ info->invert;
-}
-
-static int
-checkentry(const char *tablename,
-	   const struct ipt_ip *ip,
-	   void *matchinfo,
-	   unsigned int matchsize,
-	   unsigned int hook_mask)
-{
-	struct ipt_connmark_info *cm = 
-				(struct ipt_connmark_info *)matchinfo;
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
-		return 0;
-
-	if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
-		printk(KERN_WARNING "connmark: only support 32bit mark\n");
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ipt_match connmark_match = {
-	.name = "connmark",
-	.match = &match,
-	.checkentry = &checkentry,
-	.me = THIS_MODULE
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&connmark_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&connmark_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c
deleted file mode 100644
index c8d18705469b..000000000000
--- a/net/ipv4/netfilter/ipt_conntrack.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Kernel module to match connection tracking information.
- * Superset of Rusty's minimalistic state match.
- *
- * (C) 2001  Marc Boucher (marc@mbsi.ca).
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
-#else
-#include <net/netfilter/nf_conntrack.h>
-#endif
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_conntrack.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
-MODULE_DESCRIPTION("iptables connection tracking match module");
-
-#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_conntrack_info *sinfo = matchinfo;
-	struct ip_conntrack *ct;
-	enum ip_conntrack_info ctinfo;
-	unsigned int statebit;
-
-	ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
-
-#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg))
-
-	if (ct == &ip_conntrack_untracked)
-		statebit = IPT_CONNTRACK_STATE_UNTRACKED;
-	else if (ct)
- 		statebit = IPT_CONNTRACK_STATE_BIT(ctinfo);
- 	else
- 		statebit = IPT_CONNTRACK_STATE_INVALID;
- 
-	if(sinfo->flags & IPT_CONNTRACK_STATE) {
-		if (ct) {
-			if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip !=
-			    ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip)
-				statebit |= IPT_CONNTRACK_STATE_SNAT;
-
-			if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip !=
-			    ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip)
-				statebit |= IPT_CONNTRACK_STATE_DNAT;
-		}
-
-		if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_PROTO) {
-		if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO))
-                	return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) {
-		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_ORIGDST) {
-		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_REPLSRC) {
-		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_REPLDST) {
-		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_STATUS) {
-		if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_EXPIRES) {
-		unsigned long expires;
-
-		if(!ct)
-			return 0;
-
-		expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0;
-
-		if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES))
-			return 0;
-	}
-
-	return 1;
-}
-
-#else /* CONFIG_IP_NF_CONNTRACK */
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_conntrack_info *sinfo = matchinfo;
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	unsigned int statebit;
-
-	ct = nf_ct_get((struct sk_buff *)skb, &ctinfo);
-
-#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg))
-
-	if (ct == &nf_conntrack_untracked)
-		statebit = IPT_CONNTRACK_STATE_UNTRACKED;
-	else if (ct)
- 		statebit = IPT_CONNTRACK_STATE_BIT(ctinfo);
- 	else
- 		statebit = IPT_CONNTRACK_STATE_INVALID;
- 
-	if(sinfo->flags & IPT_CONNTRACK_STATE) {
-		if (ct) {
-			if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip !=
-			    ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip)
-				statebit |= IPT_CONNTRACK_STATE_SNAT;
-
-			if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip !=
-			    ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip)
-				statebit |= IPT_CONNTRACK_STATE_DNAT;
-		}
-
-		if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_PROTO) {
-		if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO))
-                	return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) {
-		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_ORIGDST) {
-		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_REPLSRC) {
-		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_REPLDST) {
-		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_STATUS) {
-		if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS))
-			return 0;
-	}
-
-	if(sinfo->flags & IPT_CONNTRACK_EXPIRES) {
-		unsigned long expires;
-
-		if(!ct)
-			return 0;
-
-		expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0;
-
-		if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES))
-			return 0;
-	}
-
-	return 1;
-}
-
-#endif /* CONFIG_NF_IP_CONNTRACK */
-
-static int check(const char *tablename,
-		 const struct ipt_ip *ip,
-		 void *matchinfo,
-		 unsigned int matchsize,
-		 unsigned int hook_mask)
-{
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_conntrack_info)))
-		return 0;
-
-	return 1;
-}
-
-static struct ipt_match conntrack_match = {
-	.name		= "conntrack",
-	.match		= &match,
-	.checkentry	= &check,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	need_ip_conntrack();
-	return ipt_register_match(&conntrack_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&conntrack_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c
deleted file mode 100644
index ad3278bba6c1..000000000000
--- a/net/ipv4/netfilter/ipt_dccp.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * iptables module for DCCP protocol header matching
- *
- * (C) 2005 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/spinlock.h>
-#include <net/ip.h>
-#include <linux/dccp.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_dccp.h>
-
-#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
-		                  || (!!((invflag) & (option)) ^ (cond)))
-
-static unsigned char *dccp_optbuf;
-static DEFINE_SPINLOCK(dccp_buflock);
-
-static inline int
-dccp_find_option(u_int8_t option,
-		 const struct sk_buff *skb,
-		 const struct dccp_hdr *dh,
-		 int *hotdrop)
-{
-	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
-	unsigned char *op;
-	unsigned int optoff = __dccp_hdr_len(dh);
-	unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
-	unsigned int i;
-
-	if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
-		*hotdrop = 1;
-		return 0;
-	}
-
-	if (!optlen)
-		return 0;
-
-	spin_lock_bh(&dccp_buflock);
-	op = skb_header_pointer(skb,
-				skb->nh.iph->ihl*4 + optoff,
-				optlen, dccp_optbuf);
-	if (op == NULL) {
-		/* If we don't have the whole header, drop packet. */
-		spin_unlock_bh(&dccp_buflock);
-		*hotdrop = 1;
-		return 0;
-	}
-
-	for (i = 0; i < optlen; ) {
-		if (op[i] == option) {
-			spin_unlock_bh(&dccp_buflock);
-			return 1;
-		}
-
-		if (op[i] < 2) 
-			i++;
-		else 
-			i += op[i+1]?:1;
-	}
-
-	spin_unlock_bh(&dccp_buflock);
-	return 0;
-}
-
-
-static inline int
-match_types(const struct dccp_hdr *dh, u_int16_t typemask)
-{
-	return (typemask & (1 << dh->dccph_type));
-}
-
-static inline int
-match_option(u_int8_t option, const struct sk_buff *skb,
-	     const struct dccp_hdr *dh, int *hotdrop)
-{
-	return dccp_find_option(option, skb, dh, hotdrop);
-}
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_dccp_info *info = 
-				(const struct ipt_dccp_info *)matchinfo;
-	struct dccp_hdr _dh, *dh;
-
-	if (offset)
-		return 0;
-	
-	dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh);
-	if (dh == NULL) {
-		*hotdrop = 1;
-		return 0;
-       	}
-
-	return  DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) 
-			&& (ntohs(dh->dccph_sport) <= info->spts[1])), 
-		   	IPT_DCCP_SRC_PORTS, info->flags, info->invflags)
-		&& DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) 
-			&& (ntohs(dh->dccph_dport) <= info->dpts[1])), 
-			IPT_DCCP_DEST_PORTS, info->flags, info->invflags)
-		&& DCCHECK(match_types(dh, info->typemask),
-			   IPT_DCCP_TYPE, info->flags, info->invflags)
-		&& DCCHECK(match_option(info->option, skb, dh, hotdrop),
-			   IPT_DCCP_OPTION, info->flags, info->invflags);
-}
-
-static int
-checkentry(const char *tablename,
-	   const struct ipt_ip *ip,
-	   void *matchinfo,
-	   unsigned int matchsize,
-	   unsigned int hook_mask)
-{
-	const struct ipt_dccp_info *info;
-
-	info = (const struct ipt_dccp_info *)matchinfo;
-
-	return ip->proto == IPPROTO_DCCP
-		&& !(ip->invflags & IPT_INV_PROTO)
-		&& matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info))
-		&& !(info->flags & ~IPT_DCCP_VALID_FLAGS)
-		&& !(info->invflags & ~IPT_DCCP_VALID_FLAGS)
-		&& !(info->invflags & ~info->flags);
-}
-
-static struct ipt_match dccp_match = 
-{ 
-	.name 		= "dccp",
-	.match		= &match,
-	.checkentry	= &checkentry,
-	.me 		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	int ret;
-
-	/* doff is 8 bits, so the maximum option size is (4*256).  Don't put
-	 * this in BSS since DaveM is worried about locked TLB's for kernel
-	 * BSS. */
-	dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
-	if (!dccp_optbuf)
-		return -ENOMEM;
-	ret = ipt_register_match(&dccp_match);
-	if (ret)
-		kfree(dccp_optbuf);
-
-	return ret;
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&dccp_match);
-	kfree(dccp_optbuf);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("Match for DCCP protocol packets");
-
diff --git a/net/ipv4/netfilter/ipt_dscp.c b/net/ipv4/netfilter/ipt_dscp.c
index 5df52a64a5d4..92063b4f8602 100644
--- a/net/ipv4/netfilter/ipt_dscp.c
+++ b/net/ipv4/netfilter/ipt_dscp.c
@@ -21,7 +21,7 @@ MODULE_LICENSE("GPL");
 
 static int match(const struct sk_buff *skb, const struct net_device *in,
 		 const struct net_device *out, const void *matchinfo,
-		 int offset, int *hotdrop)
+		 int offset, unsigned int protoff, int *hotdrop)
 {
 	const struct ipt_dscp_info *info = matchinfo;
 	const struct iphdr *iph = skb->nh.iph;
@@ -31,7 +31,7 @@ static int match(const struct sk_buff *skb, const struct net_device *in,
 	return ((iph->tos&IPT_DSCP_MASK) == sh_dscp) ^ info->invert;
 }
 
-static int checkentry(const char *tablename, const struct ipt_ip *ip,
+static int checkentry(const char *tablename, const void *ip,
 		      void *matchinfo, unsigned int matchsize,
 		      unsigned int hook_mask)
 {
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index b6f7181e89cc..e68b0c7981f0 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -67,7 +67,7 @@ static inline int match_tcp(const struct sk_buff *skb,
 
 static int match(const struct sk_buff *skb, const struct net_device *in,
 		 const struct net_device *out, const void *matchinfo,
-		 int offset, int *hotdrop)
+		 int offset, unsigned int protoff, int *hotdrop)
 {
 	const struct ipt_ecn_info *info = matchinfo;
 
@@ -85,11 +85,12 @@ static int match(const struct sk_buff *skb, const struct net_device *in,
 	return 1;
 }
 
-static int checkentry(const char *tablename, const struct ipt_ip *ip,
+static int checkentry(const char *tablename, const void *ip_void,
 		      void *matchinfo, unsigned int matchsize,
 		      unsigned int hook_mask)
 {
 	const struct ipt_ecn_info *info = matchinfo;
+	const struct ipt_ip *ip = ip_void;
 
 	if (matchsize != IPT_ALIGN(sizeof(struct ipt_ecn_info)))
 		return 0;
diff --git a/net/ipv4/netfilter/ipt_esp.c b/net/ipv4/netfilter/ipt_esp.c
index e1d0dd31e117..9de191a8162d 100644
--- a/net/ipv4/netfilter/ipt_esp.c
+++ b/net/ipv4/netfilter/ipt_esp.c
@@ -42,6 +42,7 @@ match(const struct sk_buff *skb,
       const struct net_device *out,
       const void *matchinfo,
       int offset,
+      unsigned int protoff,
       int *hotdrop)
 {
 	struct ip_esp_hdr _esp, *eh;
@@ -51,7 +52,7 @@ match(const struct sk_buff *skb,
 	if (offset)
 		return 0;
 
-	eh = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+	eh = skb_header_pointer(skb, protoff,
 				sizeof(_esp), &_esp);
 	if (eh == NULL) {
 		/* We've been asked to examine this packet, and we
@@ -70,12 +71,13 @@ match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 checkentry(const char *tablename,
-	   const struct ipt_ip *ip,
+	   const void *ip_void,
 	   void *matchinfo,
 	   unsigned int matchinfosize,
 	   unsigned int hook_mask)
 {
 	const struct ipt_esp *espinfo = matchinfo;
+	const struct ipt_ip *ip = ip_void;
 
 	/* Must specify proto == ESP, and no unknown invflags */
 	if (ip->proto != IPPROTO_ESP || (ip->invflags & IPT_INV_PROTO)) {
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 2dd1cccbdab9..4fe48c1bd5f3 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -429,6 +429,7 @@ hashlimit_match(const struct sk_buff *skb,
 		const struct net_device *out,
 		const void *matchinfo,
 		int offset,
+		unsigned int protoff,
 		int *hotdrop)
 {
 	struct ipt_hashlimit_info *r = 
@@ -504,7 +505,7 @@ hashlimit_match(const struct sk_buff *skb,
 
 static int
 hashlimit_checkentry(const char *tablename,
-		     const struct ipt_ip *ip,
+		     const void *inf,
 		     void *matchinfo,
 		     unsigned int matchsize,
 		     unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
deleted file mode 100644
index aef649e393af..000000000000
--- a/net/ipv4/netfilter/ipt_helper.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/* iptables module to match on related connections */
-/*
- * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *   19 Mar 2002 Harald Welte <laforge@gnumonks.org>:
- *   		 - Port to newnat infrastructure
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/netfilter.h>
-#include <linux/interrupt.h>
-#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#else
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#endif
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_helper.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>");
-MODULE_DESCRIPTION("iptables helper match module");
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
-#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_helper_info *info = matchinfo;
-	struct ip_conntrack *ct;
-	enum ip_conntrack_info ctinfo;
-	int ret = info->invert;
-	
-	ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
-	if (!ct) {
-		DEBUGP("ipt_helper: Eek! invalid conntrack?\n");
-		return ret;
-	}
-
-	if (!ct->master) {
-		DEBUGP("ipt_helper: conntrack %p has no master\n", ct);
-		return ret;
-	}
-
-	read_lock_bh(&ip_conntrack_lock);
-	if (!ct->master->helper) {
-		DEBUGP("ipt_helper: master ct %p has no helper\n", 
-			exp->expectant);
-		goto out_unlock;
-	}
-
-	DEBUGP("master's name = %s , info->name = %s\n", 
-		ct->master->helper->name, info->name);
-
-	if (info->name[0] == '\0')
-		ret ^= 1;
-	else
-		ret ^= !strncmp(ct->master->helper->name, info->name, 
-		                strlen(ct->master->helper->name));
-out_unlock:
-	read_unlock_bh(&ip_conntrack_lock);
-	return ret;
-}
-
-#else /* CONFIG_IP_NF_CONNTRACK */
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_helper_info *info = matchinfo;
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	int ret = info->invert;
-	
-	ct = nf_ct_get((struct sk_buff *)skb, &ctinfo);
-	if (!ct) {
-		DEBUGP("ipt_helper: Eek! invalid conntrack?\n");
-		return ret;
-	}
-
-	if (!ct->master) {
-		DEBUGP("ipt_helper: conntrack %p has no master\n", ct);
-		return ret;
-	}
-
-	read_lock_bh(&nf_conntrack_lock);
-	if (!ct->master->helper) {
-		DEBUGP("ipt_helper: master ct %p has no helper\n", 
-			exp->expectant);
-		goto out_unlock;
-	}
-
-	DEBUGP("master's name = %s , info->name = %s\n", 
-		ct->master->helper->name, info->name);
-
-	if (info->name[0] == '\0')
-		ret ^= 1;
-	else
-		ret ^= !strncmp(ct->master->helper->name, info->name, 
-		                strlen(ct->master->helper->name));
-out_unlock:
-	read_unlock_bh(&nf_conntrack_lock);
-	return ret;
-}
-#endif
-
-static int check(const char *tablename,
-		 const struct ipt_ip *ip,
-		 void *matchinfo,
-		 unsigned int matchsize,
-		 unsigned int hook_mask)
-{
-	struct ipt_helper_info *info = matchinfo;
-
-	info->name[29] = '\0';
-
-	/* verify size */
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_helper_info)))
-		return 0;
-
-	return 1;
-}
-
-static struct ipt_match helper_match = {
-	.name		= "helper",
-	.match		= &match,
-	.checkentry	= &check,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	need_ip_conntrack();
-	return ipt_register_match(&helper_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&helper_match);
-}
-
-module_init(init);
-module_exit(fini);
-
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
index b835b7b2e560..13fb16fb7892 100644
--- a/net/ipv4/netfilter/ipt_iprange.c
+++ b/net/ipv4/netfilter/ipt_iprange.c
@@ -28,7 +28,7 @@ match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
       const void *matchinfo,
-      int offset, int *hotdrop)
+      int offset, unsigned int protoff, int *hotdrop)
 {
 	const struct ipt_iprange_info *info = matchinfo;
 	const struct iphdr *iph = skb->nh.iph;
@@ -63,7 +63,7 @@ match(const struct sk_buff *skb,
 }
 
 static int check(const char *tablename,
-		 const struct ipt_ip *ip,
+		 const void *inf,
 		 void *matchinfo,
 		 unsigned int matchsize,
 		 unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_length.c b/net/ipv4/netfilter/ipt_length.c
deleted file mode 100644
index 4eabcfbda9d1..000000000000
--- a/net/ipv4/netfilter/ipt_length.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Kernel module to match packet length. */
-/* (C) 1999-2001 James Morris <jmorros@intercode.com.au>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter_ipv4/ipt_length.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
-MODULE_DESCRIPTION("IP tables packet length matching module");
-MODULE_LICENSE("GPL");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_length_info *info = matchinfo;
-	u_int16_t pktlen = ntohs(skb->nh.iph->tot_len);
-	
-	return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
-}
-
-static int
-checkentry(const char *tablename,
-           const struct ipt_ip *ip,
-           void *matchinfo,
-           unsigned int matchsize,
-           unsigned int hook_mask)
-{
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_length_info)))
-		return 0;
-
-	return 1;
-}
-
-static struct ipt_match length_match = {
-	.name		= "length",
-	.match		= &match,
-	.checkentry	= &checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&length_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&length_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c
deleted file mode 100644
index 0c24dcc703a5..000000000000
--- a/net/ipv4/netfilter/ipt_limit.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Kernel module to control the rate
- *
- * 2 September 1999: Changed from the target RATE to the match
- *                   `limit', removed logging.  Did I mention that
- *                   Alexey is a fucking genius?
- *                   Rusty Russell (rusty@rustcorp.com.au).  */
-
-/* (C) 1999 J�r�me de Vivie <devivie@info.enserb.u-bordeaux.fr>
- * (C) 1999 Herv� Eychenne <eychenne@info.enserb.u-bordeaux.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_limit.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
-MODULE_DESCRIPTION("iptables rate limit match");
-
-/* The algorithm used is the Simple Token Bucket Filter (TBF)
- * see net/sched/sch_tbf.c in the linux source tree
- */
-
-static DEFINE_SPINLOCK(limit_lock);
-
-/* Rusty: This is my (non-mathematically-inclined) understanding of
-   this algorithm.  The `average rate' in jiffies becomes your initial
-   amount of credit `credit' and the most credit you can ever have
-   `credit_cap'.  The `peak rate' becomes the cost of passing the
-   test, `cost'.
-
-   `prev' tracks the last packet hit: you gain one credit per jiffy.
-   If you get credit balance more than this, the extra credit is
-   discarded.  Every time the match passes, you lose `cost' credits;
-   if you don't have that many, the test fails.
-
-   See Alexey's formal explanation in net/sched/sch_tbf.c.
-
-   To get the maxmum range, we multiply by this factor (ie. you get N
-   credits per jiffy).  We want to allow a rate as low as 1 per day
-   (slowest userspace tool allows), which means
-   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */
-#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
-
-/* Repeated shift and or gives us all 1s, final shift and add 1 gives
- * us the power of 2 below the theoretical max, so GCC simply does a
- * shift. */
-#define _POW2_BELOW2(x) ((x)|((x)>>1))
-#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
-#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
-#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
-#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
-#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
-
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
-
-static int
-ipt_limit_match(const struct sk_buff *skb,
-		const struct net_device *in,
-		const struct net_device *out,
-		const void *matchinfo,
-		int offset,
-		int *hotdrop)
-{
-	struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master;
-	unsigned long now = jiffies;
-
-	spin_lock_bh(&limit_lock);
-	r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
-	if (r->credit > r->credit_cap)
-		r->credit = r->credit_cap;
-
-	if (r->credit >= r->cost) {
-		/* We're not limited. */
-		r->credit -= r->cost;
-		spin_unlock_bh(&limit_lock);
-		return 1;
-	}
-
-       	spin_unlock_bh(&limit_lock);
-	return 0;
-}
-
-/* Precision saver. */
-static u_int32_t
-user2credits(u_int32_t user)
-{
-	/* If multiplying would overflow... */
-	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
-		/* Divide first. */
-		return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
-
-	return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE;
-}
-
-static int
-ipt_limit_checkentry(const char *tablename,
-		     const struct ipt_ip *ip,
-		     void *matchinfo,
-		     unsigned int matchsize,
-		     unsigned int hook_mask)
-{
-	struct ipt_rateinfo *r = matchinfo;
-
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo)))
-		return 0;
-
-	/* Check for overflow. */
-	if (r->burst == 0
-	    || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
-		printk("Overflow in ipt_limit, try lower: %u/%u\n",
-		       r->avg, r->burst);
-		return 0;
-	}
-
-	/* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies *
-	   128. */
-	r->prev = jiffies;
-	r->credit = user2credits(r->avg * r->burst);	 /* Credits full. */
-	r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
-	r->cost = user2credits(r->avg);
-
-	/* For SMP, we only want to use one set of counters. */
-	r->master = r;
-
-	return 1;
-}
-
-static struct ipt_match ipt_limit_reg = {
-	.name		= "limit",
-	.match		= ipt_limit_match,
-	.checkentry	= ipt_limit_checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	if (ipt_register_match(&ipt_limit_reg))
-		return -EINVAL;
-	return 0;
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&ipt_limit_reg);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c
deleted file mode 100644
index 1b9bb4559f80..000000000000
--- a/net/ipv4/netfilter/ipt_mac.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Kernel module to match MAC address parameters. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/if_ether.h>
-#include <linux/etherdevice.h>
-
-#include <linux/netfilter_ipv4/ipt_mac.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables mac matching module");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-    const struct ipt_mac_info *info = matchinfo;
-
-    /* Is mac pointer valid? */
-    return (skb->mac.raw >= skb->head
-	    && (skb->mac.raw + ETH_HLEN) <= skb->data
-	    /* If so, compare... */
-	    && ((!compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr))
-		^ info->invert));
-}
-
-static int
-ipt_mac_checkentry(const char *tablename,
-		   const struct ipt_ip *ip,
-		   void *matchinfo,
-		   unsigned int matchsize,
-		   unsigned int hook_mask)
-{
-	/* FORWARD isn't always valid, but it's nice to be able to do --RR */
-	if (hook_mask
-	    & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
-		| (1 << NF_IP_FORWARD))) {
-		printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
-		return 0;
-	}
-
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info)))
-		return 0;
-
-	return 1;
-}
-
-static struct ipt_match mac_match = {
-	.name		= "mac",
-	.match		= &match,
-	.checkentry	= &ipt_mac_checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&mac_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&mac_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
deleted file mode 100644
index 00bef6cdd3f8..000000000000
--- a/net/ipv4/netfilter/ipt_mark.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Kernel module to match NFMARK values. */
-
-/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter_ipv4/ipt_mark.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
-MODULE_DESCRIPTION("iptables mark matching module");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_mark_info *info = matchinfo;
-
-	return ((skb->nfmark & info->mask) == info->mark) ^ info->invert;
-}
-
-static int
-checkentry(const char *tablename,
-           const struct ipt_ip *ip,
-           void *matchinfo,
-           unsigned int matchsize,
-           unsigned int hook_mask)
-{
-	struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo;
-
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
-		return 0;
-
-	if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
-		printk(KERN_WARNING "mark: only supports 32bit mark\n");
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ipt_match mark_match = {
-	.name		= "mark",
-	.match		= &match,
-	.checkentry	= &checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&mark_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&mark_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c
index 99e8188162e2..2d52326553f1 100644
--- a/net/ipv4/netfilter/ipt_multiport.c
+++ b/net/ipv4/netfilter/ipt_multiport.c
@@ -97,6 +97,7 @@ match(const struct sk_buff *skb,
       const struct net_device *out,
       const void *matchinfo,
       int offset,
+      unsigned int protoff,
       int *hotdrop)
 {
 	u16 _ports[2], *pptr;
@@ -105,7 +106,7 @@ match(const struct sk_buff *skb,
 	if (offset)
 		return 0;
 
-	pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+	pptr = skb_header_pointer(skb, protoff,
 				  sizeof(_ports), _ports);
 	if (pptr == NULL) {
 		/* We've been asked to examine this packet, and we
@@ -128,6 +129,7 @@ match_v1(const struct sk_buff *skb,
 	 const struct net_device *out,
 	 const void *matchinfo,
 	 int offset,
+	 unsigned int protoff,
 	 int *hotdrop)
 {
 	u16 _ports[2], *pptr;
@@ -136,7 +138,7 @@ match_v1(const struct sk_buff *skb,
 	if (offset)
 		return 0;
 
-	pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+	pptr = skb_header_pointer(skb, protoff,
 				  sizeof(_ports), _ports);
 	if (pptr == NULL) {
 		/* We've been asked to examine this packet, and we
@@ -154,7 +156,7 @@ match_v1(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 checkentry(const char *tablename,
-	   const struct ipt_ip *ip,
+	   const void *ip,
 	   void *matchinfo,
 	   unsigned int matchsize,
 	   unsigned int hook_mask)
@@ -164,7 +166,7 @@ checkentry(const char *tablename,
 
 static int
 checkentry_v1(const char *tablename,
-	      const struct ipt_ip *ip,
+	      const void *ip,
 	      void *matchinfo,
 	      unsigned int matchsize,
 	      unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 0cee2862ed85..4843d0c9734f 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -27,6 +27,7 @@ match(const struct sk_buff *skb,
       const struct net_device *out,
       const void *matchinfo,
       int offset,
+      unsigned int protoff,
       int *hotdrop)
 {
 	const struct ipt_owner_info *info = matchinfo;
@@ -51,7 +52,7 @@ match(const struct sk_buff *skb,
 
 static int
 checkentry(const char *tablename,
-           const struct ipt_ip *ip,
+           const void *ip,
            void *matchinfo,
            unsigned int matchsize,
            unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
deleted file mode 100644
index 03f554857a4d..000000000000
--- a/net/ipv4/netfilter/ipt_physdev.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Kernel module to match the bridge port in and
- * out device for IP packets coming into contact with a bridge. */
-
-/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/netfilter_ipv4/ipt_physdev.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_bridge.h>
-#define MATCH   1
-#define NOMATCH 0
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
-MODULE_DESCRIPTION("iptables bridge physical device match module");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	int i;
-	static const char nulldevname[IFNAMSIZ];
-	const struct ipt_physdev_info *info = matchinfo;
-	unsigned int ret;
-	const char *indev, *outdev;
-	struct nf_bridge_info *nf_bridge;
-
-	/* Not a bridged IP packet or no info available yet:
-	 * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if
-	 * the destination device will be a bridge. */
-	if (!(nf_bridge = skb->nf_bridge)) {
-		/* Return MATCH if the invert flags of the used options are on */
-		if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) &&
-		    !(info->invert & IPT_PHYSDEV_OP_BRIDGED))
-			return NOMATCH;
-		if ((info->bitmask & IPT_PHYSDEV_OP_ISIN) &&
-		    !(info->invert & IPT_PHYSDEV_OP_ISIN))
-			return NOMATCH;
-		if ((info->bitmask & IPT_PHYSDEV_OP_ISOUT) &&
-		    !(info->invert & IPT_PHYSDEV_OP_ISOUT))
-			return NOMATCH;
-		if ((info->bitmask & IPT_PHYSDEV_OP_IN) &&
-		    !(info->invert & IPT_PHYSDEV_OP_IN))
-			return NOMATCH;
-		if ((info->bitmask & IPT_PHYSDEV_OP_OUT) &&
-		    !(info->invert & IPT_PHYSDEV_OP_OUT))
-			return NOMATCH;
-		return MATCH;
-	}
-
-	/* This only makes sense in the FORWARD and POSTROUTING chains */
-	if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) &&
-	    (!!(nf_bridge->mask & BRNF_BRIDGED) ^
-	    !(info->invert & IPT_PHYSDEV_OP_BRIDGED)))
-		return NOMATCH;
-
-	if ((info->bitmask & IPT_PHYSDEV_OP_ISIN &&
-	    (!nf_bridge->physindev ^ !!(info->invert & IPT_PHYSDEV_OP_ISIN))) ||
-	    (info->bitmask & IPT_PHYSDEV_OP_ISOUT &&
-	    (!nf_bridge->physoutdev ^ !!(info->invert & IPT_PHYSDEV_OP_ISOUT))))
-		return NOMATCH;
-
-	if (!(info->bitmask & IPT_PHYSDEV_OP_IN))
-		goto match_outdev;
-	indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname;
-	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
-		ret |= (((const unsigned int *)indev)[i]
-			^ ((const unsigned int *)info->physindev)[i])
-			& ((const unsigned int *)info->in_mask)[i];
-	}
-
-	if ((ret == 0) ^ !(info->invert & IPT_PHYSDEV_OP_IN))
-		return NOMATCH;
-
-match_outdev:
-	if (!(info->bitmask & IPT_PHYSDEV_OP_OUT))
-		return MATCH;
-	outdev = nf_bridge->physoutdev ?
-		 nf_bridge->physoutdev->name : nulldevname;
-	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
-		ret |= (((const unsigned int *)outdev)[i]
-			^ ((const unsigned int *)info->physoutdev)[i])
-			& ((const unsigned int *)info->out_mask)[i];
-	}
-
-	return (ret != 0) ^ !(info->invert & IPT_PHYSDEV_OP_OUT);
-}
-
-static int
-checkentry(const char *tablename,
-		       const struct ipt_ip *ip,
-		       void *matchinfo,
-		       unsigned int matchsize,
-		       unsigned int hook_mask)
-{
-	const struct ipt_physdev_info *info = matchinfo;
-
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_physdev_info)))
-		return 0;
-	if (!(info->bitmask & IPT_PHYSDEV_OP_MASK) ||
-	    info->bitmask & ~IPT_PHYSDEV_OP_MASK)
-		return 0;
-	return 1;
-}
-
-static struct ipt_match physdev_match = {
-	.name		= "physdev",
-	.match		= &match,
-	.checkentry	= &checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&physdev_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&physdev_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_pkttype.c b/net/ipv4/netfilter/ipt_pkttype.c
deleted file mode 100644
index 8ddb1dc5e5ae..000000000000
--- a/net/ipv4/netfilter/ipt_pkttype.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/* (C) 1999-2001 Michal Ludvig <michal@logix.cz>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-
-#include <linux/netfilter_ipv4/ipt_pkttype.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>");
-MODULE_DESCRIPTION("IP tables match to match on linklayer packet type");
-
-static int match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-    const struct ipt_pkttype_info *info = matchinfo;
-
-    return (skb->pkt_type == info->pkttype) ^ info->invert;
-}
-
-static int checkentry(const char *tablename,
-		   const struct ipt_ip *ip,
-		   void *matchinfo,
-		   unsigned int matchsize,
-		   unsigned int hook_mask)
-{
-/*
-	if (hook_mask
-	    & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
-		| (1 << NF_IP_FORWARD))) {
-		printk("ipt_pkttype: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
-		return 0;
-	}
-*/
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_pkttype_info)))
-		return 0;
-
-	return 1;
-}
-
-static struct ipt_match pkttype_match = {
-	.name		= "pkttype",
-	.match		= &match,
-	.checkentry	= &checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&pkttype_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&pkttype_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_realm.c b/net/ipv4/netfilter/ipt_realm.c
deleted file mode 100644
index 54a6897ebaa6..000000000000
--- a/net/ipv4/netfilter/ipt_realm.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/* IP tables module for matching the routing realm
- *
- * $Id: ipt_realm.c,v 1.3 2004/03/05 13:25:40 laforge Exp $
- *
- * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <net/route.h>
-
-#include <linux/netfilter_ipv4/ipt_realm.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("iptables realm match");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_realm_info *info = matchinfo;
-	struct dst_entry *dst = skb->dst;
-    
-	return (info->id == (dst->tclassid & info->mask)) ^ info->invert;
-}
-
-static int check(const char *tablename,
-                 const struct ipt_ip *ip,
-                 void *matchinfo,
-                 unsigned int matchsize,
-                 unsigned int hook_mask)
-{
-	if (hook_mask
-	    & ~((1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) |
-	        (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN))) {
-		printk("ipt_realm: only valid for POST_ROUTING, LOCAL_OUT, "
-		       "LOCAL_IN or FORWARD.\n");
-		return 0;
-	}
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_realm_info))) {
-		printk("ipt_realm: invalid matchsize.\n");
-		return 0;
-	}
-	return 1;
-}
-
-static struct ipt_match realm_match = {
-	.name		= "realm",
-	.match		= match, 
-	.checkentry	= check,
-	.me		= THIS_MODULE
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&realm_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&realm_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 5ddccb18c65e..44611d6d14f5 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -104,6 +104,7 @@ match(const struct sk_buff *skb,
       const struct net_device *out,
       const void *matchinfo,
       int offset,
+      unsigned int protoff,
       int *hotdrop);
 
 /* Function to hash a given address into the hash table of table_size size */
@@ -317,7 +318,7 @@ static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned
 	skb->nh.iph->daddr = 0;
 	/* Clear ttl since we have no way of knowing it */
 	skb->nh.iph->ttl = 0;
-	match(skb,NULL,NULL,info,0,NULL);
+	match(skb,NULL,NULL,info,0,0,NULL);
 
 	kfree(skb->nh.iph);
 out_free_skb:
@@ -357,6 +358,7 @@ match(const struct sk_buff *skb,
       const struct net_device *out,
       const void *matchinfo,
       int offset,
+      unsigned int protoff,
       int *hotdrop)
 {
 	int pkt_count, hits_found, ans;
@@ -654,7 +656,7 @@ match(const struct sk_buff *skb,
  */
 static int
 checkentry(const char *tablename,
-           const struct ipt_ip *ip,
+           const void *ip,
            void *matchinfo,
            unsigned int matchsize,
            unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_sctp.c b/net/ipv4/netfilter/ipt_sctp.c
deleted file mode 100644
index fe2b327bcaa4..000000000000
--- a/net/ipv4/netfilter/ipt_sctp.c
+++ /dev/null
@@ -1,203 +0,0 @@
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/ip.h>
-#include <linux/sctp.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_sctp.h>
-
-#ifdef DEBUG_SCTP
-#define duprintf(format, args...) printk(format , ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
-#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
-					      || (!!((invflag) & (option)) ^ (cond)))
-
-static int
-match_flags(const struct ipt_sctp_flag_info *flag_info,
-	    const int flag_count,
-	    u_int8_t chunktype,
-	    u_int8_t chunkflags)
-{
-	int i;
-
-	for (i = 0; i < flag_count; i++) {
-		if (flag_info[i].chunktype == chunktype) {
-			return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag;
-		}
-	}
-
-	return 1;
-}
-
-static int
-match_packet(const struct sk_buff *skb,
-	     const u_int32_t *chunkmap,
-	     int chunk_match_type,
-	     const struct ipt_sctp_flag_info *flag_info,
-	     const int flag_count,
-	     int *hotdrop)
-{
-	int offset;
-	u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)];
-	sctp_chunkhdr_t _sch, *sch;
-
-#ifdef DEBUG_SCTP
-	int i = 0;
-#endif
-
-	if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) {
-		SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap);
-	}
-
-	offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t);
-	do {
-		sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch);
-		if (sch == NULL) {
-			duprintf("Dropping invalid SCTP packet.\n");
-			*hotdrop = 1;
-			return 0;
-        	}
-
-		duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n", 
-				++i, offset, sch->type, htons(sch->length), sch->flags);
-
-		offset += (htons(sch->length) + 3) & ~3;
-
-		duprintf("skb->len: %d\toffset: %d\n", skb->len, offset);
-
-		if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch->type)) {
-			switch (chunk_match_type) {
-			case SCTP_CHUNK_MATCH_ANY:
-				if (match_flags(flag_info, flag_count, 
-					sch->type, sch->flags)) {
-					return 1;
-				}
-				break;
-
-			case SCTP_CHUNK_MATCH_ALL:
-				if (match_flags(flag_info, flag_count, 
-					sch->type, sch->flags)) {
-					SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type);
-				}
-				break;
-
-			case SCTP_CHUNK_MATCH_ONLY:
-				if (!match_flags(flag_info, flag_count, 
-					sch->type, sch->flags)) {
-					return 0;
-				}
-				break;
-			}
-		} else {
-			switch (chunk_match_type) {
-			case SCTP_CHUNK_MATCH_ONLY:
-				return 0;
-			}
-		}
-	} while (offset < skb->len);
-
-	switch (chunk_match_type) {
-	case SCTP_CHUNK_MATCH_ALL:
-		return SCTP_CHUNKMAP_IS_CLEAR(chunkmap);
-	case SCTP_CHUNK_MATCH_ANY:
-		return 0;
-	case SCTP_CHUNK_MATCH_ONLY:
-		return 1;
-	}
-
-	/* This will never be reached, but required to stop compiler whine */
-	return 0;
-}
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_sctp_info *info;
-	sctp_sctphdr_t _sh, *sh;
-
-	info = (const struct ipt_sctp_info *)matchinfo;
-
-	if (offset) {
-		duprintf("Dropping non-first fragment.. FIXME\n");
-		return 0;
-	}
-	
-	sh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_sh), &_sh);
-	if (sh == NULL) {
-		duprintf("Dropping evil TCP offset=0 tinygram.\n");
-		*hotdrop = 1;
-		return 0;
-       	}
-	duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
-
-	return  SCCHECK(((ntohs(sh->source) >= info->spts[0]) 
-			&& (ntohs(sh->source) <= info->spts[1])), 
-		   	IPT_SCTP_SRC_PORTS, info->flags, info->invflags)
-		&& SCCHECK(((ntohs(sh->dest) >= info->dpts[0]) 
-			&& (ntohs(sh->dest) <= info->dpts[1])), 
-			IPT_SCTP_DEST_PORTS, info->flags, info->invflags)
-		&& SCCHECK(match_packet(skb, info->chunkmap, info->chunk_match_type,
- 					info->flag_info, info->flag_count, 
-					hotdrop),
-			   IPT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
-}
-
-static int
-checkentry(const char *tablename,
-	   const struct ipt_ip *ip,
-	   void *matchinfo,
-	   unsigned int matchsize,
-	   unsigned int hook_mask)
-{
-	const struct ipt_sctp_info *info;
-
-	info = (const struct ipt_sctp_info *)matchinfo;
-
-	return ip->proto == IPPROTO_SCTP
-		&& !(ip->invflags & IPT_INV_PROTO)
-		&& matchsize == IPT_ALIGN(sizeof(struct ipt_sctp_info))
-		&& !(info->flags & ~IPT_SCTP_VALID_FLAGS)
-		&& !(info->invflags & ~IPT_SCTP_VALID_FLAGS)
-		&& !(info->invflags & ~info->flags)
-		&& ((!(info->flags & IPT_SCTP_CHUNK_TYPES)) || 
-			(info->chunk_match_type &
-				(SCTP_CHUNK_MATCH_ALL 
-				| SCTP_CHUNK_MATCH_ANY
-				| SCTP_CHUNK_MATCH_ONLY)));
-}
-
-static struct ipt_match sctp_match = 
-{ 
-	.list = { NULL, NULL},
-	.name = "sctp",
-	.match = &match,
-	.checkentry = &checkentry,
-	.destroy = NULL,
-	.me = THIS_MODULE
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&sctp_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&sctp_match);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kiran Kumar Immidi");
-MODULE_DESCRIPTION("Match for SCTP protocol packets");
-
diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c
deleted file mode 100644
index 4d7f16b70cec..000000000000
--- a/net/ipv4/netfilter/ipt_state.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Kernel module to match connection tracking information. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/netfilter/nf_conntrack_compat.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_state.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
-MODULE_DESCRIPTION("iptables connection tracking state match module");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_state_info *sinfo = matchinfo;
-	enum ip_conntrack_info ctinfo;
-	unsigned int statebit;
-
-	if (nf_ct_is_untracked(skb))
-		statebit = IPT_STATE_UNTRACKED;
-	else if (!nf_ct_get_ctinfo(skb, &ctinfo))
-		statebit = IPT_STATE_INVALID;
-	else
-		statebit = IPT_STATE_BIT(ctinfo);
-
-	return (sinfo->statemask & statebit);
-}
-
-static int check(const char *tablename,
-		 const struct ipt_ip *ip,
-		 void *matchinfo,
-		 unsigned int matchsize,
-		 unsigned int hook_mask)
-{
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info)))
-		return 0;
-
-	return 1;
-}
-
-static struct ipt_match state_match = {
-	.name		= "state",
-	.match		= &match,
-	.checkentry	= &check,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	need_ip_conntrack();
-	return ipt_register_match(&state_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&state_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c
deleted file mode 100644
index b5def204d798..000000000000
--- a/net/ipv4/netfilter/ipt_string.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/* String matching match for iptables
- * 
- * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_string.h>
-#include <linux/textsearch.h>
-
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
-MODULE_DESCRIPTION("IP tables string match module");
-MODULE_LICENSE("GPL");
-
-static int match(const struct sk_buff *skb,
-		 const struct net_device *in,
-		 const struct net_device *out,
-		 const void *matchinfo,
-		 int offset,
-		 int *hotdrop)
-{
-	struct ts_state state;
-	struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo;
-
-	memset(&state, 0, sizeof(struct ts_state));
-
-	return (skb_find_text((struct sk_buff *)skb, conf->from_offset, 
-			     conf->to_offset, conf->config, &state) 
-			     != UINT_MAX) && !conf->invert;
-}
-
-#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m)
-
-static int checkentry(const char *tablename,
-		      const struct ipt_ip *ip,
-		      void *matchinfo,
-		      unsigned int matchsize,
-		      unsigned int hook_mask)
-{
-	struct ipt_string_info *conf = matchinfo;
-	struct ts_config *ts_conf;
-
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info)))
-		return 0;
-
-	/* Damn, can't handle this case properly with iptables... */
-	if (conf->from_offset > conf->to_offset)
-		return 0;
-
-	ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
-				     GFP_KERNEL, TS_AUTOLOAD);
-	if (IS_ERR(ts_conf))
-		return 0;
-
-	conf->config = ts_conf;
-
-	return 1;
-}
-
-static void destroy(void *matchinfo, unsigned int matchsize)
-{
-	textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
-}
-
-static struct ipt_match string_match = {
-	.name 		= "string",
-	.match 		= match,
-	.checkentry	= checkentry,
-	.destroy 	= destroy,
-	.me 		= THIS_MODULE
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&string_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&string_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tcpmss.c b/net/ipv4/netfilter/ipt_tcpmss.c
deleted file mode 100644
index 4dc9b16ab4a3..000000000000
--- a/net/ipv4/netfilter/ipt_tcpmss.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Kernel module to match TCP MSS values. */
-
-/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/tcp.h>
-
-#include <linux/netfilter_ipv4/ipt_tcpmss.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-#define TH_SYN 0x02
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
-MODULE_DESCRIPTION("iptables TCP MSS match module");
-
-/* Returns 1 if the mss option is set and matched by the range, 0 otherwise */
-static inline int
-mssoption_match(u_int16_t min, u_int16_t max,
-		const struct sk_buff *skb,
-		int invert,
-		int *hotdrop)
-{
-	struct tcphdr _tcph, *th;
-	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
-	u8 _opt[15 * 4 - sizeof(_tcph)], *op;
-	unsigned int i, optlen;
-
-	/* If we don't have the whole header, drop packet. */
-	th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
-				sizeof(_tcph), &_tcph);
-	if (th == NULL)
-		goto dropit;
-
-	/* Malformed. */
-	if (th->doff*4 < sizeof(*th))
-		goto dropit;
-
-	optlen = th->doff*4 - sizeof(*th);
-	if (!optlen)
-		goto out;
-
-	/* Truncated options. */
-	op = skb_header_pointer(skb, skb->nh.iph->ihl * 4 + sizeof(*th),
-				optlen, _opt);
-	if (op == NULL)
-		goto dropit;
-
-	for (i = 0; i < optlen; ) {
-		if (op[i] == TCPOPT_MSS
-		    && (optlen - i) >= TCPOLEN_MSS
-		    && op[i+1] == TCPOLEN_MSS) {
-			u_int16_t mssval;
-
-			mssval = (op[i+2] << 8) | op[i+3];
-			
-			return (mssval >= min && mssval <= max) ^ invert;
-		}
-		if (op[i] < 2) i++;
-		else i += op[i+1]?:1;
-	}
-out:
-	return invert;
-
- dropit:
-	*hotdrop = 1;
-	return 0;
-}
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      int *hotdrop)
-{
-	const struct ipt_tcpmss_match_info *info = matchinfo;
-
-	return mssoption_match(info->mss_min, info->mss_max, skb,
-			       info->invert, hotdrop);
-}
-
-static int
-checkentry(const char *tablename,
-           const struct ipt_ip *ip,
-           void *matchinfo,
-           unsigned int matchsize,
-           unsigned int hook_mask)
-{
-	if (matchsize != IPT_ALIGN(sizeof(struct ipt_tcpmss_match_info)))
-		return 0;
-
-	/* Must specify -p tcp */
-	if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) {
-		printk("tcpmss: Only works on TCP packets\n");
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ipt_match tcpmss_match = {
-	.name		= "tcpmss",
-	.match		= &match,
-	.checkentry	= &checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&tcpmss_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&tcpmss_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
index 086a1bb61e3e..9ab765e126f2 100644
--- a/net/ipv4/netfilter/ipt_tos.c
+++ b/net/ipv4/netfilter/ipt_tos.c
@@ -23,6 +23,7 @@ match(const struct sk_buff *skb,
       const struct net_device *out,
       const void *matchinfo,
       int offset,
+      unsigned int protoff,
       int *hotdrop)
 {
 	const struct ipt_tos_info *info = matchinfo;
@@ -32,7 +33,7 @@ match(const struct sk_buff *skb,
 
 static int
 checkentry(const char *tablename,
-           const struct ipt_ip *ip,
+           const void *ip,
            void *matchinfo,
            unsigned int matchsize,
            unsigned int hook_mask)
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
index 219aa9de88cc..82da53f430ab 100644
--- a/net/ipv4/netfilter/ipt_ttl.c
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -21,7 +21,7 @@ MODULE_LICENSE("GPL");
 
 static int match(const struct sk_buff *skb, const struct net_device *in,
 		 const struct net_device *out, const void *matchinfo,
-		 int offset, int *hotdrop)
+		 int offset, unsigned int protoff, int *hotdrop)
 {
 	const struct ipt_ttl_info *info = matchinfo;
 
@@ -47,7 +47,7 @@ static int match(const struct sk_buff *skb, const struct net_device *in,
 	return 0;
 }
 
-static int checkentry(const char *tablename, const struct ipt_ip *ip,
+static int checkentry(const char *tablename, const void  *ip,
 		      void *matchinfo, unsigned int matchsize,
 		      unsigned int hook_mask)
 {
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 260a4f0a2a90..212a3079085b 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -78,7 +78,8 @@ static struct ipt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
 	.lock		= RW_LOCK_UNLOCKED,
-	.me		= THIS_MODULE
+	.me		= THIS_MODULE,
+	.af		= AF_INET,
 };
 
 /* The work comes in here from netfilter.c. */
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 160eb11b6e2f..3212a5cc4b6b 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -109,6 +109,7 @@ static struct ipt_table packet_mangler = {
 	.valid_hooks	= MANGLE_VALID_HOOKS,
 	.lock		= RW_LOCK_UNLOCKED,
 	.me		= THIS_MODULE,
+	.af		= AF_INET,
 };
 
 /* The work comes in here from netfilter.c. */
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 47449ba83eb9..fdb9e9c81e81 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -83,7 +83,8 @@ static struct ipt_table packet_raw = {
 	.name = "raw", 
 	.valid_hooks =  RAW_VALID_HOOKS, 
 	.lock = RW_LOCK_UNLOCKED, 
-	.me = THIS_MODULE
+	.me = THIS_MODULE,
+	.af = AF_INET,
 };
 
 /* The work comes in here from netfilter.c. */
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 0c56c52a3831..167619f638c6 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -575,7 +575,7 @@ MODULE_LICENSE("GPL");
 
 static int __init init(void)
 {
-	need_nf_conntrack();
+	need_conntrack();
 	return init_or_cleanup(1);
 }
 
@@ -587,9 +587,4 @@ static void __exit fini(void)
 module_init(init);
 module_exit(fini);
 
-void need_ip_conntrack(void)
-{
-}
-
-EXPORT_SYMBOL(need_ip_conntrack);
 EXPORT_SYMBOL(nf_ct_ipv4_gather_frags);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 105dd69ee9fb..2d6f8ecbc27b 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -41,6 +41,7 @@ config IP6_NF_QUEUE
 
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering/masq/NAT)"
+	depends on NETFILTER_XTABLES
 	help
 	  ip6tables is a general, extensible packet identification framework.
 	  Currently only the packet filtering and packet mangling subsystem
@@ -50,25 +51,6 @@ config IP6_NF_IPTABLES
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 # The simple matches.
-config IP6_NF_MATCH_LIMIT
-	tristate "limit match support"
-	depends on IP6_NF_IPTABLES
-	help
-	  limit matching allows you to control the rate at which a rule can be
-	  matched: mainly useful in combination with the LOG target ("LOG
-	  target support", below) and to avoid some Denial of Service attacks.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
-config IP6_NF_MATCH_MAC
-	tristate "MAC address match support"
-	depends on IP6_NF_IPTABLES
-	help
-	  mac matching allows you to match packets based on the source
-	  Ethernet address of the packet.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP6_NF_MATCH_RT
 	tristate "Routing header match support"
 	depends on IP6_NF_IPTABLES
@@ -124,16 +106,6 @@ config IP6_NF_MATCH_OWNER
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP6_NF_MATCH_MARK
-	tristate "netfilter MARK match support"
-	depends on IP6_NF_IPTABLES
-	help
-	  Netfilter mark matching allows you to match packets based on the
-	  `nfmark' value in the packet.  This can be set by the MARK target
-	  (see below).
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP6_NF_MATCH_IPV6HEADER
 	tristate "IPv6 Extension Headers Match"
 	depends on IP6_NF_IPTABLES
@@ -151,15 +123,6 @@ config IP6_NF_MATCH_AHESP
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP6_NF_MATCH_LENGTH
-	tristate "Packet Length match support"
-	depends on IP6_NF_IPTABLES
-	help
-	  This option allows you to match the length of a packet against a
-	  specific value or range of values.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP6_NF_MATCH_EUI64
 	tristate "EUI64 address check"
 	depends on IP6_NF_IPTABLES
@@ -170,15 +133,6 @@ config IP6_NF_MATCH_EUI64
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP6_NF_MATCH_PHYSDEV
-	tristate "Physdev match support"
-	depends on IP6_NF_IPTABLES && BRIDGE_NETFILTER
-	help
-	  Physdev packet matching matches against the physical bridge ports
-	  the IP packet arrived on or will leave by.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP6_NF_MATCH_POLICY
 	tristate "IPsec policy match support"
 	depends on IP6_NF_IPTABLES && XFRM
@@ -219,17 +173,6 @@ config IP6_NF_TARGET_REJECT
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP6_NF_TARGET_NFQUEUE
-	tristate "NFQUEUE Target Support"
-	depends on IP6_NF_IPTABLES
-	help
-	  This Target replaced the old obsolete QUEUE target.
-
-	  As opposed to QUEUE, it supports 65535 different queues,
-	  not just one.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP6_NF_MANGLE
 	tristate "Packet mangling"
 	depends on IP6_NF_IPTABLES
@@ -240,19 +183,6 @@ config IP6_NF_MANGLE
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP6_NF_TARGET_MARK
-	tristate "MARK target support"
-	depends on IP6_NF_MANGLE
-	help
-	  This option adds a `MARK' target, which allows you to create rules
-	  in the `mangle' table which alter the netfilter mark (nfmark) field
-	  associated with the packet packet prior to routing. This can change
-	  the routing method (see `Use netfilter MARK value as routing
-	  key') and can also be used by other subsystems to change their
-	  behavior.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP6_NF_TARGET_HL
 	tristate  'HL (hoplimit) target support'
 	depends on IP6_NF_MANGLE
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index c0c809b426e8..663b4749820d 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -4,10 +4,7 @@
 
 # Link order matters here.
 obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o
-obj-$(CONFIG_IP6_NF_MATCH_LIMIT) += ip6t_limit.o
-obj-$(CONFIG_IP6_NF_MATCH_MARK) += ip6t_mark.o
 obj-$(CONFIG_IP6_NF_MATCH_LENGTH) += ip6t_length.o
-obj-$(CONFIG_IP6_NF_MATCH_MAC) += ip6t_mac.o
 obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
 obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o ip6t_dst.o
 obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o
@@ -17,12 +14,9 @@ obj-$(CONFIG_IP6_NF_MATCH_POLICY) += ip6t_policy.o
 obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
 obj-$(CONFIG_IP6_NF_MATCH_MULTIPORT) += ip6t_multiport.o
 obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o
-obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o
 obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
 obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
-obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o
 obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o
-obj-$(CONFIG_IP6_NF_TARGET_NFQUEUE) += ip6t_NFQUEUE.o
 obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
 obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
 obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 1390370186d9..847068fd3367 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -13,6 +13,9 @@
  * 	  a table
  * 06 Jun 2002 Andras Kis-Szabo <kisza@sch.bme.hu>
  *      - new extension header parser code
+ * 15 Oct 2005 Harald Welte <laforge@netfilter.org>
+ * 	- Unification of {ip,ip6}_tables into x_tables
+ * 	- Removed tcp and udp code, since it's not ipv6 specific
  */
 
 #include <linux/capability.h>
@@ -23,8 +26,6 @@
 #include <linux/vmalloc.h>
 #include <linux/netdevice.h>
 #include <linux/module.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
 #include <linux/icmpv6.h>
 #include <net/ipv6.h>
 #include <asm/uaccess.h>
@@ -33,6 +34,7 @@
 #include <linux/cpumask.h>
 
 #include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -67,13 +69,8 @@ do {								\
 #else
 #define IP_NF_ASSERT(x)
 #endif
-#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
 
-static DECLARE_MUTEX(ip6t_mutex);
 
-/* Must have mutex */
-#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
-#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
 #include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
@@ -91,30 +88,6 @@ static DECLARE_MUTEX(ip6t_mutex);
 
    Hence the start of any table is given by get_table() below.  */
 
-/* The table itself */
-struct ip6t_table_info
-{
-	/* Size per table */
-	unsigned int size;
-	/* Number of entries: FIXME. --RR */
-	unsigned int number;
-	/* Initial number of entries. Needed for module usage count */
-	unsigned int initial_entries;
-
-	/* Entry points and underflows */
-	unsigned int hook_entry[NF_IP6_NUMHOOKS];
-	unsigned int underflow[NF_IP6_NUMHOOKS];
-
-	/* ip6t_entry tables: one per CPU */
-	void *entries[NR_CPUS];
-};
-
-static LIST_HEAD(ip6t_target);
-static LIST_HEAD(ip6t_match);
-static LIST_HEAD(ip6t_tables);
-#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
-#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-
 #if 0
 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
 #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -297,7 +270,7 @@ ip6t_do_table(struct sk_buff **pskb,
 	      unsigned int hook,
 	      const struct net_device *in,
 	      const struct net_device *out,
-	      struct ip6t_table *table,
+	      struct xt_table *table,
 	      void *userdata)
 {
 	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
@@ -309,6 +282,7 @@ ip6t_do_table(struct sk_buff **pskb,
 	const char *indev, *outdev;
 	void *table_base;
 	struct ip6t_entry *e, *back;
+	struct xt_table_info *private;
 
 	/* Initialization */
 	indev = in ? in->name : nulldevname;
@@ -321,9 +295,10 @@ ip6t_do_table(struct sk_buff **pskb,
 	 * match it. */
 
 	read_lock_bh(&table->lock);
+	private = table->private;
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	table_base = (void *)table->private->entries[smp_processor_id()];
-	e = get_entry(table_base, table->private->hook_entry[hook]);
+	table_base = (void *)private->entries[smp_processor_id()];
+	e = get_entry(table_base, private->hook_entry[hook]);
 
 #ifdef CONFIG_NETFILTER_DEBUG
 	/* Check noone else using our table */
@@ -339,7 +314,7 @@ ip6t_do_table(struct sk_buff **pskb,
 #endif
 
 	/* For return from builtin chain */
-	back = get_entry(table_base, table->private->underflow[hook]);
+	back = get_entry(table_base, private->underflow[hook]);
 
 	do {
 		IP_NF_ASSERT(e);
@@ -439,145 +414,6 @@ ip6t_do_table(struct sk_buff **pskb,
 #endif
 }
 
-/*
- * These are weird, but module loading must not be done with mutex
- * held (since they will register), and we have to have a single
- * function to use try_then_request_module().
- */
-
-/* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
-static inline struct ip6t_table *find_table_lock(const char *name)
-{
-	struct ip6t_table *t;
-
-	if (down_interruptible(&ip6t_mutex) != 0)
-		return ERR_PTR(-EINTR);
-
-	list_for_each_entry(t, &ip6t_tables, list)
-		if (strcmp(t->name, name) == 0 && try_module_get(t->me))
-			return t;
-	up(&ip6t_mutex);
-	return NULL;
-}
-
-/* Find match, grabs ref.  Returns ERR_PTR() on error. */
-static inline struct ip6t_match *find_match(const char *name, u8 revision)
-{
-	struct ip6t_match *m;
-	int err = 0;
-
-	if (down_interruptible(&ip6t_mutex) != 0)
-		return ERR_PTR(-EINTR);
-
-	list_for_each_entry(m, &ip6t_match, list) {
-		if (strcmp(m->name, name) == 0) {
-			if (m->revision == revision) {
-				if (try_module_get(m->me)) {
-					up(&ip6t_mutex);
-					return m;
-				}
-			} else
-				err = -EPROTOTYPE; /* Found something. */
-		}
-	}
-	up(&ip6t_mutex);
-	return ERR_PTR(err);
-}
-
-/* Find target, grabs ref.  Returns ERR_PTR() on error. */
-static inline struct ip6t_target *find_target(const char *name, u8 revision)
-{
-	struct ip6t_target *t;
-	int err = 0;
-
-	if (down_interruptible(&ip6t_mutex) != 0)
-		return ERR_PTR(-EINTR);
-
-	list_for_each_entry(t, &ip6t_target, list) {
-		if (strcmp(t->name, name) == 0) {
-			if (t->revision == revision) {
-				if (try_module_get(t->me)) {
-					up(&ip6t_mutex);
-					return t;
-				}
-			} else
-				err = -EPROTOTYPE; /* Found something. */
-		}
-	}
-	up(&ip6t_mutex);
-	return ERR_PTR(err);
-}
-
-struct ip6t_target *ip6t_find_target(const char *name, u8 revision)
-{
-	struct ip6t_target *target;
-
-	target = try_then_request_module(find_target(name, revision),
-					 "ip6t_%s", name);
-	if (IS_ERR(target) || !target)
-		return NULL;
-	return target;
-}
-
-static int match_revfn(const char *name, u8 revision, int *bestp)
-{
-	struct ip6t_match *m;
-	int have_rev = 0;
-
-	list_for_each_entry(m, &ip6t_match, list) {
-		if (strcmp(m->name, name) == 0) {
-			if (m->revision > *bestp)
-				*bestp = m->revision;
-			if (m->revision == revision)
-				have_rev = 1;
-		}
-	}
-	return have_rev;
-}
-
-static int target_revfn(const char *name, u8 revision, int *bestp)
-{
-	struct ip6t_target *t;
-	int have_rev = 0;
-
-	list_for_each_entry(t, &ip6t_target, list) {
-		if (strcmp(t->name, name) == 0) {
-			if (t->revision > *bestp)
-				*bestp = t->revision;
-			if (t->revision == revision)
-				have_rev = 1;
-		}
-	}
-	return have_rev;
-}
-
-/* Returns true or fals (if no such extension at all) */
-static inline int find_revision(const char *name, u8 revision,
-				int (*revfn)(const char *, u8, int *),
-				int *err)
-{
-	int have_rev, best = -1;
-
-	if (down_interruptible(&ip6t_mutex) != 0) {
-		*err = -EINTR;
-		return 1;
-	}
-	have_rev = revfn(name, revision, &best);
-	up(&ip6t_mutex);
-
-	/* Nothing at all?  Return 0 to try loading module. */
-	if (best == -1) {
-		*err = -ENOENT;
-		return 0;
-	}
-
-	*err = best;
-	if (!have_rev)
-		*err = -EPROTONOSUPPORT;
-	return 1;
-}
-
-
 /* All zeroes == unconditional rule. */
 static inline int
 unconditional(const struct ip6t_ip6 *ipv6)
@@ -594,7 +430,7 @@ unconditional(const struct ip6t_ip6 *ipv6)
 /* Figures out from what hook each rule can be called: returns 0 if
    there are loops.  Puts hook bitmask in comefrom. */
 static int
-mark_source_chains(struct ip6t_table_info *newinfo,
+mark_source_chains(struct xt_table_info *newinfo,
 		   unsigned int valid_hooks, void *entry0)
 {
 	unsigned int hook;
@@ -740,11 +576,11 @@ check_match(struct ip6t_entry_match *m,
 {
 	struct ip6t_match *match;
 
-	match = try_then_request_module(find_match(m->u.user.name,
-						   m->u.user.revision),
+	match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name,
+			      		m->u.user.revision),
 					"ip6t_%s", m->u.user.name);
 	if (IS_ERR(match) || !match) {
-		duprintf("check_match: `%s' not found\n", m->u.user.name);
+	  	duprintf("check_match: `%s' not found\n", m->u.user.name);
 		return match ? PTR_ERR(match) : -ENOENT;
 	}
 	m->u.kernel.match = match;
@@ -785,8 +621,9 @@ check_entry(struct ip6t_entry *e, const char *name, unsigned int size,
 		goto cleanup_matches;
 
 	t = ip6t_get_target(e);
-	target = try_then_request_module(find_target(t->u.user.name,
-						     t->u.user.revision),
+	target = try_then_request_module(xt_find_target(AF_INET6,
+							t->u.user.name,
+							t->u.user.revision),
 					 "ip6t_%s", t->u.user.name);
 	if (IS_ERR(target) || !target) {
 		duprintf("check_entry: `%s' not found\n", t->u.user.name);
@@ -822,7 +659,7 @@ check_entry(struct ip6t_entry *e, const char *name, unsigned int size,
 
 static inline int
 check_entry_size_and_hooks(struct ip6t_entry *e,
-			   struct ip6t_table_info *newinfo,
+			   struct xt_table_info *newinfo,
 			   unsigned char *base,
 			   unsigned char *limit,
 			   const unsigned int *hook_entries,
@@ -856,7 +693,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
            < 0 (not IP6T_RETURN). --RR */
 
 	/* Clear counters and comefrom */
-	e->counters = ((struct ip6t_counters) { 0, 0 });
+	e->counters = ((struct xt_counters) { 0, 0 });
 	e->comefrom = 0;
 
 	(*i)++;
@@ -886,7 +723,7 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i)
 static int
 translate_table(const char *name,
 		unsigned int valid_hooks,
-		struct ip6t_table_info *newinfo,
+		struct xt_table_info *newinfo,
 		void *entry0,
 		unsigned int size,
 		unsigned int number,
@@ -963,48 +800,10 @@ translate_table(const char *name,
 	return ret;
 }
 
-static struct ip6t_table_info *
-replace_table(struct ip6t_table *table,
-	      unsigned int num_counters,
-	      struct ip6t_table_info *newinfo,
-	      int *error)
-{
-	struct ip6t_table_info *oldinfo;
-
-#ifdef CONFIG_NETFILTER_DEBUG
-	{
-		int cpu;
-
-		for_each_cpu(cpu) {
-			struct ip6t_entry *table_base = newinfo->entries[cpu];
-			if (table_base)
-				table_base->comefrom = 0xdead57ac;
-		}
-	}
-#endif
-
-	/* Do the substitution. */
-	write_lock_bh(&table->lock);
-	/* Check inside lock: is the old number correct? */
-	if (num_counters != table->private->number) {
-		duprintf("num_counters != table->private->number (%u/%u)\n",
-			 num_counters, table->private->number);
-		write_unlock_bh(&table->lock);
-		*error = -EAGAIN;
-		return NULL;
-	}
-	oldinfo = table->private;
-	table->private = newinfo;
-	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
-
-	return oldinfo;
-}
-
 /* Gets counters. */
 static inline int
 add_entry_to_counter(const struct ip6t_entry *e,
-		     struct ip6t_counters total[],
+		     struct xt_counters total[],
 		     unsigned int *i)
 {
 	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
@@ -1025,8 +824,8 @@ set_entry_to_counter(const struct ip6t_entry *e,
 }
 
 static void
-get_counters(const struct ip6t_table_info *t,
-	     struct ip6t_counters counters[])
+get_counters(const struct xt_table_info *t,
+	     struct xt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
@@ -1060,19 +859,20 @@ get_counters(const struct ip6t_table_info *t,
 
 static int
 copy_entries_to_user(unsigned int total_size,
-		     struct ip6t_table *table,
+		     struct xt_table *table,
 		     void __user *userptr)
 {
 	unsigned int off, num, countersize;
 	struct ip6t_entry *e;
-	struct ip6t_counters *counters;
+	struct xt_counters *counters;
+	struct xt_table_info *private = table->private;
 	int ret = 0;
 	void *loc_cpu_entry;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
 	   about). */
-	countersize = sizeof(struct ip6t_counters) * table->private->number;
+	countersize = sizeof(struct xt_counters) * private->number;
 	counters = vmalloc(countersize);
 
 	if (counters == NULL)
@@ -1080,11 +880,11 @@ copy_entries_to_user(unsigned int total_size,
 
 	/* First, sum counters... */
 	write_lock_bh(&table->lock);
-	get_counters(table->private, counters);
+	get_counters(private, counters);
 	write_unlock_bh(&table->lock);
 
 	/* choose the copy that is on ourc node/cpu */
-	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
@@ -1143,87 +943,42 @@ get_entries(const struct ip6t_get_entries *entries,
 	    struct ip6t_get_entries __user *uptr)
 {
 	int ret;
-	struct ip6t_table *t;
+	struct xt_table *t;
 
-	t = find_table_lock(entries->name);
+	t = xt_find_table_lock(AF_INET6, entries->name);
 	if (t && !IS_ERR(t)) {
-		duprintf("t->private->number = %u\n",
-			 t->private->number);
-		if (entries->size == t->private->size)
-			ret = copy_entries_to_user(t->private->size,
+		struct xt_table_info *private = t->private;
+		duprintf("t->private->number = %u\n", private->number);
+		if (entries->size == private->size)
+			ret = copy_entries_to_user(private->size,
 						   t, uptr->entrytable);
 		else {
 			duprintf("get_entries: I've got %u not %u!\n",
-				 t->private->size,
-				 entries->size);
+				 private->size, entries->size);
 			ret = -EINVAL;
 		}
 		module_put(t->me);
-		up(&ip6t_mutex);
+		xt_table_unlock(t);
 	} else
 		ret = t ? PTR_ERR(t) : -ENOENT;
 
 	return ret;
 }
 
-static void free_table_info(struct ip6t_table_info *info)
-{
-	int cpu;
-	for_each_cpu(cpu) {
-		if (info->size <= PAGE_SIZE)
-			kfree(info->entries[cpu]);
-		else
-			vfree(info->entries[cpu]);
-	}
-	kfree(info);
-}
-
-static struct ip6t_table_info *alloc_table_info(unsigned int size)
-{
-	struct ip6t_table_info *newinfo;
-	int cpu;
-
-	newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL);
-	if (!newinfo)
-		return NULL;
-
-	newinfo->size = size;
-
-	for_each_cpu(cpu) {
-		if (size <= PAGE_SIZE)
-			newinfo->entries[cpu] = kmalloc_node(size,
-							GFP_KERNEL,
-							cpu_to_node(cpu));
-		else
-			newinfo->entries[cpu] = vmalloc_node(size,
-							     cpu_to_node(cpu));
-		if (newinfo->entries[cpu] == NULL) {
-			free_table_info(newinfo);
-			return NULL;
-		}
-	}
-
-	return newinfo;
-}
-
 static int
 do_replace(void __user *user, unsigned int len)
 {
 	int ret;
 	struct ip6t_replace tmp;
-	struct ip6t_table *t;
-	struct ip6t_table_info *newinfo, *oldinfo;
-	struct ip6t_counters *counters;
+	struct xt_table *t;
+	struct xt_table_info *newinfo, *oldinfo;
+	struct xt_counters *counters;
 	void *loc_cpu_entry, *loc_cpu_old_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
 
-	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
-	if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
-		return -ENOMEM;
-
-	newinfo = alloc_table_info(tmp.size);
+	newinfo = xt_alloc_table_info(tmp.size);
 	if (!newinfo)
 		return -ENOMEM;
 
@@ -1235,7 +990,7 @@ do_replace(void __user *user, unsigned int len)
 		goto free_newinfo;
 	}
 
-	counters = vmalloc(tmp.num_counters * sizeof(struct ip6t_counters));
+	counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto free_newinfo;
@@ -1249,7 +1004,7 @@ do_replace(void __user *user, unsigned int len)
 
 	duprintf("ip_tables: Translated table\n");
 
-	t = try_then_request_module(find_table_lock(tmp.name),
+	t = try_then_request_module(xt_find_table_lock(AF_INET6, tmp.name),
 				    "ip6table_%s", tmp.name);
 	if (!t || IS_ERR(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
@@ -1264,7 +1019,7 @@ do_replace(void __user *user, unsigned int len)
 		goto put_module;
 	}
 
-	oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+	oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret);
 	if (!oldinfo)
 		goto put_module;
 
@@ -1283,23 +1038,23 @@ do_replace(void __user *user, unsigned int len)
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
-	free_table_info(oldinfo);
+	xt_free_table_info(oldinfo);
 	if (copy_to_user(tmp.counters, counters,
-			 sizeof(struct ip6t_counters) * tmp.num_counters) != 0)
+			 sizeof(struct xt_counters) * tmp.num_counters) != 0)
 		ret = -EFAULT;
 	vfree(counters);
-	up(&ip6t_mutex);
+	xt_table_unlock(t);
 	return ret;
 
  put_module:
 	module_put(t->me);
-	up(&ip6t_mutex);
+	xt_table_unlock(t);
  free_newinfo_counters_untrans:
 	IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
  free_newinfo_counters:
 	vfree(counters);
  free_newinfo:
-	free_table_info(newinfo);
+	xt_free_table_info(newinfo);
 	return ret;
 }
 
@@ -1307,7 +1062,7 @@ do_replace(void __user *user, unsigned int len)
  * and everything is OK. */
 static inline int
 add_counter_to_entry(struct ip6t_entry *e,
-		     const struct ip6t_counters addme[],
+		     const struct xt_counters addme[],
 		     unsigned int *i)
 {
 #if 0
@@ -1329,15 +1084,16 @@ static int
 do_add_counters(void __user *user, unsigned int len)
 {
 	unsigned int i;
-	struct ip6t_counters_info tmp, *paddc;
-	struct ip6t_table *t;
+	struct xt_counters_info tmp, *paddc;
+	struct xt_table_info *private;
+	struct xt_table *t;
 	int ret = 0;
 	void *loc_cpu_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
 
-	if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ip6t_counters))
+	if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters))
 		return -EINVAL;
 
 	paddc = vmalloc(len);
@@ -1349,29 +1105,30 @@ do_add_counters(void __user *user, unsigned int len)
 		goto free;
 	}
 
-	t = find_table_lock(tmp.name);
+	t = xt_find_table_lock(AF_INET6, tmp.name);
 	if (!t || IS_ERR(t)) {
 		ret = t ? PTR_ERR(t) : -ENOENT;
 		goto free;
 	}
 
 	write_lock_bh(&t->lock);
-	if (t->private->number != paddc->num_counters) {
+	private = t->private;
+	if (private->number != paddc->num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = t->private->entries[smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
-			  t->private->size,
+			  private->size,
 			  add_counter_to_entry,
 			  paddc->counters,
 			  &i);
  unlock_up_free:
 	write_unlock_bh(&t->lock);
-	up(&ip6t_mutex);
+	xt_table_unlock(t);
 	module_put(t->me);
  free:
 	vfree(paddc);
@@ -1415,7 +1172,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	switch (cmd) {
 	case IP6T_SO_GET_INFO: {
 		char name[IP6T_TABLE_MAXNAMELEN];
-		struct ip6t_table *t;
+		struct xt_table *t;
 
 		if (*len != sizeof(struct ip6t_getinfo)) {
 			duprintf("length %u != %u\n", *len,
@@ -1430,25 +1187,26 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		}
 		name[IP6T_TABLE_MAXNAMELEN-1] = '\0';
 
-		t = try_then_request_module(find_table_lock(name),
+		t = try_then_request_module(xt_find_table_lock(AF_INET6, name),
 					    "ip6table_%s", name);
 		if (t && !IS_ERR(t)) {
 			struct ip6t_getinfo info;
+			struct xt_table_info *private = t->private;
 
 			info.valid_hooks = t->valid_hooks;
-			memcpy(info.hook_entry, t->private->hook_entry,
+			memcpy(info.hook_entry, private->hook_entry,
 			       sizeof(info.hook_entry));
-			memcpy(info.underflow, t->private->underflow,
+			memcpy(info.underflow, private->underflow,
 			       sizeof(info.underflow));
-			info.num_entries = t->private->number;
-			info.size = t->private->size;
+			info.num_entries = private->number;
+			info.size = private->size;
 			memcpy(info.name, name, sizeof(info.name));
 
 			if (copy_to_user(user, &info, *len) != 0)
 				ret = -EFAULT;
 			else
 				ret = 0;
-			up(&ip6t_mutex);
+			xt_table_unlock(t);
 			module_put(t->me);
 		} else
 			ret = t ? PTR_ERR(t) : -ENOENT;
@@ -1475,7 +1233,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	case IP6T_SO_GET_REVISION_MATCH:
 	case IP6T_SO_GET_REVISION_TARGET: {
 		struct ip6t_get_revision rev;
-		int (*revfn)(const char *, u8, int *);
+		int target;
 
 		if (*len != sizeof(rev)) {
 			ret = -EINVAL;
@@ -1487,12 +1245,13 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		}
 
 		if (cmd == IP6T_SO_GET_REVISION_TARGET)
-			revfn = target_revfn;
+			target = 1;
 		else
-			revfn = match_revfn;
+			target = 0;
 
-		try_then_request_module(find_revision(rev.name, rev.revision,
-						      revfn, &ret),
+		try_then_request_module(xt_find_revision(AF_INET6, rev.name,
+							 rev.revision,
+							 target, &ret),
 					"ip6t_%s", rev.name);
 		break;
 	}
@@ -1505,61 +1264,16 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	return ret;
 }
 
-/* Registration hooks for targets. */
-int
-ip6t_register_target(struct ip6t_target *target)
-{
-	int ret;
-
-	ret = down_interruptible(&ip6t_mutex);
-	if (ret != 0)
-		return ret;
-	list_add(&target->list, &ip6t_target);
-	up(&ip6t_mutex);
-	return ret;
-}
-
-void
-ip6t_unregister_target(struct ip6t_target *target)
-{
-	down(&ip6t_mutex);
-	LIST_DELETE(&ip6t_target, target);
-	up(&ip6t_mutex);
-}
-
-int
-ip6t_register_match(struct ip6t_match *match)
-{
-	int ret;
-
-	ret = down_interruptible(&ip6t_mutex);
-	if (ret != 0)
-		return ret;
-
-	list_add(&match->list, &ip6t_match);
-	up(&ip6t_mutex);
-
-	return ret;
-}
-
-void
-ip6t_unregister_match(struct ip6t_match *match)
-{
-	down(&ip6t_mutex);
-	LIST_DELETE(&ip6t_match, match);
-	up(&ip6t_mutex);
-}
-
-int ip6t_register_table(struct ip6t_table *table,
+int ip6t_register_table(struct xt_table *table,
 			const struct ip6t_replace *repl)
 {
 	int ret;
-	struct ip6t_table_info *newinfo;
-	static struct ip6t_table_info bootstrap
+	struct xt_table_info *newinfo;
+	static struct xt_table_info bootstrap
 		= { 0, 0, 0, { 0 }, { 0 }, { } };
 	void *loc_cpu_entry;
 
-	newinfo = alloc_table_info(repl->size);
+	newinfo = xt_alloc_table_info(repl->size);
 	if (!newinfo)
 		return -ENOMEM;
 
@@ -1573,244 +1287,29 @@ int ip6t_register_table(struct ip6t_table *table,
 			      repl->hook_entry,
 			      repl->underflow);
 	if (ret != 0) {
-		free_table_info(newinfo);
+		xt_free_table_info(newinfo);
 		return ret;
 	}
 
-	ret = down_interruptible(&ip6t_mutex);
-	if (ret != 0) {
-		free_table_info(newinfo);
+	if (xt_register_table(table, &bootstrap, newinfo) != 0) {
+		xt_free_table_info(newinfo);
 		return ret;
 	}
 
-	/* Don't autoload: we'd eat our tail... */
-	if (list_named_find(&ip6t_tables, table->name)) {
-		ret = -EEXIST;
-		goto free_unlock;
-	}
-
-	/* Simplifies replace_table code. */
-	table->private = &bootstrap;
-	if (!replace_table(table, 0, newinfo, &ret))
-		goto free_unlock;
-
-	duprintf("table->private->number = %u\n",
-		 table->private->number);
-
-	/* save number of initial entries */
-	table->private->initial_entries = table->private->number;
-
-	rwlock_init(&table->lock);
-	list_prepend(&ip6t_tables, table);
-
- unlock:
-	up(&ip6t_mutex);
-	return ret;
-
- free_unlock:
-	free_table_info(newinfo);
-	goto unlock;
+	return 0;
 }
 
-void ip6t_unregister_table(struct ip6t_table *table)
+void ip6t_unregister_table(struct xt_table *table)
 {
+	struct xt_table_info *private;
 	void *loc_cpu_entry;
 
-	down(&ip6t_mutex);
-	LIST_DELETE(&ip6t_tables, table);
-	up(&ip6t_mutex);
+	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
-	IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
-			  cleanup_entry, NULL);
-	free_table_info(table->private);
-}
-
-/* Returns 1 if the port is matched by the range, 0 otherwise */
-static inline int
-port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert)
-{
-	int ret;
-
-	ret = (port >= min && port <= max) ^ invert;
-	return ret;
-}
-
-static int
-tcp_find_option(u_int8_t option,
-		const struct sk_buff *skb,
-		unsigned int tcpoff,
-		unsigned int optlen,
-		int invert,
-		int *hotdrop)
-{
-	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
-	u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
-	unsigned int i;
-
-	duprintf("tcp_match: finding option\n");
-	if (!optlen)
-		return invert;
-	/* If we don't have the whole header, drop packet. */
-	op = skb_header_pointer(skb, tcpoff + sizeof(struct tcphdr), optlen,
-				_opt);
-	if (op == NULL) {
-		*hotdrop = 1;
-		return 0;
-	}
-
-	for (i = 0; i < optlen; ) {
-		if (op[i] == option) return !invert;
-		if (op[i] < 2) i++;
-		else i += op[i+1]?:1;
-	}
-
-	return invert;
-}
-
-static int
-tcp_match(const struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  const void *matchinfo,
-	  int offset,
-	  unsigned int protoff,
-	  int *hotdrop)
-{
-	struct tcphdr _tcph, *th;
-	const struct ip6t_tcp *tcpinfo = matchinfo;
-
-	if (offset) {
-		/* To quote Alan:
-
-		   Don't allow a fragment of TCP 8 bytes in. Nobody normal
-		   causes this. Its a cracker trying to break in by doing a
-		   flag overwrite to pass the direction checks.
-		*/
-		if (offset == 1) {
-			duprintf("Dropping evil TCP offset=1 frag.\n");
-			*hotdrop = 1;
-		}
-		/* Must not be a fragment. */
-		return 0;
-	}
-
-#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
-
-	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
-	if (th == NULL) {
-		/* We've been asked to examine this packet, and we
-		   can't.  Hence, no choice but to drop. */
-		duprintf("Dropping evil TCP offset=0 tinygram.\n");
-		*hotdrop = 1;
-		return 0;
-	}
-
-	if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
-			ntohs(th->source),
-			!!(tcpinfo->invflags & IP6T_TCP_INV_SRCPT)))
-		return 0;
-	if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
-			ntohs(th->dest),
-			!!(tcpinfo->invflags & IP6T_TCP_INV_DSTPT)))
-		return 0;
-	if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
-		      == tcpinfo->flg_cmp,
-		      IP6T_TCP_INV_FLAGS))
-		return 0;
-	if (tcpinfo->option) {
-		if (th->doff * 4 < sizeof(_tcph)) {
-			*hotdrop = 1;
-			return 0;
-		}
-		if (!tcp_find_option(tcpinfo->option, skb, protoff,
-				     th->doff*4 - sizeof(*th),
-				     tcpinfo->invflags & IP6T_TCP_INV_OPTION,
-				     hotdrop))
-			return 0;
-	}
-	return 1;
-}
-
-/* Called when user tries to insert an entry of this type. */
-static int
-tcp_checkentry(const char *tablename,
-	       const struct ip6t_ip6 *ipv6,
-	       void *matchinfo,
-	       unsigned int matchsize,
-	       unsigned int hook_mask)
-{
-	const struct ip6t_tcp *tcpinfo = matchinfo;
-
-	/* Must specify proto == TCP, and no unknown invflags */
-	return ipv6->proto == IPPROTO_TCP
-		&& !(ipv6->invflags & IP6T_INV_PROTO)
-		&& matchsize == IP6T_ALIGN(sizeof(struct ip6t_tcp))
-		&& !(tcpinfo->invflags & ~IP6T_TCP_INV_MASK);
-}
-
-static int
-udp_match(const struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  const void *matchinfo,
-	  int offset,
-	  unsigned int protoff,
-	  int *hotdrop)
-{
-	struct udphdr _udph, *uh;
-	const struct ip6t_udp *udpinfo = matchinfo;
-
-	/* Must not be a fragment. */
-	if (offset)
-		return 0;
-
-	uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph);
-	if (uh == NULL) {
-		/* We've been asked to examine this packet, and we
-		   can't.  Hence, no choice but to drop. */
-		duprintf("Dropping evil UDP tinygram.\n");
-		*hotdrop = 1;
-		return 0;
-	}
-
-	return port_match(udpinfo->spts[0], udpinfo->spts[1],
-			  ntohs(uh->source),
-			  !!(udpinfo->invflags & IP6T_UDP_INV_SRCPT))
-		&& port_match(udpinfo->dpts[0], udpinfo->dpts[1],
-			      ntohs(uh->dest),
-			      !!(udpinfo->invflags & IP6T_UDP_INV_DSTPT));
-}
-
-/* Called when user tries to insert an entry of this type. */
-static int
-udp_checkentry(const char *tablename,
-	       const struct ip6t_ip6 *ipv6,
-	       void *matchinfo,
-	       unsigned int matchinfosize,
-	       unsigned int hook_mask)
-{
-	const struct ip6t_udp *udpinfo = matchinfo;
-
-	/* Must specify proto == UDP, and no unknown invflags */
-	if (ipv6->proto != IPPROTO_UDP || (ipv6->invflags & IP6T_INV_PROTO)) {
-		duprintf("ip6t_udp: Protocol %u != %u\n", ipv6->proto,
-			 IPPROTO_UDP);
-		return 0;
-	}
-	if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_udp))) {
-		duprintf("ip6t_udp: matchsize %u != %u\n",
-			 matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_udp)));
-		return 0;
-	}
-	if (udpinfo->invflags & ~IP6T_UDP_INV_MASK) {
-		duprintf("ip6t_udp: unknown flags %X\n",
-			 udpinfo->invflags);
-		return 0;
-	}
-
-	return 1;
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL);
+	xt_free_table_info(private);
 }
 
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
@@ -1858,11 +1357,12 @@ icmp6_match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 icmp6_checkentry(const char *tablename,
-	   const struct ip6t_ip6 *ipv6,
+	   const void *entry,
 	   void *matchinfo,
 	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
+	const struct ip6t_ip6 *ipv6 = entry;
 	const struct ip6t_icmp *icmpinfo = matchinfo;
 
 	/* Must specify proto == ICMP, and no unknown invflags */
@@ -1892,164 +1392,42 @@ static struct nf_sockopt_ops ip6t_sockopts = {
 	.get		= do_ip6t_get_ctl,
 };
 
-static struct ip6t_match tcp_matchstruct = {
-	.name		= "tcp",
-	.match		= &tcp_match,
-	.checkentry	= &tcp_checkentry,
-};
-
-static struct ip6t_match udp_matchstruct = {
-	.name		= "udp",
-	.match		= &udp_match,
-	.checkentry	= &udp_checkentry,
-};
-
 static struct ip6t_match icmp6_matchstruct = {
 	.name		= "icmp6",
 	.match		= &icmp6_match,
 	.checkentry	= &icmp6_checkentry,
 };
 
-#ifdef CONFIG_PROC_FS
-static inline int print_name(const char *i,
-			     off_t start_offset, char *buffer, int length,
-			     off_t *pos, unsigned int *count)
-{
-	if ((*count)++ >= start_offset) {
-		unsigned int namelen;
-
-		namelen = sprintf(buffer + *pos, "%s\n",
-				  i + sizeof(struct list_head));
-		if (*pos + namelen > length) {
-			/* Stop iterating */
-			return 1;
-		}
-		*pos += namelen;
-	}
-	return 0;
-}
-
-static inline int print_target(const struct ip6t_target *t,
-                               off_t start_offset, char *buffer, int length,
-                               off_t *pos, unsigned int *count)
-{
-	if (t == &ip6t_standard_target || t == &ip6t_error_target)
-		return 0;
-	return print_name((char *)t, start_offset, buffer, length, pos, count);
-}
-
-static int ip6t_get_tables(char *buffer, char **start, off_t offset, int length)
-{
-	off_t pos = 0;
-	unsigned int count = 0;
-
-	if (down_interruptible(&ip6t_mutex) != 0)
-		return 0;
-
-	LIST_FIND(&ip6t_tables, print_name, char *,
-		  offset, buffer, length, &pos, &count);
-
-	up(&ip6t_mutex);
-
-	/* `start' hack - see fs/proc/generic.c line ~105 */
-	*start=(char *)((unsigned long)count-offset);
-	return pos;
-}
-
-static int ip6t_get_targets(char *buffer, char **start, off_t offset, int length)
-{
-	off_t pos = 0;
-	unsigned int count = 0;
-
-	if (down_interruptible(&ip6t_mutex) != 0)
-		return 0;
-
-	LIST_FIND(&ip6t_target, print_target, struct ip6t_target *,
-		  offset, buffer, length, &pos, &count);
-
-	up(&ip6t_mutex);
-
-	*start = (char *)((unsigned long)count - offset);
-	return pos;
-}
-
-static int ip6t_get_matches(char *buffer, char **start, off_t offset, int length)
-{
-	off_t pos = 0;
-	unsigned int count = 0;
-
-	if (down_interruptible(&ip6t_mutex) != 0)
-		return 0;
-
-	LIST_FIND(&ip6t_match, print_name, char *,
-		  offset, buffer, length, &pos, &count);
-
-	up(&ip6t_mutex);
-
-	*start = (char *)((unsigned long)count - offset);
-	return pos;
-}
-
-static const struct { char *name; get_info_t *get_info; } ip6t_proc_entry[] =
-{ { "ip6_tables_names", ip6t_get_tables },
-  { "ip6_tables_targets", ip6t_get_targets },
-  { "ip6_tables_matches", ip6t_get_matches },
-  { NULL, NULL} };
-#endif /*CONFIG_PROC_FS*/
-
 static int __init init(void)
 {
 	int ret;
 
+	xt_proto_init(AF_INET6);
+
 	/* Noone else will be downing sem now, so we won't sleep */
-	down(&ip6t_mutex);
-	list_append(&ip6t_target, &ip6t_standard_target);
-	list_append(&ip6t_target, &ip6t_error_target);
-	list_append(&ip6t_match, &tcp_matchstruct);
-	list_append(&ip6t_match, &udp_matchstruct);
-	list_append(&ip6t_match, &icmp6_matchstruct);
-	up(&ip6t_mutex);
+	xt_register_target(AF_INET6, &ip6t_standard_target);
+	xt_register_target(AF_INET6, &ip6t_error_target);
+	xt_register_match(AF_INET6, &icmp6_matchstruct);
 
 	/* Register setsockopt */
 	ret = nf_register_sockopt(&ip6t_sockopts);
 	if (ret < 0) {
 		duprintf("Unable to register sockopts.\n");
+		xt_proto_fini(AF_INET6);
 		return ret;
 	}
 
-#ifdef CONFIG_PROC_FS
-	{
-		struct proc_dir_entry *proc;
-		int i;
-
-		for (i = 0; ip6t_proc_entry[i].name; i++) {
-			proc = proc_net_create(ip6t_proc_entry[i].name, 0,
-					       ip6t_proc_entry[i].get_info);
-			if (!proc) {
-				while (--i >= 0)
-				       proc_net_remove(ip6t_proc_entry[i].name);
-				nf_unregister_sockopt(&ip6t_sockopts);
-				return -ENOMEM;
-			}
-			proc->owner = THIS_MODULE;
-		}
-	}
-#endif
-
-	printk("ip6_tables: (C) 2000-2002 Netfilter core team\n");
+	printk("ip6_tables: (C) 2000-2006 Netfilter Core Team\n");
 	return 0;
 }
 
 static void __exit fini(void)
 {
 	nf_unregister_sockopt(&ip6t_sockopts);
-#ifdef CONFIG_PROC_FS
-	{
-		int i;
-		for (i = 0; ip6t_proc_entry[i].name; i++)
-			proc_net_remove(ip6t_proc_entry[i].name);
-	}
-#endif
+	xt_unregister_match(AF_INET6, &icmp6_matchstruct);
+	xt_unregister_target(AF_INET6, &ip6t_error_target);
+	xt_unregister_target(AF_INET6, &ip6t_standard_target);
+	xt_proto_fini(AF_INET6);
 }
 
 /*
@@ -2128,10 +1506,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
 EXPORT_SYMBOL(ip6t_register_table);
 EXPORT_SYMBOL(ip6t_unregister_table);
 EXPORT_SYMBOL(ip6t_do_table);
-EXPORT_SYMBOL(ip6t_register_match);
-EXPORT_SYMBOL(ip6t_unregister_match);
-EXPORT_SYMBOL(ip6t_register_target);
-EXPORT_SYMBOL(ip6t_unregister_target);
 EXPORT_SYMBOL(ip6t_ext_hdr);
 EXPORT_SYMBOL(ipv6_find_hdr);
 EXPORT_SYMBOL(ip6_masked_addrcmp);
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
index 8f5549b72720..306200c35057 100644
--- a/net/ipv6/netfilter/ip6t_HL.c
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -62,7 +62,7 @@ static unsigned int ip6t_hl_target(struct sk_buff **pskb,
 }
 
 static int ip6t_hl_checkentry(const char *tablename,
-		const struct ip6t_entry *e,
+		const void *entry,
 		void *targinfo,
 		unsigned int targinfosize,
 		unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index ae4653bfd654..fc19d8a4ca68 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -444,7 +444,7 @@ ip6t_log_target(struct sk_buff **pskb,
 
 
 static int ip6t_log_checkentry(const char *tablename,
-			       const struct ip6t_entry *e,
+			       const void *entry,
 			       void *targinfo,
 			       unsigned int targinfosize,
 			       unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c
deleted file mode 100644
index eab8fb864ee0..000000000000
--- a/net/ipv6/netfilter/ip6t_MARK.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/* This is a module which is used for setting the NFMARK field of an skb. */
-
-/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/checksum.h>
-
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter_ipv6/ip6t_MARK.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-
-static unsigned int
-target(struct sk_buff **pskb,
-       const struct net_device *in,
-       const struct net_device *out,
-       unsigned int hooknum,
-       const void *targinfo,
-       void *userinfo)
-{
-	const struct ip6t_mark_target_info *markinfo = targinfo;
-
-	if((*pskb)->nfmark != markinfo->mark)
-		(*pskb)->nfmark = markinfo->mark;
-
-	return IP6T_CONTINUE;
-}
-
-static int
-checkentry(const char *tablename,
-	   const struct ip6t_entry *e,
-           void *targinfo,
-           unsigned int targinfosize,
-           unsigned int hook_mask)
-{
-	if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_mark_target_info))) {
-		printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
-		       targinfosize,
-		       IP6T_ALIGN(sizeof(struct ip6t_mark_target_info)));
-		return 0;
-	}
-
-	if (strcmp(tablename, "mangle") != 0) {
-		printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ip6t_target ip6t_mark_reg = { 
-	.name		= "MARK",
-	.target		= target,
-	.checkentry	= checkentry,
-	.me		= THIS_MODULE
-};
-
-static int __init init(void)
-{
-	printk(KERN_DEBUG "registering ipv6 mark target\n");
-	if (ip6t_register_target(&ip6t_mark_reg))
-		return -EINVAL;
-
-	return 0;
-}
-
-static void __exit fini(void)
-{
-	ip6t_unregister_target(&ip6t_mark_reg);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_NFQUEUE.c b/net/ipv6/netfilter/ip6t_NFQUEUE.c
deleted file mode 100644
index c6e3730e7409..000000000000
--- a/net/ipv6/netfilter/ip6t_NFQUEUE.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/* ip6tables module for using new netfilter netlink queue
- *
- * (C) 2005 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as 
- * published by the Free Software Foundation.
- * 
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
-
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("ip6tables NFQUEUE target");
-MODULE_LICENSE("GPL");
-
-static unsigned int
-target(struct sk_buff **pskb,
-       const struct net_device *in,
-       const struct net_device *out,
-       unsigned int hooknum,
-       const void *targinfo,
-       void *userinfo)
-{
-	const struct ipt_NFQ_info *tinfo = targinfo;
-
-	return NF_QUEUE_NR(tinfo->queuenum);
-}
-
-static int
-checkentry(const char *tablename,
-	   const struct ip6t_entry *e,
-           void *targinfo,
-           unsigned int targinfosize,
-           unsigned int hook_mask)
-{
-	if (targinfosize != IP6T_ALIGN(sizeof(struct ipt_NFQ_info))) {
-		printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
-		       targinfosize,
-		       IP6T_ALIGN(sizeof(struct ipt_NFQ_info)));
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ip6t_target ipt_NFQ_reg = {
-	.name		= "NFQUEUE",
-	.target		= target,
-	.checkentry	= checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ip6t_register_target(&ipt_NFQ_reg);
-}
-
-static void __exit fini(void)
-{
-	ip6t_unregister_target(&ipt_NFQ_reg);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index b03e87adca93..c745717b4ce2 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -218,12 +218,13 @@ static unsigned int reject6_target(struct sk_buff **pskb,
 }
 
 static int check(const char *tablename,
-		 const struct ip6t_entry *e,
+		 const void *entry,
 		 void *targinfo,
 		 unsigned int targinfosize,
 		 unsigned int hook_mask)
 {
  	const struct ip6t_reject_info *rejinfo = targinfo;
+	const struct ip6t_entry *e = entry;
 
  	if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_reject_info))) {
   		DEBUGP("ip6t_REJECT: targinfosize %u != 0\n", targinfosize);
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index f5c1a7ff4a1f..219a30365dff 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -98,7 +98,7 @@ match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 checkentry(const char *tablename,
-          const struct ip6t_ip6 *ip,
+          const void *entry,
           void *matchinfo,
           unsigned int matchinfosize,
           unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6t_dst.c b/net/ipv6/netfilter/ip6t_dst.c
index 48cf5f9efc95..80fe82669ce2 100644
--- a/net/ipv6/netfilter/ip6t_dst.c
+++ b/net/ipv6/netfilter/ip6t_dst.c
@@ -178,7 +178,7 @@ match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 checkentry(const char *tablename,
-          const struct ip6t_ip6 *ip,
+          const void *info,
           void *matchinfo,
           unsigned int matchinfosize,
           unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c
index e1828f6d0a40..724285df8711 100644
--- a/net/ipv6/netfilter/ip6t_esp.c
+++ b/net/ipv6/netfilter/ip6t_esp.c
@@ -76,7 +76,7 @@ match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 checkentry(const char *tablename,
-	   const struct ip6t_ip6 *ip,
+	   const void *ip,
 	   void *matchinfo,
 	   unsigned int matchinfosize,
 	   unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c
index 616c2cbcd54d..ddf5f571909c 100644
--- a/net/ipv6/netfilter/ip6t_eui64.c
+++ b/net/ipv6/netfilter/ip6t_eui64.c
@@ -62,7 +62,7 @@ match(const struct sk_buff *skb,
 
 static int
 ip6t_eui64_checkentry(const char *tablename,
-		   const struct ip6t_ip6 *ip,
+		   const void  *ip,
 		   void *matchinfo,
 		   unsigned int matchsize,
 		   unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index d1549b268669..a9964b946ed5 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -115,7 +115,7 @@ match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 checkentry(const char *tablename,
-          const struct ip6t_ip6 *ip,
+          const void *ip,
           void *matchinfo,
           unsigned int matchinfosize,
           unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index e3bc8e2700e7..ed8ded18bbd4 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -178,7 +178,7 @@ match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 checkentry(const char *tablename,
-          const struct ip6t_ip6 *ip,
+          const void *entry,
           void *matchinfo,
           unsigned int matchinfosize,
           unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c
index 0beaff5471dd..c5d9079f2d9d 100644
--- a/net/ipv6/netfilter/ip6t_hl.c
+++ b/net/ipv6/netfilter/ip6t_hl.c
@@ -48,7 +48,7 @@ static int match(const struct sk_buff *skb, const struct net_device *in,
 	return 0;
 }
 
-static int checkentry(const char *tablename, const struct ip6t_ip6 *ip,
+static int checkentry(const char *tablename, const void *entry,
 		      void *matchinfo, unsigned int matchsize,
 		      unsigned int hook_mask)
 {
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 32e67f05845b..fda1ceaf5a29 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -124,7 +124,7 @@ ipv6header_match(const struct sk_buff *skb,
 
 static int
 ipv6header_checkentry(const char *tablename,
-		      const struct ip6t_ip6 *ip,
+		      const void *ip,
 		      void *matchinfo,
 		      unsigned int matchsize,
 		      unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6t_length.c b/net/ipv6/netfilter/ip6t_length.c
deleted file mode 100644
index e0537d3811d5..000000000000
--- a/net/ipv6/netfilter/ip6t_length.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Length Match - IPv6 Port */
-
-/* (C) 1999-2001 James Morris <jmorros@intercode.com.au>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/netfilter_ipv6/ip6t_length.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
-MODULE_DESCRIPTION("IPv6 packet length match");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      unsigned int protoff,
-      int *hotdrop)
-{
-	const struct ip6t_length_info *info = matchinfo;
-	u_int16_t pktlen = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr);
-	
-	return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
-}
-
-static int
-checkentry(const char *tablename,
-           const struct ip6t_ip6 *ip,
-           void *matchinfo,
-           unsigned int matchsize,
-           unsigned int hook_mask)
-{
-	if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_length_info)))
-		return 0;
-
-	return 1;
-}
-
-static struct ip6t_match length_match = {
-	.name		= "length",
-	.match		= &match,
-	.checkentry	= &checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ip6t_register_match(&length_match);
-}
-
-static void __exit fini(void)
-{
-	ip6t_unregister_match(&length_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_limit.c b/net/ipv6/netfilter/ip6t_limit.c
deleted file mode 100644
index fb782f610be2..000000000000
--- a/net/ipv6/netfilter/ip6t_limit.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Kernel module to control the rate
- *
- * 2 September 1999: Changed from the target RATE to the match
- *                   `limit', removed logging.  Did I mention that
- *                   Alexey is a fucking genius?
- *                   Rusty Russell (rusty@rustcorp.com.au).  */
-
-/* (C) 1999 J�r�me de Vivie <devivie@info.enserb.u-bordeaux.fr>
- * (C) 1999 Herv� Eychenne <eychenne@info.enserb.u-bordeaux.fr>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter_ipv6/ip6t_limit.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
-MODULE_DESCRIPTION("rate limiting within ip6tables");
-
-/* The algorithm used is the Simple Token Bucket Filter (TBF)
- * see net/sched/sch_tbf.c in the linux source tree
- */
-
-static DEFINE_SPINLOCK(limit_lock);
-
-/* Rusty: This is my (non-mathematically-inclined) understanding of
-   this algorithm.  The `average rate' in jiffies becomes your initial
-   amount of credit `credit' and the most credit you can ever have
-   `credit_cap'.  The `peak rate' becomes the cost of passing the
-   test, `cost'.
-
-   `prev' tracks the last packet hit: you gain one credit per jiffy.
-   If you get credit balance more than this, the extra credit is
-   discarded.  Every time the match passes, you lose `cost' credits;
-   if you don't have that many, the test fails.
-
-   See Alexey's formal explanation in net/sched/sch_tbf.c.
-
-   To avoid underflow, we multiply by 128 (ie. you get 128 credits per
-   jiffy).  Hence a cost of 2^32-1, means one pass per 32768 seconds
-   at 1024HZ (or one every 9 hours).  A cost of 1 means 12800 passes
-   per second at 100HZ.  */
-
-#define CREDITS_PER_JIFFY 128
-
-static int
-ip6t_limit_match(const struct sk_buff *skb,
-		const struct net_device *in,
-		const struct net_device *out,
-		const void *matchinfo,
-		int offset,
-		unsigned int protoff,
-		int *hotdrop)
-{
-	struct ip6t_rateinfo *r = ((struct ip6t_rateinfo *)matchinfo)->master;
-	unsigned long now = jiffies;
-
-	spin_lock_bh(&limit_lock);
-	r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
-	if (r->credit > r->credit_cap)
-		r->credit = r->credit_cap;
-
-	if (r->credit >= r->cost) {
-		/* We're not limited. */
-		r->credit -= r->cost;
-		spin_unlock_bh(&limit_lock);
-		return 1;
-	}
-
-       	spin_unlock_bh(&limit_lock);
-	return 0;
-}
-
-/* Precision saver. */
-static u_int32_t
-user2credits(u_int32_t user)
-{
-	/* If multiplying would overflow... */
-	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
-		/* Divide first. */
-		return (user / IP6T_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
-
-	return (user * HZ * CREDITS_PER_JIFFY) / IP6T_LIMIT_SCALE;
-}
-
-static int
-ip6t_limit_checkentry(const char *tablename,
-		     const struct ip6t_ip6 *ip,
-		     void *matchinfo,
-		     unsigned int matchsize,
-		     unsigned int hook_mask)
-{
-	struct ip6t_rateinfo *r = matchinfo;
-
-	if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_rateinfo)))
-		return 0;
-
-	/* Check for overflow. */
-	if (r->burst == 0
-	    || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
-		printk("Call rusty: overflow in ip6t_limit: %u/%u\n",
-		       r->avg, r->burst);
-		return 0;
-	}
-
-	/* User avg in seconds * IP6T_LIMIT_SCALE: convert to jiffies *
-	   128. */
-	r->prev = jiffies;
-	r->credit = user2credits(r->avg * r->burst);	 /* Credits full. */
-	r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
-	r->cost = user2credits(r->avg);
-
-	/* For SMP, we only want to use one set of counters. */
-	r->master = r;
-
-	return 1;
-}
-
-static struct ip6t_match ip6t_limit_reg = {
-	.name		= "limit",
-	.match		= ip6t_limit_match,
-	.checkentry	= ip6t_limit_checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	if (ip6t_register_match(&ip6t_limit_reg))
-		return -EINVAL;
-	return 0;
-}
-
-static void __exit fini(void)
-{
-	ip6t_unregister_match(&ip6t_limit_reg);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_mac.c b/net/ipv6/netfilter/ip6t_mac.c
deleted file mode 100644
index c848152315bc..000000000000
--- a/net/ipv6/netfilter/ip6t_mac.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Kernel module to match MAC address parameters. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/if_ether.h>
-#include <linux/etherdevice.h>
-
-#include <linux/netfilter_ipv6/ip6t_mac.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("MAC address matching module for IPv6");
-MODULE_AUTHOR("Netfilter Core Teaam <coreteam@netfilter.org>");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      unsigned int protoff,
-      int *hotdrop)
-{
-    const struct ip6t_mac_info *info = matchinfo;
-
-    /* Is mac pointer valid? */
-    return (skb->mac.raw >= skb->head
-	    && (skb->mac.raw + ETH_HLEN) <= skb->data
-	    /* If so, compare... */
-	    && ((!compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr))
-		^ info->invert));
-}
-
-static int
-ip6t_mac_checkentry(const char *tablename,
-		   const struct ip6t_ip6 *ip,
-		   void *matchinfo,
-		   unsigned int matchsize,
-		   unsigned int hook_mask)
-{
-	if (hook_mask
-	    & ~((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_IN)
-		| (1 << NF_IP6_FORWARD))) {
-		printk("ip6t_mac: only valid for PRE_ROUTING, LOCAL_IN or"
-		       " FORWARD\n");
-		return 0;
-	}
-
-	if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_mac_info)))
-		return 0;
-
-	return 1;
-}
-
-static struct ip6t_match mac_match = {
-	.name		= "mac",
-	.match		= &match,
-	.checkentry	= &ip6t_mac_checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ip6t_register_match(&mac_match);
-}
-
-static void __exit fini(void)
-{
-	ip6t_unregister_match(&mac_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_mark.c b/net/ipv6/netfilter/ip6t_mark.c
deleted file mode 100644
index affc3de364fc..000000000000
--- a/net/ipv6/netfilter/ip6t_mark.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Kernel module to match NFMARK values. */
-
-/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter_ipv6/ip6t_mark.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("ip6tables mark match");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      unsigned int protoff,
-      int *hotdrop)
-{
-	const struct ip6t_mark_info *info = matchinfo;
-
-	return ((skb->nfmark & info->mask) == info->mark) ^ info->invert;
-}
-
-static int
-checkentry(const char *tablename,
-           const struct ip6t_ip6 *ip,
-           void *matchinfo,
-           unsigned int matchsize,
-           unsigned int hook_mask)
-{
-	if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_mark_info)))
-		return 0;
-
-	return 1;
-}
-
-static struct ip6t_match mark_match = {
-	.name		= "mark",
-	.match		= &match,
-	.checkentry	= &checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ip6t_register_match(&mark_match);
-}
-
-static void __exit fini(void)
-{
-	ip6t_unregister_match(&mark_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_multiport.c b/net/ipv6/netfilter/ip6t_multiport.c
index 6e3246153fa3..49f7829dfbc2 100644
--- a/net/ipv6/netfilter/ip6t_multiport.c
+++ b/net/ipv6/netfilter/ip6t_multiport.c
@@ -84,11 +84,12 @@ match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 checkentry(const char *tablename,
-	   const struct ip6t_ip6 *ip,
+	   const void *info,
 	   void *matchinfo,
 	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
+	const struct ip6t_ip6 *ip = info;
 	const struct ip6t_multiport *multiinfo = matchinfo;
 
 	if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_multiport)))
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index 4de4cdad4b7d..5409b375b512 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -53,7 +53,7 @@ match(const struct sk_buff *skb,
 
 static int
 checkentry(const char *tablename,
-           const struct ip6t_ip6 *ip,
+           const void  *ip,
            void *matchinfo,
            unsigned int matchsize,
            unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6t_physdev.c b/net/ipv6/netfilter/ip6t_physdev.c
deleted file mode 100644
index 71515c86ece1..000000000000
--- a/net/ipv6/netfilter/ip6t_physdev.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Kernel module to match the bridge port in and
- * out device for IP packets coming into contact with a bridge. */
-
-/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/netfilter_ipv6/ip6t_physdev.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter_bridge.h>
-#define MATCH   1
-#define NOMATCH 0
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
-MODULE_DESCRIPTION("iptables bridge physical device match module");
-
-static int
-match(const struct sk_buff *skb,
-      const struct net_device *in,
-      const struct net_device *out,
-      const void *matchinfo,
-      int offset,
-      unsigned int protoff,
-      int *hotdrop)
-{
-	int i;
-	static const char nulldevname[IFNAMSIZ];
-	const struct ip6t_physdev_info *info = matchinfo;
-	unsigned int ret;
-	const char *indev, *outdev;
-	struct nf_bridge_info *nf_bridge;
-
-	/* Not a bridged IP packet or no info available yet:
-	 * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if
-	 * the destination device will be a bridge. */
-	if (!(nf_bridge = skb->nf_bridge)) {
-		/* Return MATCH if the invert flags of the used options are on */
-		if ((info->bitmask & IP6T_PHYSDEV_OP_BRIDGED) &&
-		    !(info->invert & IP6T_PHYSDEV_OP_BRIDGED))
-			return NOMATCH;
-		if ((info->bitmask & IP6T_PHYSDEV_OP_ISIN) &&
-		    !(info->invert & IP6T_PHYSDEV_OP_ISIN))
-			return NOMATCH;
-		if ((info->bitmask & IP6T_PHYSDEV_OP_ISOUT) &&
-		    !(info->invert & IP6T_PHYSDEV_OP_ISOUT))
-			return NOMATCH;
-		if ((info->bitmask & IP6T_PHYSDEV_OP_IN) &&
-		    !(info->invert & IP6T_PHYSDEV_OP_IN))
-			return NOMATCH;
-		if ((info->bitmask & IP6T_PHYSDEV_OP_OUT) &&
-		    !(info->invert & IP6T_PHYSDEV_OP_OUT))
-			return NOMATCH;
-		return MATCH;
-	}
-
-	/* This only makes sense in the FORWARD and POSTROUTING chains */
-	if ((info->bitmask & IP6T_PHYSDEV_OP_BRIDGED) &&
-	    (!!(nf_bridge->mask & BRNF_BRIDGED) ^
-	    !(info->invert & IP6T_PHYSDEV_OP_BRIDGED)))
-		return NOMATCH;
-
-	if ((info->bitmask & IP6T_PHYSDEV_OP_ISIN &&
-	    (!nf_bridge->physindev ^ !!(info->invert & IP6T_PHYSDEV_OP_ISIN))) ||
-	    (info->bitmask & IP6T_PHYSDEV_OP_ISOUT &&
-	    (!nf_bridge->physoutdev ^ !!(info->invert & IP6T_PHYSDEV_OP_ISOUT))))
-		return NOMATCH;
-
-	if (!(info->bitmask & IP6T_PHYSDEV_OP_IN))
-		goto match_outdev;
-	indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname;
-	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
-		ret |= (((const unsigned int *)indev)[i]
-			^ ((const unsigned int *)info->physindev)[i])
-			& ((const unsigned int *)info->in_mask)[i];
-	}
-
-	if ((ret == 0) ^ !(info->invert & IP6T_PHYSDEV_OP_IN))
-		return NOMATCH;
-
-match_outdev:
-	if (!(info->bitmask & IP6T_PHYSDEV_OP_OUT))
-		return MATCH;
-	outdev = nf_bridge->physoutdev ?
-		 nf_bridge->physoutdev->name : nulldevname;
-	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
-		ret |= (((const unsigned int *)outdev)[i]
-			^ ((const unsigned int *)info->physoutdev)[i])
-			& ((const unsigned int *)info->out_mask)[i];
-	}
-
-	return (ret != 0) ^ !(info->invert & IP6T_PHYSDEV_OP_OUT);
-}
-
-static int
-checkentry(const char *tablename,
-		       const struct ip6t_ip6 *ip,
-		       void *matchinfo,
-		       unsigned int matchsize,
-		       unsigned int hook_mask)
-{
-	const struct ip6t_physdev_info *info = matchinfo;
-
-	if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_physdev_info)))
-		return 0;
-	if (!(info->bitmask & IP6T_PHYSDEV_OP_MASK) ||
-	    info->bitmask & ~IP6T_PHYSDEV_OP_MASK)
-		return 0;
-	return 1;
-}
-
-static struct ip6t_match physdev_match = {
-	.name		= "physdev",
-	.match		= &match,
-	.checkentry	= &checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ip6t_register_match(&physdev_match);
-}
-
-static void __exit fini(void)
-{
-	ip6t_unregister_match(&physdev_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index c1e770e45543..8465b4375855 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -183,7 +183,7 @@ match(const struct sk_buff *skb,
 /* Called when user tries to insert an entry of this type. */
 static int
 checkentry(const char *tablename,
-          const struct ip6t_ip6 *ip,
+          const void *entry,
           void *matchinfo,
           unsigned int matchinfosize,
           unsigned int hook_mask)
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 4c0028671c20..ce4a968e1f70 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -97,6 +97,7 @@ static struct ip6t_table packet_filter = {
 	.valid_hooks	= FILTER_VALID_HOOKS,
 	.lock		= RW_LOCK_UNLOCKED,
 	.me		= THIS_MODULE,
+	.af		= AF_INET6,
 };
 
 /* The work comes in here from netfilter.c. */
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 85c1e6eada19..30a4627e000d 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -127,6 +127,7 @@ static struct ip6t_table packet_mangler = {
 	.valid_hooks	= MANGLE_VALID_HOOKS,
 	.lock		= RW_LOCK_UNLOCKED,
 	.me		= THIS_MODULE,
+	.af		= AF_INET6,
 };
 
 /* The work comes in here from netfilter.c. */
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index c2982efd14af..db28ba3855e2 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -106,11 +106,12 @@ static struct
 	}
 };
 
-static struct ip6t_table packet_raw = { 
+static struct xt_table packet_raw = { 
 	.name = "raw", 
 	.valid_hooks = RAW_VALID_HOOKS, 
 	.lock = RW_LOCK_UNLOCKED, 
-	.me = THIS_MODULE
+	.me = THIS_MODULE,
+	.af = AF_INET6,
 };
 
 /* The work comes in here from netfilter.c. */
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index e57d6fc9957a..92ad53d8f8c3 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -584,7 +584,7 @@ MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>");
 
 static int __init init(void)
 {
-	need_nf_conntrack();
+	need_conntrack();
 	return init_or_cleanup(1);
 }
 
@@ -595,9 +595,3 @@ static void __exit fini(void)
 
 module_init(init);
 module_exit(fini);
-
-void need_ip6_conntrack(void)
-{
-}
-
-EXPORT_SYMBOL(need_ip6_conntrack);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index f3e5ffbd592f..84ef9a13108d 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -70,8 +70,8 @@ struct nf_ct_frag6_skb_cb
 
 struct nf_ct_frag6_queue
 {
-	struct nf_ct_frag6_queue	*next;
-	struct list_head lru_list;		/* lru list member	*/
+	struct hlist_node	list;
+	struct list_head	lru_list;	/* lru list member	*/
 
 	__u32			id;		/* fragment id		*/
 	struct in6_addr		saddr;
@@ -90,14 +90,13 @@ struct nf_ct_frag6_queue
 #define FIRST_IN		2
 #define LAST_IN			1
 	__u16			nhoffset;
-	struct nf_ct_frag6_queue	**pprev;
 };
 
 /* Hash table. */
 
 #define FRAG6Q_HASHSZ	64
 
-static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ];
+static struct hlist_head nf_ct_frag6_hash[FRAG6Q_HASHSZ];
 static DEFINE_RWLOCK(nf_ct_frag6_lock);
 static u32 nf_ct_frag6_hash_rnd;
 static LIST_HEAD(nf_ct_frag6_lru_list);
@@ -105,9 +104,7 @@ int nf_ct_frag6_nqueues = 0;
 
 static __inline__ void __fq_unlink(struct nf_ct_frag6_queue *fq)
 {
-	if (fq->next)
-		fq->next->pprev = fq->pprev;
-	*fq->pprev = fq->next;
+	hlist_del(&fq->list);
 	list_del(&fq->lru_list);
 	nf_ct_frag6_nqueues--;
 }
@@ -158,28 +155,18 @@ static void nf_ct_frag6_secret_rebuild(unsigned long dummy)
 	get_random_bytes(&nf_ct_frag6_hash_rnd, sizeof(u32));
 	for (i = 0; i < FRAG6Q_HASHSZ; i++) {
 		struct nf_ct_frag6_queue *q;
+		struct hlist_node *p, *n;
 
-		q = nf_ct_frag6_hash[i];
-		while (q) {
-			struct nf_ct_frag6_queue *next = q->next;
+		hlist_for_each_entry_safe(q, p, n, &nf_ct_frag6_hash[i], list) {
 			unsigned int hval = ip6qhashfn(q->id,
 						       &q->saddr,
 						       &q->daddr);
-
 			if (hval != i) {
-				/* Unlink. */
-				if (q->next)
-					q->next->pprev = q->pprev;
-				*q->pprev = q->next;
-
+				hlist_del(&q->list);
 				/* Relink to new hash chain. */
-				if ((q->next = nf_ct_frag6_hash[hval]) != NULL)
-					q->next->pprev = &q->next;
-				nf_ct_frag6_hash[hval] = q;
-				q->pprev = &nf_ct_frag6_hash[hval];
+				hlist_add_head(&q->list,
+					       &nf_ct_frag6_hash[hval]);
 			}
-
-			q = next;
 		}
 	}
 	write_unlock(&nf_ct_frag6_lock);
@@ -314,15 +301,17 @@ out:
 
 /* Creation primitives. */
 
-
 static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash,
 					  struct nf_ct_frag6_queue *fq_in)
 {
 	struct nf_ct_frag6_queue *fq;
+#ifdef CONFIG_SMP
+	struct hlist_node *n;
+#endif
 
 	write_lock(&nf_ct_frag6_lock);
 #ifdef CONFIG_SMP
-	for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) {
+	hlist_for_each_entry(fq, n, &nf_ct_frag6_hash[hash], list) {
 		if (fq->id == fq_in->id && 
 		    !ipv6_addr_cmp(&fq_in->saddr, &fq->saddr) &&
 		    !ipv6_addr_cmp(&fq_in->daddr, &fq->daddr)) {
@@ -340,10 +329,7 @@ static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash,
 		atomic_inc(&fq->refcnt);
 
 	atomic_inc(&fq->refcnt);
-	if ((fq->next = nf_ct_frag6_hash[hash]) != NULL)
-		fq->next->pprev = &fq->next;
-	nf_ct_frag6_hash[hash] = fq;
-	fq->pprev = &nf_ct_frag6_hash[hash];
+	hlist_add_head(&fq->list, &nf_ct_frag6_hash[hash]);
 	INIT_LIST_HEAD(&fq->lru_list);
 	list_add_tail(&fq->lru_list, &nf_ct_frag6_lru_list);
 	nf_ct_frag6_nqueues++;
@@ -384,10 +370,11 @@ static __inline__ struct nf_ct_frag6_queue *
 fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst)
 {
 	struct nf_ct_frag6_queue *fq;
+	struct hlist_node *n;
 	unsigned int hash = ip6qhashfn(id, src, dst);
 
 	read_lock(&nf_ct_frag6_lock);
-	for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) {
+	hlist_for_each_entry(fq, n, &nf_ct_frag6_hash[hash], list) {
 		if (fq->id == id && 
 		    !ipv6_addr_cmp(src, &fq->saddr) &&
 		    !ipv6_addr_cmp(dst, &fq->daddr)) {
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 7d55f9cbd853..99c0a0fa4a97 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -103,3 +103,261 @@ config NF_CT_NETLINK
 	  This option enables support for a netlink-based userspace interface
 
 endmenu
+
+config NETFILTER_XTABLES
+	tristate "Netfilter Xtables support (required for ip_tables)"
+	help
+	  This is required if you intend to use any of ip_tables,
+	  ip6_tables or arp_tables.
+
+# alphabetically ordered list of targets
+
+config NETFILTER_XT_TARGET_CLASSIFY
+	tristate '"CLASSIFY" target support'
+	depends on NETFILTER_XTABLES
+	help
+	  This option adds a `CLASSIFY' target, which enables the user to set
+	  the priority of a packet. Some qdiscs can use this value for
+	  classification, among these are:
+
+  	  atm, cbq, dsmark, pfifo_fast, htb, prio
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_CONNMARK
+	tristate  '"CONNMARK" target support'
+	depends on NETFILTER_XTABLES
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
+	depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
+	help
+	  This option adds a `CONNMARK' target, which allows one to manipulate
+	  the connection mark value.  Similar to the MARK target, but
+	  affects the connection mark value rather than the packet mark value.
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  The module will be called
+	  ipt_CONNMARK.o.  If unsure, say `N'.
+
+config NETFILTER_XT_TARGET_MARK
+	tristate '"MARK" target support'
+	depends on NETFILTER_XTABLES
+	help
+	  This option adds a `MARK' target, which allows you to create rules
+	  in the `mangle' table which alter the netfilter mark (nfmark) field
+	  associated with the packet prior to routing. This can change
+	  the routing method (see `Use netfilter MARK value as routing
+	  key') and can also be used by other subsystems to change their
+	  behavior.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_NFQUEUE
+	tristate '"NFQUEUE" target Support'
+	depends on NETFILTER_XTABLES
+	help
+	  This Target replaced the old obsolete QUEUE target.
+
+	  As opposed to QUEUE, it supports 65535 different queues,
+	  not just one.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_NOTRACK
+	tristate  '"NOTRACK" target support'
+	depends on NETFILTER_XTABLES
+	depends on IP_NF_RAW || IP6_NF_RAW
+	depends on IP_NF_CONNTRACK || NF_CONNTRACK
+	help
+	  The NOTRACK target allows a select rule to specify
+	  which packets *not* to enter the conntrack/NAT
+	  subsystem with all the consequences (no ICMP error tracking,
+	  no protocol helpers for the selected packets).
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_COMMENT
+	tristate  '"comment" match support'
+	depends on NETFILTER_XTABLES
+	help
+	  This option adds a `comment' dummy-match, which allows you to put
+	  comments in your iptables ruleset.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_CONNBYTES
+	tristate  '"connbytes" per-connection counter match support'
+	depends on NETFILTER_XTABLES
+	depends on (IP_NF_CONNTRACK && IP_NF_CT_ACCT) || NF_CT_ACCT
+	help
+	  This option adds a `connbytes' match, which allows you to match the
+	  number of bytes and/or packets for each direction within a connection.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_CONNMARK
+	tristate  '"connmark" connection mark match support'
+	depends on NETFILTER_XTABLES
+	depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || NF_CONNTRACK_MARK
+	help
+	  This option adds a `connmark' match, which allows you to match the
+	  connection mark value previously set for the session by `CONNMARK'. 
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  The module will be called
+	  ipt_connmark.o.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_CONNTRACK
+	tristate '"conntrack" connection tracking match support'
+	depends on NETFILTER_XTABLES
+	depends on IP_NF_CONNTRACK || NF_CONNTRACK
+	help
+	  This is a general conntrack match module, a superset of the state match.
+
+	  It allows matching on additional conntrack information, which is
+	  useful in complex configurations, such as NAT gateways with multiple
+	  internet links or tunnels.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_DCCP
+	tristate  '"DCCP" protocol match support'
+	depends on NETFILTER_XTABLES
+	help
+	  With this option enabled, you will be able to use the iptables
+	  `dccp' match in order to match on DCCP source/destination ports
+	  and DCCP flags.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_HELPER
+	tristate '"helper" match support'
+	depends on NETFILTER_XTABLES
+	depends on IP_NF_CONNTRACK || NF_CONNTRACK
+	help
+	  Helper matching allows you to match packets in dynamic connections
+	  tracked by a conntrack-helper, ie. ip_conntrack_ftp
+
+	  To compile it as a module, choose M here.  If unsure, say Y.
+
+config NETFILTER_XT_MATCH_LENGTH
+	tristate '"length" match support'
+	depends on NETFILTER_XTABLES
+	help
+	  This option allows you to match the length of a packet against a
+	  specific value or range of values.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_LIMIT
+	tristate '"limit" match support'
+	depends on NETFILTER_XTABLES
+	help
+	  limit matching allows you to control the rate at which a rule can be
+	  matched: mainly useful in combination with the LOG target ("LOG
+	  target support", below) and to avoid some Denial of Service attacks.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_MAC
+	tristate '"mac" address match support'
+	depends on NETFILTER_XTABLES
+	help
+	  MAC matching allows you to match packets based on the source
+	  Ethernet address of the packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_MARK
+	tristate '"mark" match support'
+	depends on NETFILTER_XTABLES
+	help
+	  Netfilter mark matching allows you to match packets based on the
+	  `nfmark' value in the packet.  This can be set by the MARK target
+	  (see below).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_PHYSDEV
+	tristate '"physdev" match support'
+	depends on NETFILTER_XTABLES && BRIDGE_NETFILTER
+	help
+	  Physdev packet matching matches against the physical bridge ports
+	  the IP packet arrived on or will leave by.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_PKTTYPE
+	tristate '"pkttype" packet type match support'
+	depends on NETFILTER_XTABLES
+	help
+	  Packet type matching allows you to match a packet by
+	  its "class", eg. BROADCAST, MULTICAST, ...
+
+	  Typical usage:
+	  iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_REALM
+	tristate  '"realm" match support'
+	depends on NETFILTER_XTABLES
+	select NET_CLS_ROUTE
+	help
+	  This option adds a `realm' match, which allows you to use the realm
+	  key from the routing subsystem inside iptables.
+	
+	  This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option 
+	  in tc world.
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_SCTP
+	tristate  '"sctp" protocol match support'
+	depends on NETFILTER_XTABLES
+	help
+	  With this option enabled, you will be able to use the 
+	  `sctp' match in order to match on SCTP source/destination ports
+	  and SCTP chunk types.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_STATE
+	tristate '"state" match support'
+	depends on NETFILTER_XTABLES
+	depends on IP_NF_CONNTRACK || NF_CONNTRACK
+	help
+	  Connection state matching allows you to match packets based on their
+	  relationship to a tracked connection (ie. previous packets).  This
+	  is a powerful tool for packet classification.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_STRING
+	tristate  '"string" match support'
+	depends on NETFILTER_XTABLES
+	select TEXTSEARCH
+	select TEXTSEARCH_KMP
+	select TEXTSEARCH_BM
+	select TEXTSEARCH_FSM
+	help
+	  This option adds a `string' match, which allows you to look for
+	  pattern matchings in packets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_TCPMSS
+	tristate '"tcpmss" match support'
+	depends on NETFILTER_XTABLES
+	help
+	  This option adds a `tcpmss' match, which allows you to examine the
+	  MSS value of TCP SYN packets, which control the maximum packet size
+	  for that connection.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index cb2183145c37..746172ebc91b 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,4 +1,5 @@
 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
+nf_conntrack-objs	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
 
@@ -6,13 +7,43 @@ obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
 obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
 obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
 
-nf_conntrack-objs	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o
-
+# connection tracking
 obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
-obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
 
 # SCTP protocol connection tracking
 obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
 
 # netlink interface for nf_conntrack
 obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
+
+# connection tracking helpers
+obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
+
+# generic X tables 
+obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
+
+# targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
+
+# matches
+obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNMARK) += xt_connmark.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 3531d142f693..617599aeeead 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -821,7 +821,7 @@ module_exit(fini);
 
 /* Some modules need us, but don't depend directly on any symbol.
    They should call this. */
-void need_nf_conntrack(void)
+void need_conntrack(void)
 {
 }
 
@@ -841,7 +841,7 @@ EXPORT_SYMBOL(nf_conntrack_protocol_unregister);
 EXPORT_SYMBOL(nf_ct_invert_tuplepr);
 EXPORT_SYMBOL(nf_conntrack_alter_reply);
 EXPORT_SYMBOL(nf_conntrack_destroyed);
-EXPORT_SYMBOL(need_nf_conntrack);
+EXPORT_SYMBOL(need_conntrack);
 EXPORT_SYMBOL(nf_conntrack_helper_register);
 EXPORT_SYMBOL(nf_conntrack_helper_unregister);
 EXPORT_SYMBOL(nf_ct_iterate_cleanup);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
new file mode 100644
index 000000000000..d7817afc6b96
--- /dev/null
+++ b/net/netfilter/x_tables.c
@@ -0,0 +1,624 @@
+/*
+ * x_tables core - Backend for {ip,ip6,arp}_tables
+ *
+ * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org>
+ *
+ * Based on existing ip_tables code which is
+ *   Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ *   Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("[ip,ip6,arp]_tables backend module");
+
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+
+struct xt_af {
+	struct semaphore mutex;
+	struct list_head match;
+	struct list_head target;
+	struct list_head tables;
+};
+
+static struct xt_af *xt;
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+enum {
+	TABLE,
+	TARGET,
+	MATCH,
+};
+
+/* Registration hooks for targets. */
+int
+xt_register_target(int af, struct xt_target *target)
+{
+	int ret;
+
+	ret = down_interruptible(&xt[af].mutex);
+	if (ret != 0)
+		return ret;
+	list_add(&target->list, &xt[af].target);
+	up(&xt[af].mutex);
+	return ret;
+}
+EXPORT_SYMBOL(xt_register_target);
+
+void
+xt_unregister_target(int af, struct xt_target *target)
+{
+	down(&xt[af].mutex);
+	LIST_DELETE(&xt[af].target, target);
+	up(&xt[af].mutex);
+}
+EXPORT_SYMBOL(xt_unregister_target);
+
+int
+xt_register_match(int af, struct xt_match *match)
+{
+	int ret;
+
+	ret = down_interruptible(&xt[af].mutex);
+	if (ret != 0)
+		return ret;
+
+	list_add(&match->list, &xt[af].match);
+	up(&xt[af].mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(xt_register_match);
+
+void
+xt_unregister_match(int af, struct xt_match *match)
+{
+	down(&xt[af].mutex);
+	LIST_DELETE(&xt[af].match, match);
+	up(&xt[af].mutex);
+}
+EXPORT_SYMBOL(xt_unregister_match);
+
+
+/*
+ * These are weird, but module loading must not be done with mutex
+ * held (since they will register), and we have to have a single
+ * function to use try_then_request_module().
+ */
+
+/* Find match, grabs ref.  Returns ERR_PTR() on error. */
+struct xt_match *xt_find_match(int af, const char *name, u8 revision)
+{
+	struct xt_match *m;
+	int err = 0;
+
+	if (down_interruptible(&xt[af].mutex) != 0)
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry(m, &xt[af].match, list) {
+		if (strcmp(m->name, name) == 0) {
+			if (m->revision == revision) {
+				if (try_module_get(m->me)) {
+					up(&xt[af].mutex);
+					return m;
+				}
+			} else
+				err = -EPROTOTYPE; /* Found something. */
+		}
+	}
+	up(&xt[af].mutex);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(xt_find_match);
+
+/* Find target, grabs ref.  Returns ERR_PTR() on error. */
+struct xt_target *xt_find_target(int af, const char *name, u8 revision)
+{
+	struct xt_target *t;
+	int err = 0;
+
+	if (down_interruptible(&xt[af].mutex) != 0)
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry(t, &xt[af].target, list) {
+		if (strcmp(t->name, name) == 0) {
+			if (t->revision == revision) {
+				if (try_module_get(t->me)) {
+					up(&xt[af].mutex);
+					return t;
+				}
+			} else
+				err = -EPROTOTYPE; /* Found something. */
+		}
+	}
+	up(&xt[af].mutex);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(xt_find_target);
+
+static const char *xt_prefix[NPROTO] = {
+	[AF_INET] 	= "ipt_%s",
+	[AF_INET6] 	= "ip6t_%s",
+	[NF_ARP]	= "arpt_%s",
+};
+
+struct xt_target *xt_request_find_target(int af, const char *name, u8 revision)
+{
+	struct xt_target *target;
+
+	target = try_then_request_module(xt_find_target(af, name, revision),
+					 xt_prefix[af], name);
+	if (IS_ERR(target) || !target)
+		return NULL;
+	return target;
+}
+EXPORT_SYMBOL_GPL(xt_request_find_target);
+
+static int match_revfn(int af, const char *name, u8 revision, int *bestp)
+{
+	struct xt_match *m;
+	int have_rev = 0;
+
+	list_for_each_entry(m, &xt[af].match, list) {
+		if (strcmp(m->name, name) == 0) {
+			if (m->revision > *bestp)
+				*bestp = m->revision;
+			if (m->revision == revision)
+				have_rev = 1;
+		}
+	}
+	return have_rev;
+}
+
+static int target_revfn(int af, const char *name, u8 revision, int *bestp)
+{
+	struct xt_target *t;
+	int have_rev = 0;
+
+	list_for_each_entry(t, &xt[af].target, list) {
+		if (strcmp(t->name, name) == 0) {
+			if (t->revision > *bestp)
+				*bestp = t->revision;
+			if (t->revision == revision)
+				have_rev = 1;
+		}
+	}
+	return have_rev;
+}
+
+/* Returns true or false (if no such extension at all) */
+int xt_find_revision(int af, const char *name, u8 revision, int target,
+		     int *err)
+{
+	int have_rev, best = -1;
+
+	if (down_interruptible(&xt[af].mutex) != 0) {
+		*err = -EINTR;
+		return 1;
+	}
+	if (target == 1)
+		have_rev = target_revfn(af, name, revision, &best);
+	else
+		have_rev = match_revfn(af, name, revision, &best);
+	up(&xt[af].mutex);
+
+	/* Nothing at all?  Return 0 to try loading module. */
+	if (best == -1) {
+		*err = -ENOENT;
+		return 0;
+	}
+
+	*err = best;
+	if (!have_rev)
+		*err = -EPROTONOSUPPORT;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(xt_find_revision);
+
+struct xt_table_info *xt_alloc_table_info(unsigned int size)
+{
+	struct xt_table_info *newinfo;
+	int cpu;
+
+	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
+	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages)
+		return NULL;
+
+	newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL);
+	if (!newinfo)
+		return NULL;
+
+	newinfo->size = size;
+
+	for_each_cpu(cpu) {
+		if (size <= PAGE_SIZE)
+			newinfo->entries[cpu] = kmalloc_node(size,
+							GFP_KERNEL,
+							cpu_to_node(cpu));
+		else
+			newinfo->entries[cpu] = vmalloc_node(size,
+							cpu_to_node(cpu));
+
+		if (newinfo->entries[cpu] == NULL) {
+			xt_free_table_info(newinfo);
+			return NULL;
+		}
+	}
+
+	return newinfo;
+}
+EXPORT_SYMBOL(xt_alloc_table_info);
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+	int cpu;
+
+	for_each_cpu(cpu) {
+		if (info->size <= PAGE_SIZE)
+			kfree(info->entries[cpu]);
+		else
+			vfree(info->entries[cpu]);
+	}
+	kfree(info);
+}
+EXPORT_SYMBOL(xt_free_table_info);
+
+/* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
+struct xt_table *xt_find_table_lock(int af, const char *name)
+{
+	struct xt_table *t;
+
+	if (down_interruptible(&xt[af].mutex) != 0)
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry(t, &xt[af].tables, list)
+		if (strcmp(t->name, name) == 0 && try_module_get(t->me))
+			return t;
+	up(&xt[af].mutex);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(xt_find_table_lock);
+
+void xt_table_unlock(struct xt_table *table)
+{
+	up(&xt[table->af].mutex);
+}
+EXPORT_SYMBOL_GPL(xt_table_unlock);
+
+
+struct xt_table_info *
+xt_replace_table(struct xt_table *table,
+	      unsigned int num_counters,
+	      struct xt_table_info *newinfo,
+	      int *error)
+{
+	struct xt_table_info *oldinfo, *private;
+
+	/* Do the substitution. */
+	write_lock_bh(&table->lock);
+	private = table->private;
+	/* Check inside lock: is the old number correct? */
+	if (num_counters != private->number) {
+		duprintf("num_counters != table->private->number (%u/%u)\n",
+			 num_counters, private->number);
+		write_unlock_bh(&table->lock);
+		*error = -EAGAIN;
+		return NULL;
+	}
+	oldinfo = private;
+	table->private = newinfo;
+	newinfo->initial_entries = oldinfo->initial_entries;
+	write_unlock_bh(&table->lock);
+
+	return oldinfo;
+}
+EXPORT_SYMBOL_GPL(xt_replace_table);
+
+int xt_register_table(struct xt_table *table,
+		      struct xt_table_info *bootstrap,
+		      struct xt_table_info *newinfo)
+{
+	int ret;
+	struct xt_table_info *private;
+
+	ret = down_interruptible(&xt[table->af].mutex);
+	if (ret != 0)
+		return ret;
+
+	/* Don't autoload: we'd eat our tail... */
+	if (list_named_find(&xt[table->af].tables, table->name)) {
+		ret = -EEXIST;
+		goto unlock;
+	}
+
+	/* Simplifies replace_table code. */
+	table->private = bootstrap;
+	if (!xt_replace_table(table, 0, newinfo, &ret))
+		goto unlock;
+
+	private = table->private;
+	duprintf("table->private->number = %u\n", private->number);
+
+	/* save number of initial entries */
+	private->initial_entries = private->number;
+
+	rwlock_init(&table->lock);
+	list_prepend(&xt[table->af].tables, table);
+
+	ret = 0;
+ unlock:
+	up(&xt[table->af].mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xt_register_table);
+
+void *xt_unregister_table(struct xt_table *table)
+{
+	struct xt_table_info *private;
+
+	down(&xt[table->af].mutex);
+	private = table->private;
+	LIST_DELETE(&xt[table->af].tables, table);
+	up(&xt[table->af].mutex);
+
+	return private;
+}
+EXPORT_SYMBOL_GPL(xt_unregister_table);
+
+#ifdef CONFIG_PROC_FS
+static char *xt_proto_prefix[NPROTO] = {
+	[AF_INET]	= "ip",
+	[AF_INET6]	= "ip6",
+	[NF_ARP]	= "arp",
+};
+
+static struct list_head *xt_get_idx(struct list_head *list, struct seq_file *seq, loff_t pos)
+{
+	struct list_head *head = list->next;
+
+	if (!head || list_empty(list))
+		return NULL;
+
+	while (pos && (head = head->next)) {
+		if (head == list)
+			return NULL;
+		pos--;
+	}
+	return pos ? NULL : head;
+}
+
+static struct list_head *type2list(u_int16_t af, u_int16_t type)
+{
+	struct list_head *list;
+
+	switch (type) {
+	case TARGET:
+		list = &xt[af].target;
+		break;
+	case MATCH:
+		list = &xt[af].match;
+		break;
+	case TABLE:
+		list = &xt[af].tables;
+		break;
+	default:
+		list = NULL;
+		break;
+	}
+
+	return list;
+}
+
+static void *xt_tgt_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct proc_dir_entry *pde = (struct proc_dir_entry *) seq->private;
+	u_int16_t af = (unsigned long)pde->data & 0xffff;
+	u_int16_t type = (unsigned long)pde->data >> 16;
+	struct list_head *list;
+
+	if (af >= NPROTO)
+		return NULL;
+
+	list = type2list(af, type);
+	if (!list)
+		return NULL;
+
+	if (down_interruptible(&xt[af].mutex) != 0)
+		return NULL;
+	
+	return xt_get_idx(list, seq, *pos);
+}
+
+static void *xt_tgt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct proc_dir_entry *pde = seq->private;
+	u_int16_t af = (unsigned long)pde->data & 0xffff;
+	u_int16_t type = (unsigned long)pde->data >> 16;
+	struct list_head *list;
+
+	if (af >= NPROTO)
+		return NULL;
+	
+	list = type2list(af, type);
+	if (!list)
+		return NULL;
+
+	(*pos)++;
+	return xt_get_idx(list, seq, *pos);
+}
+
+static void xt_tgt_seq_stop(struct seq_file *seq, void *v)
+{
+	struct proc_dir_entry *pde = seq->private;
+	u_int16_t af = (unsigned long)pde->data & 0xffff;
+
+	up(&xt[af].mutex);
+}
+
+static int xt_name_seq_show(struct seq_file *seq, void *v)
+{
+	char *name = (char *)v + sizeof(struct list_head);
+
+	if (strlen(name))
+		return seq_printf(seq, "%s\n", name);
+	else
+		return 0;
+}
+
+static struct seq_operations xt_tgt_seq_ops = {
+	.start	= xt_tgt_seq_start,
+	.next	= xt_tgt_seq_next,
+	.stop	= xt_tgt_seq_stop,
+	.show	= xt_name_seq_show,
+};
+
+static int xt_tgt_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &xt_tgt_seq_ops);
+	if (!ret) {
+		struct seq_file *seq = file->private_data;
+		struct proc_dir_entry *pde = PDE(inode);
+
+		seq->private = pde;
+	}
+
+	return ret;
+}
+
+static struct file_operations xt_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = xt_tgt_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+#define FORMAT_TABLES	"_tables_names"
+#define	FORMAT_MATCHES	"_tables_matches"
+#define FORMAT_TARGETS 	"_tables_targets"
+
+#endif /* CONFIG_PROC_FS */
+
+int xt_proto_init(int af)
+{
+#ifdef CONFIG_PROC_FS
+	char buf[XT_FUNCTION_MAXNAMELEN];
+	struct proc_dir_entry *proc;
+#endif
+
+	if (af >= NPROTO)
+		return -EINVAL;
+
+
+#ifdef CONFIG_PROC_FS
+	strlcpy(buf, xt_proto_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_TABLES, sizeof(buf));
+	proc = proc_net_fops_create(buf, 0440, &xt_file_ops);
+	if (!proc)
+		goto out;
+	proc->data = (void *) ((unsigned long) af | (TABLE << 16));
+
+
+	strlcpy(buf, xt_proto_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_MATCHES, sizeof(buf));
+	proc = proc_net_fops_create(buf, 0440, &xt_file_ops);
+	if (!proc)
+		goto out_remove_tables;
+	proc->data = (void *) ((unsigned long) af | (MATCH << 16));
+
+	strlcpy(buf, xt_proto_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_TARGETS, sizeof(buf));
+	proc = proc_net_fops_create(buf, 0440, &xt_file_ops);
+	if (!proc)
+		goto out_remove_matches;
+	proc->data = (void *) ((unsigned long) af | (TARGET << 16));
+#endif
+
+	return 0;
+
+#ifdef CONFIG_PROC_FS
+out_remove_matches:
+	strlcpy(buf, xt_proto_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_MATCHES, sizeof(buf));
+	proc_net_remove(buf);
+
+out_remove_tables:
+	strlcpy(buf, xt_proto_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_TABLES, sizeof(buf));
+	proc_net_remove(buf);
+out:
+	return -1;
+#endif
+}
+EXPORT_SYMBOL_GPL(xt_proto_init);
+
+void xt_proto_fini(int af)
+{
+#ifdef CONFIG_PROC_FS
+	char buf[XT_FUNCTION_MAXNAMELEN];
+
+	strlcpy(buf, xt_proto_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_TABLES, sizeof(buf));
+	proc_net_remove(buf);
+
+	strlcpy(buf, xt_proto_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_TARGETS, sizeof(buf));
+	proc_net_remove(buf);
+
+	strlcpy(buf, xt_proto_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_MATCHES, sizeof(buf));
+	proc_net_remove(buf);
+#endif /*CONFIG_PROC_FS*/
+}
+EXPORT_SYMBOL_GPL(xt_proto_fini);
+
+
+static int __init xt_init(void)
+{
+	int i;
+
+	xt = kmalloc(sizeof(struct xt_af) * NPROTO, GFP_KERNEL);
+	if (!xt)
+		return -ENOMEM;
+
+	for (i = 0; i < NPROTO; i++) {
+		init_MUTEX(&xt[i].mutex);
+		INIT_LIST_HEAD(&xt[i].target);
+		INIT_LIST_HEAD(&xt[i].match);
+		INIT_LIST_HEAD(&xt[i].tables);
+	}
+	return 0;
+}
+
+static void __exit xt_fini(void)
+{
+	kfree(xt);
+}
+
+module_init(xt_init);
+module_exit(xt_fini);
+
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
new file mode 100644
index 000000000000..78ee266a12ee
--- /dev/null
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -0,0 +1,109 @@
+/*
+ * This is a module which is used for setting the skb->priority field
+ * of an skb for qdisc classification.
+ */
+
+/* (C) 2001-2002 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CLASSIFY.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables qdisc classification target module");
+MODULE_ALIAS("ipt_CLASSIFY");
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct xt_classify_target_info *clinfo = targinfo;
+
+	if ((*pskb)->priority != clinfo->priority)
+		(*pskb)->priority = clinfo->priority;
+
+	return XT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+           const void *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	if (targinfosize != XT_ALIGN(sizeof(struct xt_classify_target_info))){
+		printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n",
+		       targinfosize,
+		       XT_ALIGN(sizeof(struct xt_classify_target_info)));
+		return 0;
+	}
+	
+	if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) |
+	                  (1 << NF_IP_POST_ROUTING))) {
+		printk(KERN_ERR "CLASSIFY: only valid in LOCAL_OUT, FORWARD "
+		                "and POST_ROUTING.\n");
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_ERR "CLASSIFY: can only be called from "
+		                "\"mangle\" table, not \"%s\".\n",
+		                tablename);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_target classify_reg = { 
+	.name 		= "CLASSIFY", 
+	.target 	= target,
+	.checkentry	= checkentry,
+	.me 		= THIS_MODULE,
+};
+static struct xt_target classify6_reg = { 
+	.name 		= "CLASSIFY", 
+	.target 	= target,
+	.checkentry	= checkentry,
+	.me 		= THIS_MODULE,
+};
+
+
+static int __init init(void)
+{
+	int ret;
+
+	ret = xt_register_target(AF_INET, &classify_reg);
+	if (ret)
+		return ret;
+
+	ret = xt_register_target(AF_INET6, &classify6_reg);
+	if (ret)
+		xt_unregister_target(AF_INET, &classify_reg);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_target(AF_INET, &classify_reg);
+	xt_unregister_target(AF_INET6, &classify6_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
new file mode 100644
index 000000000000..22506e376be5
--- /dev/null
+++ b/net/netfilter/xt_CONNMARK.c
@@ -0,0 +1,141 @@
+/* This kernel module is used to modify the connection mark values, or
+ * to optionally restore the skb nfmark from the connection mark
+ *
+ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
+MODULE_DESCRIPTION("IP tables CONNMARK matching module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_CONNMARK");
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CONNMARK.h>
+#include <net/netfilter/nf_conntrack_compat.h>
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct xt_connmark_target_info *markinfo = targinfo;
+	u_int32_t diff;
+	u_int32_t nfmark;
+	u_int32_t newmark;
+	u_int32_t ctinfo;
+	u_int32_t *ctmark = nf_ct_get_mark(*pskb, &ctinfo);
+
+	if (ctmark) {
+	    switch(markinfo->mode) {
+	    case XT_CONNMARK_SET:
+		newmark = (*ctmark & ~markinfo->mask) | markinfo->mark;
+		if (newmark != *ctmark)
+		    *ctmark = newmark;
+		break;
+	    case XT_CONNMARK_SAVE:
+		newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask);
+		if (*ctmark != newmark)
+		    *ctmark = newmark;
+		break;
+	    case XT_CONNMARK_RESTORE:
+		nfmark = (*pskb)->nfmark;
+		diff = (*ctmark ^ nfmark) & markinfo->mask;
+		if (diff != 0)
+		    (*pskb)->nfmark = nfmark ^ diff;
+		break;
+	    }
+	}
+
+	return XT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+	   const void *entry,
+	   void *targinfo,
+	   unsigned int targinfosize,
+	   unsigned int hook_mask)
+{
+	struct xt_connmark_target_info *matchinfo = targinfo;
+	if (targinfosize != XT_ALIGN(sizeof(struct xt_connmark_target_info))) {
+		printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       XT_ALIGN(sizeof(struct xt_connmark_target_info)));
+		return 0;
+	}
+
+	if (matchinfo->mode == XT_CONNMARK_RESTORE) {
+	    if (strcmp(tablename, "mangle") != 0) {
+		    printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+		    return 0;
+	    }
+	}
+
+	if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_target connmark_reg = {
+	.name = "CONNMARK",
+	.target = &target,
+	.checkentry = &checkentry,
+	.me = THIS_MODULE
+};
+static struct xt_target connmark6_reg = {
+	.name = "CONNMARK",
+	.target = &target,
+	.checkentry = &checkentry,
+	.me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	need_conntrack();
+
+	ret = xt_register_target(AF_INET, &connmark_reg);
+	if (ret)
+		return ret;
+
+	ret = xt_register_target(AF_INET6, &connmark6_reg);
+	if (ret)
+		xt_unregister_target(AF_INET, &connmark_reg);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_target(AF_INET, &connmark_reg);
+	xt_unregister_target(AF_INET6, &connmark6_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
new file mode 100644
index 000000000000..0c11ee9550f3
--- /dev/null
+++ b/net/netfilter/xt_MARK.c
@@ -0,0 +1,191 @@
+/* This is a module which is used for setting the NFMARK field of an skb. */
+
+/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_MARK.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("ip[6]tables MARK modification module");
+MODULE_ALIAS("ipt_MARK");
+MODULE_ALIAS("ip6t_MARK");
+
+static unsigned int
+target_v0(struct sk_buff **pskb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  unsigned int hooknum,
+	  const void *targinfo,
+	  void *userinfo)
+{
+	const struct xt_mark_target_info *markinfo = targinfo;
+
+	if((*pskb)->nfmark != markinfo->mark)
+		(*pskb)->nfmark = markinfo->mark;
+
+	return XT_CONTINUE;
+}
+
+static unsigned int
+target_v1(struct sk_buff **pskb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  unsigned int hooknum,
+	  const void *targinfo,
+	  void *userinfo)
+{
+	const struct xt_mark_target_info_v1 *markinfo = targinfo;
+	int mark = 0;
+
+	switch (markinfo->mode) {
+	case XT_MARK_SET:
+		mark = markinfo->mark;
+		break;
+		
+	case XT_MARK_AND:
+		mark = (*pskb)->nfmark & markinfo->mark;
+		break;
+		
+	case XT_MARK_OR:
+		mark = (*pskb)->nfmark | markinfo->mark;
+		break;
+	}
+
+	if((*pskb)->nfmark != mark)
+		(*pskb)->nfmark = mark;
+
+	return XT_CONTINUE;
+}
+
+
+static int
+checkentry_v0(const char *tablename,
+	      const void *entry,
+	      void *targinfo,
+	      unsigned int targinfosize,
+	      unsigned int hook_mask)
+{
+	struct xt_mark_target_info *markinfo = targinfo;
+
+	if (targinfosize != XT_ALIGN(sizeof(struct xt_mark_target_info))) {
+		printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       XT_ALIGN(sizeof(struct xt_mark_target_info)));
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+checkentry_v1(const char *tablename,
+	      const void *entry,
+	      void *targinfo,
+	      unsigned int targinfosize,
+	      unsigned int hook_mask)
+{
+	struct xt_mark_target_info_v1 *markinfo = targinfo;
+
+	if (targinfosize != XT_ALIGN(sizeof(struct xt_mark_target_info_v1))){
+		printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       XT_ALIGN(sizeof(struct xt_mark_target_info_v1)));
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	if (markinfo->mode != XT_MARK_SET
+	    && markinfo->mode != XT_MARK_AND
+	    && markinfo->mode != XT_MARK_OR) {
+		printk(KERN_WARNING "MARK: unknown mode %u\n",
+		       markinfo->mode);
+		return 0;
+	}
+
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_target ipt_mark_reg_v0 = {
+	.name		= "MARK",
+	.target		= target_v0,
+	.checkentry	= checkentry_v0,
+	.me		= THIS_MODULE,
+	.revision	= 0,
+};
+
+static struct xt_target ipt_mark_reg_v1 = {
+	.name		= "MARK",
+	.target		= target_v1,
+	.checkentry	= checkentry_v1,
+	.me		= THIS_MODULE,
+	.revision	= 1,
+};
+
+static struct xt_target ip6t_mark_reg_v0 = {
+	.name		= "MARK",
+	.target		= target_v0,
+	.checkentry	= checkentry_v0,
+	.me		= THIS_MODULE,
+	.revision	= 0,
+};
+
+static int __init init(void)
+{
+	int err;
+
+	err = xt_register_target(AF_INET, &ipt_mark_reg_v0);
+	if (err)
+		return err;
+
+	err = xt_register_target(AF_INET, &ipt_mark_reg_v1);
+	if (err)
+		xt_unregister_target(AF_INET, &ipt_mark_reg_v0);
+
+	err = xt_register_target(AF_INET6, &ip6t_mark_reg_v0);
+	if (err) {
+		xt_unregister_target(AF_INET, &ipt_mark_reg_v0);
+		xt_unregister_target(AF_INET, &ipt_mark_reg_v1);
+	}
+
+	return err;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_target(AF_INET, &ipt_mark_reg_v0);
+	xt_unregister_target(AF_INET, &ipt_mark_reg_v1);
+	xt_unregister_target(AF_INET6, &ip6t_mark_reg_v0);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
new file mode 100644
index 000000000000..8b76b6f8d1e4
--- /dev/null
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -0,0 +1,107 @@
+/* iptables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as 
+ * published by the Free Software Foundation.
+ * 
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_NFQUEUE.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("[ip,ip6,arp]_tables NFQUEUE target");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_NFQUEUE");
+MODULE_ALIAS("ip6t_NFQUEUE");
+MODULE_ALIAS("arpt_NFQUEUE");
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct xt_NFQ_info *tinfo = targinfo;
+
+	return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static int
+checkentry(const char *tablename,
+	   const void *entry,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	if (targinfosize != XT_ALIGN(sizeof(struct xt_NFQ_info))) {
+		printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       XT_ALIGN(sizeof(struct xt_NFQ_info)));
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_target ipt_NFQ_reg = {
+	.name		= "NFQUEUE",
+	.target		= target,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_target ip6t_NFQ_reg = {
+	.name		= "NFQUEUE",
+	.target		= target,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_target arpt_NFQ_reg = {
+	.name		= "NFQUEUE",
+	.target		= target,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+	ret = xt_register_target(AF_INET, &ipt_NFQ_reg);
+	if (ret)
+		return ret;
+	ret = xt_register_target(AF_INET6, &ip6t_NFQ_reg);
+	if (ret)
+		goto out_ip;
+	ret = xt_register_target(NF_ARP, &arpt_NFQ_reg);
+	if (ret)
+		goto out_ip6;
+
+	return ret;
+out_ip6:
+	xt_unregister_target(AF_INET6, &ip6t_NFQ_reg);
+out_ip:
+	xt_unregister_target(AF_INET, &ipt_NFQ_reg);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_target(NF_ARP, &arpt_NFQ_reg);
+	xt_unregister_target(AF_INET6, &ip6t_NFQ_reg);
+	xt_unregister_target(AF_INET, &ipt_NFQ_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
new file mode 100644
index 000000000000..24d477afa939
--- /dev/null
+++ b/net/netfilter/xt_NOTRACK.c
@@ -0,0 +1,92 @@
+/* This is a module which is used for setting up fake conntracks
+ * on packets so that they are not seen by the conntrack/NAT code.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_conntrack_compat.h>
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_NOTRACK");
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	/* Previously seen (loopback)? Ignore. */
+	if ((*pskb)->nfct != NULL)
+		return XT_CONTINUE;
+
+	/* Attach fake conntrack entry. 
+	   If there is a real ct entry correspondig to this packet, 
+	   it'll hang aroun till timing out. We don't deal with it
+	   for performance reasons. JK */
+	nf_ct_untrack(*pskb);
+	(*pskb)->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get((*pskb)->nfct);
+
+	return XT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+	   const void *entry,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	if (targinfosize != 0) {
+		printk(KERN_WARNING "NOTRACK: targinfosize %u != 0\n",
+		       targinfosize);
+		return 0;
+	}
+
+	if (strcmp(tablename, "raw") != 0) {
+		printk(KERN_WARNING "NOTRACK: can only be called from \"raw\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_target notrack_reg = { 
+	.name = "NOTRACK", 
+	.target = target, 
+	.checkentry = checkentry,
+	.me = THIS_MODULE,
+};
+static struct xt_target notrack6_reg = { 
+	.name = "NOTRACK", 
+	.target = target, 
+	.checkentry = checkentry,
+	.me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	ret = xt_register_target(AF_INET, &notrack_reg);
+	if (ret)
+		return ret;
+
+	ret = xt_register_target(AF_INET6, &notrack6_reg);
+	if (ret)
+		xt_unregister_target(AF_INET, &notrack_reg);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_target(AF_INET6, &notrack6_reg);
+	xt_unregister_target(AF_INET, &notrack_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c
new file mode 100644
index 000000000000..4ba6fd65c6e9
--- /dev/null
+++ b/net/netfilter/xt_comment.c
@@ -0,0 +1,80 @@
+/*
+ * Implements a dummy match to allow attaching comments to rules
+ *
+ * 2003-05-13 Brad Fisher (brad@info-link.net)
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_comment.h>
+
+MODULE_AUTHOR("Brad Fisher <brad@info-link.net>");
+MODULE_DESCRIPTION("iptables comment match module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_comment");
+MODULE_ALIAS("ip6t_comment");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protooff,
+      int *hotdrop)
+{
+	/* We always match */
+	return 1;
+}
+
+static int
+checkentry(const char *tablename,
+           const void *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	/* Check the size */
+	if (matchsize != XT_ALIGN(sizeof(struct xt_comment_info)))
+		return 0;
+	return 1;
+}
+
+static struct xt_match comment_match = {
+	.name		= "comment",
+	.match		= match,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE
+};
+
+static struct xt_match comment6_match = {
+	.name		= "comment",
+	.match		= match,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	ret = xt_register_match(AF_INET, &comment_match);
+	if (ret)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &comment6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &comment_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &comment_match);
+	xt_unregister_match(AF_INET6, &comment6_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
new file mode 100644
index 000000000000..150d2a4b0f71
--- /dev/null
+++ b/net/netfilter/xt_connbytes.c
@@ -0,0 +1,180 @@
+/* Kernel module to match connection tracking byte counter.
+ * GPL (C) 2002 Martin Devera (devik@cdi.cz).
+ *
+ * 2004-07-20 Harald Welte <laforge@netfilter.org>
+ * 	- reimplemented to use per-connection accounting counters
+ * 	- add functionality to match number of packets
+ * 	- add functionality to match average packet size
+ * 	- add support to match directions seperately
+ * 2005-10-16 Harald Welte <laforge@netfilter.org>
+ * 	- Port to x_tables
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack_compat.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_connbytes.h>
+
+#include <asm/div64.h>
+#include <asm/bitops.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
+MODULE_ALIAS("ipt_connbytes");
+
+/* 64bit divisor, dividend and result. dynamic precision */
+static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+{
+	u_int32_t d = divisor;
+
+	if (divisor > 0xffffffffULL) {
+		unsigned int shift = fls(divisor >> 32);
+
+		d = divisor >> shift;
+		dividend >>= shift;
+	}
+
+	do_div(dividend, d);
+	return dividend;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_connbytes_info *sinfo = matchinfo;
+	u_int64_t what = 0;	/* initialize to make gcc happy */
+	const struct ip_conntrack_counter *counters;
+
+	if (!(counters = nf_ct_get_counters(skb)))
+		return 0; /* no match */
+
+	switch (sinfo->what) {
+	case XT_CONNBYTES_PKTS:
+		switch (sinfo->direction) {
+		case XT_CONNBYTES_DIR_ORIGINAL:
+			what = counters[IP_CT_DIR_ORIGINAL].packets;
+			break;
+		case XT_CONNBYTES_DIR_REPLY:
+			what = counters[IP_CT_DIR_REPLY].packets;
+			break;
+		case XT_CONNBYTES_DIR_BOTH:
+			what = counters[IP_CT_DIR_ORIGINAL].packets;
+			what += counters[IP_CT_DIR_REPLY].packets;
+			break;
+		}
+		break;
+	case XT_CONNBYTES_BYTES:
+		switch (sinfo->direction) {
+		case XT_CONNBYTES_DIR_ORIGINAL:
+			what = counters[IP_CT_DIR_ORIGINAL].bytes;
+			break;
+		case XT_CONNBYTES_DIR_REPLY:
+			what = counters[IP_CT_DIR_REPLY].bytes;
+			break;
+		case XT_CONNBYTES_DIR_BOTH:
+			what = counters[IP_CT_DIR_ORIGINAL].bytes;
+			what += counters[IP_CT_DIR_REPLY].bytes;
+			break;
+		}
+		break;
+	case XT_CONNBYTES_AVGPKT:
+		switch (sinfo->direction) {
+		case XT_CONNBYTES_DIR_ORIGINAL:
+			what = div64_64(counters[IP_CT_DIR_ORIGINAL].bytes,
+					counters[IP_CT_DIR_ORIGINAL].packets);
+			break;
+		case XT_CONNBYTES_DIR_REPLY:
+			what = div64_64(counters[IP_CT_DIR_REPLY].bytes,
+					counters[IP_CT_DIR_REPLY].packets);
+			break;
+		case XT_CONNBYTES_DIR_BOTH:
+			{
+				u_int64_t bytes;
+				u_int64_t pkts;
+				bytes = counters[IP_CT_DIR_ORIGINAL].bytes +
+					counters[IP_CT_DIR_REPLY].bytes;
+				pkts = counters[IP_CT_DIR_ORIGINAL].packets+
+					counters[IP_CT_DIR_REPLY].packets;
+
+				/* FIXME_THEORETICAL: what to do if sum
+				 * overflows ? */
+
+				what = div64_64(bytes, pkts);
+			}
+			break;
+		}
+		break;
+	}
+
+	if (sinfo->count.to)
+		return (what <= sinfo->count.to && what >= sinfo->count.from);
+	else
+		return (what >= sinfo->count.from);
+}
+
+static int check(const char *tablename,
+		 const void *ip,
+		 void *matchinfo,
+		 unsigned int matchsize,
+		 unsigned int hook_mask)
+{
+	const struct xt_connbytes_info *sinfo = matchinfo;
+
+	if (matchsize != XT_ALIGN(sizeof(struct xt_connbytes_info)))
+		return 0;
+
+	if (sinfo->what != XT_CONNBYTES_PKTS &&
+	    sinfo->what != XT_CONNBYTES_BYTES &&
+	    sinfo->what != XT_CONNBYTES_AVGPKT)
+		return 0;
+
+	if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL &&
+	    sinfo->direction != XT_CONNBYTES_DIR_REPLY &&
+	    sinfo->direction != XT_CONNBYTES_DIR_BOTH)
+		return 0;
+
+	return 1;
+}
+
+static struct xt_match connbytes_match = {
+	.name		= "connbytes",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE
+};
+static struct xt_match connbytes6_match = {
+	.name		= "connbytes",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	int ret;
+	ret = xt_register_match(AF_INET, &connbytes_match);
+	if (ret)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &connbytes6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &connbytes_match);
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &connbytes_match);
+	xt_unregister_match(AF_INET6, &connbytes6_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
new file mode 100644
index 000000000000..d06e925032da
--- /dev/null
+++ b/net/netfilter/xt_connmark.c
@@ -0,0 +1,109 @@
+/* This kernel module matches connection mark values set by the
+ * CONNMARK target
+ *
+ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
+MODULE_DESCRIPTION("IP tables connmark match module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_connmark");
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_connmark.h>
+#include <net/netfilter/nf_conntrack_compat.h>
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_connmark_info *info = matchinfo;
+	u_int32_t ctinfo;
+	const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo);
+	if (!ctmark)
+		return 0;
+
+	return (((*ctmark) & info->mask) == info->mark) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+	   const void *ip,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	struct xt_connmark_info *cm = 
+				(struct xt_connmark_info *)matchinfo;
+	if (matchsize != XT_ALIGN(sizeof(struct xt_connmark_info)))
+		return 0;
+
+	if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+		printk(KERN_WARNING "connmark: only support 32bit mark\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_match connmark_match = {
+	.name = "connmark",
+	.match = &match,
+	.checkentry = &checkentry,
+	.me = THIS_MODULE
+};
+static struct xt_match connmark6_match = {
+	.name = "connmark",
+	.match = &match,
+	.checkentry = &checkentry,
+	.me = THIS_MODULE
+};
+
+
+static int __init init(void)
+{
+	int ret;
+
+	need_conntrack();
+
+	ret = xt_register_match(AF_INET, &connmark_match);
+	if (ret)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &connmark6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &connmark_match);
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET6, &connmark6_match);
+	xt_unregister_match(AF_INET, &connmark_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
new file mode 100644
index 000000000000..ffdebc95eb95
--- /dev/null
+++ b/net/netfilter/xt_conntrack.c
@@ -0,0 +1,238 @@
+/* Kernel module to match connection tracking information.
+ * Superset of Rusty's minimalistic state match.
+ *
+ * (C) 2001  Marc Boucher (marc@mbsi.ca).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
+#else
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_conntrack.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables connection tracking match module");
+MODULE_ALIAS("ipt_conntrack");
+
+#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_conntrack_info *sinfo = matchinfo;
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int statebit;
+
+	ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+
+#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg))
+
+	if (ct == &ip_conntrack_untracked)
+		statebit = XT_CONNTRACK_STATE_UNTRACKED;
+	else if (ct)
+ 		statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+ 	else
+ 		statebit = XT_CONNTRACK_STATE_INVALID;
+ 
+	if(sinfo->flags & XT_CONNTRACK_STATE) {
+		if (ct) {
+			if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip !=
+			    ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip)
+				statebit |= XT_CONNTRACK_STATE_SNAT;
+
+			if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip !=
+			    ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip)
+				statebit |= XT_CONNTRACK_STATE_DNAT;
+		}
+
+		if (FWINV((statebit & sinfo->statemask) == 0, XT_CONNTRACK_STATE))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_PROTO) {
+		if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, XT_CONNTRACK_PROTO))
+                	return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_ORIGSRC) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, XT_CONNTRACK_ORIGSRC))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_ORIGDST) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, XT_CONNTRACK_ORIGDST))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_REPLSRC) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, XT_CONNTRACK_REPLSRC))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_REPLDST) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, XT_CONNTRACK_REPLDST))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_STATUS) {
+		if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, XT_CONNTRACK_STATUS))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_EXPIRES) {
+		unsigned long expires;
+
+		if(!ct)
+			return 0;
+
+		expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0;
+
+		if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), XT_CONNTRACK_EXPIRES))
+			return 0;
+	}
+
+	return 1;
+}
+
+#else /* CONFIG_IP_NF_CONNTRACK */
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_conntrack_info *sinfo = matchinfo;
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int statebit;
+
+	ct = nf_ct_get((struct sk_buff *)skb, &ctinfo);
+
+#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg))
+
+	if (ct == &nf_conntrack_untracked)
+		statebit = XT_CONNTRACK_STATE_UNTRACKED;
+	else if (ct)
+ 		statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+ 	else
+ 		statebit = XT_CONNTRACK_STATE_INVALID;
+ 
+	if(sinfo->flags & XT_CONNTRACK_STATE) {
+		if (ct) {
+			if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip !=
+			    ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip)
+				statebit |= XT_CONNTRACK_STATE_SNAT;
+
+			if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip !=
+			    ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip)
+				statebit |= XT_CONNTRACK_STATE_DNAT;
+		}
+
+		if (FWINV((statebit & sinfo->statemask) == 0, XT_CONNTRACK_STATE))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_PROTO) {
+		if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, XT_CONNTRACK_PROTO))
+                	return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_ORIGSRC) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, XT_CONNTRACK_ORIGSRC))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_ORIGDST) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, XT_CONNTRACK_ORIGDST))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_REPLSRC) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, XT_CONNTRACK_REPLSRC))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_REPLDST) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, XT_CONNTRACK_REPLDST))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_STATUS) {
+		if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, XT_CONNTRACK_STATUS))
+			return 0;
+	}
+
+	if(sinfo->flags & XT_CONNTRACK_EXPIRES) {
+		unsigned long expires;
+
+		if(!ct)
+			return 0;
+
+		expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0;
+
+		if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), XT_CONNTRACK_EXPIRES))
+			return 0;
+	}
+
+	return 1;
+}
+
+#endif /* CONFIG_NF_IP_CONNTRACK */
+
+static int check(const char *tablename,
+		 const void *ip,
+		 void *matchinfo,
+		 unsigned int matchsize,
+		 unsigned int hook_mask)
+{
+	if (matchsize != XT_ALIGN(sizeof(struct xt_conntrack_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct xt_match conntrack_match = {
+	.name		= "conntrack",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+	need_conntrack();
+	ret = xt_register_match(AF_INET, &conntrack_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &conntrack_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
new file mode 100644
index 000000000000..779f42fc9524
--- /dev/null
+++ b/net/netfilter/xt_dccp.c
@@ -0,0 +1,221 @@
+/*
+ * iptables module for DCCP protocol header matching
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/ip.h>
+#include <linux/dccp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_dccp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Match for DCCP protocol packets");
+MODULE_ALIAS("ipt_dccp");
+
+#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+		                  || (!!((invflag) & (option)) ^ (cond)))
+
+static unsigned char *dccp_optbuf;
+static DEFINE_SPINLOCK(dccp_buflock);
+
+static inline int
+dccp_find_option(u_int8_t option,
+		 const struct sk_buff *skb,
+		 unsigned int protoff,
+		 const struct dccp_hdr *dh,
+		 int *hotdrop)
+{
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	unsigned char *op;
+	unsigned int optoff = __dccp_hdr_len(dh);
+	unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
+	unsigned int i;
+
+	if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
+		*hotdrop = 1;
+		return 0;
+	}
+
+	if (!optlen)
+		return 0;
+
+	spin_lock_bh(&dccp_buflock);
+	op = skb_header_pointer(skb, protoff + optoff, optlen, dccp_optbuf);
+	if (op == NULL) {
+		/* If we don't have the whole header, drop packet. */
+		spin_unlock_bh(&dccp_buflock);
+		*hotdrop = 1;
+		return 0;
+	}
+
+	for (i = 0; i < optlen; ) {
+		if (op[i] == option) {
+			spin_unlock_bh(&dccp_buflock);
+			return 1;
+		}
+
+		if (op[i] < 2) 
+			i++;
+		else 
+			i += op[i+1]?:1;
+	}
+
+	spin_unlock_bh(&dccp_buflock);
+	return 0;
+}
+
+
+static inline int
+match_types(const struct dccp_hdr *dh, u_int16_t typemask)
+{
+	return (typemask & (1 << dh->dccph_type));
+}
+
+static inline int
+match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff,
+	     const struct dccp_hdr *dh, int *hotdrop)
+{
+	return dccp_find_option(option, skb, protoff, dh, hotdrop);
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_dccp_info *info = 
+				(const struct xt_dccp_info *)matchinfo;
+	struct dccp_hdr _dh, *dh;
+
+	if (offset)
+		return 0;
+	
+	dh = skb_header_pointer(skb, protoff, sizeof(_dh), &_dh);
+	if (dh == NULL) {
+		*hotdrop = 1;
+		return 0;
+       	}
+
+	return  DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) 
+			&& (ntohs(dh->dccph_sport) <= info->spts[1])), 
+		   	XT_DCCP_SRC_PORTS, info->flags, info->invflags)
+		&& DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) 
+			&& (ntohs(dh->dccph_dport) <= info->dpts[1])), 
+			XT_DCCP_DEST_PORTS, info->flags, info->invflags)
+		&& DCCHECK(match_types(dh, info->typemask),
+			   XT_DCCP_TYPE, info->flags, info->invflags)
+		&& DCCHECK(match_option(info->option, skb, protoff, dh,
+					hotdrop),
+			   XT_DCCP_OPTION, info->flags, info->invflags);
+}
+
+static int
+checkentry(const char *tablename,
+	   const void *inf,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	const struct ipt_ip *ip = inf;
+	const struct xt_dccp_info *info;
+
+	info = (const struct xt_dccp_info *)matchinfo;
+
+	return ip->proto == IPPROTO_DCCP
+		&& !(ip->invflags & XT_INV_PROTO)
+		&& matchsize == XT_ALIGN(sizeof(struct xt_dccp_info))
+		&& !(info->flags & ~XT_DCCP_VALID_FLAGS)
+		&& !(info->invflags & ~XT_DCCP_VALID_FLAGS)
+		&& !(info->invflags & ~info->flags);
+}
+
+static int
+checkentry6(const char *tablename,
+	   const void *inf,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	const struct ip6t_ip6 *ip = inf;
+	const struct xt_dccp_info *info;
+
+	info = (const struct xt_dccp_info *)matchinfo;
+
+	return ip->proto == IPPROTO_DCCP
+		&& !(ip->invflags & XT_INV_PROTO)
+		&& matchsize == XT_ALIGN(sizeof(struct xt_dccp_info))
+		&& !(info->flags & ~XT_DCCP_VALID_FLAGS)
+		&& !(info->invflags & ~XT_DCCP_VALID_FLAGS)
+		&& !(info->invflags & ~info->flags);
+}
+
+
+static struct xt_match dccp_match = 
+{ 
+	.name 		= "dccp",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me 		= THIS_MODULE,
+};
+static struct xt_match dccp6_match = 
+{ 
+	.name 		= "dccp",
+	.match		= &match,
+	.checkentry	= &checkentry6,
+	.me 		= THIS_MODULE,
+};
+
+
+static int __init init(void)
+{
+	int ret;
+
+	/* doff is 8 bits, so the maximum option size is (4*256).  Don't put
+	 * this in BSS since DaveM is worried about locked TLB's for kernel
+	 * BSS. */
+	dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
+	if (!dccp_optbuf)
+		return -ENOMEM;
+	ret = xt_register_match(AF_INET, &dccp_match);
+	if (ret)
+		goto out_kfree;
+	ret = xt_register_match(AF_INET6, &dccp6_match);
+	if (ret)
+		goto out_unreg;
+
+	return ret;
+
+out_unreg:
+	xt_unregister_match(AF_INET, &dccp_match);
+out_kfree:
+	kfree(dccp_optbuf);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET6, &dccp6_match);
+	xt_unregister_match(AF_INET, &dccp_match);
+	kfree(dccp_optbuf);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
new file mode 100644
index 000000000000..38b6715e1db4
--- /dev/null
+++ b/net/netfilter/xt_helper.c
@@ -0,0 +1,188 @@
+/* iptables module to match on related connections */
+/*
+ * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *   19 Mar 2002 Harald Welte <laforge@gnumonks.org>:
+ *   		 - Port to newnat infrastructure
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#else
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#endif
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_helper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>");
+MODULE_DESCRIPTION("iptables helper match module");
+MODULE_ALIAS("ipt_helper");
+MODULE_ALIAS("ip6t_helper");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_helper_info *info = matchinfo;
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	int ret = info->invert;
+	
+	ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+	if (!ct) {
+		DEBUGP("xt_helper: Eek! invalid conntrack?\n");
+		return ret;
+	}
+
+	if (!ct->master) {
+		DEBUGP("xt_helper: conntrack %p has no master\n", ct);
+		return ret;
+	}
+
+	read_lock_bh(&ip_conntrack_lock);
+	if (!ct->master->helper) {
+		DEBUGP("xt_helper: master ct %p has no helper\n", 
+			exp->expectant);
+		goto out_unlock;
+	}
+
+	DEBUGP("master's name = %s , info->name = %s\n", 
+		ct->master->helper->name, info->name);
+
+	if (info->name[0] == '\0')
+		ret ^= 1;
+	else
+		ret ^= !strncmp(ct->master->helper->name, info->name, 
+		                strlen(ct->master->helper->name));
+out_unlock:
+	read_unlock_bh(&ip_conntrack_lock);
+	return ret;
+}
+
+#else /* CONFIG_IP_NF_CONNTRACK */
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_helper_info *info = matchinfo;
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	int ret = info->invert;
+	
+	ct = nf_ct_get((struct sk_buff *)skb, &ctinfo);
+	if (!ct) {
+		DEBUGP("xt_helper: Eek! invalid conntrack?\n");
+		return ret;
+	}
+
+	if (!ct->master) {
+		DEBUGP("xt_helper: conntrack %p has no master\n", ct);
+		return ret;
+	}
+
+	read_lock_bh(&nf_conntrack_lock);
+	if (!ct->master->helper) {
+		DEBUGP("xt_helper: master ct %p has no helper\n", 
+			exp->expectant);
+		goto out_unlock;
+	}
+
+	DEBUGP("master's name = %s , info->name = %s\n", 
+		ct->master->helper->name, info->name);
+
+	if (info->name[0] == '\0')
+		ret ^= 1;
+	else
+		ret ^= !strncmp(ct->master->helper->name, info->name, 
+		                strlen(ct->master->helper->name));
+out_unlock:
+	read_unlock_bh(&nf_conntrack_lock);
+	return ret;
+}
+#endif
+
+static int check(const char *tablename,
+		 const void *inf,
+		 void *matchinfo,
+		 unsigned int matchsize,
+		 unsigned int hook_mask)
+{
+	struct xt_helper_info *info = matchinfo;
+
+	info->name[29] = '\0';
+
+	/* verify size */
+	if (matchsize != XT_ALIGN(sizeof(struct xt_helper_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct xt_match helper_match = {
+	.name		= "helper",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE,
+};
+static struct xt_match helper6_match = {
+	.name		= "helper",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+	need_conntrack();
+
+	ret = xt_register_match(AF_INET, &helper_match);
+	if (ret < 0)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &helper6_match);
+	if (ret < 0)
+		xt_unregister_match(AF_INET, &helper_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &helper_match);
+	xt_unregister_match(AF_INET6, &helper6_match);
+}
+
+module_init(init);
+module_exit(fini);
+
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
new file mode 100644
index 000000000000..ab6c710cf88f
--- /dev/null
+++ b/net/netfilter/xt_length.c
@@ -0,0 +1,98 @@
+/* Kernel module to match packet length. */
+/* (C) 1999-2001 James Morris <jmorros@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+
+#include <linux/netfilter/xt_length.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("IP tables packet length matching module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_length");
+MODULE_ALIAS("ip6t_length");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_length_info *info = matchinfo;
+	u_int16_t pktlen = ntohs(skb->nh.iph->tot_len);
+	
+	return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
+}
+
+static int
+match6(const struct sk_buff *skb,
+       const struct net_device *in,
+       const struct net_device *out,
+       const void *matchinfo,
+       int offset,
+       unsigned int protoff,
+       int *hotdrop)
+{
+	const struct xt_length_info *info = matchinfo;
+	u_int16_t pktlen = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr);
+	
+	return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+           const void *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	if (matchsize != XT_ALIGN(sizeof(struct xt_length_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct xt_match length_match = {
+	.name		= "length",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+static struct xt_match length6_match = {
+	.name		= "length",
+	.match		= &match6,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+	ret = xt_register_match(AF_INET, &length_match);
+	if (ret)
+		return ret;
+	ret = xt_register_match(AF_INET6, &length6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &length_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &length_match);
+	xt_unregister_match(AF_INET6, &length6_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
new file mode 100644
index 000000000000..15e40506bc3a
--- /dev/null
+++ b/net/netfilter/xt_limit.c
@@ -0,0 +1,175 @@
+/* Kernel module to control the rate
+ *
+ * 2 September 1999: Changed from the target RATE to the match
+ *                   `limit', removed logging.  Did I mention that
+ *                   Alexey is a fucking genius?
+ *                   Rusty Russell (rusty@rustcorp.com.au).  */
+
+/* (C) 1999 J�r�me de Vivie <devivie@info.enserb.u-bordeaux.fr>
+ * (C) 1999 Herv� Eychenne <eychenne@info.enserb.u-bordeaux.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_limit.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
+MODULE_DESCRIPTION("iptables rate limit match");
+MODULE_ALIAS("ipt_limit");
+MODULE_ALIAS("ip6t_limit");
+
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+static DEFINE_SPINLOCK(limit_lock);
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+   this algorithm.  The `average rate' in jiffies becomes your initial
+   amount of credit `credit' and the most credit you can ever have
+   `credit_cap'.  The `peak rate' becomes the cost of passing the
+   test, `cost'.
+
+   `prev' tracks the last packet hit: you gain one credit per jiffy.
+   If you get credit balance more than this, the extra credit is
+   discarded.  Every time the match passes, you lose `cost' credits;
+   if you don't have that many, the test fails.
+
+   See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+   To get the maxmum range, we multiply by this factor (ie. you get N
+   credits per jiffy).  We want to allow a rate as low as 1 per day
+   (slowest userspace tool allows), which means
+   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+static int
+ipt_limit_match(const struct sk_buff *skb,
+		const struct net_device *in,
+		const struct net_device *out,
+		const void *matchinfo,
+		int offset,
+		unsigned int protoff,
+		int *hotdrop)
+{
+	struct xt_rateinfo *r = ((struct xt_rateinfo *)matchinfo)->master;
+	unsigned long now = jiffies;
+
+	spin_lock_bh(&limit_lock);
+	r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
+	if (r->credit > r->credit_cap)
+		r->credit = r->credit_cap;
+
+	if (r->credit >= r->cost) {
+		/* We're not limited. */
+		r->credit -= r->cost;
+		spin_unlock_bh(&limit_lock);
+		return 1;
+	}
+
+       	spin_unlock_bh(&limit_lock);
+	return 0;
+}
+
+/* Precision saver. */
+static u_int32_t
+user2credits(u_int32_t user)
+{
+	/* If multiplying would overflow... */
+	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+		/* Divide first. */
+		return (user / XT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+	return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE;
+}
+
+static int
+ipt_limit_checkentry(const char *tablename,
+		     const void *inf,
+		     void *matchinfo,
+		     unsigned int matchsize,
+		     unsigned int hook_mask)
+{
+	struct xt_rateinfo *r = matchinfo;
+
+	if (matchsize != XT_ALIGN(sizeof(struct xt_rateinfo)))
+		return 0;
+
+	/* Check for overflow. */
+	if (r->burst == 0
+	    || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
+		printk("Overflow in xt_limit, try lower: %u/%u\n",
+		       r->avg, r->burst);
+		return 0;
+	}
+
+	/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
+	   128. */
+	r->prev = jiffies;
+	r->credit = user2credits(r->avg * r->burst);	 /* Credits full. */
+	r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
+	r->cost = user2credits(r->avg);
+
+	/* For SMP, we only want to use one set of counters. */
+	r->master = r;
+
+	return 1;
+}
+
+static struct xt_match ipt_limit_reg = {
+	.name		= "limit",
+	.match		= ipt_limit_match,
+	.checkentry	= ipt_limit_checkentry,
+	.me		= THIS_MODULE,
+};
+static struct xt_match limit6_reg = {
+	.name		= "limit",
+	.match		= ipt_limit_match,
+	.checkentry	= ipt_limit_checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+	
+	ret = xt_register_match(AF_INET, &ipt_limit_reg);
+	if (ret)
+		return ret;
+	
+	ret = xt_register_match(AF_INET6, &limit6_reg);
+	if (ret)
+		xt_unregister_match(AF_INET, &ipt_limit_reg);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &ipt_limit_reg);
+	xt_unregister_match(AF_INET6, &limit6_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
new file mode 100644
index 000000000000..0461dcb5fc7a
--- /dev/null
+++ b/net/netfilter/xt_mac.c
@@ -0,0 +1,100 @@
+/* Kernel module to match MAC address parameters. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/xt_mac.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mac matching module");
+MODULE_ALIAS("ipt_mac");
+MODULE_ALIAS("ip6t_mac");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+    const struct xt_mac_info *info = matchinfo;
+
+    /* Is mac pointer valid? */
+    return (skb->mac.raw >= skb->head
+	    && (skb->mac.raw + ETH_HLEN) <= skb->data
+	    /* If so, compare... */
+	    && ((!compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr))
+		^ info->invert));
+}
+
+static int
+ipt_mac_checkentry(const char *tablename,
+		   const void *inf,
+		   void *matchinfo,
+		   unsigned int matchsize,
+		   unsigned int hook_mask)
+{
+	/* FORWARD isn't always valid, but it's nice to be able to do --RR */
+	if (hook_mask
+	    & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
+		| (1 << NF_IP_FORWARD))) {
+		printk("xt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
+		return 0;
+	}
+
+	if (matchsize != XT_ALIGN(sizeof(struct xt_mac_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct xt_match mac_match = {
+	.name		= "mac",
+	.match		= &match,
+	.checkentry	= &ipt_mac_checkentry,
+	.me		= THIS_MODULE,
+};
+static struct xt_match mac6_match = {
+	.name		= "mac",
+	.match		= &match,
+	.checkentry	= &ipt_mac_checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+	ret = xt_register_match(AF_INET, &mac_match);
+	if (ret)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &mac6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &mac_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &mac_match);
+	xt_unregister_match(AF_INET6, &mac6_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
new file mode 100644
index 000000000000..2a0ac62b72c8
--- /dev/null
+++ b/net/netfilter/xt_mark.c
@@ -0,0 +1,91 @@
+/* Kernel module to match NFMARK values. */
+
+/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/xt_mark.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables mark matching module");
+MODULE_ALIAS("ipt_mark");
+MODULE_ALIAS("ip6t_mark");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_mark_info *info = matchinfo;
+
+	return ((skb->nfmark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+           const void *entry,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	struct xt_mark_info *minfo = (struct xt_mark_info *) matchinfo;
+
+	if (matchsize != XT_ALIGN(sizeof(struct xt_mark_info)))
+		return 0;
+
+	if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "mark: only supports 32bit mark\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_match mark_match = {
+	.name		= "mark",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_match mark6_match = {
+	.name		= "mark",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+	ret = xt_register_match(AF_INET, &mark_match);
+	if (ret)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &mark6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &mark_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &mark_match);
+	xt_unregister_match(AF_INET6, &mark6_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
new file mode 100644
index 000000000000..19bb57c14dfe
--- /dev/null
+++ b/net/netfilter/xt_physdev.c
@@ -0,0 +1,155 @@
+/* Kernel module to match the bridge port in and
+ * out device for IP packets coming into contact with a bridge. */
+
+/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_physdev.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge.h>
+#define MATCH   1
+#define NOMATCH 0
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("iptables bridge physical device match module");
+MODULE_ALIAS("ipt_physdev");
+MODULE_ALIAS("ip6t_physdev");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	int i;
+	static const char nulldevname[IFNAMSIZ];
+	const struct xt_physdev_info *info = matchinfo;
+	unsigned int ret;
+	const char *indev, *outdev;
+	struct nf_bridge_info *nf_bridge;
+
+	/* Not a bridged IP packet or no info available yet:
+	 * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if
+	 * the destination device will be a bridge. */
+	if (!(nf_bridge = skb->nf_bridge)) {
+		/* Return MATCH if the invert flags of the used options are on */
+		if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) &&
+		    !(info->invert & XT_PHYSDEV_OP_BRIDGED))
+			return NOMATCH;
+		if ((info->bitmask & XT_PHYSDEV_OP_ISIN) &&
+		    !(info->invert & XT_PHYSDEV_OP_ISIN))
+			return NOMATCH;
+		if ((info->bitmask & XT_PHYSDEV_OP_ISOUT) &&
+		    !(info->invert & XT_PHYSDEV_OP_ISOUT))
+			return NOMATCH;
+		if ((info->bitmask & XT_PHYSDEV_OP_IN) &&
+		    !(info->invert & XT_PHYSDEV_OP_IN))
+			return NOMATCH;
+		if ((info->bitmask & XT_PHYSDEV_OP_OUT) &&
+		    !(info->invert & XT_PHYSDEV_OP_OUT))
+			return NOMATCH;
+		return MATCH;
+	}
+
+	/* This only makes sense in the FORWARD and POSTROUTING chains */
+	if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) &&
+	    (!!(nf_bridge->mask & BRNF_BRIDGED) ^
+	    !(info->invert & XT_PHYSDEV_OP_BRIDGED)))
+		return NOMATCH;
+
+	if ((info->bitmask & XT_PHYSDEV_OP_ISIN &&
+	    (!nf_bridge->physindev ^ !!(info->invert & XT_PHYSDEV_OP_ISIN))) ||
+	    (info->bitmask & XT_PHYSDEV_OP_ISOUT &&
+	    (!nf_bridge->physoutdev ^ !!(info->invert & XT_PHYSDEV_OP_ISOUT))))
+		return NOMATCH;
+
+	if (!(info->bitmask & XT_PHYSDEV_OP_IN))
+		goto match_outdev;
+	indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname;
+	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
+		ret |= (((const unsigned int *)indev)[i]
+			^ ((const unsigned int *)info->physindev)[i])
+			& ((const unsigned int *)info->in_mask)[i];
+	}
+
+	if ((ret == 0) ^ !(info->invert & XT_PHYSDEV_OP_IN))
+		return NOMATCH;
+
+match_outdev:
+	if (!(info->bitmask & XT_PHYSDEV_OP_OUT))
+		return MATCH;
+	outdev = nf_bridge->physoutdev ?
+		 nf_bridge->physoutdev->name : nulldevname;
+	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
+		ret |= (((const unsigned int *)outdev)[i]
+			^ ((const unsigned int *)info->physoutdev)[i])
+			& ((const unsigned int *)info->out_mask)[i];
+	}
+
+	return (ret != 0) ^ !(info->invert & XT_PHYSDEV_OP_OUT);
+}
+
+static int
+checkentry(const char *tablename,
+		       const void *ip,
+		       void *matchinfo,
+		       unsigned int matchsize,
+		       unsigned int hook_mask)
+{
+	const struct xt_physdev_info *info = matchinfo;
+
+	if (matchsize != XT_ALIGN(sizeof(struct xt_physdev_info)))
+		return 0;
+	if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
+	    info->bitmask & ~XT_PHYSDEV_OP_MASK)
+		return 0;
+	return 1;
+}
+
+static struct xt_match physdev_match = {
+	.name		= "physdev",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_match physdev6_match = {
+	.name		= "physdev",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	ret = xt_register_match(AF_INET, &physdev_match);
+	if (ret < 0)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &physdev6_match);
+	if (ret < 0)
+		xt_unregister_match(AF_INET, &physdev_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &physdev_match);
+	xt_unregister_match(AF_INET6, &physdev6_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
new file mode 100644
index 000000000000..ab1b2630f97d
--- /dev/null
+++ b/net/netfilter/xt_pkttype.c
@@ -0,0 +1,82 @@
+/* (C) 1999-2001 Michal Ludvig <michal@logix.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+
+#include <linux/netfilter/xt_pkttype.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>");
+MODULE_DESCRIPTION("IP tables match to match on linklayer packet type");
+MODULE_ALIAS("ipt_pkttype");
+MODULE_ALIAS("ip6t_pkttype");
+
+static int match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_pkttype_info *info = matchinfo;
+
+	return (skb->pkt_type == info->pkttype) ^ info->invert;
+}
+
+static int checkentry(const char *tablename,
+		   const void *ip,
+		   void *matchinfo,
+		   unsigned int matchsize,
+		   unsigned int hook_mask)
+{
+	if (matchsize != XT_ALIGN(sizeof(struct xt_pkttype_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct xt_match pkttype_match = {
+	.name		= "pkttype",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+static struct xt_match pkttype6_match = {
+	.name		= "pkttype",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+
+static int __init init(void)
+{
+	int ret;
+	ret = xt_register_match(AF_INET, &pkttype_match);
+	if (ret)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &pkttype6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &pkttype_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &pkttype_match);
+	xt_unregister_match(AF_INET6, &pkttype6_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
new file mode 100644
index 000000000000..2b7e1781d34d
--- /dev/null
+++ b/net/netfilter/xt_realm.c
@@ -0,0 +1,79 @@
+/* IP tables module for matching the routing realm
+ *
+ * $Id: ipt_realm.c,v 1.3 2004/03/05 13:25:40 laforge Exp $
+ *
+ * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <net/route.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/xt_realm.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("X_tables realm match");
+MODULE_ALIAS("ipt_realm");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_realm_info *info = matchinfo;
+	struct dst_entry *dst = skb->dst;
+    
+	return (info->id == (dst->tclassid & info->mask)) ^ info->invert;
+}
+
+static int check(const char *tablename,
+                 const void *ip,
+                 void *matchinfo,
+                 unsigned int matchsize,
+                 unsigned int hook_mask)
+{
+	if (hook_mask
+	    & ~((1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) |
+	        (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN))) {
+		printk("xt_realm: only valid for POST_ROUTING, LOCAL_OUT, "
+		       "LOCAL_IN or FORWARD.\n");
+		return 0;
+	}
+	if (matchsize != XT_ALIGN(sizeof(struct xt_realm_info))) {
+		printk("xt_realm: invalid matchsize.\n");
+		return 0;
+	}
+	return 1;
+}
+
+static struct xt_match realm_match = {
+	.name		= "realm",
+	.match		= match, 
+	.checkentry	= check,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return xt_register_match(AF_INET, &realm_match);
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &realm_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
new file mode 100644
index 000000000000..10fbfc5ba758
--- /dev/null
+++ b/net/netfilter/xt_sctp.c
@@ -0,0 +1,250 @@
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/sctp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_sctp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Match for SCTP protocol packets");
+MODULE_ALIAS("ipt_sctp");
+
+#ifdef DEBUG_SCTP
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+					      || (!!((invflag) & (option)) ^ (cond)))
+
+static int
+match_flags(const struct xt_sctp_flag_info *flag_info,
+	    const int flag_count,
+	    u_int8_t chunktype,
+	    u_int8_t chunkflags)
+{
+	int i;
+
+	for (i = 0; i < flag_count; i++) {
+		if (flag_info[i].chunktype == chunktype) {
+			return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag;
+		}
+	}
+
+	return 1;
+}
+
+static inline int
+match_packet(const struct sk_buff *skb,
+	     unsigned int offset,
+	     const u_int32_t *chunkmap,
+	     int chunk_match_type,
+	     const struct xt_sctp_flag_info *flag_info,
+	     const int flag_count,
+	     int *hotdrop)
+{
+	u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)];
+	sctp_chunkhdr_t _sch, *sch;
+
+#ifdef DEBUG_SCTP
+	int i = 0;
+#endif
+
+	if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) {
+		SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap);
+	}
+
+	do {
+		sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch);
+		if (sch == NULL) {
+			duprintf("Dropping invalid SCTP packet.\n");
+			*hotdrop = 1;
+			return 0;
+        	}
+
+		duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n", 
+				++i, offset, sch->type, htons(sch->length), sch->flags);
+
+		offset += (htons(sch->length) + 3) & ~3;
+
+		duprintf("skb->len: %d\toffset: %d\n", skb->len, offset);
+
+		if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch->type)) {
+			switch (chunk_match_type) {
+			case SCTP_CHUNK_MATCH_ANY:
+				if (match_flags(flag_info, flag_count, 
+					sch->type, sch->flags)) {
+					return 1;
+				}
+				break;
+
+			case SCTP_CHUNK_MATCH_ALL:
+				if (match_flags(flag_info, flag_count, 
+					sch->type, sch->flags)) {
+					SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type);
+				}
+				break;
+
+			case SCTP_CHUNK_MATCH_ONLY:
+				if (!match_flags(flag_info, flag_count, 
+					sch->type, sch->flags)) {
+					return 0;
+				}
+				break;
+			}
+		} else {
+			switch (chunk_match_type) {
+			case SCTP_CHUNK_MATCH_ONLY:
+				return 0;
+			}
+		}
+	} while (offset < skb->len);
+
+	switch (chunk_match_type) {
+	case SCTP_CHUNK_MATCH_ALL:
+		return SCTP_CHUNKMAP_IS_CLEAR(chunkmap);
+	case SCTP_CHUNK_MATCH_ANY:
+		return 0;
+	case SCTP_CHUNK_MATCH_ONLY:
+		return 1;
+	}
+
+	/* This will never be reached, but required to stop compiler whine */
+	return 0;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_sctp_info *info;
+	sctp_sctphdr_t _sh, *sh;
+
+	info = (const struct xt_sctp_info *)matchinfo;
+
+	if (offset) {
+		duprintf("Dropping non-first fragment.. FIXME\n");
+		return 0;
+	}
+	
+	sh = skb_header_pointer(skb, protoff, sizeof(_sh), &_sh);
+	if (sh == NULL) {
+		duprintf("Dropping evil TCP offset=0 tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+       	}
+	duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
+
+	return  SCCHECK(((ntohs(sh->source) >= info->spts[0]) 
+			&& (ntohs(sh->source) <= info->spts[1])), 
+		   	XT_SCTP_SRC_PORTS, info->flags, info->invflags)
+		&& SCCHECK(((ntohs(sh->dest) >= info->dpts[0]) 
+			&& (ntohs(sh->dest) <= info->dpts[1])), 
+			XT_SCTP_DEST_PORTS, info->flags, info->invflags)
+		&& SCCHECK(match_packet(skb, protoff,
+					info->chunkmap, info->chunk_match_type,
+ 					info->flag_info, info->flag_count, 
+					hotdrop),
+			   XT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
+}
+
+static int
+checkentry(const char *tablename,
+	   const void *inf,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	const struct xt_sctp_info *info;
+	const struct ipt_ip *ip = inf;
+
+	info = (const struct xt_sctp_info *)matchinfo;
+
+	return ip->proto == IPPROTO_SCTP
+		&& !(ip->invflags & XT_INV_PROTO)
+		&& matchsize == XT_ALIGN(sizeof(struct xt_sctp_info))
+		&& !(info->flags & ~XT_SCTP_VALID_FLAGS)
+		&& !(info->invflags & ~XT_SCTP_VALID_FLAGS)
+		&& !(info->invflags & ~info->flags)
+		&& ((!(info->flags & XT_SCTP_CHUNK_TYPES)) || 
+			(info->chunk_match_type &
+				(SCTP_CHUNK_MATCH_ALL 
+				| SCTP_CHUNK_MATCH_ANY
+				| SCTP_CHUNK_MATCH_ONLY)));
+}
+
+static int
+checkentry6(const char *tablename,
+	   const void *inf,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	const struct xt_sctp_info *info;
+	const struct ip6t_ip6 *ip = inf;
+
+	info = (const struct xt_sctp_info *)matchinfo;
+
+	return ip->proto == IPPROTO_SCTP
+		&& !(ip->invflags & XT_INV_PROTO)
+		&& matchsize == XT_ALIGN(sizeof(struct xt_sctp_info))
+		&& !(info->flags & ~XT_SCTP_VALID_FLAGS)
+		&& !(info->invflags & ~XT_SCTP_VALID_FLAGS)
+		&& !(info->invflags & ~info->flags)
+		&& ((!(info->flags & XT_SCTP_CHUNK_TYPES)) || 
+			(info->chunk_match_type &
+				(SCTP_CHUNK_MATCH_ALL 
+				| SCTP_CHUNK_MATCH_ANY
+				| SCTP_CHUNK_MATCH_ONLY)));
+}
+
+
+static struct xt_match sctp_match = 
+{ 
+	.name = "sctp",
+	.match = &match,
+	.checkentry = &checkentry,
+	.me = THIS_MODULE
+};
+static struct xt_match sctp6_match = 
+{ 
+	.name = "sctp",
+	.match = &match,
+	.checkentry = &checkentry6,
+	.me = THIS_MODULE
+};
+
+
+static int __init init(void)
+{
+	int ret;
+	ret = xt_register_match(AF_INET, &sctp_match);
+	if (ret)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &sctp6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &sctp_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET6, &sctp6_match);
+	xt_unregister_match(AF_INET, &sctp_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
new file mode 100644
index 000000000000..39ce808d40ef
--- /dev/null
+++ b/net/netfilter/xt_state.c
@@ -0,0 +1,96 @@
+/* Kernel module to match connection tracking information. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack_compat.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_state.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ip[6]_tables connection tracking state match module");
+MODULE_ALIAS("ipt_state");
+MODULE_ALIAS("ip6t_state");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_state_info *sinfo = matchinfo;
+	enum ip_conntrack_info ctinfo;
+	unsigned int statebit;
+
+	if (nf_ct_is_untracked(skb))
+		statebit = XT_STATE_UNTRACKED;
+	else if (!nf_ct_get_ctinfo(skb, &ctinfo))
+		statebit = XT_STATE_INVALID;
+	else
+		statebit = XT_STATE_BIT(ctinfo);
+
+	return (sinfo->statemask & statebit);
+}
+
+static int check(const char *tablename,
+		 const void *ip,
+		 void *matchinfo,
+		 unsigned int matchsize,
+		 unsigned int hook_mask)
+{
+	if (matchsize != XT_ALIGN(sizeof(struct xt_state_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct xt_match state_match = {
+	.name		= "state",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_match state6_match = {
+	.name		= "state",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	need_conntrack();
+
+	ret = xt_register_match(AF_INET, &state_match);
+	if (ret < 0)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &state6_match);
+	if (ret < 0)
+		xt_unregister_match(AF_INET,&state_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &state_match);
+	xt_unregister_match(AF_INET6, &state6_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
new file mode 100644
index 000000000000..7c7d5c8807d6
--- /dev/null
+++ b/net/netfilter/xt_string.c
@@ -0,0 +1,111 @@
+/* String matching match for iptables
+ * 
+ * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_string.h>
+#include <linux/textsearch.h>
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
+MODULE_DESCRIPTION("IP tables string match module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_string");
+MODULE_ALIAS("ip6t_string");
+
+static int match(const struct sk_buff *skb,
+		 const struct net_device *in,
+		 const struct net_device *out,
+		 const void *matchinfo,
+		 int offset,
+		 unsigned int protoff,
+		 int *hotdrop)
+{
+	struct ts_state state;
+	struct xt_string_info *conf = (struct xt_string_info *) matchinfo;
+
+	memset(&state, 0, sizeof(struct ts_state));
+
+	return (skb_find_text((struct sk_buff *)skb, conf->from_offset, 
+			     conf->to_offset, conf->config, &state) 
+			     != UINT_MAX) && !conf->invert;
+}
+
+#define STRING_TEXT_PRIV(m) ((struct xt_string_info *) m)
+
+static int checkentry(const char *tablename,
+		      const void *ip,
+		      void *matchinfo,
+		      unsigned int matchsize,
+		      unsigned int hook_mask)
+{
+	struct xt_string_info *conf = matchinfo;
+	struct ts_config *ts_conf;
+
+	if (matchsize != XT_ALIGN(sizeof(struct xt_string_info)))
+		return 0;
+
+	/* Damn, can't handle this case properly with iptables... */
+	if (conf->from_offset > conf->to_offset)
+		return 0;
+
+	ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
+				     GFP_KERNEL, TS_AUTOLOAD);
+	if (IS_ERR(ts_conf))
+		return 0;
+
+	conf->config = ts_conf;
+
+	return 1;
+}
+
+static void destroy(void *matchinfo, unsigned int matchsize)
+{
+	textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
+}
+
+static struct xt_match string_match = {
+	.name 		= "string",
+	.match 		= match,
+	.checkentry	= checkentry,
+	.destroy 	= destroy,
+	.me 		= THIS_MODULE
+};
+static struct xt_match string6_match = {
+	.name 		= "string",
+	.match 		= match,
+	.checkentry	= checkentry,
+	.destroy 	= destroy,
+	.me 		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	ret = xt_register_match(AF_INET, &string_match);
+	if (ret)
+		return ret;
+	ret = xt_register_match(AF_INET6, &string6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &string_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET, &string_match);
+	xt_unregister_match(AF_INET6, &string6_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
new file mode 100644
index 000000000000..acf7f533e9f1
--- /dev/null
+++ b/net/netfilter/xt_tcpmss.c
@@ -0,0 +1,172 @@
+/* Kernel module to match TCP MSS values. */
+
+/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ * Portions (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/xt_tcpmss.h>
+#include <linux/netfilter/x_tables.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+#define TH_SYN 0x02
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables TCP MSS match module");
+MODULE_ALIAS("ipt_tcpmss");
+
+/* Returns 1 if the mss option is set and matched by the range, 0 otherwise */
+static inline int
+mssoption_match(u_int16_t min, u_int16_t max,
+		const struct sk_buff *skb,
+		unsigned int protoff,
+		int invert,
+		int *hotdrop)
+{
+	struct tcphdr _tcph, *th;
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	u8 _opt[15 * 4 - sizeof(_tcph)], *op;
+	unsigned int i, optlen;
+
+	/* If we don't have the whole header, drop packet. */
+	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		goto dropit;
+
+	/* Malformed. */
+	if (th->doff*4 < sizeof(*th))
+		goto dropit;
+
+	optlen = th->doff*4 - sizeof(*th);
+	if (!optlen)
+		goto out;
+
+	/* Truncated options. */
+	op = skb_header_pointer(skb, protoff + sizeof(*th), optlen, _opt);
+	if (op == NULL)
+		goto dropit;
+
+	for (i = 0; i < optlen; ) {
+		if (op[i] == TCPOPT_MSS
+		    && (optlen - i) >= TCPOLEN_MSS
+		    && op[i+1] == TCPOLEN_MSS) {
+			u_int16_t mssval;
+
+			mssval = (op[i+2] << 8) | op[i+3];
+			
+			return (mssval >= min && mssval <= max) ^ invert;
+		}
+		if (op[i] < 2) i++;
+		else i += op[i+1]?:1;
+	}
+out:
+	return invert;
+
+ dropit:
+	*hotdrop = 1;
+	return 0;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      unsigned int protoff,
+      int *hotdrop)
+{
+	const struct xt_tcpmss_match_info *info = matchinfo;
+
+	return mssoption_match(info->mss_min, info->mss_max, skb, protoff,
+			       info->invert, hotdrop);
+}
+
+static int
+checkentry(const char *tablename,
+           const void *ipinfo,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	const struct ipt_ip *ip = ipinfo;
+	if (matchsize != XT_ALIGN(sizeof(struct xt_tcpmss_match_info)))
+		return 0;
+
+	/* Must specify -p tcp */
+	if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) {
+		printk("tcpmss: Only works on TCP packets\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+checkentry6(const char *tablename,
+	   const void *ipinfo,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	const struct ip6t_ip6 *ip = ipinfo;
+
+	if (matchsize != XT_ALIGN(sizeof(struct xt_tcpmss_match_info)))
+		return 0;
+
+	/* Must specify -p tcp */
+	if (ip->proto != IPPROTO_TCP || (ip->invflags & XT_INV_PROTO)) {
+		printk("tcpmss: Only works on TCP packets\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_match tcpmss_match = {
+	.name		= "tcpmss",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_match tcpmss6_match = {
+	.name		= "tcpmss",
+	.match		= &match,
+	.checkentry	= &checkentry6,
+	.me		= THIS_MODULE,
+};
+
+
+static int __init init(void)
+{
+	int ret;
+	ret = xt_register_match(AF_INET, &tcpmss_match);
+	if (ret)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &tcpmss6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &tcpmss_match);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET6, &tcpmss6_match);
+	xt_unregister_match(AF_INET, &tcpmss_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
new file mode 100644
index 000000000000..33f86fd6f3e6
--- /dev/null
+++ b/net/netfilter/xt_tcpudp.c
@@ -0,0 +1,333 @@
+#include <linux/types.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_DESCRIPTION("x_tables match for TCP and UDP, supports IPv4 and IPv6");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("xt_tcp");
+MODULE_ALIAS("xt_udp");
+MODULE_ALIAS("ipt_udp");
+MODULE_ALIAS("ipt_tcp");
+MODULE_ALIAS("ip6t_udp");
+MODULE_ALIAS("ip6t_tcp");
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+
+/* Returns 1 if the port is matched by the range, 0 otherwise */
+static inline int
+port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert)
+{
+	int ret;
+
+	ret = (port >= min && port <= max) ^ invert;
+	return ret;
+}
+
+static int
+tcp_find_option(u_int8_t option,
+		const struct sk_buff *skb,
+		unsigned int protoff,
+		unsigned int optlen,
+		int invert,
+		int *hotdrop)
+{
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
+	unsigned int i;
+
+	duprintf("tcp_match: finding option\n");
+
+	if (!optlen)
+		return invert;
+
+	/* If we don't have the whole header, drop packet. */
+	op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr),
+				optlen, _opt);
+	if (op == NULL) {
+		*hotdrop = 1;
+		return 0;
+	}
+
+	for (i = 0; i < optlen; ) {
+		if (op[i] == option) return !invert;
+		if (op[i] < 2) i++;
+		else i += op[i+1]?:1;
+	}
+
+	return invert;
+}
+
+static int
+tcp_match(const struct sk_buff *skb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  const void *matchinfo,
+	  int offset,
+	  unsigned int protoff,
+	  int *hotdrop)
+{
+	struct tcphdr _tcph, *th;
+	const struct xt_tcp *tcpinfo = matchinfo;
+
+	if (offset) {
+		/* To quote Alan:
+
+		   Don't allow a fragment of TCP 8 bytes in. Nobody normal
+		   causes this. Its a cracker trying to break in by doing a
+		   flag overwrite to pass the direction checks.
+		*/
+		if (offset == 1) {
+			duprintf("Dropping evil TCP offset=1 frag.\n");
+			*hotdrop = 1;
+		}
+		/* Must not be a fragment. */
+		return 0;
+	}
+
+#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
+
+	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		/* We've been asked to examine this packet, and we
+		   can't.  Hence, no choice but to drop. */
+		duprintf("Dropping evil TCP offset=0 tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+	}
+
+	if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
+			ntohs(th->source),
+			!!(tcpinfo->invflags & XT_TCP_INV_SRCPT)))
+		return 0;
+	if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
+			ntohs(th->dest),
+			!!(tcpinfo->invflags & XT_TCP_INV_DSTPT)))
+		return 0;
+	if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
+		      == tcpinfo->flg_cmp,
+		      XT_TCP_INV_FLAGS))
+		return 0;
+	if (tcpinfo->option) {
+		if (th->doff * 4 < sizeof(_tcph)) {
+			*hotdrop = 1;
+			return 0;
+		}
+		if (!tcp_find_option(tcpinfo->option, skb, protoff,
+				     th->doff*4 - sizeof(_tcph),
+				     tcpinfo->invflags & XT_TCP_INV_OPTION,
+				     hotdrop))
+			return 0;
+	}
+	return 1;
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+tcp_checkentry(const char *tablename,
+	       const void *info,
+	       void *matchinfo,
+	       unsigned int matchsize,
+	       unsigned int hook_mask)
+{
+	const struct ipt_ip *ip = info;
+	const struct xt_tcp *tcpinfo = matchinfo;
+
+	/* Must specify proto == TCP, and no unknown invflags */
+	return ip->proto == IPPROTO_TCP
+		&& !(ip->invflags & XT_INV_PROTO)
+		&& matchsize == XT_ALIGN(sizeof(struct xt_tcp))
+		&& !(tcpinfo->invflags & ~XT_TCP_INV_MASK);
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+tcp6_checkentry(const char *tablename,
+	       const void *entry,
+	       void *matchinfo,
+	       unsigned int matchsize,
+	       unsigned int hook_mask)
+{
+	const struct ip6t_ip6 *ipv6 = entry;
+	const struct xt_tcp *tcpinfo = matchinfo;
+
+	/* Must specify proto == TCP, and no unknown invflags */
+	return ipv6->proto == IPPROTO_TCP
+		&& !(ipv6->invflags & XT_INV_PROTO)
+		&& matchsize == XT_ALIGN(sizeof(struct xt_tcp))
+		&& !(tcpinfo->invflags & ~XT_TCP_INV_MASK);
+}
+
+
+static int
+udp_match(const struct sk_buff *skb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  const void *matchinfo,
+	  int offset,
+	  unsigned int protoff,
+	  int *hotdrop)
+{
+	struct udphdr _udph, *uh;
+	const struct xt_udp *udpinfo = matchinfo;
+
+	/* Must not be a fragment. */
+	if (offset)
+		return 0;
+
+	uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph);
+	if (uh == NULL) {
+		/* We've been asked to examine this packet, and we
+		   can't.  Hence, no choice but to drop. */
+		duprintf("Dropping evil UDP tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+	}
+
+	return port_match(udpinfo->spts[0], udpinfo->spts[1],
+			  ntohs(uh->source),
+			  !!(udpinfo->invflags & XT_UDP_INV_SRCPT))
+		&& port_match(udpinfo->dpts[0], udpinfo->dpts[1],
+			      ntohs(uh->dest),
+			      !!(udpinfo->invflags & XT_UDP_INV_DSTPT));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+udp_checkentry(const char *tablename,
+	       const void *info,
+	       void *matchinfo,
+	       unsigned int matchinfosize,
+	       unsigned int hook_mask)
+{
+	const struct ipt_ip *ip = info;
+	const struct xt_udp *udpinfo = matchinfo;
+
+	/* Must specify proto == UDP, and no unknown invflags */
+	if (ip->proto != IPPROTO_UDP || (ip->invflags & XT_INV_PROTO)) {
+		duprintf("ipt_udp: Protocol %u != %u\n", ip->proto,
+			 IPPROTO_UDP);
+		return 0;
+	}
+	if (matchinfosize != XT_ALIGN(sizeof(struct xt_udp))) {
+		duprintf("ipt_udp: matchsize %u != %u\n",
+			 matchinfosize, XT_ALIGN(sizeof(struct xt_udp)));
+		return 0;
+	}
+	if (udpinfo->invflags & ~XT_UDP_INV_MASK) {
+		duprintf("ipt_udp: unknown flags %X\n",
+			 udpinfo->invflags);
+		return 0;
+	}
+
+	return 1;
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+udp6_checkentry(const char *tablename,
+	       const void *entry,
+	       void *matchinfo,
+	       unsigned int matchinfosize,
+	       unsigned int hook_mask)
+{
+	const struct ip6t_ip6 *ipv6 = entry;
+	const struct xt_udp *udpinfo = matchinfo;
+
+	/* Must specify proto == UDP, and no unknown invflags */
+	if (ipv6->proto != IPPROTO_UDP || (ipv6->invflags & XT_INV_PROTO)) {
+		duprintf("ip6t_udp: Protocol %u != %u\n", ipv6->proto,
+			 IPPROTO_UDP);
+		return 0;
+	}
+	if (matchinfosize != XT_ALIGN(sizeof(struct xt_udp))) {
+		duprintf("ip6t_udp: matchsize %u != %u\n",
+			 matchinfosize, XT_ALIGN(sizeof(struct xt_udp)));
+		return 0;
+	}
+	if (udpinfo->invflags & ~XT_UDP_INV_MASK) {
+		duprintf("ip6t_udp: unknown flags %X\n",
+			 udpinfo->invflags);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_match tcp_matchstruct = {
+	.name		= "tcp",
+	.match		= &tcp_match,
+	.checkentry	= &tcp_checkentry,
+	.me		= THIS_MODULE,
+};
+static struct xt_match tcp6_matchstruct = {
+	.name		= "tcp",
+	.match		= &tcp_match,
+	.checkentry	= &tcp6_checkentry,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_match udp_matchstruct = {
+	.name		= "udp",
+	.match		= &udp_match,
+	.checkentry	= &udp_checkentry,
+	.me		= THIS_MODULE,
+};
+static struct xt_match udp6_matchstruct = {
+	.name		= "udp",
+	.match		= &udp_match,
+	.checkentry	= &udp6_checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+	ret = xt_register_match(AF_INET, &tcp_matchstruct);
+	if (ret)
+		return ret;
+
+	ret = xt_register_match(AF_INET6, &tcp6_matchstruct);
+	if (ret)
+		goto out_unreg_tcp;
+
+	ret = xt_register_match(AF_INET, &udp_matchstruct);
+	if (ret)
+		goto out_unreg_tcp6;
+	
+	ret = xt_register_match(AF_INET6, &udp6_matchstruct);
+	if (ret)
+		goto out_unreg_udp;
+
+	return ret;
+
+out_unreg_udp:
+	xt_unregister_match(AF_INET, &tcp_matchstruct);
+out_unreg_tcp6:
+	xt_unregister_match(AF_INET6, &tcp6_matchstruct);
+out_unreg_tcp:
+	xt_unregister_match(AF_INET, &tcp_matchstruct);
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET6, &udp6_matchstruct);
+	xt_unregister_match(AF_INET, &udp_matchstruct);
+	xt_unregister_match(AF_INET6, &tcp6_matchstruct);
+	xt_unregister_match(AF_INET, &tcp_matchstruct);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index b5001939b74b..39a22a3ffe78 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -62,7 +62,7 @@ ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
 	struct ipt_target *target;
 	int ret = 0;
 
-	target = ipt_find_target(t->u.user.name, t->u.user.revision);
+	target = xt_find_target(AF_INET, t->u.user.name, t->u.user.revision);
 	if (!target)
 		return -ENOENT;
 
-- 
cgit v1.2.3-71-gd317


From 898b5395e915210f41223caa30312994d64cba1d Mon Sep 17 00:00:00 2001
From: Dave C Boutcher <sleddog@us.ibm.com>
Date: Thu, 12 Jan 2006 16:07:17 -0600
Subject: [PATCH] powerpc: Add/remove/update properties in /proc/device-tree

Add support to the proc_device_tree file for removing
and updating properties.  Remove just removes the
proc file, update changes the data pointer within
the proc file.  The remainder of the device-tree
changes occur elsewhere.

Signed-off-by: Dave Boutcher <sleddog@us.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 fs/proc/proc_devtree.c  | 24 ++++++++++++++++++++++++
 include/linux/proc_fs.h |  5 +++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index fb117b74809e..9bdd077d6f55 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -81,6 +81,30 @@ void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop
 	__proc_device_tree_add_prop(pde, prop);
 }
 
+void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
+				  struct property *prop)
+{
+	remove_proc_entry(prop->name, pde);
+}
+
+void proc_device_tree_update_prop(struct proc_dir_entry *pde,
+				  struct property *newprop,
+				  struct property *oldprop)
+{
+	struct proc_dir_entry *ent;
+
+	for (ent = pde->subdir; ent != NULL; ent = ent->next)
+		if (ent->data == oldprop)
+			break;
+	if (ent == NULL) {
+		printk(KERN_WARNING "device-tree: property \"%s\" "
+		       " does not exist\n", oldprop->name);
+	} else {
+		ent->data = newprop;
+		ent->size = newprop->length;
+	}
+}
+
 /*
  * Process a node, adding entries for its children and its properties.
  */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 74488e49166d..aa6322d45198 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -146,6 +146,11 @@ struct property;
 extern void proc_device_tree_init(void);
 extern void proc_device_tree_add_node(struct device_node *, struct proc_dir_entry *);
 extern void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop);
+extern void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
+					 struct property *prop);
+extern void proc_device_tree_update_prop(struct proc_dir_entry *pde,
+					 struct property *newprop,
+					 struct property *oldprop);
 #endif /* CONFIG_PROC_DEVICETREE */
 
 extern struct proc_dir_entry *proc_symlink(const char *,
-- 
cgit v1.2.3-71-gd317


From 67daf5f11f06b9b15f8320de1d237ccc2e74fe43 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 13 Jan 2006 14:23:25 +1100
Subject: [PATCH] Increase AT_VECTOR_SIZE

On PowerPC, we want to be able to provide an AT_PLATFORM aux table
entry to userspace, so that glibc can choose optimized libraries for
the processor we're running on.  Unfortunately that would be the 21st
aux table entry on powerpc, meaning that the aux table including the
terminating null entry would overflow the mm->saved_auxv[] array,
leading to userland programs segfaulting.

This increases the size of the mm->saved_auxv array to be large enough
to accommodate an AT_PLATFORM entry on powerpc.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/auxvec.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h
index 9a7b374c9fb4..d2bc0d66e65d 100644
--- a/include/linux/auxvec.h
+++ b/include/linux/auxvec.h
@@ -26,6 +26,6 @@
 
 #define AT_SECURE 23   /* secure mode boolean */
 
-#define AT_VECTOR_SIZE  42 /* Size of auxiliary table.  */
+#define AT_VECTOR_SIZE  44 /* Size of auxiliary table.  */
 
 #endif /* _LINUX_AUXVEC_H */
-- 
cgit v1.2.3-71-gd317


From 594c8281f90560faf9632d91bb9d402cbe560e63 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@arm.linux.org.uk>
Date: Thu, 5 Jan 2006 14:29:51 +0000
Subject: [PATCH] Add bus_type probe, remove, shutdown methods.

Add bus_type probe, remove and shutdown methods to replace the
corresponding methods in struct device_driver.  This matches
the way we handle the suspend/resume methods.

Since the bus methods override the device_driver methods, warn
if a device driver is registered whose methods will not be
called.

The long-term idea is to remove the device_driver methods entirely.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/dd.c             | 12 ++++++++++--
 drivers/base/driver.c         |  5 +++++
 drivers/base/power/shutdown.c |  5 ++++-
 include/linux/device.h        |  3 +++
 4 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 2b905016664d..730a9ce0a14a 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -78,7 +78,13 @@ int driver_probe_device(struct device_driver * drv, struct device * dev)
 	pr_debug("%s: Matched Device %s with Driver %s\n",
 		 drv->bus->name, dev->bus_id, drv->name);
 	dev->driver = drv;
-	if (drv->probe) {
+	if (dev->bus->probe) {
+		ret = dev->bus->probe(dev);
+		if (ret) {
+			dev->driver = NULL;
+			goto ProbeFailed;
+		}
+	} else if (drv->probe) {
 		ret = drv->probe(dev);
 		if (ret) {
 			dev->driver = NULL;
@@ -203,7 +209,9 @@ static void __device_release_driver(struct device * dev)
 		sysfs_remove_link(&dev->kobj, "driver");
 		klist_remove(&dev->knode_driver);
 
-		if (drv->remove)
+		if (dev->bus->remove)
+			dev->bus->remove(dev);
+		else if (drv->remove)
 			drv->remove(dev);
 		dev->driver = NULL;
 		put_driver(drv);
diff --git a/drivers/base/driver.c b/drivers/base/driver.c
index 161f3a390d90..b400314e1c62 100644
--- a/drivers/base/driver.c
+++ b/drivers/base/driver.c
@@ -171,6 +171,11 @@ static void klist_devices_put(struct klist_node *n)
  */
 int driver_register(struct device_driver * drv)
 {
+	if ((drv->bus->probe && drv->probe) ||
+	    (drv->bus->remove && drv->remove) ||
+	    (drv->bus->shutdown && drv->shutdown)) {
+		printk(KERN_WARNING "Driver '%s' needs updating - please use bus_type methods\n", drv->name);
+	}
 	klist_init(&drv->klist_devices, klist_devices_get, klist_devices_put);
 	init_completion(&drv->unloaded);
 	return bus_add_driver(drv);
diff --git a/drivers/base/power/shutdown.c b/drivers/base/power/shutdown.c
index f50a08be424b..a47bb74da72b 100644
--- a/drivers/base/power/shutdown.c
+++ b/drivers/base/power/shutdown.c
@@ -40,7 +40,10 @@ void device_shutdown(void)
 	down_write(&devices_subsys.rwsem);
 	list_for_each_entry_reverse(dev, &devices_subsys.kset.list,
 				kobj.entry) {
-		if (dev->driver && dev->driver->shutdown) {
+		if (dev->bus && dev->bus->shutdown) {
+			dev_dbg(dev, "shutdown\n");
+			dev->bus->shutdown(dev);
+		} else if (dev->driver && dev->driver->shutdown) {
 			dev_dbg(dev, "shutdown\n");
 			dev->driver->shutdown(dev);
 		}
diff --git a/include/linux/device.h b/include/linux/device.h
index 0cdee78e5ce1..58df18d9cd3e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -49,6 +49,9 @@ struct bus_type {
 	int		(*match)(struct device * dev, struct device_driver * drv);
 	int		(*uevent)(struct device *dev, char **envp,
 				  int num_envp, char *buffer, int buffer_size);
+	int		(*probe)(struct device * dev);
+	int		(*remove)(struct device * dev);
+	void		(*shutdown)(struct device * dev);
 	int		(*suspend)(struct device * dev, pm_message_t state);
 	int		(*resume)(struct device * dev);
 };
-- 
cgit v1.2.3-71-gd317


From 4031bbe4bbec6c0fe50412ef7fb43a270b0f29f1 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@arm.linux.org.uk>
Date: Fri, 6 Jan 2006 11:41:00 +0000
Subject: [PATCH] Add ide_bus_type probe and remove methods

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/ide/ide-cd.c     | 14 +++++---------
 drivers/ide/ide-disk.c   | 22 ++++++++--------------
 drivers/ide/ide-floppy.c | 14 +++++---------
 drivers/ide/ide-tape.c   | 18 +++++++-----------
 drivers/ide/ide.c        | 31 +++++++++++++++++++++++++++++++
 include/linux/ide.h      |  5 +++++
 6 files changed, 61 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 9b2ebd219ad0..ef09a7ef2396 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -3256,9 +3256,8 @@ sector_t ide_cdrom_capacity (ide_drive_t *drive)
 }
 #endif
 
-static int ide_cd_remove(struct device *dev)
+static void ide_cd_remove(ide_drive_t *drive)
 {
-	ide_drive_t *drive = to_ide_device(dev);
 	struct cdrom_info *info = drive->driver_data;
 
 	ide_unregister_subdriver(drive, info->driver);
@@ -3266,8 +3265,6 @@ static int ide_cd_remove(struct device *dev)
 	del_gendisk(info->disk);
 
 	ide_cd_put(info);
-
-	return 0;
 }
 
 static void ide_cd_release(struct kref *kref)
@@ -3291,7 +3288,7 @@ static void ide_cd_release(struct kref *kref)
 	kfree(info);
 }
 
-static int ide_cd_probe(struct device *);
+static int ide_cd_probe(ide_drive_t *);
 
 #ifdef CONFIG_PROC_FS
 static int proc_idecd_read_capacity
@@ -3317,9 +3314,9 @@ static ide_driver_t ide_cdrom_driver = {
 		.owner		= THIS_MODULE,
 		.name		= "ide-cdrom",
 		.bus		= &ide_bus_type,
-		.probe		= ide_cd_probe,
-		.remove		= ide_cd_remove,
 	},
+	.probe			= ide_cd_probe,
+	.remove			= ide_cd_remove,
 	.version		= IDECD_VERSION,
 	.media			= ide_cdrom,
 	.supports_dsc_overlap	= 1,
@@ -3413,9 +3410,8 @@ static char *ignore = NULL;
 module_param(ignore, charp, 0400);
 MODULE_DESCRIPTION("ATAPI CD-ROM Driver");
 
-static int ide_cd_probe(struct device *dev)
+static int ide_cd_probe(ide_drive_t *drive)
 {
-	ide_drive_t *drive = to_ide_device(dev);
 	struct cdrom_info *info;
 	struct gendisk *g;
 	struct request_sense sense;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index cab362ea0336..245b508208df 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -997,9 +997,8 @@ static void ide_cacheflush_p(ide_drive_t *drive)
 		printk(KERN_INFO "%s: wcache flush failed!\n", drive->name);
 }
 
-static int ide_disk_remove(struct device *dev)
+static void ide_disk_remove(ide_drive_t *drive)
 {
-	ide_drive_t *drive = to_ide_device(dev);
 	struct ide_disk_obj *idkp = drive->driver_data;
 	struct gendisk *g = idkp->disk;
 
@@ -1010,8 +1009,6 @@ static int ide_disk_remove(struct device *dev)
 	ide_cacheflush_p(drive);
 
 	ide_disk_put(idkp);
-
-	return 0;
 }
 
 static void ide_disk_release(struct kref *kref)
@@ -1027,12 +1024,10 @@ static void ide_disk_release(struct kref *kref)
 	kfree(idkp);
 }
 
-static int ide_disk_probe(struct device *dev);
+static int ide_disk_probe(ide_drive_t *drive);
 
-static void ide_device_shutdown(struct device *dev)
+static void ide_device_shutdown(ide_drive_t *drive)
 {
-	ide_drive_t *drive = container_of(dev, ide_drive_t, gendev);
-
 #ifdef	CONFIG_ALPHA
 	/* On Alpha, halt(8) doesn't actually turn the machine off,
 	   it puts you into the sort of firmware monitor. Typically,
@@ -1054,7 +1049,7 @@ static void ide_device_shutdown(struct device *dev)
 	}
 
 	printk("Shutdown: %s\n", drive->name);
-	dev->bus->suspend(dev, PMSG_SUSPEND);
+	drive->gendev.bus->suspend(&drive->gendev, PMSG_SUSPEND);
 }
 
 static ide_driver_t idedisk_driver = {
@@ -1062,10 +1057,10 @@ static ide_driver_t idedisk_driver = {
 		.owner		= THIS_MODULE,
 		.name		= "ide-disk",
 		.bus		= &ide_bus_type,
-		.probe		= ide_disk_probe,
-		.remove		= ide_disk_remove,
-		.shutdown	= ide_device_shutdown,
 	},
+	.probe			= ide_disk_probe,
+	.remove			= ide_disk_remove,
+	.shutdown		= ide_device_shutdown,
 	.version		= IDEDISK_VERSION,
 	.media			= ide_disk,
 	.supports_dsc_overlap	= 0,
@@ -1182,9 +1177,8 @@ static struct block_device_operations idedisk_ops = {
 
 MODULE_DESCRIPTION("ATA DISK Driver");
 
-static int ide_disk_probe(struct device *dev)
+static int ide_disk_probe(ide_drive_t *drive)
 {
-	ide_drive_t *drive = to_ide_device(dev);
 	struct ide_disk_obj *idkp;
 	struct gendisk *g;
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 5945f551aaaa..1f8db9ac05d1 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -1871,9 +1871,8 @@ static void idefloppy_setup (ide_drive_t *drive, idefloppy_floppy_t *floppy)
 	idefloppy_add_settings(drive);
 }
 
-static int ide_floppy_remove(struct device *dev)
+static void ide_floppy_remove(ide_drive_t *drive)
 {
-	ide_drive_t *drive = to_ide_device(dev);
 	idefloppy_floppy_t *floppy = drive->driver_data;
 	struct gendisk *g = floppy->disk;
 
@@ -1882,8 +1881,6 @@ static int ide_floppy_remove(struct device *dev)
 	del_gendisk(g);
 
 	ide_floppy_put(floppy);
-
-	return 0;
 }
 
 static void ide_floppy_release(struct kref *kref)
@@ -1922,16 +1919,16 @@ static ide_proc_entry_t idefloppy_proc[] = {
 
 #endif	/* CONFIG_PROC_FS */
 
-static int ide_floppy_probe(struct device *);
+static int ide_floppy_probe(ide_drive_t *);
 
 static ide_driver_t idefloppy_driver = {
 	.gen_driver = {
 		.owner		= THIS_MODULE,
 		.name		= "ide-floppy",
 		.bus		= &ide_bus_type,
-		.probe		= ide_floppy_probe,
-		.remove		= ide_floppy_remove,
 	},
+	.probe			= ide_floppy_probe,
+	.remove			= ide_floppy_remove,
 	.version		= IDEFLOPPY_VERSION,
 	.media			= ide_floppy,
 	.supports_dsc_overlap	= 0,
@@ -2136,9 +2133,8 @@ static struct block_device_operations idefloppy_ops = {
 	.revalidate_disk= idefloppy_revalidate_disk
 };
 
-static int ide_floppy_probe(struct device *dev)
+static int ide_floppy_probe(ide_drive_t *drive)
 {
-	ide_drive_t *drive = to_ide_device(dev);
 	idefloppy_floppy_t *floppy;
 	struct gendisk *g;
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index fab9b2b02504..0101d0def7c5 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -4682,9 +4682,8 @@ static void idetape_setup (ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	idetape_add_settings(drive);
 }
 
-static int ide_tape_remove(struct device *dev)
+static void ide_tape_remove(ide_drive_t *drive)
 {
-	ide_drive_t *drive = to_ide_device(dev);
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_unregister_subdriver(drive, tape->driver);
@@ -4692,8 +4691,6 @@ static int ide_tape_remove(struct device *dev)
 	ide_unregister_region(tape->disk);
 
 	ide_tape_put(tape);
-
-	return 0;
 }
 
 static void ide_tape_release(struct kref *kref)
@@ -4745,16 +4742,16 @@ static ide_proc_entry_t idetape_proc[] = {
 
 #endif
 
-static int ide_tape_probe(struct device *);
+static int ide_tape_probe(ide_drive_t *);
 
 static ide_driver_t idetape_driver = {
 	.gen_driver = {
 		.owner		= THIS_MODULE,
 		.name		= "ide-tape",
 		.bus		= &ide_bus_type,
-		.probe		= ide_tape_probe,
-		.remove		= ide_tape_remove,
 	},
+	.probe			= ide_tape_probe,
+	.remove			= ide_tape_remove,
 	.version		= IDETAPE_VERSION,
 	.media			= ide_tape,
 	.supports_dsc_overlap 	= 1,
@@ -4825,9 +4822,8 @@ static struct block_device_operations idetape_block_ops = {
 	.ioctl		= idetape_ioctl,
 };
 
-static int ide_tape_probe(struct device *dev)
+static int ide_tape_probe(ide_drive_t *drive)
 {
-	ide_drive_t *drive = to_ide_device(dev);
 	idetape_tape_t *tape;
 	struct gendisk *g;
 	int minor;
@@ -4883,9 +4879,9 @@ static int ide_tape_probe(struct device *dev)
 	idetape_setup(drive, tape, minor);
 
 	class_device_create(idetape_sysfs_class, NULL,
-			MKDEV(IDETAPE_MAJOR, minor), dev, "%s", tape->name);
+			MKDEV(IDETAPE_MAJOR, minor), &drive->gendev, "%s", tape->name);
 	class_device_create(idetape_sysfs_class, NULL,
-			MKDEV(IDETAPE_MAJOR, minor + 128), dev, "n%s", tape->name);
+			MKDEV(IDETAPE_MAJOR, minor + 128), &drive->gendev, "n%s", tape->name);
 
 	devfs_mk_cdev(MKDEV(HWIF(drive)->major, minor),
 			S_IFCHR | S_IRUGO | S_IWUGO,
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index ec5a4cb173b0..afeb02bbb722 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -1949,10 +1949,41 @@ static int ide_uevent(struct device *dev, char **envp, int num_envp,
 	return 0;
 }
 
+static int generic_ide_probe(struct device *dev)
+{
+	ide_drive_t *drive = to_ide_device(dev);
+	ide_driver_t *drv = to_ide_driver(dev->driver);
+
+	return drv->probe ? drv->probe(drive) : -ENODEV;
+}
+
+static int generic_ide_remove(struct device *dev)
+{
+	ide_drive_t *drive = to_ide_device(dev);
+	ide_driver_t *drv = to_ide_driver(dev->driver);
+
+	if (drv->remove)
+		drv->remove(drive);
+
+	return 0;
+}
+
+static void generic_ide_shutdown(struct device *dev)
+{
+	ide_drive_t *drive = to_ide_device(dev);
+	ide_driver_t *drv = to_ide_driver(dev->driver);
+
+	if (dev->driver && drv->shutdown)
+		drv->shutdown(drive);
+}
+
 struct bus_type ide_bus_type = {
 	.name		= "ide",
 	.match		= ide_bus_match,
 	.uevent		= ide_uevent,
+	.probe		= generic_ide_probe,
+	.remove		= generic_ide_remove,
+	.shutdown	= generic_ide_shutdown,
 	.dev_attrs	= ide_dev_attrs,
 	.suspend	= generic_ide_suspend,
 	.resume		= generic_ide_resume,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index f2e1b5b22898..110b3cfac021 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -983,8 +983,13 @@ typedef struct ide_driver_s {
 	ide_startstop_t	(*abort)(ide_drive_t *, struct request *rq);
 	ide_proc_entry_t	*proc;
 	struct device_driver	gen_driver;
+	int		(*probe)(ide_drive_t *);
+	void		(*remove)(ide_drive_t *);
+	void		(*shutdown)(ide_drive_t *);
 } ide_driver_t;
 
+#define to_ide_driver(drv) container_of(drv, ide_driver_t, gen_driver)
+
 int generic_ide_ioctl(ide_drive_t *, struct file *, struct block_device *, unsigned, unsigned long);
 
 /*
-- 
cgit v1.2.3-71-gd317


From e2862f6a833ea26591c7feb755dc2e46909182a6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Jan 2006 21:37:07 +0000
Subject: [SERIAL] convert uart_state.sem to uart_state.mutex

semaphore to mutex conversion.

the conversion was generated via scripts, and the result was validated
automatically via a script as well.

build and boot tested.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/pmac_zilog.c  |  9 +++----
 drivers/serial/serial_core.c | 60 ++++++++++++++++++++++----------------------
 include/linux/serial_core.h  |  3 ++-
 3 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/pmac_zilog.c b/drivers/serial/pmac_zilog.c
index 5f52883e64d2..4e03a87f3fb4 100644
--- a/drivers/serial/pmac_zilog.c
+++ b/drivers/serial/pmac_zilog.c
@@ -69,7 +69,6 @@
 #include <asm/pmac_feature.h>
 #include <asm/dbdma.h>
 #include <asm/macio.h>
-#include <asm/semaphore.h>
 
 #if defined (CONFIG_SERIAL_PMACZILOG_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
 #define SUPPORT_SYSRQ
@@ -1593,7 +1592,7 @@ static int pmz_suspend(struct macio_dev *mdev, pm_message_t pm_state)
 	state = pmz_uart_reg.state + uap->port.line;
 
 	mutex_lock(&pmz_irq_mutex);
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 
 	spin_lock_irqsave(&uap->port.lock, flags);
 
@@ -1624,7 +1623,7 @@ static int pmz_suspend(struct macio_dev *mdev, pm_message_t pm_state)
 	/* Shut the chip down */
 	pmz_set_scc_power(uap, 0);
 
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 	mutex_unlock(&pmz_irq_mutex);
 
 	pmz_debug("suspend, switching complete\n");
@@ -1653,7 +1652,7 @@ static int pmz_resume(struct macio_dev *mdev)
 	state = pmz_uart_reg.state + uap->port.line;
 
 	mutex_lock(&pmz_irq_mutex);
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 
 	spin_lock_irqsave(&uap->port.lock, flags);
 	if (!ZS_IS_OPEN(uap) && !ZS_IS_CONS(uap)) {
@@ -1685,7 +1684,7 @@ static int pmz_resume(struct macio_dev *mdev)
 	}
 
  bail:
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 	mutex_unlock(&pmz_irq_mutex);
 
 	/* Right now, we deal with delay by blocking here, I'll be
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 2ca620900bcc..943770470b9d 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -638,7 +638,7 @@ static int uart_set_info(struct uart_state *state,
 	 * module insertion/removal doesn't change anything
 	 * under us.
 	 */
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 
 	change_irq  = new_serial.irq != port->irq;
 
@@ -797,7 +797,7 @@ static int uart_set_info(struct uart_state *state,
 	} else
 		retval = uart_startup(state, 1);
  exit:
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 	return retval;
 }
 
@@ -834,7 +834,7 @@ static int uart_tiocmget(struct tty_struct *tty, struct file *file)
 	struct uart_port *port = state->port;
 	int result = -EIO;
 
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 	if ((!file || !tty_hung_up_p(file)) &&
 	    !(tty->flags & (1 << TTY_IO_ERROR))) {
 		result = port->mctrl;
@@ -843,7 +843,7 @@ static int uart_tiocmget(struct tty_struct *tty, struct file *file)
 		result |= port->ops->get_mctrl(port);
 		spin_unlock_irq(&port->lock);
 	}
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 
 	return result;
 }
@@ -856,13 +856,13 @@ uart_tiocmset(struct tty_struct *tty, struct file *file,
 	struct uart_port *port = state->port;
 	int ret = -EIO;
 
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 	if ((!file || !tty_hung_up_p(file)) &&
 	    !(tty->flags & (1 << TTY_IO_ERROR))) {
 		uart_update_mctrl(port, set, clear);
 		ret = 0;
 	}
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 	return ret;
 }
 
@@ -873,12 +873,12 @@ static void uart_break_ctl(struct tty_struct *tty, int break_state)
 
 	BUG_ON(!kernel_locked());
 
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 
 	if (port->type != PORT_UNKNOWN)
 		port->ops->break_ctl(port, break_state);
 
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 }
 
 static int uart_do_autoconfig(struct uart_state *state)
@@ -894,7 +894,7 @@ static int uart_do_autoconfig(struct uart_state *state)
 	 * changing, and hence any extra opens of the port while
 	 * we're auto-configuring.
 	 */
-	if (down_interruptible(&state->sem))
+	if (mutex_lock_interruptible(&state->mutex))
 		return -ERESTARTSYS;
 
 	ret = -EBUSY;
@@ -920,7 +920,7 @@ static int uart_do_autoconfig(struct uart_state *state)
 
 		ret = uart_startup(state, 1);
 	}
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 	return ret;
 }
 
@@ -1074,7 +1074,7 @@ uart_ioctl(struct tty_struct *tty, struct file *filp, unsigned int cmd,
 	if (ret != -ENOIOCTLCMD)
 		goto out;
 
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 
 	if (tty_hung_up_p(filp)) {
 		ret = -EIO;
@@ -1098,7 +1098,7 @@ uart_ioctl(struct tty_struct *tty, struct file *filp, unsigned int cmd,
 	}
 	}
  out_up:
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
  out:
 	return ret;
 }
@@ -1186,7 +1186,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 
 	DPRINTK("uart_close(%d) called\n", port->line);
 
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 
 	if (tty_hung_up_p(filp))
 		goto done;
@@ -1260,7 +1260,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	wake_up_interruptible(&state->info->open_wait);
 
  done:
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 }
 
 static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
@@ -1334,7 +1334,7 @@ static void uart_hangup(struct tty_struct *tty)
 	BUG_ON(!kernel_locked());
 	DPRINTK("uart_hangup(%d)\n", state->port->line);
 
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 	if (state->info && state->info->flags & UIF_NORMAL_ACTIVE) {
 		uart_flush_buffer(tty);
 		uart_shutdown(state);
@@ -1344,7 +1344,7 @@ static void uart_hangup(struct tty_struct *tty)
 		wake_up_interruptible(&state->info->open_wait);
 		wake_up_interruptible(&state->info->delta_msr_wait);
 	}
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 }
 
 /*
@@ -1447,9 +1447,9 @@ uart_block_til_ready(struct file *filp, struct uart_state *state)
 		if (mctrl & TIOCM_CAR)
 			break;
 
-		up(&state->sem);
+		mutex_unlock(&state->mutex);
 		schedule();
-		down(&state->sem);
+		mutex_lock(&state->mutex);
 
 		if (signal_pending(current))
 			break;
@@ -1475,7 +1475,7 @@ static struct uart_state *uart_get(struct uart_driver *drv, int line)
 
 	mutex_lock(&port_mutex);
 	state = drv->state + line;
-	if (down_interruptible(&state->sem)) {
+	if (mutex_lock_interruptible(&state->mutex)) {
 		state = ERR_PTR(-ERESTARTSYS);
 		goto out;
 	}
@@ -1483,7 +1483,7 @@ static struct uart_state *uart_get(struct uart_driver *drv, int line)
 	state->count++;
 	if (!state->port) {
 		state->count--;
-		up(&state->sem);
+		mutex_unlock(&state->mutex);
 		state = ERR_PTR(-ENXIO);
 		goto out;
 	}
@@ -1504,7 +1504,7 @@ static struct uart_state *uart_get(struct uart_driver *drv, int line)
 				     (unsigned long)state);
 		} else {
 			state->count--;
-			up(&state->sem);
+			mutex_unlock(&state->mutex);
 			state = ERR_PTR(-ENOMEM);
 		}
 	}
@@ -1571,7 +1571,7 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	if (tty_hung_up_p(filp)) {
 		retval = -EAGAIN;
 		state->count--;
-		up(&state->sem);
+		mutex_unlock(&state->mutex);
 		goto fail;
 	}
 
@@ -1591,7 +1591,7 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	 */
 	if (retval == 0)
 		retval = uart_block_til_ready(filp, state);
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 
 	/*
 	 * If this is the first open to succeed, adjust things to suit.
@@ -1867,7 +1867,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *port)
 {
 	struct uart_state *state = drv->state + port->line;
 
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 
 	if (state->info && state->info->flags & UIF_INITIALIZED) {
 		struct uart_ops *ops = port->ops;
@@ -1896,7 +1896,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *port)
 
 	uart_change_pm(state, 3);
 
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 
 	return 0;
 }
@@ -1905,7 +1905,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 {
 	struct uart_state *state = drv->state + port->line;
 
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 
 	uart_change_pm(state, 0);
 
@@ -1954,7 +1954,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 		}
 	}
 
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 
 	return 0;
 }
@@ -2049,7 +2049,7 @@ uart_unconfigure_port(struct uart_driver *drv, struct uart_state *state)
 	if (info && info->tty)
 		tty_vhangup(info->tty);
 
-	down(&state->sem);
+	mutex_lock(&state->mutex);
 
 	state->info = NULL;
 
@@ -2072,7 +2072,7 @@ uart_unconfigure_port(struct uart_driver *drv, struct uart_state *state)
 		kfree(info);
 	}
 
-	up(&state->sem);
+	mutex_unlock(&state->mutex);
 }
 
 static struct tty_operations uart_ops = {
@@ -2161,7 +2161,7 @@ int uart_register_driver(struct uart_driver *drv)
 		state->close_delay     = 500;	/* .5 seconds */
 		state->closing_wait    = 30000;	/* 30 seconds */
 
-		init_MUTEX(&state->sem);
+		mutex_init(&state->mutex);
 	}
 
 	retval = tty_register_driver(normal);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index a8187c3c8a7b..ec351005bf9d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -136,6 +136,7 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/tty.h>
+#include <linux/mutex.h>
 
 struct uart_port;
 struct uart_info;
@@ -284,7 +285,7 @@ struct uart_state {
 	struct uart_info	*info;
 	struct uart_port	*port;
 
-	struct semaphore	sem;
+	struct mutex		mutex;
 };
 
 #define UART_XMIT_SIZE	PAGE_SIZE
-- 
cgit v1.2.3-71-gd317


From 46b86a2da0fd14bd49765330df63a62279833acb Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 13 Jan 2006 14:29:07 -0800
Subject: [NET]: Use NIP6_FMT in kernel.h

There are errors and inconsistency in the display of NIP6 strings.
	ie: net/ipv6/ip6_flowlabel.c

There are errors and inconsistency in the display of NIPQUAD strings too.
	ie: net/netfilter/nf_conntrack_ftp.c

This patch:
	adds NIP6_FMT to kernel.h
	changes all code to use NIP6_FMT
	fixes net/ipv6/ip6_flowlabel.c
	adds NIPQUAD_FMT to kernel.h
	fixes net/netfilter/nf_conntrack_ftp.c
	changes a few uses of "%u.%u.%u.%u" to NIPQUAD_FMT for symmetry to NIP6_FMT

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h                         |  2 ++
 include/net/netfilter/nf_conntrack_tuple.h     |  2 +-
 include/net/sctp/sctp.h                        |  4 ++--
 net/ipv6/addrconf.c                            |  2 +-
 net/ipv6/ah6.c                                 |  3 +--
 net/ipv6/anycast.c                             |  4 +---
 net/ipv6/esp6.c                                |  3 +--
 net/ipv6/icmp.c                                |  2 +-
 net/ipv6/ip6_flowlabel.c                       |  8 +++-----
 net/ipv6/ipcomp6.c                             |  3 +--
 net/ipv6/mcast.c                               |  9 +++------
 net/ipv6/ndisc.c                               |  2 +-
 net/ipv6/netfilter/ip6t_LOG.c                  |  5 ++---
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  2 +-
 net/ipv6/xfrm6_tunnel.c                        |  8 +++-----
 net/netfilter/nf_conntrack_ftp.c               |  4 ++--
 net/sctp/ipv6.c                                | 24 +++++++++---------------
 net/sctp/sm_statefuns.c                        |  4 ++--
 security/selinux/avc.c                         |  5 ++---
 19 files changed, 39 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e6ee2d95da7a..323924edb26a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -216,6 +216,7 @@ extern void dump_stack(void);
 	((unsigned char *)&addr)[1], \
 	((unsigned char *)&addr)[2], \
 	((unsigned char *)&addr)[3]
+#define NIPQUAD_FMT "%u.%u.%u.%u"
 
 #define NIP6(addr) \
 	ntohs((addr).s6_addr16[0]), \
@@ -226,6 +227,7 @@ extern void dump_stack(void);
 	ntohs((addr).s6_addr16[5]), \
 	ntohs((addr).s6_addr16[6]), \
 	ntohs((addr).s6_addr16[7])
+#define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
 
 #if defined(__LITTLE_ENDIAN)
 #define HIPQUAD(addr) \
diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h
index 14ce790e5c65..530ef1f75283 100644
--- a/include/net/netfilter/nf_conntrack_tuple.h
+++ b/include/net/netfilter/nf_conntrack_tuple.h
@@ -111,7 +111,7 @@ struct nf_conntrack_tuple
 #ifdef __KERNEL__
 
 #define NF_CT_DUMP_TUPLE(tp)						    \
-DEBUGP("tuple %p: %u %u %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x %hu -> %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x %hu\n",					    \
+DEBUGP("tuple %p: %u %u " NIP6_FMT " %hu -> " NIP6_FMT " %hu\n",	    \
 	(tp), (tp)->src.l3num, (tp)->dst.protonum,			    \
 	NIP6(*(struct in6_addr *)(tp)->src.u3.all), ntohs((tp)->src.u.all), \
 	NIP6(*(struct in6_addr *)(tp)->dst.u3.all), ntohs((tp)->dst.u.all))
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 8f241216f46b..a553f39f6aee 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -225,13 +225,13 @@ extern int sctp_debug_flag;
 	if (sctp_debug_flag) { \
 		if (saddr->sa.sa_family == AF_INET6) { \
 			printk(KERN_DEBUG \
-			       lead "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x" trail, \
+			       lead NIP6_FMT trail, \
 			       leadparm, \
 			       NIP6(saddr->v6.sin6_addr), \
 			       otherparms); \
 		} else { \
 			printk(KERN_DEBUG \
-			       lead "%u.%u.%u.%u" trail, \
+			       lead NIPQUAD_FMT trail, \
 			       leadparm, \
 			       NIPQUAD(saddr->v4.sin_addr.s_addr), \
 			       otherparms); \
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7129d4239755..dfb4f145a139 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2644,7 +2644,7 @@ static int if6_seq_show(struct seq_file *seq, void *v)
 {
 	struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v;
 	seq_printf(seq,
-		   "%04x%04x%04x%04x%04x%04x%04x%04x %02x %02x %02x %02x %8s\n",
+		   NIP6_FMT " %02x %02x %02x %02x %8s\n",
 		   NIP6(ifp->addr),
 		   ifp->idev->dev->ifindex,
 		   ifp->prefix_len,
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 13cc7f895583..c7932cb420a5 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -332,8 +332,7 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!x)
 		return;
 
-	NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
-		 "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/" NIP6_FMT "\n",
 		 ntohl(ah->spi), NIP6(iph->daddr));
 
 	xfrm_state_put(x);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 65e73ac0d6d0..72bd08af2dfb 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -532,9 +532,7 @@ static int ac6_seq_show(struct seq_file *seq, void *v)
 	struct ac6_iter_state *state = ac6_seq_private(seq);
 
 	seq_printf(seq,
-		   "%-4d %-15s "
-		   "%04x%04x%04x%04x%04x%04x%04x%04x "
-		   "%5d\n",
+		   "%-4d %-15s " NIP6_FMT " %5d\n",
 		   state->dev->ifindex, state->dev->name,
 		   NIP6(im->aca_addr),
 		   im->aca_users);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 6de8ee1a5ad9..7b5b94f13902 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -266,8 +266,7 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6);
 	if (!x)
 		return;
-	printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/"
-			"%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", 
+	printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/" NIP6_FMT "\n", 
 			ntohl(esph->spi), NIP6(iph->daddr));
 	xfrm_state_put(x);
 }
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 53c81fcd20ba..fcf883183cef 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -607,7 +607,7 @@ static int icmpv6_rcv(struct sk_buff **pskb)
 		skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len,
 					     IPPROTO_ICMPV6, 0);
 		if (__skb_checksum_complete(skb)) {
-			LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
+			LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [" NIP6_FMT " > " NIP6_FMT "]\n",
 				       NIP6(*saddr), NIP6(*daddr));
 			goto discard_it;
 		}
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 964ad9d1276d..4183c8dac7f6 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -629,9 +629,7 @@ static void ip6fl_fl_seq_show(struct seq_file *seq, struct ip6_flowlabel *fl)
 {
 	while(fl) {
 		seq_printf(seq,
-			   "%05X %-1d %-6d %-6d %-6ld %-8ld "
-			   "%02x%02x%02x%02x%02x%02x%02x%02x "
-			   "%-4d\n",
+			   "%05X %-1d %-6d %-6d %-6ld %-8ld " NIP6_FMT " %-4d\n",
 			   (unsigned)ntohl(fl->label),
 			   fl->share,
 			   (unsigned)fl->owner,
@@ -647,8 +645,8 @@ static void ip6fl_fl_seq_show(struct seq_file *seq, struct ip6_flowlabel *fl)
 static int ip6fl_seq_show(struct seq_file *seq, void *v)
 {
 	if (v == SEQ_START_TOKEN)
-		seq_puts(seq, "Label S Owner  Users  Linger Expires  "
-			      "Dst                              Opt\n");
+		seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-39s %s\n",
+			   "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt");
 	else
 		ip6fl_fl_seq_show(seq, v);
 	return 0;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 626dd39685f2..d511a884dad0 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -212,8 +212,7 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!x)
 		return;
 
-	printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/"
-			"%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+	printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" NIP6_FMT "\n",
 			spi, NIP6(iph->daddr));
 	xfrm_state_put(x);
 }
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index cc3e9f560867..0e03eabfb9da 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2373,7 +2373,7 @@ static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
 	struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
 
 	seq_printf(seq,
-		   "%-4d %-15s %04x%04x%04x%04x%04x%04x%04x%04x %5d %08X %ld\n", 
+		   "%-4d %-15s " NIP6_FMT " %5d %08X %ld\n", 
 		   state->dev->ifindex, state->dev->name,
 		   NIP6(im->mca_addr),
 		   im->mca_users, im->mca_flags,
@@ -2542,15 +2542,12 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(seq, 
 			   "%3s %6s "
-			   "%32s %32s %6s %6s\n", "Idx",
+			   "%39s %39s %6s %6s\n", "Idx",
 			   "Device", "Multicast Address",
 			   "Source Address", "INC", "EXC");
 	} else {
 		seq_printf(seq,
-			   "%3d %6.6s "
-			   "%04x%04x%04x%04x%04x%04x%04x%04x "
-			   "%04x%04x%04x%04x%04x%04x%04x%04x "
-			   "%6lu %6lu\n",
+			   "%3d %6.6s " NIP6_FMT " " NIP6_FMT " %6lu %6lu\n",
 			   state->dev->ifindex, state->dev->name,
 			   NIP6(state->im->mca_addr),
 			   NIP6(psf->sf_addr),
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 305d9ee6d7db..cb8856b1d951 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -692,7 +692,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 		if (!(neigh->nud_state & NUD_VALID)) {
 			ND_PRINTK1(KERN_DEBUG
 				   "%s(): trying to ucast probe in NUD_INVALID: "
-				   "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+				   NIP6_FMT "\n",
 				   __FUNCTION__,
 				   NIP6(*target));
 		}
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index fc19d8a4ca68..77c725832dec 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -63,9 +63,8 @@ static void dump_packet(const struct nf_loginfo *info,
 		return;
 	}
 
-	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000" */
-	printk("SRC=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(ih->saddr));
-	printk("DST=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(ih->daddr));
+	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
+	printk("SRC=" NIP6_FMT " DST=" NIP6_FMT " ", NIP6(ih->saddr), NIP6(ih->daddr));
 
 	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
 	printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 92ad53d8f8c3..ac702a29dd16 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -74,7 +74,7 @@ static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 static int ipv6_print_tuple(struct seq_file *s,
 			    const struct nf_conntrack_tuple *tuple)
 {
-	return seq_printf(s, "src=%x:%x:%x:%x:%x:%x:%x:%x dst=%x:%x:%x:%x:%x:%x:%x:%x ",
+	return seq_printf(s, "src=" NIP6_FMT " dst=" NIP6_FMT " ",
 			  NIP6(*((struct in6_addr *)tuple->src.u3.ip6)),
 			  NIP6(*((struct in6_addr *)tuple->dst.u3.ip6)));
 }
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index da09ff258648..8cfc58b96fc2 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -259,8 +259,7 @@ try_next_2:;
 	spi = 0;
 	goto out;
 alloc_spi:
-	X6TPRINTK3(KERN_DEBUG "%s(): allocate new spi for "
-			      "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", 
+	X6TPRINTK3(KERN_DEBUG "%s(): allocate new spi for " NIP6_FMT "\n",
 			      __FUNCTION__, 
 			      NIP6(*(struct in6_addr *)saddr));
 	x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, SLAB_ATOMIC);
@@ -323,9 +322,8 @@ void xfrm6_tunnel_free_spi(xfrm_address_t *saddr)
 				  list_byaddr)
 	{
 		if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) {
-			X6TPRINTK3(KERN_DEBUG "%s(): x6spi object "
-					      "for %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x "
-					      "found at %p\n",
+			X6TPRINTK3(KERN_DEBUG "%s(): x6spi object for " NIP6_FMT 
+					      " found at %p\n",
 				   __FUNCTION__, 
 				   NIP6(*(struct in6_addr *)saddr),
 				   x6spi);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index d5a6eaf4a1de..ab0c920f0d30 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -545,11 +545,11 @@ static int help(struct sk_buff **pskb,
                    different IP address.  Simply don't record it for
                    NAT. */
 		if (cmd.l3num == PF_INET) {
-                	DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n",
+                	DEBUGP("conntrack_ftp: NOT RECORDING: " NIPQUAD_FMT " != " NIPQUAD_FMT "\n",
 			       NIPQUAD(cmd.u3.ip),
 			       NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip));
 		} else {
-			DEBUGP("conntrack_ftp: NOT RECORDING: %x:%x:%x:%x:%x:%x:%x:%x != %x:%x:%x:%x:%x:%x:%x:%x\n",
+			DEBUGP("conntrack_ftp: NOT RECORDING: " NIP6_FMT " != " NIP6_FMT "\n",
 			       NIP6(*((struct in6_addr *)cmd.u3.ip6)),
 			       NIP6(*((struct in6_addr *)ct->tuplehash[dir]
 							.tuple.src.u3.ip6)));
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 04c7fab4edc4..2e266129a764 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -180,8 +180,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport,
 	}
 
 	SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, "
-			  "src:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x "
-			  "dst:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+			  "src:" NIP6_FMT " dst:" NIP6_FMT "\n",
 			  __FUNCTION__, skb, skb->len,
 			  NIP6(fl.fl6_src), NIP6(fl.fl6_dst));
 
@@ -206,13 +205,13 @@ static struct dst_entry *sctp_v6_get_dst(struct sctp_association *asoc,
 		fl.oif = daddr->v6.sin6_scope_id;
 	
 
-	SCTP_DEBUG_PRINTK("%s: DST=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ",
+	SCTP_DEBUG_PRINTK("%s: DST=" NIP6_FMT " ",
 			  __FUNCTION__, NIP6(fl.fl6_dst));
 
 	if (saddr) {
 		ipv6_addr_copy(&fl.fl6_src, &saddr->v6.sin6_addr);
 		SCTP_DEBUG_PRINTK(
-			"SRC=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x - ",
+			"SRC=" NIP6_FMT " - ",
 			NIP6(fl.fl6_src));
 	}
 
@@ -221,8 +220,7 @@ static struct dst_entry *sctp_v6_get_dst(struct sctp_association *asoc,
 		struct rt6_info *rt;
 		rt = (struct rt6_info *)dst;
 		SCTP_DEBUG_PRINTK(
-			"rt6_dst:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x "
-			"rt6_src:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+			"rt6_dst:" NIP6_FMT " rt6_src:" NIP6_FMT "\n",
 			NIP6(rt->rt6i_dst.addr), NIP6(rt->rt6i_src.addr));
 	} else {
 		SCTP_DEBUG_PRINTK("NO ROUTE\n");
@@ -271,13 +269,12 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc,
 	__u8 bmatchlen;
 
 	SCTP_DEBUG_PRINTK("%s: asoc:%p dst:%p "
-			  "daddr:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ",
+			  "daddr:" NIP6_FMT " ",
 			  __FUNCTION__, asoc, dst, NIP6(daddr->v6.sin6_addr));
 
 	if (!asoc) {
 		ipv6_get_saddr(dst, &daddr->v6.sin6_addr,&saddr->v6.sin6_addr);
-		SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: "
-				  "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: " NIP6_FMT "\n",
 				  NIP6(saddr->v6.sin6_addr));
 		return;
 	}
@@ -305,13 +302,11 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc,
 
 	if (baddr) {
 		memcpy(saddr, baddr, sizeof(union sctp_addr));
-		SCTP_DEBUG_PRINTK("saddr: "
-				  "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		SCTP_DEBUG_PRINTK("saddr: " NIP6_FMT "\n",
 				  NIP6(saddr->v6.sin6_addr));
 	} else {
 		printk(KERN_ERR "%s: asoc:%p Could not find a valid source "
-		       "address for the "
-		       "dest:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		       "address for the dest:" NIP6_FMT "\n",
 		       __FUNCTION__, asoc, NIP6(daddr->v6.sin6_addr));
 	}
 
@@ -675,8 +670,7 @@ static int sctp_v6_is_ce(const struct sk_buff *skb)
 /* Dump the v6 addr to the seq file. */
 static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
 {
-	seq_printf(seq, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ",
-		   NIP6(addr->v6.sin6_addr));
+	seq_printf(seq, NIP6_FMT " ", NIP6(addr->v6.sin6_addr));
 }
 
 /* Initialize a PF_INET6 socket msg_name. */
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 557a7d90b92a..477d7f80dba6 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1036,14 +1036,14 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
 		if (from_addr.sa.sa_family == AF_INET6) {
 			printk(KERN_WARNING
 			       "%s association %p could not find address "
-			       "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+			       NIP6_FMT "\n",
 			       __FUNCTION__,
 			       asoc,
 			       NIP6(from_addr.v6.sin6_addr));
 		} else {
 			printk(KERN_WARNING
 			       "%s association %p could not find address "
-			       "%u.%u.%u.%u\n",
+			       NIPQUAD_FMT "\n",
 			       __FUNCTION__,
 			       asoc,
 			       NIPQUAD(from_addr.v4.sin_addr.s_addr));
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 12e4fb72bf0f..53d6c7bbf564 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -494,8 +494,7 @@ static inline void avc_print_ipv6_addr(struct audit_buffer *ab,
 				       char *name1, char *name2)
 {
 	if (!ipv6_addr_any(addr))
-		audit_log_format(ab, " %s=%04x:%04x:%04x:%04x:%04x:"
-				 "%04x:%04x:%04x", name1, NIP6(*addr));
+		audit_log_format(ab, " %s=" NIP6_FMT, name1, NIP6(*addr));
 	if (port)
 		audit_log_format(ab, " %s=%d", name2, ntohs(port));
 }
@@ -504,7 +503,7 @@ static inline void avc_print_ipv4_addr(struct audit_buffer *ab, u32 addr,
 				       __be16 port, char *name1, char *name2)
 {
 	if (addr)
-		audit_log_format(ab, " %s=%d.%d.%d.%d", name1, NIPQUAD(addr));
+		audit_log_format(ab, " %s=" NIPQUAD_FMT, name1, NIPQUAD(addr));
 	if (port)
 		audit_log_format(ab, " %s=%d", name2, ntohs(port));
 }
-- 
cgit v1.2.3-71-gd317


From 8ae12a0d85987dc138f8c944cb78a92bf466cea0 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 8 Jan 2006 13:34:19 -0800
Subject: [PATCH] spi: simple SPI framework

This is the core of a small SPI framework, implementing the model of a
queue of messages which complete asynchronously (with thin synchronous
wrappers on top).

  - It's still less than 2KB of ".text" (ARM).  If there's got to be a
    mid-layer for something so simple, that's the right size budget.  :)

  - The guts use board-specific SPI device tables to build the driver
    model tree.  (Hardware probing is rarely an option.)

  - This version of Kconfig includes no drivers.  At this writing there
    are two known master controller drivers (PXA/SSP, OMAP MicroWire)
    and three protocol drivers (CS8415a, ADS7846, DataFlash) with LKML
    mentions of other drivers in development.

  - No userspace API.  There are several implementations to compare.
    Implement them like any other driver, and bind them with sysfs.

The changes from last version posted to LKML (on 11-Nov-2005) are minor,
and include:

  - One bugfix (removes a FIXME), with the visible effect of making device
    names be "spiB.C" where B is the bus number and C is the chipselect.

  - The "caller provides DMA mappings" mechanism now has kerneldoc, for
    DMA drivers that want to be fancy.

  - Hey, the framework init can be subsys_init.  Even though board init
    logic fires earlier, at arch_init ... since the framework init is
    for driver support, and the board init support uses static init.

  - Various additional spec/doc clarifications based on discussions
    with other folk.  It adds a brief "thank you" at the end, for folk
    who've helped nudge this framework into existence.

As I've said before, I think that "protocol tweaking" is the main support
that this driver framework will need to evolve.

From: Mark Underwood <basicmark@yahoo.com>

  Update the SPI framework to remove a potential priority inversion case by
  reverting to kmalloc if the pre-allocated DMA-safe buffer isn't available.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/spi/spi-summary | 416 +++++++++++++++++++++++++++++++
 arch/arm/Kconfig              |   2 +
 drivers/Kconfig               |   2 +
 drivers/Makefile              |   1 +
 drivers/spi/Kconfig           |  76 ++++++
 drivers/spi/Makefile          |  23 ++
 drivers/spi/spi.c             | 568 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/spi.h       | 542 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 1630 insertions(+)
 create mode 100644 Documentation/spi/spi-summary
 create mode 100644 drivers/spi/Kconfig
 create mode 100644 drivers/spi/Makefile
 create mode 100644 drivers/spi/spi.c
 create mode 100644 include/linux/spi/spi.h

(limited to 'include/linux')

diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
new file mode 100644
index 000000000000..00497f95ca4b
--- /dev/null
+++ b/Documentation/spi/spi-summary
@@ -0,0 +1,416 @@
+Overview of Linux kernel SPI support
+====================================
+
+22-Nov-2005
+
+What is SPI?
+------------
+The "Serial Peripheral Interface" (SPI) is a four-wire point-to-point
+serial link used to connect microcontrollers to sensors and memory.
+
+The three signal wires hold a clock (SCLK, often on the order of 10 MHz),
+and parallel data lines with "Master Out, Slave In" (MOSI) or "Master In,
+Slave Out" (MISO) signals.  (Other names are also used.)  There are four
+clocking modes through which data is exchanged; mode-0 and mode-3 are most
+commonly used.
+
+SPI masters may use a "chip select" line to activate a given SPI slave
+device, so those three signal wires may be connected to several chips
+in parallel.  All SPI slaves support chipselects.  Some devices have
+other signals, often including an interrupt to the master.
+
+Unlike serial busses like USB or SMBUS, even low level protocols for
+SPI slave functions are usually not interoperable between vendors
+(except for cases like SPI memory chips).
+
+  - SPI may be used for request/response style device protocols, as with
+    touchscreen sensors and memory chips.
+
+  - It may also be used to stream data in either direction (half duplex),
+    or both of them at the same time (full duplex).
+
+  - Some devices may use eight bit words.  Others may different word
+    lengths, such as streams of 12-bit or 20-bit digital samples.
+
+In the same way, SPI slaves will only rarely support any kind of automatic
+discovery/enumeration protocol.  The tree of slave devices accessible from
+a given SPI master will normally be set up manually, with configuration
+tables.
+
+SPI is only one of the names used by such four-wire protocols, and
+most controllers have no problem handling "MicroWire" (think of it as
+half-duplex SPI, for request/response protocols), SSP ("Synchronous
+Serial Protocol"), PSP ("Programmable Serial Protocol"), and other
+related protocols.
+
+Microcontrollers often support both master and slave sides of the SPI
+protocol.  This document (and Linux) currently only supports the master
+side of SPI interactions.
+
+
+Who uses it?  On what kinds of systems?
+---------------------------------------
+Linux developers using SPI are probably writing device drivers for embedded
+systems boards.  SPI is used to control external chips, and it is also a
+protocol supported by every MMC or SD memory card.  (The older "DataFlash"
+cards, predating MMC cards but using the same connectors and card shape,
+support only SPI.)  Some PC hardware uses SPI flash for BIOS code.
+
+SPI slave chips range from digital/analog converters used for analog
+sensors and codecs, to memory, to peripherals like USB controllers
+or Ethernet adapters; and more.
+
+Most systems using SPI will integrate a few devices on a mainboard.
+Some provide SPI links on expansion connectors; in cases where no
+dedicated SPI controller exists, GPIO pins can be used to create a
+low speed "bitbanging" adapter.  Very few systems will "hotplug" an SPI
+controller; the reasons to use SPI focus on low cost and simple operation,
+and if dynamic reconfiguration is important, USB will often be a more
+appropriate low-pincount peripheral bus.
+
+Many microcontrollers that can run Linux integrate one or more I/O
+interfaces with SPI modes.  Given SPI support, they could use MMC or SD
+cards without needing a special purpose MMC/SD/SDIO controller.
+
+
+How do these driver programming interfaces work?
+------------------------------------------------
+The <linux/spi/spi.h> header file includes kerneldoc, as does the
+main source code, and you should certainly read that.  This is just
+an overview, so you get the big picture before the details.
+
+There are two types of SPI driver, here called:
+
+  Controller drivers ... these are often built in to System-On-Chip
+	processors, and often support both Master and Slave roles.
+	These drivers touch hardware registers and may use DMA.
+
+  Protocol drivers ... these pass messages through the controller
+	driver to communicate with a Slave or Master device on the
+	other side of an SPI link.
+
+So for example one protocol driver might talk to the MTD layer to export
+data to filesystems stored on SPI flash like DataFlash; and others might
+control audio interfaces, present touchscreen sensors as input interfaces,
+or monitor temperature and voltage levels during industrial processing.
+And those might all be sharing the same controller driver.
+
+A "struct spi_device" encapsulates the master-side interface between
+those two types of driver.  At this writing, Linux has no slave side
+programming interface.
+
+There is a minimal core of SPI programming interfaces, focussing on
+using driver model to connect controller and protocol drivers using
+device tables provided by board specific initialization code.  SPI
+shows up in sysfs in several locations:
+
+   /sys/devices/.../CTLR/spiB.C ... spi_device for on bus "B",
+	chipselect C, accessed through CTLR.
+
+   /sys/bus/spi/devices/spiB.C ... symlink to the physical
+   	spiB-C device
+
+   /sys/bus/spi/drivers/D ... driver for one or more spi*.* devices
+
+   /sys/class/spi_master/spiB ... class device for the controller
+	managing bus "B".  All the spiB.* devices share the same
+	physical SPI bus segment, with SCLK, MOSI, and MISO.
+
+The basic I/O primitive submits an asynchronous message to an I/O queue
+maintained by the controller driver.  A completion callback is issued
+asynchronously when the data transfer(s) in that message completes.
+There are also some simple synchronous wrappers for those calls.
+
+
+How does board-specific init code declare SPI devices?
+------------------------------------------------------
+Linux needs several kinds of information to properly configure SPI devices.
+That information is normally provided by board-specific code, even for
+chips that do support some of automated discovery/enumeration.
+
+DECLARE CONTROLLERS
+
+The first kind of information is a list of what SPI controllers exist.
+For System-on-Chip (SOC) based boards, these will usually be platform
+devices, and the controller may need some platform_data in order to
+operate properly.  The "struct platform_device" will include resources
+like the physical address of the controller's first register and its IRQ.
+
+Platforms will often abstract the "register SPI controller" operation,
+maybe coupling it with code to initialize pin configurations, so that
+the arch/.../mach-*/board-*.c files for several boards can all share the
+same basic controller setup code.  This is because most SOCs have several
+SPI-capable controllers, and only the ones actually usable on a given
+board should normally be set up and registered.
+
+So for example arch/.../mach-*/board-*.c files might have code like:
+
+	#include <asm/arch/spi.h>	/* for mysoc_spi_data */
+
+	/* if your mach-* infrastructure doesn't support kernels that can
+	 * run on multiple boards, pdata wouldn't benefit from "__init".
+	 */
+	static struct mysoc_spi_data __init pdata = { ... };
+
+	static __init board_init(void)
+	{
+		...
+		/* this board only uses SPI controller #2 */
+		mysoc_register_spi(2, &pdata);
+		...
+	}
+
+And SOC-specific utility code might look something like:
+
+	#include <asm/arch/spi.h>
+
+	static struct platform_device spi2 = { ... };
+
+	void mysoc_register_spi(unsigned n, struct mysoc_spi_data *pdata)
+	{
+		struct mysoc_spi_data *pdata2;
+
+		pdata2 = kmalloc(sizeof *pdata2, GFP_KERNEL);
+		*pdata2 = pdata;
+		...
+		if (n == 2) {
+			spi2->dev.platform_data = pdata2;
+			register_platform_device(&spi2);
+
+			/* also: set up pin modes so the spi2 signals are
+			 * visible on the relevant pins ... bootloaders on
+			 * production boards may already have done this, but
+			 * developer boards will often need Linux to do it.
+			 */
+		}
+		...
+	}
+
+Notice how the platform_data for boards may be different, even if the
+same SOC controller is used.  For example, on one board SPI might use
+an external clock, where another derives the SPI clock from current
+settings of some master clock.
+
+
+DECLARE SLAVE DEVICES
+
+The second kind of information is a list of what SPI slave devices exist
+on the target board, often with some board-specific data needed for the
+driver to work correctly.
+
+Normally your arch/.../mach-*/board-*.c files would provide a small table
+listing the SPI devices on each board.  (This would typically be only a
+small handful.)  That might look like:
+
+	static struct ads7846_platform_data ads_info = {
+		.vref_delay_usecs	= 100,
+		.x_plate_ohms		= 580,
+		.y_plate_ohms		= 410,
+	};
+
+	static struct spi_board_info spi_board_info[] __initdata = {
+	{
+		.modalias	= "ads7846",
+		.platform_data	= &ads_info,
+		.mode		= SPI_MODE_0,
+		.irq		= GPIO_IRQ(31),
+		.max_speed_hz	= 120000 /* max sample rate at 3V */ * 16,
+		.bus_num	= 1,
+		.chip_select	= 0,
+	},
+	};
+
+Again, notice how board-specific information is provided; each chip may need
+several types.  This example shows generic constraints like the fastest SPI
+clock to allow (a function of board voltage in this case) or how an IRQ pin
+is wired, plus chip-specific constraints like an important delay that's
+changed by the capacitance at one pin.
+
+(There's also "controller_data", information that may be useful to the
+controller driver.  An example would be peripheral-specific DMA tuning
+data or chipselect callbacks.  This is stored in spi_device later.)
+
+The board_info should provide enough information to let the system work
+without the chip's driver being loaded.  The most troublesome aspect of
+that is likely the SPI_CS_HIGH bit in the spi_device.mode field, since
+sharing a bus with a device that interprets chipselect "backwards" is
+not possible.
+
+Then your board initialization code would register that table with the SPI
+infrastructure, so that it's available later when the SPI master controller
+driver is registered:
+
+	spi_register_board_info(spi_board_info, ARRAY_SIZE(spi_board_info));
+
+Like with other static board-specific setup, you won't unregister those.
+
+
+NON-STATIC CONFIGURATIONS
+
+Developer boards often play by different rules than product boards, and one
+example is the potential need to hotplug SPI devices and/or controllers.
+
+For those cases you might need to use use spi_busnum_to_master() to look
+up the spi bus master, and will likely need spi_new_device() to provide the
+board info based on the board that was hotplugged.  Of course, you'd later
+call at least spi_unregister_device() when that board is removed.
+
+
+How do I write an "SPI Protocol Driver"?
+----------------------------------------
+All SPI drivers are currently kernel drivers.  A userspace driver API
+would just be another kernel driver, probably offering some lowlevel
+access through aio_read(), aio_write(), and ioctl() calls and using the
+standard userspace sysfs mechanisms to bind to a given SPI device.
+
+SPI protocol drivers are normal device drivers, with no more wrapper
+than needed by platform devices:
+
+	static struct device_driver CHIP_driver = {
+		.name		= "CHIP",
+		.bus		= &spi_bus_type,
+		.probe		= CHIP_probe,
+		.remove		= __exit_p(CHIP_remove),
+		.suspend	= CHIP_suspend,
+		.resume		= CHIP_resume,
+	};
+
+The SPI core will autmatically attempt to bind this driver to any SPI
+device whose board_info gave a modalias of "CHIP".  Your probe() code
+might look like this unless you're creating a class_device:
+
+	static int __init CHIP_probe(struct device *dev)
+	{
+		struct spi_device		*spi = to_spi_device(dev);
+		struct CHIP			*chip;
+		struct CHIP_platform_data	*pdata = dev->platform_data;
+
+		/* get memory for driver's per-chip state */
+		chip = kzalloc(sizeof *chip, GFP_KERNEL);
+		if (!chip)
+			return -ENOMEM;
+		dev_set_drvdata(dev, chip);
+
+		... etc
+		return 0;
+	}
+
+As soon as it enters probe(), the driver may issue I/O requests to
+the SPI device using "struct spi_message".  When remove() returns,
+the driver guarantees that it won't submit any more such messages.
+
+  - An spi_message is a sequence of of protocol operations, executed
+    as one atomic sequence.  SPI driver controls include:
+
+      + when bidirectional reads and writes start ... by how its
+        sequence of spi_transfer requests is arranged;
+
+      + optionally defining short delays after transfers ... using
+        the spi_transfer.delay_usecs setting;
+
+      + whether the chipselect becomes inactive after a transfer and
+        any delay ... by using the spi_transfer.cs_change flag;
+
+      + hinting whether the next message is likely to go to this same
+        device ... using the spi_transfer.cs_change flag on the last
+	transfer in that atomic group, and potentially saving costs
+	for chip deselect and select operations.
+
+  - Follow standard kernel rules, and provide DMA-safe buffers in
+    your messages.  That way controller drivers using DMA aren't forced
+    to make extra copies unless the hardware requires it (e.g. working
+    around hardware errata that force the use of bounce buffering).
+
+    If standard dma_map_single() handling of these buffers is inappropriate,
+    you can use spi_message.is_dma_mapped to tell the controller driver
+    that you've already provided the relevant DMA addresses.
+
+  - The basic I/O primitive is spi_async().  Async requests may be
+    issued in any context (irq handler, task, etc) and completion
+    is reported using a callback provided with the message.
+
+  - There are also synchronous wrappers like spi_sync(), and wrappers
+    like spi_read(), spi_write(), and spi_write_then_read().  These
+    may be issued only in contexts that may sleep, and they're all
+    clean (and small, and "optional") layers over spi_async().
+
+  - The spi_write_then_read() call, and convenience wrappers around
+    it, should only be used with small amounts of data where the
+    cost of an extra copy may be ignored.  It's designed to support
+    common RPC-style requests, such as writing an eight bit command
+    and reading a sixteen bit response -- spi_w8r16() being one its
+    wrappers, doing exactly that.
+
+Some drivers may need to modify spi_device characteristics like the
+transfer mode, wordsize, or clock rate.  This is done with spi_setup(),
+which would normally be called from probe() before the first I/O is
+done to the device.
+
+While "spi_device" would be the bottom boundary of the driver, the
+upper boundaries might include sysfs (especially for sensor readings),
+the input layer, ALSA, networking, MTD, the character device framework,
+or other Linux subsystems.
+
+
+How do I write an "SPI Master Controller Driver"?
+-------------------------------------------------
+An SPI controller will probably be registered on the platform_bus; write
+a driver to bind to the device, whichever bus is involved.
+
+The main task of this type of driver is to provide an "spi_master".
+Use spi_alloc_master() to allocate the master, and class_get_devdata()
+to get the driver-private data allocated for that device.
+
+	struct spi_master	*master;
+	struct CONTROLLER	*c;
+
+	master = spi_alloc_master(dev, sizeof *c);
+	if (!master)
+		return -ENODEV;
+
+	c = class_get_devdata(&master->cdev);
+
+The driver will initialize the fields of that spi_master, including the
+bus number (maybe the same as the platform device ID) and three methods
+used to interact with the SPI core and SPI protocol drivers.  It will
+also initialize its own internal state.
+
+    master->setup(struct spi_device *spi)
+	This sets up the device clock rate, SPI mode, and word sizes.
+	Drivers may change the defaults provided by board_info, and then
+	call spi_setup(spi) to invoke this routine.  It may sleep.
+
+    master->transfer(struct spi_device *spi, struct spi_message *message)
+    	This must not sleep.  Its responsibility is arrange that the
+	transfer happens and its complete() callback is issued; the two
+	will normally happen later, after other transfers complete.
+
+    master->cleanup(struct spi_device *spi)
+	Your controller driver may use spi_device.controller_state to hold
+	state it dynamically associates with that device.  If you do that,
+	be sure to provide the cleanup() method to free that state.
+
+The bulk of the driver will be managing the I/O queue fed by transfer().
+
+That queue could be purely conceptual.  For example, a driver used only
+for low-frequency sensor acess might be fine using synchronous PIO.
+
+But the queue will probably be very real, using message->queue, PIO,
+often DMA (especially if the root filesystem is in SPI flash), and
+execution contexts like IRQ handlers, tasklets, or workqueues (such
+as keventd).  Your driver can be as fancy, or as simple, as you need.
+
+
+THANKS TO
+---------
+Contributors to Linux-SPI discussions include (in alphabetical order,
+by last name):
+
+David Brownell
+Russell King
+Dmitry Pervushin
+Stephen Street
+Mark Underwood
+Andrew Victor
+Vitaly Wool
+
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 50b9afa8ae6d..3cfd82a05b20 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -729,6 +729,8 @@ source "drivers/char/Kconfig"
 
 source "drivers/i2c/Kconfig"
 
+source "drivers/spi/Kconfig"
+
 source "drivers/hwmon/Kconfig"
 
 #source "drivers/l3/Kconfig"
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 48f446d3c671..283c089537bc 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -44,6 +44,8 @@ source "drivers/char/Kconfig"
 
 source "drivers/i2c/Kconfig"
 
+source "drivers/spi/Kconfig"
+
 source "drivers/w1/Kconfig"
 
 source "drivers/hwmon/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 7fc3f0f08b29..7c45050ecd03 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_FUSION)		+= message/
 obj-$(CONFIG_IEEE1394)		+= ieee1394/
 obj-y				+= cdrom/
 obj-$(CONFIG_MTD)		+= mtd/
+obj-$(CONFIG_SPI)		+= spi/
 obj-$(CONFIG_PCCARD)		+= pcmcia/
 obj-$(CONFIG_DIO)		+= dio/
 obj-$(CONFIG_SBUS)		+= sbus/
diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
new file mode 100644
index 000000000000..d3105104a297
--- /dev/null
+++ b/drivers/spi/Kconfig
@@ -0,0 +1,76 @@
+#
+# SPI driver configuration
+#
+# NOTE:  the reason this doesn't show SPI slave support is mostly that
+# nobody's needed a slave side API yet.  The master-role API is not
+# fully appropriate there, so it'd need some thought to do well.
+#
+menu "SPI support"
+
+config SPI
+	bool "SPI support"
+	help
+	  The "Serial Peripheral Interface" is a low level synchronous
+	  protocol.  Chips that support SPI can have data transfer rates
+	  up to several tens of Mbit/sec.  Chips are addressed with a
+	  controller and a chipselect.  Most SPI slaves don't support
+	  dynamic device discovery; some are even write-only or read-only.
+
+	  SPI is widely used by microcontollers to talk with sensors,
+	  eeprom and flash memory, codecs and various other controller
+	  chips, analog to digital (and d-to-a) converters, and more.
+	  MMC and SD cards can be accessed using SPI protocol; and for
+	  DataFlash cards used in MMC sockets, SPI must always be used.
+
+	  SPI is one of a family of similar protocols using a four wire
+	  interface (select, clock, data in, data out) including Microwire
+	  (half duplex), SSP, SSI, and PSP.  This driver framework should
+	  work with most such devices and controllers.
+
+config SPI_DEBUG
+	boolean "Debug support for SPI drivers"
+	depends on SPI && DEBUG_KERNEL
+	help
+	  Say "yes" to enable debug messaging (like dev_dbg and pr_debug),
+	  sysfs, and debugfs support in SPI controller and protocol drivers.
+
+#
+# MASTER side ... talking to discrete SPI slave chips including microcontrollers
+#
+
+config SPI_MASTER
+#	boolean "SPI Master Support"
+	boolean
+	default SPI
+	help
+	  If your system has an master-capable SPI controller (which
+	  provides the clock and chipselect), you can enable that
+	  controller and the protocol drivers for the SPI slave chips
+	  that are connected.
+
+comment "SPI Master Controller Drivers"
+	depends on SPI_MASTER
+
+
+#
+# Add new SPI master controllers in alphabetical order above this line
+#
+
+
+#
+# There are lots of SPI device types, with sensors and memory
+# being probably the most widely used ones.
+#
+comment "SPI Protocol Masters"
+	depends on SPI_MASTER
+
+
+#
+# Add new SPI protocol masters in alphabetical order above this line
+#
+
+
+# (slave support would go here)
+
+endmenu # "SPI support"
+
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
new file mode 100644
index 000000000000..afd2321753b3
--- /dev/null
+++ b/drivers/spi/Makefile
@@ -0,0 +1,23 @@
+#
+# Makefile for kernel SPI drivers.
+#
+
+ifeq ($(CONFIG_SPI_DEBUG),y)
+EXTRA_CFLAGS += -DDEBUG
+endif
+
+# small core, mostly translating board-specific
+# config declarations into driver model code
+obj-$(CONFIG_SPI_MASTER)		+= spi.o
+
+# SPI master controller drivers (bus)
+# 	... add above this line ...
+
+# SPI protocol drivers (device/link on bus)
+# 	... add above this line ...
+
+# SPI slave controller drivers (upstream link)
+# 	... add above this line ...
+
+# SPI slave drivers (protocol for that link)
+# 	... add above this line ...
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
new file mode 100644
index 000000000000..7cd356b17644
--- /dev/null
+++ b/drivers/spi/spi.c
@@ -0,0 +1,568 @@
+/*
+ * spi.c - SPI init/core code
+ *
+ * Copyright (C) 2005 David Brownell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/autoconf.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/cache.h>
+#include <linux/spi/spi.h>
+
+
+/* SPI bustype and spi_master class are registered during early boot,
+ * usually before board init code provides the SPI device tables, and
+ * are available later when driver init code needs them.
+ *
+ * Drivers for SPI devices started out like those for platform bus
+ * devices.  But both have changed in 2.6.15; maybe this should get
+ * an "spi_driver" structure at some point (not currently needed)
+ */
+static void spidev_release(struct device *dev)
+{
+	const struct spi_device	*spi = to_spi_device(dev);
+
+	/* spi masters may cleanup for released devices */
+	if (spi->master->cleanup)
+		spi->master->cleanup(spi);
+
+	class_device_put(&spi->master->cdev);
+	kfree(dev);
+}
+
+static ssize_t
+modalias_show(struct device *dev, struct device_attribute *a, char *buf)
+{
+	const struct spi_device	*spi = to_spi_device(dev);
+
+	return snprintf(buf, BUS_ID_SIZE + 1, "%s\n", spi->modalias);
+}
+
+static struct device_attribute spi_dev_attrs[] = {
+	__ATTR_RO(modalias),
+	__ATTR_NULL,
+};
+
+/* modalias support makes "modprobe $MODALIAS" new-style hotplug work,
+ * and the sysfs version makes coldplug work too.
+ */
+
+static int spi_match_device(struct device *dev, struct device_driver *drv)
+{
+	const struct spi_device	*spi = to_spi_device(dev);
+
+	return strncmp(spi->modalias, drv->name, BUS_ID_SIZE) == 0;
+}
+
+static int spi_uevent(struct device *dev, char **envp, int num_envp,
+		char *buffer, int buffer_size)
+{
+	const struct spi_device		*spi = to_spi_device(dev);
+
+	envp[0] = buffer;
+	snprintf(buffer, buffer_size, "MODALIAS=%s", spi->modalias);
+	envp[1] = NULL;
+	return 0;
+}
+
+#ifdef	CONFIG_PM
+
+/* Suspend/resume in "struct device_driver" don't really need that
+ * strange third parameter, so we just make it a constant and expect
+ * SPI drivers to ignore it just like most platform drivers do.
+ *
+ * NOTE:  the suspend() method for an spi_master controller driver
+ * should verify that all its child devices are marked as suspended;
+ * suspend requests delivered through sysfs power/state files don't
+ * enforce such constraints.
+ */
+static int spi_suspend(struct device *dev, pm_message_t message)
+{
+	int	value;
+
+	if (!dev->driver || !dev->driver->suspend)
+		return 0;
+
+	/* suspend will stop irqs and dma; no more i/o */
+	value = dev->driver->suspend(dev, message);
+	if (value == 0)
+		dev->power.power_state = message;
+	return value;
+}
+
+static int spi_resume(struct device *dev)
+{
+	int	value;
+
+	if (!dev->driver || !dev->driver->resume)
+		return 0;
+
+	/* resume may restart the i/o queue */
+	value = dev->driver->resume(dev);
+	if (value == 0)
+		dev->power.power_state = PMSG_ON;
+	return value;
+}
+
+#else
+#define spi_suspend	NULL
+#define spi_resume	NULL
+#endif
+
+struct bus_type spi_bus_type = {
+	.name		= "spi",
+	.dev_attrs	= spi_dev_attrs,
+	.match		= spi_match_device,
+	.uevent		= spi_uevent,
+	.suspend	= spi_suspend,
+	.resume		= spi_resume,
+};
+EXPORT_SYMBOL_GPL(spi_bus_type);
+
+/*-------------------------------------------------------------------------*/
+
+/* SPI devices should normally not be created by SPI device drivers; that
+ * would make them board-specific.  Similarly with SPI master drivers.
+ * Device registration normally goes into like arch/.../mach.../board-YYY.c
+ * with other readonly (flashable) information about mainboard devices.
+ */
+
+struct boardinfo {
+	struct list_head	list;
+	unsigned		n_board_info;
+	struct spi_board_info	board_info[0];
+};
+
+static LIST_HEAD(board_list);
+static DECLARE_MUTEX(board_lock);
+
+
+/* On typical mainboards, this is purely internal; and it's not needed
+ * after board init creates the hard-wired devices.  Some development
+ * platforms may not be able to use spi_register_board_info though, and
+ * this is exported so that for example a USB or parport based adapter
+ * driver could add devices (which it would learn about out-of-band).
+ */
+struct spi_device *__init_or_module
+spi_new_device(struct spi_master *master, struct spi_board_info *chip)
+{
+	struct spi_device	*proxy;
+	struct device		*dev = master->cdev.dev;
+	int			status;
+
+	/* NOTE:  caller did any chip->bus_num checks necessary */
+
+	if (!class_device_get(&master->cdev))
+		return NULL;
+
+	proxy = kzalloc(sizeof *proxy, GFP_KERNEL);
+	if (!proxy) {
+		dev_err(dev, "can't alloc dev for cs%d\n",
+			chip->chip_select);
+		goto fail;
+	}
+	proxy->master = master;
+	proxy->chip_select = chip->chip_select;
+	proxy->max_speed_hz = chip->max_speed_hz;
+	proxy->irq = chip->irq;
+	proxy->modalias = chip->modalias;
+
+	snprintf(proxy->dev.bus_id, sizeof proxy->dev.bus_id,
+			"%s.%u", master->cdev.class_id,
+			chip->chip_select);
+	proxy->dev.parent = dev;
+	proxy->dev.bus = &spi_bus_type;
+	proxy->dev.platform_data = (void *) chip->platform_data;
+	proxy->controller_data = chip->controller_data;
+	proxy->controller_state = NULL;
+	proxy->dev.release = spidev_release;
+
+	/* drivers may modify this default i/o setup */
+	status = master->setup(proxy);
+	if (status < 0) {
+		dev_dbg(dev, "can't %s %s, status %d\n",
+				"setup", proxy->dev.bus_id, status);
+		goto fail;
+	}
+
+	/* driver core catches callers that misbehave by defining
+	 * devices that already exist.
+	 */
+	status = device_register(&proxy->dev);
+	if (status < 0) {
+		dev_dbg(dev, "can't %s %s, status %d\n",
+				"add", proxy->dev.bus_id, status);
+fail:
+		class_device_put(&master->cdev);
+		kfree(proxy);
+		return NULL;
+	}
+	dev_dbg(dev, "registered child %s\n", proxy->dev.bus_id);
+	return proxy;
+}
+EXPORT_SYMBOL_GPL(spi_new_device);
+
+/*
+ * Board-specific early init code calls this (probably during arch_initcall)
+ * with segments of the SPI device table.  Any device nodes are created later,
+ * after the relevant parent SPI controller (bus_num) is defined.  We keep
+ * this table of devices forever, so that reloading a controller driver will
+ * not make Linux forget about these hard-wired devices.
+ *
+ * Other code can also call this, e.g. a particular add-on board might provide
+ * SPI devices through its expansion connector, so code initializing that board
+ * would naturally declare its SPI devices.
+ *
+ * The board info passed can safely be __initdata ... but be careful of
+ * any embedded pointers (platform_data, etc), they're copied as-is.
+ */
+int __init
+spi_register_board_info(struct spi_board_info const *info, unsigned n)
+{
+	struct boardinfo	*bi;
+
+	bi = kmalloc (sizeof (*bi) + n * sizeof (*info), GFP_KERNEL);
+	if (!bi)
+		return -ENOMEM;
+	bi->n_board_info = n;
+	memcpy(bi->board_info, info, n * sizeof (*info));
+
+	down(&board_lock);
+	list_add_tail(&bi->list, &board_list);
+	up(&board_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(spi_register_board_info);
+
+/* FIXME someone should add support for a __setup("spi", ...) that
+ * creates board info from kernel command lines
+ */
+
+static void __init_or_module
+scan_boardinfo(struct spi_master *master)
+{
+	struct boardinfo	*bi;
+	struct device		*dev = master->cdev.dev;
+
+	down(&board_lock);
+	list_for_each_entry(bi, &board_list, list) {
+		struct spi_board_info	*chip = bi->board_info;
+		unsigned		n;
+
+		for (n = bi->n_board_info; n > 0; n--, chip++) {
+			if (chip->bus_num != master->bus_num)
+				continue;
+			/* some controllers only have one chip, so they
+			 * might not use chipselects.  otherwise, the
+			 * chipselects are numbered 0..max.
+			 */
+			if (chip->chip_select >= master->num_chipselect
+					&& master->num_chipselect) {
+				dev_dbg(dev, "cs%d > max %d\n",
+					chip->chip_select,
+					master->num_chipselect);
+				continue;
+			}
+			(void) spi_new_device(master, chip);
+		}
+	}
+	up(&board_lock);
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void spi_master_release(struct class_device *cdev)
+{
+	struct spi_master *master;
+
+	master = container_of(cdev, struct spi_master, cdev);
+	put_device(master->cdev.dev);
+	master->cdev.dev = NULL;
+	kfree(master);
+}
+
+static struct class spi_master_class = {
+	.name		= "spi_master",
+	.owner		= THIS_MODULE,
+	.release	= spi_master_release,
+};
+
+
+/**
+ * spi_alloc_master - allocate SPI master controller
+ * @dev: the controller, possibly using the platform_bus
+ * @size: how much driver-private data to preallocate; a pointer to this
+ * 	memory in the class_data field of the returned class_device
+ *
+ * This call is used only by SPI master controller drivers, which are the
+ * only ones directly touching chip registers.  It's how they allocate
+ * an spi_master structure, prior to calling spi_add_master().
+ *
+ * This must be called from context that can sleep.  It returns the SPI
+ * master structure on success, else NULL.
+ *
+ * The caller is responsible for assigning the bus number and initializing
+ * the master's methods before calling spi_add_master(), or else (on error)
+ * calling class_device_put() to prevent a memory leak.
+ */
+struct spi_master * __init_or_module
+spi_alloc_master(struct device *dev, unsigned size)
+{
+	struct spi_master	*master;
+
+	master = kzalloc(size + sizeof *master, SLAB_KERNEL);
+	if (!master)
+		return NULL;
+
+	master->cdev.class = &spi_master_class;
+	master->cdev.dev = get_device(dev);
+	class_set_devdata(&master->cdev, &master[1]);
+
+	return master;
+}
+EXPORT_SYMBOL_GPL(spi_alloc_master);
+
+/**
+ * spi_register_master - register SPI master controller
+ * @master: initialized master, originally from spi_alloc_master()
+ *
+ * SPI master controllers connect to their drivers using some non-SPI bus,
+ * such as the platform bus.  The final stage of probe() in that code
+ * includes calling spi_register_master() to hook up to this SPI bus glue.
+ *
+ * SPI controllers use board specific (often SOC specific) bus numbers,
+ * and board-specific addressing for SPI devices combines those numbers
+ * with chip select numbers.  Since SPI does not directly support dynamic
+ * device identification, boards need configuration tables telling which
+ * chip is at which address.
+ *
+ * This must be called from context that can sleep.  It returns zero on
+ * success, else a negative error code (dropping the master's refcount).
+ */
+int __init_or_module
+spi_register_master(struct spi_master *master)
+{
+	static atomic_t		dyn_bus_id = ATOMIC_INIT(0);
+	struct device		*dev = master->cdev.dev;
+	int			status = -ENODEV;
+	int			dynamic = 0;
+
+	/* convention:  dynamically assigned bus IDs count down from the max */
+	if (master->bus_num == 0) {
+		master->bus_num = atomic_dec_return(&dyn_bus_id);
+		dynamic = 0;
+	}
+
+	/* register the device, then userspace will see it.
+	 * registration fails if the bus ID is in use.
+	 */
+	snprintf(master->cdev.class_id, sizeof master->cdev.class_id,
+		"spi%u", master->bus_num);
+	status = class_device_register(&master->cdev);
+	if (status < 0) {
+		class_device_put(&master->cdev);
+		goto done;
+	}
+	dev_dbg(dev, "registered master %s%s\n", master->cdev.class_id,
+			dynamic ? " (dynamic)" : "");
+
+	/* populate children from any spi device tables */
+	scan_boardinfo(master);
+	status = 0;
+done:
+	return status;
+}
+EXPORT_SYMBOL_GPL(spi_register_master);
+
+
+static int __unregister(struct device *dev, void *unused)
+{
+	/* note: before about 2.6.14-rc1 this would corrupt memory: */
+	device_unregister(dev);
+	return 0;
+}
+
+/**
+ * spi_unregister_master - unregister SPI master controller
+ * @master: the master being unregistered
+ *
+ * This call is used only by SPI master controller drivers, which are the
+ * only ones directly touching chip registers.
+ *
+ * This must be called from context that can sleep.
+ */
+void spi_unregister_master(struct spi_master *master)
+{
+	class_device_unregister(&master->cdev);
+	(void) device_for_each_child(master->cdev.dev, NULL, __unregister);
+}
+EXPORT_SYMBOL_GPL(spi_unregister_master);
+
+/**
+ * spi_busnum_to_master - look up master associated with bus_num
+ * @bus_num: the master's bus number
+ *
+ * This call may be used with devices that are registered after
+ * arch init time.  It returns a refcounted pointer to the relevant
+ * spi_master (which the caller must release), or NULL if there is
+ * no such master registered.
+ */
+struct spi_master *spi_busnum_to_master(u16 bus_num)
+{
+	if (bus_num) {
+		char			name[8];
+		struct kobject		*bus;
+
+		snprintf(name, sizeof name, "spi%u", bus_num);
+		bus = kset_find_obj(&spi_master_class.subsys.kset, name);
+		if (bus)
+			return container_of(bus, struct spi_master, cdev.kobj);
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(spi_busnum_to_master);
+
+
+/*-------------------------------------------------------------------------*/
+
+/**
+ * spi_sync - blocking/synchronous SPI data transfers
+ * @spi: device with which data will be exchanged
+ * @message: describes the data transfers
+ *
+ * This call may only be used from a context that may sleep.  The sleep
+ * is non-interruptible, and has no timeout.  Low-overhead controller
+ * drivers may DMA directly into and out of the message buffers.
+ *
+ * Note that the SPI device's chip select is active during the message,
+ * and then is normally disabled between messages.  Drivers for some
+ * frequently-used devices may want to minimize costs of selecting a chip,
+ * by leaving it selected in anticipation that the next message will go
+ * to the same chip.  (That may increase power usage.)
+ *
+ * The return value is a negative error code if the message could not be
+ * submitted, else zero.  When the value is zero, then message->status is
+ * also defined:  it's the completion code for the transfer, either zero
+ * or a negative error code from the controller driver.
+ */
+int spi_sync(struct spi_device *spi, struct spi_message *message)
+{
+	DECLARE_COMPLETION(done);
+	int status;
+
+	message->complete = (void (*)(void *)) complete;
+	message->context = &done;
+	status = spi_async(spi, message);
+	if (status == 0)
+		wait_for_completion(&done);
+	message->context = NULL;
+	return status;
+}
+EXPORT_SYMBOL_GPL(spi_sync);
+
+#define	SPI_BUFSIZ	(SMP_CACHE_BYTES)
+
+static u8	*buf;
+
+/**
+ * spi_write_then_read - SPI synchronous write followed by read
+ * @spi: device with which data will be exchanged
+ * @txbuf: data to be written (need not be dma-safe)
+ * @n_tx: size of txbuf, in bytes
+ * @rxbuf: buffer into which data will be read
+ * @n_rx: size of rxbuf, in bytes (need not be dma-safe)
+ *
+ * This performs a half duplex MicroWire style transaction with the
+ * device, sending txbuf and then reading rxbuf.  The return value
+ * is zero for success, else a negative errno status code.
+ *
+ * Parameters to this routine are always copied using a small buffer,
+ * large transfers should use use spi_{async,sync}() calls with
+ * dma-safe buffers.
+ */
+int spi_write_then_read(struct spi_device *spi,
+		const u8 *txbuf, unsigned n_tx,
+		u8 *rxbuf, unsigned n_rx)
+{
+	static DECLARE_MUTEX(lock);
+
+	int			status;
+	struct spi_message	message;
+	struct spi_transfer	x[2];
+	u8			*local_buf;
+
+	/* Use preallocated DMA-safe buffer.  We can't avoid copying here,
+	 * (as a pure convenience thing), but we can keep heap costs
+	 * out of the hot path ...
+	 */
+	if ((n_tx + n_rx) > SPI_BUFSIZ)
+		return -EINVAL;
+
+	/* ... unless someone else is using the pre-allocated buffer */
+	if (down_trylock(&lock)) {
+		local_buf = kmalloc(SPI_BUFSIZ, GFP_KERNEL);
+		if (!local_buf)
+			return -ENOMEM;
+	} else
+		local_buf = buf;
+
+	memset(x, 0, sizeof x);
+
+	memcpy(local_buf, txbuf, n_tx);
+	x[0].tx_buf = local_buf;
+	x[0].len = n_tx;
+
+	x[1].rx_buf = local_buf + n_tx;
+	x[1].len = n_rx;
+
+	/* do the i/o */
+	message.transfers = x;
+	message.n_transfer = ARRAY_SIZE(x);
+	status = spi_sync(spi, &message);
+	if (status == 0) {
+		memcpy(rxbuf, x[1].rx_buf, n_rx);
+		status = message.status;
+	}
+
+	if (x[0].tx_buf == buf)
+		up(&lock);
+	else
+		kfree(local_buf);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(spi_write_then_read);
+
+/*-------------------------------------------------------------------------*/
+
+static int __init spi_init(void)
+{
+	buf = kmalloc(SPI_BUFSIZ, SLAB_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	bus_register(&spi_bus_type);
+	class_register(&spi_master_class);
+	return 0;
+}
+/* board_info is normally registered in arch_initcall(),
+ * but even essential drivers wait till later
+ */
+subsys_initcall(spi_init);
+
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
new file mode 100644
index 000000000000..51a6769114df
--- /dev/null
+++ b/include/linux/spi/spi.h
@@ -0,0 +1,542 @@
+/*
+ * Copyright (C) 2005 David Brownell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __LINUX_SPI_H
+#define __LINUX_SPI_H
+
+/*
+ * INTERFACES between SPI master drivers and infrastructure
+ * (There's no SPI slave support for Linux yet...)
+ *
+ * A "struct device_driver" for an spi_device uses "spi_bus_type" and
+ * needs no special API wrappers (much like platform_bus).  These drivers
+ * are bound to devices based on their names (much like platform_bus),
+ * and are available in dev->driver.
+ */
+extern struct bus_type spi_bus_type;
+
+/**
+ * struct spi_device - Master side proxy for an SPI slave device
+ * @dev: Driver model representation of the device.
+ * @master: SPI controller used with the device.
+ * @max_speed_hz: Maximum clock rate to be used with this chip
+ *	(on this board); may be changed by the device's driver.
+ * @chip-select: Chipselect, distinguishing chips handled by "master".
+ * @mode: The spi mode defines how data is clocked out and in.
+ *	This may be changed by the device's driver.
+ * @bits_per_word: Data transfers involve one or more words; word sizes
+ * 	like eight or 12 bits are common.  In-memory wordsizes are
+ *	powers of two bytes (e.g. 20 bit samples use 32 bits).
+ *	This may be changed by the device's driver.
+ * @irq: Negative, or the number passed to request_irq() to receive
+ * 	interrupts from this device.
+ * @controller_state: Controller's runtime state
+ * @controller_data: Static board-specific definitions for controller, such
+ * 	as FIFO initialization parameters; from board_info.controller_data
+ *
+ * An spi_device is used to interchange data between an SPI slave
+ * (usually a discrete chip) and CPU memory.
+ *
+ * In "dev", the platform_data is used to hold information about this
+ * device that's meaningful to the device's protocol driver, but not
+ * to its controller.  One example might be an identifier for a chip
+ * variant with slightly different functionality.
+ */
+struct spi_device {
+	struct device		dev;
+	struct spi_master	*master;
+	u32			max_speed_hz;
+	u8			chip_select;
+	u8			mode;
+#define	SPI_CPHA	0x01		/* clock phase */
+#define	SPI_CPOL	0x02		/* clock polarity */
+#define	SPI_MODE_0	(0|0)
+#define	SPI_MODE_1	(0|SPI_CPHA)
+#define	SPI_MODE_2	(SPI_CPOL|0)
+#define	SPI_MODE_3	(SPI_CPOL|SPI_CPHA)
+#define	SPI_CS_HIGH	0x04		/* chipselect active high? */
+	u8			bits_per_word;
+	int			irq;
+	void			*controller_state;
+	const void		*controller_data;
+	const char		*modalias;
+
+	// likely need more hooks for more protocol options affecting how
+	// the controller talks to its chips, like:
+	//  - bit order (default is wordwise msb-first)
+	//  - memory packing (12 bit samples into low bits, others zeroed)
+	//  - priority
+	//  - chipselect delays
+	//  - ...
+};
+
+static inline struct spi_device *to_spi_device(struct device *dev)
+{
+	return container_of(dev, struct spi_device, dev);
+}
+
+/* most drivers won't need to care about device refcounting */
+static inline struct spi_device *spi_dev_get(struct spi_device *spi)
+{
+	return (spi && get_device(&spi->dev)) ? spi : NULL;
+}
+
+static inline void spi_dev_put(struct spi_device *spi)
+{
+	if (spi)
+		put_device(&spi->dev);
+}
+
+/* ctldata is for the bus_master driver's runtime state */
+static inline void *spi_get_ctldata(struct spi_device *spi)
+{
+	return spi->controller_state;
+}
+
+static inline void spi_set_ctldata(struct spi_device *spi, void *state)
+{
+	spi->controller_state = state;
+}
+
+
+struct spi_message;
+
+
+/**
+ * struct spi_master - interface to SPI master controller
+ * @cdev: class interface to this driver
+ * @bus_num: board-specific (and often SOC-specific) identifier for a
+ * 	given SPI controller.
+ * @num_chipselects: chipselects are used to distinguish individual
+ * 	SPI slaves, and are numbered from zero to num_chipselects.
+ * 	each slave has a chipselect signal, but it's common that not
+ * 	every chipselect is connected to a slave.
+ * @setup: updates the device mode and clocking records used by a
+ * 	device's SPI controller; protocol code may call this.
+ * @transfer: adds a message to the controller's transfer queue.
+ * @cleanup: frees controller-specific state
+ *
+ * Each SPI master controller can communicate with one or more spi_device
+ * children.  These make a small bus, sharing MOSI, MISO and SCK signals
+ * but not chip select signals.  Each device may be configured to use a
+ * different clock rate, since those shared signals are ignored unless
+ * the chip is selected.
+ *
+ * The driver for an SPI controller manages access to those devices through
+ * a queue of spi_message transactions, copyin data between CPU memory and
+ * an SPI slave device).  For each such message it queues, it calls the
+ * message's completion function when the transaction completes.
+ */
+struct spi_master {
+	struct class_device	cdev;
+
+	/* other than zero (== assign one dynamically), bus_num is fully
+	 * board-specific.  usually that simplifies to being SOC-specific.
+	 * example:  one SOC has three SPI controllers, numbered 1..3,
+	 * and one board's schematics might show it using SPI-2.  software
+	 * would normally use bus_num=2 for that controller.
+	 */
+	u16			bus_num;
+
+	/* chipselects will be integral to many controllers; some others
+	 * might use board-specific GPIOs.
+	 */
+	u16			num_chipselect;
+
+	/* setup mode and clock, etc (spi driver may call many times) */
+	int			(*setup)(struct spi_device *spi);
+
+	/* bidirectional bulk transfers
+	 *
+	 * + The transfer() method may not sleep; its main role is
+	 *   just to add the message to the queue.
+	 * + For now there's no remove-from-queue operation, or
+	 *   any other request management
+	 * + To a given spi_device, message queueing is pure fifo
+	 *
+	 * + The master's main job is to process its message queue,
+	 *   selecting a chip then transferring data
+	 * + If there are multiple spi_device children, the i/o queue
+	 *   arbitration algorithm is unspecified (round robin, fifo,
+	 *   priority, reservations, preemption, etc)
+	 *
+	 * + Chipselect stays active during the entire message
+	 *   (unless modified by spi_transfer.cs_change != 0).
+	 * + The message transfers use clock and SPI mode parameters
+	 *   previously established by setup() for this device
+	 */
+	int			(*transfer)(struct spi_device *spi,
+						struct spi_message *mesg);
+
+	/* called on release() to free memory provided by spi_master */
+	void			(*cleanup)(const struct spi_device *spi);
+};
+
+/* the spi driver core manages memory for the spi_master classdev */
+extern struct spi_master *
+spi_alloc_master(struct device *host, unsigned size);
+
+extern int spi_register_master(struct spi_master *master);
+extern void spi_unregister_master(struct spi_master *master);
+
+extern struct spi_master *spi_busnum_to_master(u16 busnum);
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * I/O INTERFACE between SPI controller and protocol drivers
+ *
+ * Protocol drivers use a queue of spi_messages, each transferring data
+ * between the controller and memory buffers.
+ *
+ * The spi_messages themselves consist of a series of read+write transfer
+ * segments.  Those segments always read the same number of bits as they
+ * write; but one or the other is easily ignored by passing a null buffer
+ * pointer.  (This is unlike most types of I/O API, because SPI hardware
+ * is full duplex.)
+ *
+ * NOTE:  Allocation of spi_transfer and spi_message memory is entirely
+ * up to the protocol driver, which guarantees the integrity of both (as
+ * well as the data buffers) for as long as the message is queued.
+ */
+
+/**
+ * struct spi_transfer - a read/write buffer pair
+ * @tx_buf: data to be written (dma-safe address), or NULL
+ * @rx_buf: data to be read (dma-safe address), or NULL
+ * @tx_dma: DMA address of buffer, if spi_message.is_dma_mapped
+ * @rx_dma: DMA address of buffer, if spi_message.is_dma_mapped
+ * @len: size of rx and tx buffers (in bytes)
+ * @cs_change: affects chipselect after this transfer completes
+ * @delay_usecs: microseconds to delay after this transfer before
+ * 	(optionally) changing the chipselect status, then starting
+ * 	the next transfer or completing this spi_message.
+ *
+ * SPI transfers always write the same number of bytes as they read.
+ * Protocol drivers should always provide rx_buf and/or tx_buf.
+ * In some cases, they may also want to provide DMA addresses for
+ * the data being transferred; that may reduce overhead, when the
+ * underlying driver uses dma.
+ *
+ * All SPI transfers start with the relevant chipselect active.  Drivers
+ * can change behavior of the chipselect after the transfer finishes
+ * (including any mandatory delay).  The normal behavior is to leave it
+ * selected, except for the last transfer in a message.  Setting cs_change
+ * allows two additional behavior options:
+ *
+ * (i) If the transfer isn't the last one in the message, this flag is
+ * used to make the chipselect briefly go inactive in the middle of the
+ * message.  Toggling chipselect in this way may be needed to terminate
+ * a chip command, letting a single spi_message perform all of group of
+ * chip transactions together.
+ *
+ * (ii) When the transfer is the last one in the message, the chip may
+ * stay selected until the next transfer.  This is purely a performance
+ * hint; the controller driver may need to select a different device
+ * for the next message.
+ */
+struct spi_transfer {
+	/* it's ok if tx_buf == rx_buf (right?)
+	 * for MicroWire, one buffer must be null
+	 * buffers must work with dma_*map_single() calls
+	 */
+	const void	*tx_buf;
+	void		*rx_buf;
+	unsigned	len;
+
+	dma_addr_t	tx_dma;
+	dma_addr_t	rx_dma;
+
+	unsigned	cs_change:1;
+	u16		delay_usecs;
+};
+
+/**
+ * struct spi_message - one multi-segment SPI transaction
+ * @transfers: the segements of the transaction
+ * @n_transfer: how many segments
+ * @spi: SPI device to which the transaction is queued
+ * @is_dma_mapped: if true, the caller provided both dma and cpu virtual
+ *	addresses for each transfer buffer
+ * @complete: called to report transaction completions
+ * @context: the argument to complete() when it's called
+ * @actual_length: how many bytes were transferd
+ * @status: zero for success, else negative errno
+ * @queue: for use by whichever driver currently owns the message
+ * @state: for use by whichever driver currently owns the message
+ */
+struct spi_message {
+	struct spi_transfer	*transfers;
+	unsigned		n_transfer;
+
+	struct spi_device	*spi;
+
+	unsigned		is_dma_mapped:1;
+
+	/* REVISIT:  we might want a flag affecting the behavior of the
+	 * last transfer ... allowing things like "read 16 bit length L"
+	 * immediately followed by "read L bytes".  Basically imposing
+	 * a specific message scheduling algorithm.
+	 *
+	 * Some controller drivers (message-at-a-time queue processing)
+	 * could provide that as their default scheduling algorithm.  But
+	 * others (with multi-message pipelines) would need a flag to
+	 * tell them about such special cases.
+	 */
+
+	/* completion is reported through a callback */
+	void 			FASTCALL((*complete)(void *context));
+	void			*context;
+	unsigned		actual_length;
+	int			status;
+
+	/* for optional use by whatever driver currently owns the
+	 * spi_message ...  between calls to spi_async and then later
+	 * complete(), that's the spi_master controller driver.
+	 */
+	struct list_head	queue;
+	void			*state;
+};
+
+/**
+ * spi_setup -- setup SPI mode and clock rate
+ * @spi: the device whose settings are being modified
+ *
+ * SPI protocol drivers may need to update the transfer mode if the
+ * device doesn't work with the mode 0 default.  They may likewise need
+ * to update clock rates or word sizes from initial values.  This function
+ * changes those settings, and must be called from a context that can sleep.
+ */
+static inline int
+spi_setup(struct spi_device *spi)
+{
+	return spi->master->setup(spi);
+}
+
+
+/**
+ * spi_async -- asynchronous SPI transfer
+ * @spi: device with which data will be exchanged
+ * @message: describes the data transfers, including completion callback
+ *
+ * This call may be used in_irq and other contexts which can't sleep,
+ * as well as from task contexts which can sleep.
+ *
+ * The completion callback is invoked in a context which can't sleep.
+ * Before that invocation, the value of message->status is undefined.
+ * When the callback is issued, message->status holds either zero (to
+ * indicate complete success) or a negative error code.
+ *
+ * Note that although all messages to a spi_device are handled in
+ * FIFO order, messages may go to different devices in other orders.
+ * Some device might be higher priority, or have various "hard" access
+ * time requirements, for example.
+ */
+static inline int
+spi_async(struct spi_device *spi, struct spi_message *message)
+{
+	message->spi = spi;
+	return spi->master->transfer(spi, message);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/* All these synchronous SPI transfer routines are utilities layered
+ * over the core async transfer primitive.  Here, "synchronous" means
+ * they will sleep uninterruptibly until the async transfer completes.
+ */
+
+extern int spi_sync(struct spi_device *spi, struct spi_message *message);
+
+/**
+ * spi_write - SPI synchronous write
+ * @spi: device to which data will be written
+ * @buf: data buffer
+ * @len: data buffer size
+ *
+ * This writes the buffer and returns zero or a negative error code.
+ * Callable only from contexts that can sleep.
+ */
+static inline int
+spi_write(struct spi_device *spi, const u8 *buf, size_t len)
+{
+	struct spi_transfer	t = {
+			.tx_buf		= buf,
+			.rx_buf		= NULL,
+			.len		= len,
+			.cs_change	= 0,
+		};
+	struct spi_message	m = {
+			.transfers	= &t,
+			.n_transfer	= 1,
+		};
+
+	return spi_sync(spi, &m);
+}
+
+/**
+ * spi_read - SPI synchronous read
+ * @spi: device from which data will be read
+ * @buf: data buffer
+ * @len: data buffer size
+ *
+ * This writes the buffer and returns zero or a negative error code.
+ * Callable only from contexts that can sleep.
+ */
+static inline int
+spi_read(struct spi_device *spi, u8 *buf, size_t len)
+{
+	struct spi_transfer	t = {
+			.tx_buf		= NULL,
+			.rx_buf		= buf,
+			.len		= len,
+			.cs_change	= 0,
+		};
+	struct spi_message	m = {
+			.transfers	= &t,
+			.n_transfer	= 1,
+		};
+
+	return spi_sync(spi, &m);
+}
+
+extern int spi_write_then_read(struct spi_device *spi,
+		const u8 *txbuf, unsigned n_tx,
+		u8 *rxbuf, unsigned n_rx);
+
+/**
+ * spi_w8r8 - SPI synchronous 8 bit write followed by 8 bit read
+ * @spi: device with which data will be exchanged
+ * @cmd: command to be written before data is read back
+ *
+ * This returns the (unsigned) eight bit number returned by the
+ * device, or else a negative error code.  Callable only from
+ * contexts that can sleep.
+ */
+static inline ssize_t spi_w8r8(struct spi_device *spi, u8 cmd)
+{
+	ssize_t			status;
+	u8			result;
+
+	status = spi_write_then_read(spi, &cmd, 1, &result, 1);
+
+	/* return negative errno or unsigned value */
+	return (status < 0) ? status : result;
+}
+
+/**
+ * spi_w8r16 - SPI synchronous 8 bit write followed by 16 bit read
+ * @spi: device with which data will be exchanged
+ * @cmd: command to be written before data is read back
+ *
+ * This returns the (unsigned) sixteen bit number returned by the
+ * device, or else a negative error code.  Callable only from
+ * contexts that can sleep.
+ *
+ * The number is returned in wire-order, which is at least sometimes
+ * big-endian.
+ */
+static inline ssize_t spi_w8r16(struct spi_device *spi, u8 cmd)
+{
+	ssize_t			status;
+	u16			result;
+
+	status = spi_write_then_read(spi, &cmd, 1, (u8 *) &result, 2);
+
+	/* return negative errno or unsigned value */
+	return (status < 0) ? status : result;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * INTERFACE between board init code and SPI infrastructure.
+ *
+ * No SPI driver ever sees these SPI device table segments, but
+ * it's how the SPI core (or adapters that get hotplugged) grows
+ * the driver model tree.
+ *
+ * As a rule, SPI devices can't be probed.  Instead, board init code
+ * provides a table listing the devices which are present, with enough
+ * information to bind and set up the device's driver.  There's basic
+ * support for nonstatic configurations too; enough to handle adding
+ * parport adapters, or microcontrollers acting as USB-to-SPI bridges.
+ */
+
+/* board-specific information about each SPI device */
+struct spi_board_info {
+	/* the device name and module name are coupled, like platform_bus;
+	 * "modalias" is normally the driver name.
+	 *
+	 * platform_data goes to spi_device.dev.platform_data,
+	 * controller_data goes to spi_device.platform_data,
+	 * irq is copied too
+	 */
+	char		modalias[KOBJ_NAME_LEN];
+	const void	*platform_data;
+	const void	*controller_data;
+	int		irq;
+
+	/* slower signaling on noisy or low voltage boards */
+	u32		max_speed_hz;
+
+
+	/* bus_num is board specific and matches the bus_num of some
+	 * spi_master that will probably be registered later.
+	 *
+	 * chip_select reflects how this chip is wired to that master;
+	 * it's less than num_chipselect.
+	 */
+	u16		bus_num;
+	u16		chip_select;
+
+	/* ... may need additional spi_device chip config data here.
+	 * avoid stuff protocol drivers can set; but include stuff
+	 * needed to behave without being bound to a driver:
+	 *  - chipselect polarity
+	 *  - quirks like clock rate mattering when not selected
+	 */
+};
+
+#ifdef	CONFIG_SPI
+extern int
+spi_register_board_info(struct spi_board_info const *info, unsigned n);
+#else
+/* board init code may ignore whether SPI is configured or not */
+static inline int
+spi_register_board_info(struct spi_board_info const *info, unsigned n)
+	{ return 0; }
+#endif
+
+
+/* If you're hotplugging an adapter with devices (parport, usb, etc)
+ * use spi_new_device() to describe each device.  You can also call
+ * spi_unregister_device() to get start making that device vanish,
+ * but normally that would be handled by spi_unregister_master().
+ */
+extern struct spi_device *
+spi_new_device(struct spi_master *, struct spi_board_info *);
+
+static inline void
+spi_unregister_device(struct spi_device *spi)
+{
+	if (spi)
+		device_unregister(&spi->dev);
+}
+
+#endif /* __LINUX_SPI_H */
-- 
cgit v1.2.3-71-gd317


From ffa458c1bd9b6f653008d450f337602f3d52a646 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 8 Jan 2006 13:34:21 -0800
Subject: [PATCH] spi: ads7846 driver

This is a driver for the ADS7846 touchscreen sensor, derived from
the corgi_ts and omap_ts drivers.  Key differences from those two:

  - Uses the new SPI framework (minimalist version)
  - <linux/spi/ads7846.h> abstracts board-specific touchscreen info
  - Sysfs attributes for the temperature and voltage sensors
  - Uses fewer ARM-specific IRQ primitives

The temperature and voltage sensors show up in sysfs like this:

  $ pwd
  /sys/devices/platform/omap-uwire/spi2.0
  $ ls
  bus@          input:event0@ power/        temp1         vbatt
  driver@       modalias      temp0         vaux
  $ cat modalias
  ads7846
  $ cat temp0
  991
  $ cat temp1
  1177
  $

So far only basic testing has been done.  There's a fair amount of hardware
that uses this sensor, and which also runs Linux, which should eventually
be able to use this driver.

One portability note may be of special interest.  It turns out that not all
SPI controllers are happy issuing requests that do things like "write 8 bit
command, read 12 bit response".  Most of them seem happy to handle various
word sizes, so the issue isn't "12 bit response" but rather "different rx
and tx write sizes", despite that being a common MicroWire convention.  So
this version of the driver no longer reads 12 bit native-endian words; it
reads 16-bit big-endian responses, then byteswaps them and shifts the
results to discard the noise.

Signed-off-by: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/input/touchscreen/Kconfig   |  13 +
 drivers/input/touchscreen/Makefile  |   1 +
 drivers/input/touchscreen/ads7846.c | 621 ++++++++++++++++++++++++++++++++++++
 include/linux/spi/ads7846.h         |  18 ++
 4 files changed, 653 insertions(+)
 create mode 100644 drivers/input/touchscreen/ads7846.c
 create mode 100644 include/linux/spi/ads7846.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index 21d55ed4b88a..2c674023a6ac 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -11,6 +11,19 @@ menuconfig INPUT_TOUCHSCREEN
 
 if INPUT_TOUCHSCREEN
 
+config TOUCHSCREEN_ADS7846
+	tristate "ADS 7846 based touchscreens"
+	depends on SPI_MASTER
+	help
+	  Say Y here if you have a touchscreen interface using the
+	  ADS7846 controller, and your board-specific initialization
+	  code includes that in its table of SPI devices.
+
+	  If unsure, say N (but it's safe to say "Y").
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ads7846.
+
 config TOUCHSCREEN_BITSY
 	tristate "Compaq iPAQ H3600 (Bitsy) touchscreen"
 	depends on SA1100_BITSY
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index 6842869c9a26..5e5557c43121 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -4,6 +4,7 @@
 
 # Each configuration option enables a list of files.
 
+obj-$(CONFIG_TOUCHSCREEN_ADS7846)	+= ads7846.o
 obj-$(CONFIG_TOUCHSCREEN_BITSY)	+= h3600_ts_input.o
 obj-$(CONFIG_TOUCHSCREEN_CORGI)	+= corgi_ts.o
 obj-$(CONFIG_TOUCHSCREEN_GUNZE)	+= gunze.o
diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
new file mode 100644
index 000000000000..24ff6c5a4021
--- /dev/null
+++ b/drivers/input/touchscreen/ads7846.c
@@ -0,0 +1,621 @@
+/*
+ * ADS7846 based touchscreen and sensor driver
+ *
+ * Copyright (c) 2005 David Brownell
+ *
+ * Using code from:
+ *  - corgi_ts.c
+ *	Copyright (C) 2004-2005 Richard Purdie
+ *  - omap_ts.[hc], ads7846.h, ts_osk.c
+ *	Copyright (C) 2002 MontaVista Software
+ *	Copyright (C) 2004 Texas Instruments
+ *	Copyright (C) 2005 Dirk Behme
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/ads7846.h>
+
+#ifdef	CONFIG_ARM
+#include <asm/mach-types.h>
+#ifdef	CONFIG_ARCH_OMAP
+#include <asm/arch/gpio.h>
+#endif
+
+#else
+#define	set_irq_type(irq,type)	do{}while(0)
+#endif
+
+
+/*
+ * This code has been lightly tested on an ads7846.
+ * Support for ads7843 and ads7845 has only been stubbed in.
+ *
+ * Not yet done:  investigate the values reported.  Are x/y/pressure
+ * event values sane enough for X11?  How accurate are the temperature
+ * and voltage readings?  (System-specific calibration should support
+ * accuracy of 0.3 degrees C; otherwise it's 2.0 degrees.)
+ *
+ * app note sbaa036 talks in more detail about accurate sampling...
+ * that ought to help in situations like LCDs inducing noise (which
+ * can also be helped by using synch signals) and more generally.
+ */
+
+#define	TS_POLL_PERIOD	msecs_to_jiffies(10)
+
+struct ts_event {
+	/* For portability, we can't read 12 bit values using SPI (which
+	 * would make the controller deliver them as native byteorder u16
+	 * with msbs zeroed).  Instead, we read them as two 8-byte values,
+	 * which need byteswapping then range adjustment.
+	 */
+	__be16 x;
+	__be16 y;
+	__be16 z1, z2;
+};
+
+struct ads7846 {
+	struct input_dev	input;
+	char			phys[32];
+
+	struct spi_device	*spi;
+	u16			model;
+	u16			vref_delay_usecs;
+	u16			x_plate_ohms;
+
+	struct ts_event		tc;
+
+	struct spi_transfer	xfer[8];
+	struct spi_message	msg;
+
+	spinlock_t		lock;
+	struct timer_list	timer;		/* P: lock */
+	unsigned		pendown:1;	/* P: lock */
+	unsigned		pending:1;	/* P: lock */
+// FIXME remove "irq_disabled"
+	unsigned		irq_disabled:1;	/* P: lock */
+};
+
+/* leave chip selected when we're done, for quicker re-select? */
+#if	0
+#define	CS_CHANGE(xfer)	((xfer).cs_change = 1)
+#else
+#define	CS_CHANGE(xfer)	((xfer).cs_change = 0)
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+/* The ADS7846 has touchscreen and other sensors.
+ * Earlier ads784x chips are somewhat compatible.
+ */
+#define	ADS_START		(1 << 7)
+#define	ADS_A2A1A0_d_y		(1 << 4)	/* differential */
+#define	ADS_A2A1A0_d_z1		(3 << 4)	/* differential */
+#define	ADS_A2A1A0_d_z2		(4 << 4)	/* differential */
+#define	ADS_A2A1A0_d_x		(5 << 4)	/* differential */
+#define	ADS_A2A1A0_temp0	(0 << 4)	/* non-differential */
+#define	ADS_A2A1A0_vbatt	(2 << 4)	/* non-differential */
+#define	ADS_A2A1A0_vaux		(6 << 4)	/* non-differential */
+#define	ADS_A2A1A0_temp1	(7 << 4)	/* non-differential */
+#define	ADS_8_BIT		(1 << 3)
+#define	ADS_12_BIT		(0 << 3)
+#define	ADS_SER			(1 << 2)	/* non-differential */
+#define	ADS_DFR			(0 << 2)	/* differential */
+#define	ADS_PD10_PDOWN		(0 << 0)	/* lowpower mode + penirq */
+#define	ADS_PD10_ADC_ON		(1 << 0)	/* ADC on */
+#define	ADS_PD10_REF_ON		(2 << 0)	/* vREF on + penirq */
+#define	ADS_PD10_ALL_ON		(3 << 0)	/* ADC + vREF on */
+
+#define	MAX_12BIT	((1<<12)-1)
+
+/* leave ADC powered up (disables penirq) between differential samples */
+#define	READ_12BIT_DFR(x) (ADS_START | ADS_A2A1A0_d_ ## x \
+	| ADS_12_BIT | ADS_DFR)
+
+static const u8	read_y  = READ_12BIT_DFR(y)  | ADS_PD10_ADC_ON;
+static const u8	read_z1 = READ_12BIT_DFR(z1) | ADS_PD10_ADC_ON;
+static const u8	read_z2 = READ_12BIT_DFR(z2) | ADS_PD10_ADC_ON;
+static const u8	read_x  = READ_12BIT_DFR(x)  | ADS_PD10_PDOWN;	/* LAST */
+
+/* single-ended samples need to first power up reference voltage;
+ * we leave both ADC and VREF powered
+ */
+#define	READ_12BIT_SER(x) (ADS_START | ADS_A2A1A0_ ## x \
+	| ADS_12_BIT | ADS_SER)
+
+static const u8	ref_on = READ_12BIT_DFR(x) | ADS_PD10_ALL_ON;
+static const u8	ref_off = READ_12BIT_DFR(y) | ADS_PD10_PDOWN;
+
+/*--------------------------------------------------------------------------*/
+
+/*
+ * Non-touchscreen sensors only use single-ended conversions.
+ */
+
+struct ser_req {
+	u8			command;
+	u16			scratch;
+	__be16			sample;
+	struct spi_message	msg;
+	struct spi_transfer	xfer[6];
+};
+
+static int ads7846_read12_ser(struct device *dev, unsigned command)
+{
+	struct spi_device	*spi = to_spi_device(dev);
+	struct ads7846		*ts = dev_get_drvdata(dev);
+	struct ser_req		*req = kzalloc(sizeof *req, SLAB_KERNEL);
+	int			status;
+	int			sample;
+
+	if (!req)
+		return -ENOMEM;
+
+	/* activate reference, so it has time to settle; */
+	req->xfer[0].tx_buf = &ref_on;
+	req->xfer[0].len = 1;
+	req->xfer[1].rx_buf = &req->scratch;
+	req->xfer[1].len = 2;
+
+	/*
+	 * for external VREF, 0 usec (and assume it's always on);
+	 * for 1uF, use 800 usec;
+	 * no cap, 100 usec.
+	 */
+	req->xfer[1].delay_usecs = ts->vref_delay_usecs;
+
+	/* take sample */
+	req->command = (u8) command;
+	req->xfer[2].tx_buf = &req->command;
+	req->xfer[2].len = 1;
+	req->xfer[3].rx_buf = &req->sample;
+	req->xfer[3].len = 2;
+
+	/* REVISIT:  take a few more samples, and compare ... */
+
+	/* turn off reference */
+	req->xfer[4].tx_buf = &ref_off;
+	req->xfer[4].len = 1;
+	req->xfer[5].rx_buf = &req->scratch;
+	req->xfer[5].len = 2;
+
+	CS_CHANGE(req->xfer[5]);
+
+	/* group all the transfers together, so we can't interfere with
+	 * reading touchscreen state; disable penirq while sampling
+	 */
+	req->msg.transfers = req->xfer;
+	req->msg.n_transfer = 6;
+
+	disable_irq(spi->irq);
+	status = spi_sync(spi, &req->msg);
+	enable_irq(spi->irq);
+
+	if (req->msg.status)
+		status = req->msg.status;
+	sample = be16_to_cpu(req->sample);
+	sample = sample >> 4;
+	kfree(req);
+
+	return status ? status : sample;
+}
+
+#define SHOW(name) static ssize_t \
+name ## _show(struct device *dev, struct device_attribute *attr, char *buf) \
+{ \
+	ssize_t v = ads7846_read12_ser(dev, \
+			READ_12BIT_SER(name) | ADS_PD10_ALL_ON); \
+	if (v < 0) \
+		return v; \
+	return sprintf(buf, "%u\n", (unsigned) v); \
+} \
+static DEVICE_ATTR(name, S_IRUGO, name ## _show, NULL);
+
+SHOW(temp0)
+SHOW(temp1)
+SHOW(vaux)
+SHOW(vbatt)
+
+/*--------------------------------------------------------------------------*/
+
+/*
+ * PENIRQ only kicks the timer.  The timer only reissues the SPI transfer,
+ * to retrieve touchscreen status.
+ *
+ * The SPI transfer completion callback does the real work.  It reports
+ * touchscreen events and reactivates the timer (or IRQ) as appropriate.
+ */
+
+static void ads7846_rx(void *ads)
+{
+	struct ads7846	*ts = ads;
+	unsigned	Rt;
+	unsigned	sync = 0;
+	u16		x, y, z1, z2;
+	unsigned long	flags;
+
+	/* adjust:  12 bit samples (left aligned), built from
+	 * two 8 bit values writen msb-first.
+	 */
+	x = be16_to_cpu(ts->tc.x) >> 4;
+	y = be16_to_cpu(ts->tc.y) >> 4;
+	z1 = be16_to_cpu(ts->tc.z1) >> 4;
+	z2 = be16_to_cpu(ts->tc.z2) >> 4;
+
+	/* range filtering */
+	if (x == MAX_12BIT)
+		x = 0;
+
+	if (x && z1 && ts->spi->dev.power.power_state.event == PM_EVENT_ON) {
+		/* compute touch pressure resistance using equation #2 */
+		Rt = z2;
+		Rt -= z1;
+		Rt *= x;
+		Rt *= ts->x_plate_ohms;
+		Rt /= z1;
+		Rt = (Rt + 2047) >> 12;
+	} else
+		Rt = 0;
+
+	/* NOTE:  "pendown" is inferred from pressure; we don't rely on
+	 * being able to check nPENIRQ status, or "friendly" trigger modes
+	 * (both-edges is much better than just-falling or low-level).
+	 *
+	 * REVISIT:  some boards may require reading nPENIRQ; it's
+	 * needed on 7843.  and 7845 reads pressure differently...
+	 *
+	 * REVISIT:  the touchscreen might not be connected; this code
+	 * won't notice that, even if nPENIRQ never fires ...
+	 */
+	if (!ts->pendown && Rt != 0) {
+		input_report_key(&ts->input, BTN_TOUCH, 1);
+		sync = 1;
+	} else if (ts->pendown && Rt == 0) {
+		input_report_key(&ts->input, BTN_TOUCH, 0);
+		sync = 1;
+	}
+
+	if (Rt) {
+		input_report_abs(&ts->input, ABS_X, x);
+		input_report_abs(&ts->input, ABS_Y, y);
+		input_report_abs(&ts->input, ABS_PRESSURE, Rt);
+		sync = 1;
+	}
+	if (sync)
+		input_sync(&ts->input);
+
+#ifdef	VERBOSE
+	if (Rt || ts->pendown)
+		pr_debug("%s: %d/%d/%d%s\n", ts->spi->dev.bus_id,
+			x, y, Rt, Rt ? "" : " UP");
+#endif
+
+	/* don't retrigger while we're suspended */
+	spin_lock_irqsave(&ts->lock, flags);
+
+	ts->pendown = (Rt != 0);
+	ts->pending = 0;
+
+	if (ts->spi->dev.power.power_state.event == PM_EVENT_ON) {
+		if (ts->pendown)
+			mod_timer(&ts->timer, jiffies + TS_POLL_PERIOD);
+		else if (ts->irq_disabled) {
+			ts->irq_disabled = 0;
+			enable_irq(ts->spi->irq);
+		}
+	}
+
+	spin_unlock_irqrestore(&ts->lock, flags);
+}
+
+static void ads7846_timer(unsigned long handle)
+{
+	struct ads7846	*ts = (void *)handle;
+	int		status = 0;
+	unsigned long	flags;
+
+	spin_lock_irqsave(&ts->lock, flags);
+	if (!ts->pending) {
+		ts->pending = 1;
+		if (!ts->irq_disabled) {
+			ts->irq_disabled = 1;
+			disable_irq(ts->spi->irq);
+		}
+		status = spi_async(ts->spi, &ts->msg);
+		if (status)
+			dev_err(&ts->spi->dev, "spi_async --> %d\n",
+					status);
+	}
+	spin_unlock_irqrestore(&ts->lock, flags);
+}
+
+static irqreturn_t ads7846_irq(int irq, void *handle, struct pt_regs *regs)
+{
+	ads7846_timer((unsigned long) handle);
+	return IRQ_HANDLED;
+}
+
+/*--------------------------------------------------------------------------*/
+
+/* non-empty "extra" is needed before 2.6.14-git5 or so */
+#define	EXTRA	//	, u32 level
+#define	EXTRA2	//	, 0
+
+static int
+ads7846_suspend(struct device *dev, pm_message_t message EXTRA)
+{
+	struct ads7846 *ts = dev_get_drvdata(dev);
+	unsigned long	flags;
+
+	spin_lock_irqsave(&ts->lock, flags);
+
+	ts->spi->dev.power.power_state = message;
+
+	/* are we waiting for IRQ, or polling? */
+	if (!ts->pendown) {
+		if (!ts->irq_disabled) {
+			ts->irq_disabled = 1;
+			disable_irq(ts->spi->irq);
+		}
+	} else {
+		/* polling; force a final SPI completion;
+		 * that will clean things up neatly
+		 */
+		if (!ts->pending)
+			mod_timer(&ts->timer, jiffies);
+
+		while (ts->pendown || ts->pending) {
+			spin_unlock_irqrestore(&ts->lock, flags);
+			udelay(10);
+			spin_lock_irqsave(&ts->lock, flags);
+		}
+	}
+
+	/* we know the chip's in lowpower mode since we always
+	 * leave it that way after every request
+	 */
+
+	spin_unlock_irqrestore(&ts->lock, flags);
+	return 0;
+}
+
+static int ads7846_resume(struct device *dev EXTRA)
+{
+	struct ads7846 *ts = dev_get_drvdata(dev);
+
+	ts->irq_disabled = 0;
+	enable_irq(ts->spi->irq);
+	dev->power.power_state = PMSG_ON;
+	return 0;
+}
+
+static int __init ads7846_probe(struct device *dev)
+{
+	struct spi_device		*spi = to_spi_device(dev);
+	struct ads7846			*ts;
+	struct ads7846_platform_data	*pdata = dev->platform_data;
+	struct spi_transfer		*x;
+
+	if (!spi->irq) {
+		dev_dbg(dev, "no IRQ?\n");
+		return -ENODEV;
+	}
+
+	if (!pdata) {
+		dev_dbg(dev, "no platform data?\n");
+		return -ENODEV;
+	}
+
+	/* don't exceed max specified sample rate */
+	if (spi->max_speed_hz > (125000 * 16)) {
+		dev_dbg(dev, "f(sample) %d KHz?\n",
+				(spi->max_speed_hz/16)/1000);
+		return -EINVAL;
+	}
+
+	/* We'd set the wordsize to 12 bits ... except that some controllers
+	 * will then treat the 8 bit command words as 12 bits (and drop the
+	 * four MSBs of the 12 bit result).  Result: inputs must be shifted
+	 * to discard the four garbage LSBs.
+	 */
+
+	if (!(ts = kzalloc(sizeof(struct ads7846), GFP_KERNEL)))
+		return -ENOMEM;
+
+	dev_set_drvdata(dev, ts);
+
+	ts->spi = spi;
+	spi->dev.power.power_state = PMSG_ON;
+
+	init_timer(&ts->timer);
+	ts->timer.data = (unsigned long) ts;
+	ts->timer.function = ads7846_timer;
+
+	ts->model = pdata->model ? : 7846;
+	ts->vref_delay_usecs = pdata->vref_delay_usecs ? : 100;
+	ts->x_plate_ohms = pdata->x_plate_ohms ? : 400;
+
+	init_input_dev(&ts->input);
+
+	ts->input.dev = dev;
+	ts->input.name = "ADS784x Touchscreen";
+	snprintf(ts->phys, sizeof ts->phys, "%s/input0", dev->bus_id);
+	ts->input.phys = ts->phys;
+
+	ts->input.evbit[0] = BIT(EV_KEY) | BIT(EV_ABS);
+	ts->input.keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH);
+	input_set_abs_params(&ts->input, ABS_X,
+			pdata->x_min ? : 0,
+			pdata->x_max ? : MAX_12BIT,
+			0, 0);
+	input_set_abs_params(&ts->input, ABS_Y,
+			pdata->y_min ? : 0,
+			pdata->y_max ? : MAX_12BIT,
+			0, 0);
+	input_set_abs_params(&ts->input, ABS_PRESSURE,
+			pdata->pressure_min, pdata->pressure_max, 0, 0);
+
+	input_register_device(&ts->input);
+
+	/* set up the transfers to read touchscreen state; this assumes we
+	 * use formula #2 for pressure, not #3.
+	 */
+	x = ts->xfer;
+
+	/* y- still on; turn on only y+ (and ADC) */
+	x->tx_buf = &read_y;
+	x->len = 1;
+	x++;
+	x->rx_buf = &ts->tc.y;
+	x->len = 2;
+	x++;
+
+	/* turn y+ off, x- on; we'll use formula #2 */
+	if (ts->model == 7846) {
+		x->tx_buf = &read_z1;
+		x->len = 1;
+		x++;
+		x->rx_buf = &ts->tc.z1;
+		x->len = 2;
+		x++;
+
+		x->tx_buf = &read_z2;
+		x->len = 1;
+		x++;
+		x->rx_buf = &ts->tc.z2;
+		x->len = 2;
+		x++;
+	}
+
+	/* turn y- off, x+ on, then leave in lowpower */
+	x->tx_buf = &read_x;
+	x->len = 1;
+	x++;
+	x->rx_buf = &ts->tc.x;
+	x->len = 2;
+	x++;
+
+	CS_CHANGE(x[-1]);
+
+	ts->msg.transfers = ts->xfer;
+	ts->msg.n_transfer = x - ts->xfer;
+	ts->msg.complete = ads7846_rx;
+	ts->msg.context = ts;
+
+	if (request_irq(spi->irq, ads7846_irq, SA_SAMPLE_RANDOM,
+				dev->bus_id, ts)) {
+		dev_dbg(dev, "irq %d busy?\n", spi->irq);
+		input_unregister_device(&ts->input);
+		kfree(ts);
+		return -EBUSY;
+	}
+	set_irq_type(spi->irq, IRQT_FALLING);
+
+	dev_info(dev, "touchscreen, irq %d\n", spi->irq);
+
+	/* take a first sample, leaving nPENIRQ active; avoid
+	 * the touchscreen, in case it's not connected.
+	 */
+	(void) ads7846_read12_ser(dev,
+			  READ_12BIT_SER(vaux) | ADS_PD10_ALL_ON);
+
+	/* ads7843/7845 don't have temperature sensors, and
+	 * use the other sensors a bit differently too
+	 */
+	if (ts->model == 7846) {
+		device_create_file(dev, &dev_attr_temp0);
+		device_create_file(dev, &dev_attr_temp1);
+	}
+	if (ts->model != 7845)
+		device_create_file(dev, &dev_attr_vbatt);
+	device_create_file(dev, &dev_attr_vaux);
+
+	return 0;
+}
+
+static int __exit ads7846_remove(struct device *dev)
+{
+	struct ads7846		*ts = dev_get_drvdata(dev);
+
+	ads7846_suspend(dev, PMSG_SUSPEND EXTRA2);
+	free_irq(ts->spi->irq, ts);
+	if (ts->irq_disabled)
+		enable_irq(ts->spi->irq);
+
+	if (ts->model == 7846) {
+		device_remove_file(dev, &dev_attr_temp0);
+		device_remove_file(dev, &dev_attr_temp1);
+	}
+	if (ts->model != 7845)
+		device_remove_file(dev, &dev_attr_vbatt);
+	device_remove_file(dev, &dev_attr_vaux);
+
+	input_unregister_device(&ts->input);
+	kfree(ts);
+
+	dev_dbg(dev, "unregistered touchscreen\n");
+	return 0;
+}
+
+static struct device_driver ads7846_driver = {
+	.name		= "ads7846",
+	.bus		= &spi_bus_type,
+	.probe		= ads7846_probe,
+	.remove		= __exit_p(ads7846_remove),
+	.suspend	= ads7846_suspend,
+	.resume		= ads7846_resume,
+};
+
+static int __init ads7846_init(void)
+{
+	/* grr, board-specific init should stay out of drivers!! */
+
+#ifdef	CONFIG_ARCH_OMAP
+	if (machine_is_omap_osk()) {
+		/* GPIO4 = PENIRQ; GPIO6 = BUSY */
+		omap_request_gpio(4);
+		omap_set_gpio_direction(4, 1);
+		omap_request_gpio(6);
+		omap_set_gpio_direction(6, 1);
+	}
+	// also TI 1510 Innovator, bitbanging through FPGA
+	// also Nokia 770
+	// also Palm Tungsten T2
+#endif
+
+	// PXA:
+	// also Dell Axim X50
+	// also HP iPaq H191x/H192x/H415x/H435x
+	// also Intel Lubbock (alternate to UCB1400)
+	// also Sharp Zaurus C7xx, C8xx (corgi/sheperd/husky)
+
+	// also various AMD Au1x00 devel boards
+
+	return driver_register(&ads7846_driver);
+}
+module_init(ads7846_init);
+
+static void __exit ads7846_exit(void)
+{
+	driver_unregister(&ads7846_driver);
+
+#ifdef	CONFIG_ARCH_OMAP
+	if (machine_is_omap_osk()) {
+		omap_free_gpio(4);
+		omap_free_gpio(6);
+	}
+#endif
+
+}
+module_exit(ads7846_exit);
+
+MODULE_DESCRIPTION("ADS7846 TouchScreen Driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
new file mode 100644
index 000000000000..84a27013d399
--- /dev/null
+++ b/include/linux/spi/ads7846.h
@@ -0,0 +1,18 @@
+/* linux/spi/ads7846.h */
+
+/* Touchscreen characteristics vary between boards and models.  The
+ * platform_data for the device's "struct device" holts this information.
+ *
+ * It's OK if the min/max values are zero.
+ */
+struct ads7846_platform_data {
+	u16	model;			/* 7843, 7845, 7846. */
+	u16	vref_delay_usecs;	/* 0 for external vref; etc */
+	u16	x_plate_ohms;
+	u16	y_plate_ohms;
+
+	u16	x_min, x_max;
+	u16	y_min, y_max;
+	u16	pressure_min, pressure_max;
+};
+
-- 
cgit v1.2.3-71-gd317


From 1d6432fe10c3e724e307dd7137cd293a0edcae80 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 8 Jan 2006 13:34:22 -0800
Subject: [PATCH] spi: mtd dataflash driver

This is a conversion of the AT91rm9200 DataFlash MTD driver to use the
lightweight SPI framework, and no longer be AT91-specific.  It compiles
down to less than 3KBytes on ARM.

The driver allows board-specific init code to provide platform_data with
the relevant MTD partitioning information, and hotplugs.

This version has been lightly tested.  Its parent at91_dataflash driver has
been pretty well banged on, although kernel.org JFFS2 dataflash support was
acting broken the last time I tried it.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/mtd/devices/Kconfig         |   8 +
 drivers/mtd/devices/Makefile        |   1 +
 drivers/mtd/devices/mtd_dataflash.c | 623 ++++++++++++++++++++++++++++++++++++
 include/linux/spi/flash.h           |  27 ++
 4 files changed, 659 insertions(+)
 create mode 100644 drivers/mtd/devices/mtd_dataflash.c
 create mode 100644 include/linux/spi/flash.h

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index 9a2aa4033c6a..84f2eb1fae02 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -47,6 +47,14 @@ config MTD_MS02NV
 	  accelerator.  Say Y here if you have a DECstation 5000/2x0 or a
 	  DECsystem 5900 equipped with such a module.
 
+config MTD_DATAFLASH
+	tristate "Support for AT45xxx DataFlash"
+	depends on MTD && SPI_MASTER && EXPERIMENTAL
+	help
+	  This enables access to AT45xxx DataFlash chips, using SPI.
+	  Sometimes DataFlash chips are packaged inside MMC-format
+	  cards; at this writing, the MMC stack won't handle those.
+
 config MTD_SLRAM
 	tristate "Uncached system RAM"
 	depends on MTD
diff --git a/drivers/mtd/devices/Makefile b/drivers/mtd/devices/Makefile
index e38db348057d..cd8d8074b5b6 100644
--- a/drivers/mtd/devices/Makefile
+++ b/drivers/mtd/devices/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_MTD_MTDRAM)	+= mtdram.o
 obj-$(CONFIG_MTD_LART)		+= lart.o
 obj-$(CONFIG_MTD_BLKMTD)	+= blkmtd.o
 obj-$(CONFIG_MTD_BLOCK2MTD)	+= block2mtd.o
+obj-$(CONFIG_MTD_DATAFLASH)	+= mtd_dataflash.o
diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c
new file mode 100644
index 000000000000..a39b3b6b266c
--- /dev/null
+++ b/drivers/mtd/devices/mtd_dataflash.c
@@ -0,0 +1,623 @@
+/*
+ * Atmel AT45xxx DataFlash MTD driver for lightweight SPI framework
+ *
+ * Largely derived from at91_dataflash.c:
+ *  Copyright (C) 2003-2005 SAN People (Pty) Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+*/
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/flash.h>
+
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+
+
+/*
+ * DataFlash is a kind of SPI flash.  Most AT45 chips have two buffers in
+ * each chip, which may be used for double buffered I/O; but this driver
+ * doesn't (yet) use these for any kind of i/o overlap or prefetching.
+ *
+ * Sometimes DataFlash is packaged in MMC-format cards, although the
+ * MMC stack can't use SPI (yet), or distinguish between MMC and DataFlash
+ * protocols during enumeration.
+ */
+
+#define CONFIG_DATAFLASH_WRITE_VERIFY
+
+/* reads can bypass the buffers */
+#define OP_READ_CONTINUOUS	0xE8
+#define OP_READ_PAGE		0xD2
+
+/* group B requests can run even while status reports "busy" */
+#define OP_READ_STATUS		0xD7	/* group B */
+
+/* move data between host and buffer */
+#define OP_READ_BUFFER1		0xD4	/* group B */
+#define OP_READ_BUFFER2		0xD6	/* group B */
+#define OP_WRITE_BUFFER1	0x84	/* group B */
+#define OP_WRITE_BUFFER2	0x87	/* group B */
+
+/* erasing flash */
+#define OP_ERASE_PAGE		0x81
+#define OP_ERASE_BLOCK		0x50
+
+/* move data between buffer and flash */
+#define OP_TRANSFER_BUF1	0x53
+#define OP_TRANSFER_BUF2	0x55
+#define OP_MREAD_BUFFER1	0xD4
+#define OP_MREAD_BUFFER2	0xD6
+#define OP_MWERASE_BUFFER1	0x83
+#define OP_MWERASE_BUFFER2	0x86
+#define OP_MWRITE_BUFFER1	0x88	/* sector must be pre-erased */
+#define OP_MWRITE_BUFFER2	0x89	/* sector must be pre-erased */
+
+/* write to buffer, then write-erase to flash */
+#define OP_PROGRAM_VIA_BUF1	0x82
+#define OP_PROGRAM_VIA_BUF2	0x85
+
+/* compare buffer to flash */
+#define OP_COMPARE_BUF1		0x60
+#define OP_COMPARE_BUF2		0x61
+
+/* read flash to buffer, then write-erase to flash */
+#define OP_REWRITE_VIA_BUF1	0x58
+#define OP_REWRITE_VIA_BUF2	0x59
+
+/* newer chips report JEDEC manufacturer and device IDs; chip
+ * serial number and OTP bits; and per-sector writeprotect.
+ */
+#define OP_READ_ID		0x9F
+#define OP_READ_SECURITY	0x77
+#define OP_WRITE_SECURITY	0x9A	/* OTP bits */
+
+
+struct dataflash {
+	u8			command[4];
+	char			name[24];
+
+	unsigned		partitioned:1;
+
+	unsigned short		page_offset;	/* offset in flash address */
+	unsigned int		page_size;	/* of bytes per page */
+
+	struct semaphore	lock;
+	struct spi_device	*spi;
+
+	struct mtd_info		mtd;
+};
+
+#ifdef CONFIG_MTD_PARTITIONS
+#define	mtd_has_partitions()	(1)
+#else
+#define	mtd_has_partitions()	(0)
+#endif
+
+/* ......................................................................... */
+
+/*
+ * Return the status of the DataFlash device.
+ */
+static inline int dataflash_status(struct spi_device *spi)
+{
+	/* NOTE:  at45db321c over 25 MHz wants to write
+	 * a dummy byte after the opcode...
+	 */
+	return spi_w8r8(spi, OP_READ_STATUS);
+}
+
+/*
+ * Poll the DataFlash device until it is READY.
+ * This usually takes 5-20 msec or so; more for sector erase.
+ */
+static int dataflash_waitready(struct spi_device *spi)
+{
+	int	status;
+
+	for (;;) {
+		status = dataflash_status(spi);
+		if (status < 0) {
+			DEBUG(MTD_DEBUG_LEVEL1, "%s: status %d?\n",
+					spi->dev.bus_id, status);
+			status = 0;
+		}
+
+		if (status & (1 << 7))	/* RDY/nBSY */
+			return status;
+
+		msleep(3);
+	}
+}
+
+/* ......................................................................... */
+
+/*
+ * Erase pages of flash.
+ */
+static int dataflash_erase(struct mtd_info *mtd, struct erase_info *instr)
+{
+	struct dataflash	*priv = (struct dataflash *)mtd->priv;
+	struct spi_device	*spi = priv->spi;
+	struct spi_transfer	x[1] = { { .tx_dma = 0, }, };
+	struct spi_message	msg;
+	unsigned		blocksize = priv->page_size << 3;
+	u8			*command;
+
+	DEBUG(MTD_DEBUG_LEVEL2, "%s: erase addr=0x%x len 0x%x\n",
+			spi->dev.bus_id,
+			instr->addr, instr->len);
+
+	/* Sanity checks */
+	if ((instr->addr + instr->len) > mtd->size
+			|| (instr->len % priv->page_size) != 0
+			|| (instr->addr % priv->page_size) != 0)
+		return -EINVAL;
+
+	x[0].tx_buf = command = priv->command;
+	x[0].len = 4;
+	msg.transfers = x;
+	msg.n_transfer = 1;
+
+	down(&priv->lock);
+	while (instr->len > 0) {
+		unsigned int	pageaddr;
+		int		status;
+		int		do_block;
+
+		/* Calculate flash page address; use block erase (for speed) if
+		 * we're at a block boundary and need to erase the whole block.
+		 */
+		pageaddr = instr->addr / priv->page_size;
+		do_block = (pageaddr & 0x7) == 0 && instr->len <= blocksize;
+		pageaddr = pageaddr << priv->page_offset;
+
+		command[0] = do_block ? OP_ERASE_BLOCK : OP_ERASE_PAGE;
+		command[1] = (u8)(pageaddr >> 16);
+		command[2] = (u8)(pageaddr >> 8);
+		command[3] = 0;
+
+		DEBUG(MTD_DEBUG_LEVEL3, "ERASE %s: (%x) %x %x %x [%i]\n",
+			do_block ? "block" : "page",
+			command[0], command[1], command[2], command[3],
+			pageaddr);
+
+		status = spi_sync(spi, &msg);
+		(void) dataflash_waitready(spi);
+
+		if (status < 0) {
+			printk(KERN_ERR "%s: erase %x, err %d\n",
+				spi->dev.bus_id, pageaddr, status);
+			/* REVISIT:  can retry instr->retries times; or
+			 * giveup and instr->fail_addr = instr->addr;
+			 */
+			continue;
+		}
+
+		if (do_block) {
+			instr->addr += blocksize;
+			instr->len -= blocksize;
+		} else {
+			instr->addr += priv->page_size;
+			instr->len -= priv->page_size;
+		}
+	}
+	up(&priv->lock);
+
+	/* Inform MTD subsystem that erase is complete */
+	instr->state = MTD_ERASE_DONE;
+	mtd_erase_callback(instr);
+
+	return 0;
+}
+
+/*
+ * Read from the DataFlash device.
+ *   from   : Start offset in flash device
+ *   len    : Amount to read
+ *   retlen : About of data actually read
+ *   buf    : Buffer containing the data
+ */
+static int dataflash_read(struct mtd_info *mtd, loff_t from, size_t len,
+			       size_t *retlen, u_char *buf)
+{
+	struct dataflash	*priv = (struct dataflash *)mtd->priv;
+	struct spi_transfer	x[2] = { { .tx_dma = 0, }, };
+	struct spi_message	msg;
+	unsigned int		addr;
+	u8			*command;
+	int			status;
+
+	DEBUG(MTD_DEBUG_LEVEL2, "%s: read 0x%x..0x%x\n",
+		priv->spi->dev.bus_id, (unsigned)from, (unsigned)(from + len));
+
+	*retlen = 0;
+
+	/* Sanity checks */
+	if (!len)
+		return 0;
+	if (from + len > mtd->size)
+		return -EINVAL;
+
+	/* Calculate flash page/byte address */
+	addr = (((unsigned)from / priv->page_size) << priv->page_offset)
+		+ ((unsigned)from % priv->page_size);
+
+	command = priv->command;
+
+	DEBUG(MTD_DEBUG_LEVEL3, "READ: (%x) %x %x %x\n",
+		command[0], command[1], command[2], command[3]);
+
+	x[0].tx_buf = command;
+	x[0].len = 8;
+	x[1].rx_buf = buf;
+	x[1].len = len;
+	msg.transfers = x;
+	msg.n_transfer = 2;
+
+	down(&priv->lock);
+
+	/* Continuous read, max clock = f(car) which may be less than
+	 * the peak rate available.  Some chips support commands with
+	 * fewer "don't care" bytes.  Both buffers stay unchanged.
+	 */
+	command[0] = OP_READ_CONTINUOUS;
+	command[1] = (u8)(addr >> 16);
+	command[2] = (u8)(addr >> 8);
+	command[3] = (u8)(addr >> 0);
+	/* plus 4 "don't care" bytes */
+
+	status = spi_sync(priv->spi, &msg);
+	up(&priv->lock);
+
+	if (status >= 0) {
+		*retlen = msg.actual_length - 8;
+		status = 0;
+	} else
+		DEBUG(MTD_DEBUG_LEVEL1, "%s: read %x..%x --> %d\n",
+			priv->spi->dev.bus_id,
+			(unsigned)from, (unsigned)(from + len),
+			status);
+	return status;
+}
+
+/*
+ * Write to the DataFlash device.
+ *   to     : Start offset in flash device
+ *   len    : Amount to write
+ *   retlen : Amount of data actually written
+ *   buf    : Buffer containing the data
+ */
+static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
+				size_t * retlen, const u_char * buf)
+{
+	struct dataflash	*priv = (struct dataflash *)mtd->priv;
+	struct spi_device	*spi = priv->spi;
+	struct spi_transfer	x[2] = { { .tx_dma = 0, }, };
+	struct spi_message	msg;
+	unsigned int		pageaddr, addr, offset, writelen;
+	size_t			remaining = len;
+	u_char			*writebuf = (u_char *) buf;
+	int			status = -EINVAL;
+	u8			*command;
+
+	DEBUG(MTD_DEBUG_LEVEL2, "%s: write 0x%x..0x%x\n",
+		spi->dev.bus_id, (unsigned)to, (unsigned)(to + len));
+
+	*retlen = 0;
+
+	/* Sanity checks */
+	if (!len)
+		return 0;
+	if ((to + len) > mtd->size)
+		return -EINVAL;
+
+	x[0].tx_buf = command = priv->command;
+	x[0].len = 4;
+	msg.transfers = x;
+
+	pageaddr = ((unsigned)to / priv->page_size);
+	offset = ((unsigned)to % priv->page_size);
+	if (offset + len > priv->page_size)
+		writelen = priv->page_size - offset;
+	else
+		writelen = len;
+
+	down(&priv->lock);
+	while (remaining > 0) {
+		DEBUG(MTD_DEBUG_LEVEL3, "write @ %i:%i len=%i\n",
+			pageaddr, offset, writelen);
+
+		/* REVISIT:
+		 * (a) each page in a sector must be rewritten at least
+		 *     once every 10K sibling erase/program operations.
+		 * (b) for pages that are already erased, we could
+		 *     use WRITE+MWRITE not PROGRAM for ~30% speedup.
+		 * (c) WRITE to buffer could be done while waiting for
+		 *     a previous MWRITE/MWERASE to complete ...
+		 * (d) error handling here seems to be mostly missing.
+		 *
+		 * Two persistent bits per page, plus a per-sector counter,
+		 * could support (a) and (b) ... we might consider using
+		 * the second half of sector zero, which is just one block,
+		 * to track that state.  (On AT91, that sector should also
+		 * support boot-from-DataFlash.)
+		 */
+
+		addr = pageaddr << priv->page_offset;
+
+		/* (1) Maybe transfer partial page to Buffer1 */
+		if (writelen != priv->page_size) {
+			command[0] = OP_TRANSFER_BUF1;
+			command[1] = (addr & 0x00FF0000) >> 16;
+			command[2] = (addr & 0x0000FF00) >> 8;
+			command[3] = 0;
+
+			DEBUG(MTD_DEBUG_LEVEL3, "TRANSFER: (%x) %x %x %x\n",
+				command[0], command[1], command[2], command[3]);
+
+			msg.n_transfer = 1;
+			status = spi_sync(spi, &msg);
+			if (status < 0)
+				DEBUG(MTD_DEBUG_LEVEL1, "%s: xfer %u -> %d \n",
+					spi->dev.bus_id, addr, status);
+
+			(void) dataflash_waitready(priv->spi);
+		}
+
+		/* (2) Program full page via Buffer1 */
+		addr += offset;
+		command[0] = OP_PROGRAM_VIA_BUF1;
+		command[1] = (addr & 0x00FF0000) >> 16;
+		command[2] = (addr & 0x0000FF00) >> 8;
+		command[3] = (addr & 0x000000FF);
+
+		DEBUG(MTD_DEBUG_LEVEL3, "PROGRAM: (%x) %x %x %x\n",
+			command[0], command[1], command[2], command[3]);
+
+		x[1].tx_buf = writebuf;
+		x[1].len = writelen;
+		msg.n_transfer = 2;
+		status = spi_sync(spi, &msg);
+		if (status < 0)
+			DEBUG(MTD_DEBUG_LEVEL1, "%s: pgm %u/%u -> %d \n",
+				spi->dev.bus_id, addr, writelen, status);
+
+		(void) dataflash_waitready(priv->spi);
+
+#ifdef	CONFIG_DATAFLASH_WRITE_VERIFY
+
+		/* (3) Compare to Buffer1 */
+		addr = pageaddr << priv->page_offset;
+		command[0] = OP_COMPARE_BUF1;
+		command[1] = (addr & 0x00FF0000) >> 16;
+		command[2] = (addr & 0x0000FF00) >> 8;
+		command[3] = 0;
+
+		DEBUG(MTD_DEBUG_LEVEL3, "COMPARE: (%x) %x %x %x\n",
+			command[0], command[1], command[2], command[3]);
+
+		msg.n_transfer = 1;
+		status = spi_sync(spi, &msg);
+		if (status < 0)
+			DEBUG(MTD_DEBUG_LEVEL1, "%s: compare %u -> %d \n",
+				spi->dev.bus_id, addr, status);
+
+		status = dataflash_waitready(priv->spi);
+
+		/* Check result of the compare operation */
+		if ((status & (1 << 6)) == 1) {
+			printk(KERN_ERR "%s: compare page %u, err %d\n",
+				spi->dev.bus_id, pageaddr, status);
+			remaining = 0;
+			status = -EIO;
+			break;
+		} else
+			status = 0;
+
+#endif	/* CONFIG_DATAFLASH_WRITE_VERIFY */
+
+		remaining = remaining - writelen;
+		pageaddr++;
+		offset = 0;
+		writebuf += writelen;
+		*retlen += writelen;
+
+		if (remaining > priv->page_size)
+			writelen = priv->page_size;
+		else
+			writelen = remaining;
+	}
+	up(&priv->lock);
+
+	return status;
+}
+
+/* ......................................................................... */
+
+/*
+ * Register DataFlash device with MTD subsystem.
+ */
+static int __devinit
+add_dataflash(struct spi_device *spi, char *name,
+		int nr_pages, int pagesize, int pageoffset)
+{
+	struct dataflash		*priv;
+	struct mtd_info			*device;
+	struct flash_platform_data	*pdata = spi->dev.platform_data;
+
+	priv = (struct dataflash *) kzalloc(sizeof *priv, GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	init_MUTEX(&priv->lock);
+	priv->spi = spi;
+	priv->page_size = pagesize;
+	priv->page_offset = pageoffset;
+
+	/* name must be usable with cmdlinepart */
+	sprintf(priv->name, "spi%d.%d-%s",
+			spi->master->bus_num, spi->chip_select,
+			name);
+
+	device = &priv->mtd;
+	device->name = (pdata && pdata->name) ? pdata->name : priv->name;
+	device->size = nr_pages * pagesize;
+	device->erasesize = pagesize;
+	device->owner = THIS_MODULE;
+	device->type = MTD_DATAFLASH;
+	device->flags = MTD_CAP_NORFLASH;
+	device->erase = dataflash_erase;
+	device->read = dataflash_read;
+	device->write = dataflash_write;
+	device->priv = priv;
+
+	dev_info(&spi->dev, "%s (%d KBytes)\n", name, device->size/1024);
+	dev_set_drvdata(&spi->dev, priv);
+
+	if (mtd_has_partitions()) {
+		struct mtd_partition	*parts;
+		int			nr_parts = 0;
+
+#ifdef CONFIG_MTD_CMDLINE_PARTS
+		static const char *part_probes[] = { "cmdlinepart", NULL, };
+
+		nr_parts = parse_mtd_partitions(device, part_probes, &parts, 0);
+#endif
+
+		if (nr_parts <= 0 && pdata && pdata->parts) {
+			parts = pdata->parts;
+			nr_parts = pdata->nr_parts;
+		}
+
+		if (nr_parts > 0) {
+			priv->partitioned = 1;
+			return add_mtd_partitions(device, parts, nr_parts);
+		}
+	} else if (pdata->nr_parts)
+		dev_warn(&spi->dev, "ignoring %d default partitions on %s\n",
+				pdata->nr_parts, device->name);
+
+	return add_mtd_device(device) == 1 ? -ENODEV : 0;
+}
+
+/*
+ * Detect and initialize DataFlash device:
+ *
+ *   Device      Density         ID code          #Pages PageSize  Offset
+ *   AT45DB011B  1Mbit   (128K)  xx0011xx (0x0c)    512    264      9
+ *   AT45DB021B  2Mbit   (256K)  xx0101xx (0x14)   1025    264      9
+ *   AT45DB041B  4Mbit   (512K)  xx0111xx (0x1c)   2048    264      9
+ *   AT45DB081B  8Mbit   (1M)    xx1001xx (0x24)   4096    264      9
+ *   AT45DB0161B 16Mbit  (2M)    xx1011xx (0x2c)   4096    528     10
+ *   AT45DB0321B 32Mbit  (4M)    xx1101xx (0x34)   8192    528     10
+ *   AT45DB0642  64Mbit  (8M)    xx111xxx (0x3c)   8192   1056     11
+ *   AT45DB1282  128Mbit (16M)   xx0100xx (0x10)  16384   1056     11
+ */
+static int __devinit dataflash_probe(struct spi_device *spi)
+{
+	int status;
+
+	status = dataflash_status(spi);
+	if (status <= 0 || status == 0xff) {
+		DEBUG(MTD_DEBUG_LEVEL1, "%s: status error %d\n",
+				spi->dev.bus_id, status);
+		if (status == 0xff)
+			status = -ENODEV;
+		return status;
+	}
+
+	/* if there's a device there, assume it's dataflash.
+	 * board setup should have set spi->max_speed_max to
+	 * match f(car) for continuous reads, mode 0 or 3.
+	 */
+	switch (status & 0x3c) {
+	case 0x0c:	/* 0 0 1 1 x x */
+		status = add_dataflash(spi, "AT45DB011B", 512, 264, 9);
+		break;
+	case 0x14:	/* 0 1 0 1 x x */
+		status = add_dataflash(spi, "AT45DB021B", 1025, 264, 9);
+		break;
+	case 0x1c:	/* 0 1 1 1 x x */
+		status = add_dataflash(spi, "AT45DB041x", 2048, 264, 9);
+		break;
+	case 0x24:	/* 1 0 0 1 x x */
+		status = add_dataflash(spi, "AT45DB081B", 4096, 264, 9);
+		break;
+	case 0x2c:	/* 1 0 1 1 x x */
+		status = add_dataflash(spi, "AT45DB161x", 4096, 528, 10);
+		break;
+	case 0x34:	/* 1 1 0 1 x x */
+		status = add_dataflash(spi, "AT45DB321x", 8192, 528, 10);
+		break;
+	case 0x38:	/* 1 1 1 x x x */
+	case 0x3c:
+		status = add_dataflash(spi, "AT45DB642x", 8192, 1056, 11);
+		break;
+	/* obsolete AT45DB1282 not (yet?) supported */
+	default:
+		DEBUG(MTD_DEBUG_LEVEL1, "%s: unsupported device (%x)\n",
+				spi->dev.bus_id, status & 0x3c);
+		status = -ENODEV;
+	}
+
+	if (status < 0)
+		DEBUG(MTD_DEBUG_LEVEL1, "%s: add_dataflash --> %d\n",
+				spi->dev.bus_id, status);
+
+	return status;
+}
+
+static int __devexit dataflash_remove(struct spi_device *spi)
+{
+	struct dataflash	*flash = dev_get_drvdata(&spi->dev);
+	int			status;
+
+	DEBUG(MTD_DEBUG_LEVEL1, "%s: remove\n", spi->dev.bus_id);
+
+	if (mtd_has_partitions() && flash->partitioned)
+		status = del_mtd_partitions(&flash->mtd);
+	else
+		status = del_mtd_device(&flash->mtd);
+	if (status == 0)
+		kfree(flash);
+	return status;
+}
+
+static struct spi_driver dataflash_driver = {
+	.driver = {
+		.name		= "mtd_dataflash",
+		.bus		= &spi_bus_type,
+		.owner		= THIS_MODULE,
+	},
+
+	.probe		= dataflash_probe,
+	.remove		= __devexit_p(dataflash_remove),
+
+	/* FIXME:  investigate suspend and resume... */
+};
+
+static int __init dataflash_init(void)
+{
+	return spi_register_driver(&dataflash_driver);
+}
+module_init(dataflash_init);
+
+static void __exit dataflash_exit(void)
+{
+	spi_unregister_driver(&dataflash_driver);
+}
+module_exit(dataflash_exit);
+
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andrew Victor, David Brownell");
+MODULE_DESCRIPTION("MTD DataFlash driver");
diff --git a/include/linux/spi/flash.h b/include/linux/spi/flash.h
new file mode 100644
index 000000000000..2ce6558bf3f8
--- /dev/null
+++ b/include/linux/spi/flash.h
@@ -0,0 +1,27 @@
+#ifndef LINUX_SPI_FLASH_H
+#define LINUX_SPI_FLASH_H
+
+struct mtd_partition;
+
+/**
+ * struct flash_platform_data: board-specific flash data
+ * @name: optional flash device name (eg, as used with mtdparts=)
+ * @parts: optional array of mtd_partitions for static partitioning
+ * @nr_parts: number of mtd_partitions for static partitoning
+ *
+ * Board init code (in arch/.../mach-xxx/board-yyy.c files) can
+ * provide information about SPI flash parts (such as DataFlash) to
+ * help set up the device and its appropriate default partitioning.
+ *
+ * Note that for DataFlash, sizes for pages, blocks, and sectors are
+ * rarely powers of two; and partitions should be sector-aligned.
+ */
+struct flash_platform_data {
+	char		*name;
+	struct mtd_partition *parts;
+	unsigned int	nr_parts;
+
+	/* we'll likely add more ... use JEDEC IDs, etc */
+};
+
+#endif
-- 
cgit v1.2.3-71-gd317


From b885244eb2628e0b8206e7edaaa6a314da78e9a4 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 8 Jan 2006 13:34:23 -0800
Subject: [PATCH] spi: add spi_driver to SPI framework

This is a refresh of the "Simple SPI Framework" found in 2.6.15-rc3-mm1
which makes the following changes:

  * There's now a "struct spi_driver".  This increase the footprint
    of the core a bit, since it now includes code to do what the driver
    core was previously handling directly.  Documentation and comments
    were updated to match.

  * spi_alloc_master() now does class_device_initialize(), so it can
    at least be refcounted before spi_register_master().  To match,
    spi_register_master() switched over to class_device_add().

  * States explicitly that after transfer errors, spi_devices will be
    deselected.  We want fault recovery procedures to work the same
    for all controller drivers.

  * Minor tweaks:  controller_data no longer points to readonly data;
    prevent some potential cast-from-null bugs with container_of calls;
    clarifies some existing kerneldoc,

And a few small cleanups.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/spi/spi-summary |  52 ++++++++++++-------
 drivers/spi/spi.c             | 118 ++++++++++++++++++++++++++++++------------
 include/linux/spi/spi.h       |  75 +++++++++++++++++++--------
 3 files changed, 170 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
index 00497f95ca4b..c6152d1ff2b0 100644
--- a/Documentation/spi/spi-summary
+++ b/Documentation/spi/spi-summary
@@ -1,18 +1,19 @@
 Overview of Linux kernel SPI support
 ====================================
 
-22-Nov-2005
+02-Dec-2005
 
 What is SPI?
 ------------
-The "Serial Peripheral Interface" (SPI) is a four-wire point-to-point
-serial link used to connect microcontrollers to sensors and memory.
+The "Serial Peripheral Interface" (SPI) is a synchronous four wire serial
+link used to connect microcontrollers to sensors, memory, and peripherals.
 
 The three signal wires hold a clock (SCLK, often on the order of 10 MHz),
 and parallel data lines with "Master Out, Slave In" (MOSI) or "Master In,
 Slave Out" (MISO) signals.  (Other names are also used.)  There are four
 clocking modes through which data is exchanged; mode-0 and mode-3 are most
-commonly used.
+commonly used.  Each clock cycle shifts data out and data in; the clock
+doesn't cycle except when there is data to shift.
 
 SPI masters may use a "chip select" line to activate a given SPI slave
 device, so those three signal wires may be connected to several chips
@@ -79,11 +80,18 @@ The <linux/spi/spi.h> header file includes kerneldoc, as does the
 main source code, and you should certainly read that.  This is just
 an overview, so you get the big picture before the details.
 
+SPI requests always go into I/O queues.  Requests for a given SPI device
+are always executed in FIFO order, and complete asynchronously through
+completion callbacks.  There are also some simple synchronous wrappers
+for those calls, including ones for common transaction types like writing
+a command and then reading its response.
+
 There are two types of SPI driver, here called:
 
   Controller drivers ... these are often built in to System-On-Chip
 	processors, and often support both Master and Slave roles.
 	These drivers touch hardware registers and may use DMA.
+	Or they can be PIO bitbangers, needing just GPIO pins.
 
   Protocol drivers ... these pass messages through the controller
 	driver to communicate with a Slave or Master device on the
@@ -116,11 +124,6 @@ shows up in sysfs in several locations:
 	managing bus "B".  All the spiB.* devices share the same
 	physical SPI bus segment, with SCLK, MOSI, and MISO.
 
-The basic I/O primitive submits an asynchronous message to an I/O queue
-maintained by the controller driver.  A completion callback is issued
-asynchronously when the data transfer(s) in that message completes.
-There are also some simple synchronous wrappers for those calls.
-
 
 How does board-specific init code declare SPI devices?
 ------------------------------------------------------
@@ -263,33 +266,40 @@ would just be another kernel driver, probably offering some lowlevel
 access through aio_read(), aio_write(), and ioctl() calls and using the
 standard userspace sysfs mechanisms to bind to a given SPI device.
 
-SPI protocol drivers are normal device drivers, with no more wrapper
-than needed by platform devices:
+SPI protocol drivers somewhat resemble platform device drivers:
+
+	static struct spi_driver CHIP_driver = {
+		.driver = {
+			.name		= "CHIP",
+			.bus		= &spi_bus_type,
+			.owner		= THIS_MODULE,
+		},
 
-	static struct device_driver CHIP_driver = {
-		.name		= "CHIP",
-		.bus		= &spi_bus_type,
 		.probe		= CHIP_probe,
-		.remove		= __exit_p(CHIP_remove),
+		.remove		= __devexit_p(CHIP_remove),
 		.suspend	= CHIP_suspend,
 		.resume		= CHIP_resume,
 	};
 
-The SPI core will autmatically attempt to bind this driver to any SPI
+The driver core will autmatically attempt to bind this driver to any SPI
 device whose board_info gave a modalias of "CHIP".  Your probe() code
 might look like this unless you're creating a class_device:
 
-	static int __init CHIP_probe(struct device *dev)
+	static int __devinit CHIP_probe(struct spi_device *spi)
 	{
-		struct spi_device		*spi = to_spi_device(dev);
 		struct CHIP			*chip;
-		struct CHIP_platform_data	*pdata = dev->platform_data;
+		struct CHIP_platform_data	*pdata;
+
+		/* assuming the driver requires board-specific data: */
+		pdata = &spi->dev.platform_data;
+		if (!pdata)
+			return -ENODEV;
 
 		/* get memory for driver's per-chip state */
 		chip = kzalloc(sizeof *chip, GFP_KERNEL);
 		if (!chip)
 			return -ENOMEM;
-		dev_set_drvdata(dev, chip);
+		dev_set_drvdata(&spi->dev, chip);
 
 		... etc
 		return 0;
@@ -328,6 +338,8 @@ the driver guarantees that it won't submit any more such messages.
   - The basic I/O primitive is spi_async().  Async requests may be
     issued in any context (irq handler, task, etc) and completion
     is reported using a callback provided with the message.
+    After any detected error, the chip is deselected and processing
+    of that spi_message is aborted.
 
   - There are also synchronous wrappers like spi_sync(), and wrappers
     like spi_read(), spi_write(), and spi_write_then_read().  These
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 7cd356b17644..2ecb86cb3689 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -26,13 +26,9 @@
 #include <linux/spi/spi.h>
 
 
-/* SPI bustype and spi_master class are registered during early boot,
- * usually before board init code provides the SPI device tables, and
- * are available later when driver init code needs them.
- *
- * Drivers for SPI devices started out like those for platform bus
- * devices.  But both have changed in 2.6.15; maybe this should get
- * an "spi_driver" structure at some point (not currently needed)
+/* SPI bustype and spi_master class are registered after board init code
+ * provides the SPI device tables, ensuring that both are present by the
+ * time controller driver registration causes spi_devices to "enumerate".
  */
 static void spidev_release(struct device *dev)
 {
@@ -83,10 +79,7 @@ static int spi_uevent(struct device *dev, char **envp, int num_envp,
 
 #ifdef	CONFIG_PM
 
-/* Suspend/resume in "struct device_driver" don't really need that
- * strange third parameter, so we just make it a constant and expect
- * SPI drivers to ignore it just like most platform drivers do.
- *
+/*
  * NOTE:  the suspend() method for an spi_master controller driver
  * should verify that all its child devices are marked as suspended;
  * suspend requests delivered through sysfs power/state files don't
@@ -94,13 +87,14 @@ static int spi_uevent(struct device *dev, char **envp, int num_envp,
  */
 static int spi_suspend(struct device *dev, pm_message_t message)
 {
-	int	value;
+	int			value;
+	struct spi_driver	*drv = to_spi_driver(dev->driver);
 
-	if (!dev->driver || !dev->driver->suspend)
+	if (!drv || !drv->suspend)
 		return 0;
 
 	/* suspend will stop irqs and dma; no more i/o */
-	value = dev->driver->suspend(dev, message);
+	value = drv->suspend(to_spi_device(dev), message);
 	if (value == 0)
 		dev->power.power_state = message;
 	return value;
@@ -108,13 +102,14 @@ static int spi_suspend(struct device *dev, pm_message_t message)
 
 static int spi_resume(struct device *dev)
 {
-	int	value;
+	int			value;
+	struct spi_driver	*drv = to_spi_driver(dev->driver);
 
-	if (!dev->driver || !dev->driver->resume)
+	if (!drv || !drv->resume)
 		return 0;
 
 	/* resume may restart the i/o queue */
-	value = dev->driver->resume(dev);
+	value = drv->resume(to_spi_device(dev));
 	if (value == 0)
 		dev->power.power_state = PMSG_ON;
 	return value;
@@ -135,6 +130,41 @@ struct bus_type spi_bus_type = {
 };
 EXPORT_SYMBOL_GPL(spi_bus_type);
 
+
+static int spi_drv_probe(struct device *dev)
+{
+	const struct spi_driver		*sdrv = to_spi_driver(dev->driver);
+
+	return sdrv->probe(to_spi_device(dev));
+}
+
+static int spi_drv_remove(struct device *dev)
+{
+	const struct spi_driver		*sdrv = to_spi_driver(dev->driver);
+
+	return sdrv->remove(to_spi_device(dev));
+}
+
+static void spi_drv_shutdown(struct device *dev)
+{
+	const struct spi_driver		*sdrv = to_spi_driver(dev->driver);
+
+	sdrv->shutdown(to_spi_device(dev));
+}
+
+int spi_register_driver(struct spi_driver *sdrv)
+{
+	sdrv->driver.bus = &spi_bus_type;
+	if (sdrv->probe)
+		sdrv->driver.probe = spi_drv_probe;
+	if (sdrv->remove)
+		sdrv->driver.remove = spi_drv_remove;
+	if (sdrv->shutdown)
+		sdrv->driver.shutdown = spi_drv_shutdown;
+	return driver_register(&sdrv->driver);
+}
+EXPORT_SYMBOL_GPL(spi_register_driver);
+
 /*-------------------------------------------------------------------------*/
 
 /* SPI devices should normally not be created by SPI device drivers; that
@@ -208,13 +238,15 @@ spi_new_device(struct spi_master *master, struct spi_board_info *chip)
 	if (status < 0) {
 		dev_dbg(dev, "can't %s %s, status %d\n",
 				"add", proxy->dev.bus_id, status);
-fail:
-		class_device_put(&master->cdev);
-		kfree(proxy);
-		return NULL;
+		goto fail;
 	}
 	dev_dbg(dev, "registered child %s\n", proxy->dev.bus_id);
 	return proxy;
+
+fail:
+	class_device_put(&master->cdev);
+	kfree(proxy);
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(spi_new_device);
 
@@ -237,11 +269,11 @@ spi_register_board_info(struct spi_board_info const *info, unsigned n)
 {
 	struct boardinfo	*bi;
 
-	bi = kmalloc (sizeof (*bi) + n * sizeof (*info), GFP_KERNEL);
+	bi = kmalloc(sizeof(*bi) + n * sizeof *info, GFP_KERNEL);
 	if (!bi)
 		return -ENOMEM;
 	bi->n_board_info = n;
-	memcpy(bi->board_info, info, n * sizeof (*info));
+	memcpy(bi->board_info, info, n * sizeof *info);
 
 	down(&board_lock);
 	list_add_tail(&bi->list, &board_list);
@@ -330,6 +362,7 @@ spi_alloc_master(struct device *dev, unsigned size)
 	if (!master)
 		return NULL;
 
+	class_device_initialize(&master->cdev);
 	master->cdev.class = &spi_master_class;
 	master->cdev.dev = get_device(dev);
 	class_set_devdata(&master->cdev, &master[1]);
@@ -366,7 +399,7 @@ spi_register_master(struct spi_master *master)
 	/* convention:  dynamically assigned bus IDs count down from the max */
 	if (master->bus_num == 0) {
 		master->bus_num = atomic_dec_return(&dyn_bus_id);
-		dynamic = 0;
+		dynamic = 1;
 	}
 
 	/* register the device, then userspace will see it.
@@ -374,11 +407,9 @@ spi_register_master(struct spi_master *master)
 	 */
 	snprintf(master->cdev.class_id, sizeof master->cdev.class_id,
 		"spi%u", master->bus_num);
-	status = class_device_register(&master->cdev);
-	if (status < 0) {
-		class_device_put(&master->cdev);
+	status = class_device_add(&master->cdev);
+	if (status < 0)
 		goto done;
-	}
 	dev_dbg(dev, "registered master %s%s\n", master->cdev.class_id,
 			dynamic ? " (dynamic)" : "");
 
@@ -491,6 +522,7 @@ static u8	*buf;
  * This performs a half duplex MicroWire style transaction with the
  * device, sending txbuf and then reading rxbuf.  The return value
  * is zero for success, else a negative errno status code.
+ * This call may only be used from a context that may sleep.
  *
  * Parameters to this routine are always copied using a small buffer,
  * large transfers should use use spi_{async,sync}() calls with
@@ -553,16 +585,38 @@ EXPORT_SYMBOL_GPL(spi_write_then_read);
 
 static int __init spi_init(void)
 {
+	int	status;
+
 	buf = kmalloc(SPI_BUFSIZ, SLAB_KERNEL);
-	if (!buf)
-		return -ENOMEM;
+	if (!buf) {
+		status = -ENOMEM;
+		goto err0;
+	}
+
+	status = bus_register(&spi_bus_type);
+	if (status < 0)
+		goto err1;
 
-	bus_register(&spi_bus_type);
-	class_register(&spi_master_class);
+	status = class_register(&spi_master_class);
+	if (status < 0)
+		goto err2;
 	return 0;
+
+err2:
+	bus_unregister(&spi_bus_type);
+err1:
+	kfree(buf);
+	buf = NULL;
+err0:
+	return status;
 }
+
 /* board_info is normally registered in arch_initcall(),
  * but even essential drivers wait till later
+ *
+ * REVISIT only boardinfo really needs static linking. the rest (device and
+ * driver registration) _could_ be dynamically linked (modular) ... costs
+ * include needing to have boardinfo data structures be much more public.
  */
 subsys_initcall(spi_init);
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 51a6769114df..c851b3d13208 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -20,13 +20,8 @@
 #define __LINUX_SPI_H
 
 /*
- * INTERFACES between SPI master drivers and infrastructure
+ * INTERFACES between SPI master-side drivers and SPI infrastructure.
  * (There's no SPI slave support for Linux yet...)
- *
- * A "struct device_driver" for an spi_device uses "spi_bus_type" and
- * needs no special API wrappers (much like platform_bus).  These drivers
- * are bound to devices based on their names (much like platform_bus),
- * and are available in dev->driver.
  */
 extern struct bus_type spi_bus_type;
 
@@ -46,8 +41,8 @@ extern struct bus_type spi_bus_type;
  * @irq: Negative, or the number passed to request_irq() to receive
  * 	interrupts from this device.
  * @controller_state: Controller's runtime state
- * @controller_data: Static board-specific definitions for controller, such
- * 	as FIFO initialization parameters; from board_info.controller_data
+ * @controller_data: Board-specific definitions for controller, such as
+ * 	FIFO initialization parameters; from board_info.controller_data
  *
  * An spi_device is used to interchange data between an SPI slave
  * (usually a discrete chip) and CPU memory.
@@ -63,31 +58,32 @@ struct spi_device {
 	u32			max_speed_hz;
 	u8			chip_select;
 	u8			mode;
-#define	SPI_CPHA	0x01		/* clock phase */
-#define	SPI_CPOL	0x02		/* clock polarity */
+#define	SPI_CPHA	0x01			/* clock phase */
+#define	SPI_CPOL	0x02			/* clock polarity */
 #define	SPI_MODE_0	(0|0)
-#define	SPI_MODE_1	(0|SPI_CPHA)
+#define	SPI_MODE_1	(0|SPI_CPHA)		/* (original MicroWire) */
 #define	SPI_MODE_2	(SPI_CPOL|0)
 #define	SPI_MODE_3	(SPI_CPOL|SPI_CPHA)
-#define	SPI_CS_HIGH	0x04		/* chipselect active high? */
+#define	SPI_CS_HIGH	0x04			/* chipselect active high? */
 	u8			bits_per_word;
 	int			irq;
 	void			*controller_state;
-	const void		*controller_data;
+	void			*controller_data;
 	const char		*modalias;
 
 	// likely need more hooks for more protocol options affecting how
-	// the controller talks to its chips, like:
+	// the controller talks to each chip, like:
 	//  - bit order (default is wordwise msb-first)
 	//  - memory packing (12 bit samples into low bits, others zeroed)
 	//  - priority
+	//  - drop chipselect after each word
 	//  - chipselect delays
 	//  - ...
 };
 
 static inline struct spi_device *to_spi_device(struct device *dev)
 {
-	return container_of(dev, struct spi_device, dev);
+	return dev ? container_of(dev, struct spi_device, dev) : NULL;
 }
 
 /* most drivers won't need to care about device refcounting */
@@ -117,12 +113,38 @@ static inline void spi_set_ctldata(struct spi_device *spi, void *state)
 struct spi_message;
 
 
+
+struct spi_driver {
+	int			(*probe)(struct spi_device *spi);
+	int			(*remove)(struct spi_device *spi);
+	void			(*shutdown)(struct spi_device *spi);
+	int			(*suspend)(struct spi_device *spi, pm_message_t mesg);
+	int			(*resume)(struct spi_device *spi);
+	struct device_driver	driver;
+};
+
+static inline struct spi_driver *to_spi_driver(struct device_driver *drv)
+{
+	return drv ? container_of(drv, struct spi_driver, driver) : NULL;
+}
+
+extern int spi_register_driver(struct spi_driver *sdrv);
+
+static inline void spi_unregister_driver(struct spi_driver *sdrv)
+{
+	if (!sdrv)
+		return;
+	driver_unregister(&sdrv->driver);
+}
+
+
+
 /**
  * struct spi_master - interface to SPI master controller
  * @cdev: class interface to this driver
  * @bus_num: board-specific (and often SOC-specific) identifier for a
  * 	given SPI controller.
- * @num_chipselects: chipselects are used to distinguish individual
+ * @num_chipselect: chipselects are used to distinguish individual
  * 	SPI slaves, and are numbered from zero to num_chipselects.
  * 	each slave has a chipselect signal, but it's common that not
  * 	every chipselect is connected to a slave.
@@ -275,7 +297,8 @@ struct spi_transfer {
  *	addresses for each transfer buffer
  * @complete: called to report transaction completions
  * @context: the argument to complete() when it's called
- * @actual_length: how many bytes were transferd
+ * @actual_length: the total number of bytes that were transferred in all
+ *	successful segments
  * @status: zero for success, else negative errno
  * @queue: for use by whichever driver currently owns the message
  * @state: for use by whichever driver currently owns the message
@@ -295,7 +318,7 @@ struct spi_message {
 	 *
 	 * Some controller drivers (message-at-a-time queue processing)
 	 * could provide that as their default scheduling algorithm.  But
-	 * others (with multi-message pipelines) would need a flag to
+	 * others (with multi-message pipelines) could need a flag to
 	 * tell them about such special cases.
 	 */
 
@@ -346,6 +369,13 @@ spi_setup(struct spi_device *spi)
  * FIFO order, messages may go to different devices in other orders.
  * Some device might be higher priority, or have various "hard" access
  * time requirements, for example.
+ *
+ * On detection of any fault during the transfer, processing of
+ * the entire message is aborted, and the device is deselected.
+ * Until returning from the associated message completion callback,
+ * no other spi_message queued to that device will be processed.
+ * (This rule applies equally to all the synchronous transfer calls,
+ * which are wrappers around this core asynchronous primitive.)
  */
 static inline int
 spi_async(struct spi_device *spi, struct spi_message *message)
@@ -484,12 +514,12 @@ struct spi_board_info {
 	 * "modalias" is normally the driver name.
 	 *
 	 * platform_data goes to spi_device.dev.platform_data,
-	 * controller_data goes to spi_device.platform_data,
+	 * controller_data goes to spi_device.controller_data,
 	 * irq is copied too
 	 */
 	char		modalias[KOBJ_NAME_LEN];
 	const void	*platform_data;
-	const void	*controller_data;
+	void		*controller_data;
 	int		irq;
 
 	/* slower signaling on noisy or low voltage boards */
@@ -525,9 +555,8 @@ spi_register_board_info(struct spi_board_info const *info, unsigned n)
 
 
 /* If you're hotplugging an adapter with devices (parport, usb, etc)
- * use spi_new_device() to describe each device.  You can also call
- * spi_unregister_device() to get start making that device vanish,
- * but normally that would be handled by spi_unregister_master().
+ * use spi_new_device() to describe each device.  You would then call
+ * spi_unregister_device() to start making that device vanish.
  */
 extern struct spi_device *
 spi_new_device(struct spi_master *, struct spi_board_info *);
-- 
cgit v1.2.3-71-gd317


From 0c868461fcb8413cb9f691d68e5b99b0fd3c0737 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 8 Jan 2006 13:34:25 -0800
Subject: [PATCH] SPI core tweaks, bugfix

This includes various updates to the SPI core:

  - Fixes a driver model refcount bug in spi_unregister_master() paths.

  - The spi_master structures now have wrappers which help keep drivers
    from needing class-level get/put for device data or for refcounts.

  - Check for a few setup errors that would cause oopsing later.

  - Docs say more about memory management.  Highlights the use of DMA-safe
    i/o buffers, and zero-initializing spi_message and such metadata.

  - Provide a simple alloc/free for spi_message and its spi_transfer;
    this is only one of the possible memory management policies.

Nothing to break code that already works.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/spi/spi-summary | 16 +++++++++
 drivers/spi/spi.c             | 45 ++++++++++++++++----------
 include/linux/spi/spi.h       | 75 +++++++++++++++++++++++++++++++++++++++----
 3 files changed, 113 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
index c6152d1ff2b0..761debf748e9 100644
--- a/Documentation/spi/spi-summary
+++ b/Documentation/spi/spi-summary
@@ -363,6 +363,22 @@ upper boundaries might include sysfs (especially for sensor readings),
 the input layer, ALSA, networking, MTD, the character device framework,
 or other Linux subsystems.
 
+Note that there are two types of memory your driver must manage as part
+of interacting with SPI devices.
+
+  - I/O buffers use the usual Linux rules, and must be DMA-safe.
+    You'd normally allocate them from the heap or free page pool.
+    Don't use the stack, or anything that's declared "static".
+
+  - The spi_message and spi_transfer metadata used to glue those
+    I/O buffers into a group of protocol transactions.  These can
+    be allocated anywhere it's convenient, including as part of
+    other allocate-once driver data structures.  Zero-init these.
+
+If you like, spi_message_alloc() and spi_message_free() convenience
+routines are available to allocate and zero-initialize an spi_message
+with several transfers.
+
 
 How do I write an "SPI Master Controller Driver"?
 -------------------------------------------------
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 2ecb86cb3689..3ecedccdb96c 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -38,7 +38,7 @@ static void spidev_release(struct device *dev)
 	if (spi->master->cleanup)
 		spi->master->cleanup(spi);
 
-	class_device_put(&spi->master->cdev);
+	spi_master_put(spi->master);
 	kfree(dev);
 }
 
@@ -90,7 +90,7 @@ static int spi_suspend(struct device *dev, pm_message_t message)
 	int			value;
 	struct spi_driver	*drv = to_spi_driver(dev->driver);
 
-	if (!drv || !drv->suspend)
+	if (!drv->suspend)
 		return 0;
 
 	/* suspend will stop irqs and dma; no more i/o */
@@ -105,7 +105,7 @@ static int spi_resume(struct device *dev)
 	int			value;
 	struct spi_driver	*drv = to_spi_driver(dev->driver);
 
-	if (!drv || !drv->resume)
+	if (!drv->resume)
 		return 0;
 
 	/* resume may restart the i/o queue */
@@ -198,7 +198,7 @@ spi_new_device(struct spi_master *master, struct spi_board_info *chip)
 
 	/* NOTE:  caller did any chip->bus_num checks necessary */
 
-	if (!class_device_get(&master->cdev))
+	if (!spi_master_get(master))
 		return NULL;
 
 	proxy = kzalloc(sizeof *proxy, GFP_KERNEL);
@@ -244,7 +244,7 @@ spi_new_device(struct spi_master *master, struct spi_board_info *chip)
 	return proxy;
 
 fail:
-	class_device_put(&master->cdev);
+	spi_master_put(master);
 	kfree(proxy);
 	return NULL;
 }
@@ -324,8 +324,6 @@ static void spi_master_release(struct class_device *cdev)
 	struct spi_master *master;
 
 	master = container_of(cdev, struct spi_master, cdev);
-	put_device(master->cdev.dev);
-	master->cdev.dev = NULL;
 	kfree(master);
 }
 
@@ -339,8 +337,9 @@ static struct class spi_master_class = {
 /**
  * spi_alloc_master - allocate SPI master controller
  * @dev: the controller, possibly using the platform_bus
- * @size: how much driver-private data to preallocate; a pointer to this
- * 	memory in the class_data field of the returned class_device
+ * @size: how much driver-private data to preallocate; the pointer to this
+ * 	memory is in the class_data field of the returned class_device,
+ *	accessible with spi_master_get_devdata().
  *
  * This call is used only by SPI master controller drivers, which are the
  * only ones directly touching chip registers.  It's how they allocate
@@ -350,14 +349,17 @@ static struct class spi_master_class = {
  * master structure on success, else NULL.
  *
  * The caller is responsible for assigning the bus number and initializing
- * the master's methods before calling spi_add_master(), or else (on error)
- * calling class_device_put() to prevent a memory leak.
+ * the master's methods before calling spi_add_master(); and (after errors
+ * adding the device) calling spi_master_put() to prevent a memory leak.
  */
 struct spi_master * __init_or_module
 spi_alloc_master(struct device *dev, unsigned size)
 {
 	struct spi_master	*master;
 
+	if (!dev)
+		return NULL;
+
 	master = kzalloc(size + sizeof *master, SLAB_KERNEL);
 	if (!master)
 		return NULL;
@@ -365,7 +367,7 @@ spi_alloc_master(struct device *dev, unsigned size)
 	class_device_initialize(&master->cdev);
 	master->cdev.class = &spi_master_class;
 	master->cdev.dev = get_device(dev);
-	class_set_devdata(&master->cdev, &master[1]);
+	spi_master_set_devdata(master, &master[1]);
 
 	return master;
 }
@@ -387,6 +389,8 @@ EXPORT_SYMBOL_GPL(spi_alloc_master);
  *
  * This must be called from context that can sleep.  It returns zero on
  * success, else a negative error code (dropping the master's refcount).
+ * After a successful return, the caller is responsible for calling
+ * spi_unregister_master().
  */
 int __init_or_module
 spi_register_master(struct spi_master *master)
@@ -396,6 +400,9 @@ spi_register_master(struct spi_master *master)
 	int			status = -ENODEV;
 	int			dynamic = 0;
 
+	if (!dev)
+		return -ENODEV;
+
 	/* convention:  dynamically assigned bus IDs count down from the max */
 	if (master->bus_num == 0) {
 		master->bus_num = atomic_dec_return(&dyn_bus_id);
@@ -425,7 +432,7 @@ EXPORT_SYMBOL_GPL(spi_register_master);
 static int __unregister(struct device *dev, void *unused)
 {
 	/* note: before about 2.6.14-rc1 this would corrupt memory: */
-	device_unregister(dev);
+	spi_unregister_device(to_spi_device(dev));
 	return 0;
 }
 
@@ -440,8 +447,9 @@ static int __unregister(struct device *dev, void *unused)
  */
 void spi_unregister_master(struct spi_master *master)
 {
-	class_device_unregister(&master->cdev);
 	(void) device_for_each_child(master->cdev.dev, NULL, __unregister);
+	class_device_unregister(&master->cdev);
+	master->cdev.dev = NULL;
 }
 EXPORT_SYMBOL_GPL(spi_unregister_master);
 
@@ -487,6 +495,9 @@ EXPORT_SYMBOL_GPL(spi_busnum_to_master);
  * by leaving it selected in anticipation that the next message will go
  * to the same chip.  (That may increase power usage.)
  *
+ * Also, the caller is guaranteeing that the memory associated with the
+ * message will not be freed before this call returns.
+ *
  * The return value is a negative error code if the message could not be
  * submitted, else zero.  When the value is zero, then message->status is
  * also defined:  it's the completion code for the transfer, either zero
@@ -524,9 +535,9 @@ static u8	*buf;
  * is zero for success, else a negative errno status code.
  * This call may only be used from a context that may sleep.
  *
- * Parameters to this routine are always copied using a small buffer,
- * large transfers should use use spi_{async,sync}() calls with
- * dma-safe buffers.
+ * Parameters to this routine are always copied using a small buffer;
+ * performance-sensitive or bulk transfer code should instead use
+ * spi_{async,sync}() calls with dma-safe buffers.
  */
 int spi_write_then_read(struct spi_device *spi,
 		const u8 *txbuf, unsigned n_tx,
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index c851b3d13208..6a41e2650b2e 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -60,8 +60,8 @@ struct spi_device {
 	u8			mode;
 #define	SPI_CPHA	0x01			/* clock phase */
 #define	SPI_CPOL	0x02			/* clock polarity */
-#define	SPI_MODE_0	(0|0)
-#define	SPI_MODE_1	(0|SPI_CPHA)		/* (original MicroWire) */
+#define	SPI_MODE_0	(0|0)			/* (original MicroWire) */
+#define	SPI_MODE_1	(0|SPI_CPHA)
 #define	SPI_MODE_2	(SPI_CPOL|0)
 #define	SPI_MODE_3	(SPI_CPOL|SPI_CPHA)
 #define	SPI_CS_HIGH	0x04			/* chipselect active high? */
@@ -209,6 +209,30 @@ struct spi_master {
 	void			(*cleanup)(const struct spi_device *spi);
 };
 
+static inline void *spi_master_get_devdata(struct spi_master *master)
+{
+	return class_get_devdata(&master->cdev);
+}
+
+static inline void spi_master_set_devdata(struct spi_master *master, void *data)
+{
+	class_set_devdata(&master->cdev, data);
+}
+
+static inline struct spi_master *spi_master_get(struct spi_master *master)
+{
+	if (!master || !class_device_get(&master->cdev))
+		return NULL;
+	return master;
+}
+
+static inline void spi_master_put(struct spi_master *master)
+{
+	if (master)
+		class_device_put(&master->cdev);
+}
+
+
 /* the spi driver core manages memory for the spi_master classdev */
 extern struct spi_master *
 spi_alloc_master(struct device *host, unsigned size);
@@ -271,11 +295,17 @@ extern struct spi_master *spi_busnum_to_master(u16 busnum);
  * stay selected until the next transfer.  This is purely a performance
  * hint; the controller driver may need to select a different device
  * for the next message.
+ *
+ * The code that submits an spi_message (and its spi_transfers)
+ * to the lower layers is responsible for managing its memory.
+ * Zero-initialize every field you don't set up explicitly, to
+ * insulate against future API updates.
  */
 struct spi_transfer {
 	/* it's ok if tx_buf == rx_buf (right?)
 	 * for MicroWire, one buffer must be null
-	 * buffers must work with dma_*map_single() calls
+	 * buffers must work with dma_*map_single() calls, unless
+	 *   spi_message.is_dma_mapped reports a pre-existing mapping
 	 */
 	const void	*tx_buf;
 	void		*rx_buf;
@@ -302,6 +332,11 @@ struct spi_transfer {
  * @status: zero for success, else negative errno
  * @queue: for use by whichever driver currently owns the message
  * @state: for use by whichever driver currently owns the message
+ *
+ * The code that submits an spi_message (and its spi_transfers)
+ * to the lower layers is responsible for managing its memory.
+ * Zero-initialize every field you don't set up explicitly, to
+ * insulate against future API updates.
  */
 struct spi_message {
 	struct spi_transfer	*transfers;
@@ -336,6 +371,29 @@ struct spi_message {
 	void			*state;
 };
 
+/* It's fine to embed message and transaction structures in other data
+ * structures so long as you don't free them while they're in use.
+ */
+
+static inline struct spi_message *spi_message_alloc(unsigned ntrans, gfp_t flags)
+{
+	struct spi_message *m;
+
+	m = kzalloc(sizeof(struct spi_message)
+			+ ntrans * sizeof(struct spi_transfer),
+			flags);
+	if (m) {
+		m->transfers = (void *)(m + 1);
+		m->n_transfer = ntrans;
+	}
+	return m;
+}
+
+static inline void spi_message_free(struct spi_message *m)
+{
+	kfree(m);
+}
+
 /**
  * spi_setup -- setup SPI mode and clock rate
  * @spi: the device whose settings are being modified
@@ -363,7 +421,10 @@ spi_setup(struct spi_device *spi)
  * The completion callback is invoked in a context which can't sleep.
  * Before that invocation, the value of message->status is undefined.
  * When the callback is issued, message->status holds either zero (to
- * indicate complete success) or a negative error code.
+ * indicate complete success) or a negative error code.  After that
+ * callback returns, the driver which issued the transfer request may
+ * deallocate the associated memory; it's no longer in use by any SPI
+ * core or controller driver code.
  *
  * Note that although all messages to a spi_device are handled in
  * FIFO order, messages may go to different devices in other orders.
@@ -445,6 +506,7 @@ spi_read(struct spi_device *spi, u8 *buf, size_t len)
 	return spi_sync(spi, &m);
 }
 
+/* this copies txbuf and rxbuf data; for small transfers only! */
 extern int spi_write_then_read(struct spi_device *spi,
 		const u8 *txbuf, unsigned n_tx,
 		u8 *rxbuf, unsigned n_rx);
@@ -555,8 +617,9 @@ spi_register_board_info(struct spi_board_info const *info, unsigned n)
 
 
 /* If you're hotplugging an adapter with devices (parport, usb, etc)
- * use spi_new_device() to describe each device.  You would then call
- * spi_unregister_device() to start making that device vanish.
+ * use spi_new_device() to describe each device.  You can also call
+ * spi_unregister_device() to start making that device vanish, but
+ * normally that would be handled by spi_unregister_master().
  */
 extern struct spi_device *
 spi_new_device(struct spi_master *, struct spi_board_info *);
-- 
cgit v1.2.3-71-gd317


From 2e5a7bd978bf4118a0c8edf2e6ff81d0a72fee47 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 8 Jan 2006 13:34:25 -0800
Subject: [PATCH] spi: ads7836 uses spi_driver

This updates the ads7864 driver to use the new "spi_driver" struct, and
includes some minor unrelated cleanup.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/input/touchscreen/ads7846.c | 84 ++++++++++++++++++-------------------
 include/linux/spi/ads7846.h         |  2 +-
 2 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index 24ff6c5a4021..c741776ef3bf 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -345,19 +345,15 @@ static irqreturn_t ads7846_irq(int irq, void *handle, struct pt_regs *regs)
 
 /*--------------------------------------------------------------------------*/
 
-/* non-empty "extra" is needed before 2.6.14-git5 or so */
-#define	EXTRA	//	, u32 level
-#define	EXTRA2	//	, 0
-
 static int
-ads7846_suspend(struct device *dev, pm_message_t message EXTRA)
+ads7846_suspend(struct spi_device *spi, pm_message_t message)
 {
-	struct ads7846 *ts = dev_get_drvdata(dev);
+	struct ads7846 *ts = dev_get_drvdata(&spi->dev);
 	unsigned long	flags;
 
 	spin_lock_irqsave(&ts->lock, flags);
 
-	ts->spi->dev.power.power_state = message;
+	spi->dev.power.power_state = message;
 
 	/* are we waiting for IRQ, or polling? */
 	if (!ts->pendown) {
@@ -387,36 +383,35 @@ ads7846_suspend(struct device *dev, pm_message_t message EXTRA)
 	return 0;
 }
 
-static int ads7846_resume(struct device *dev EXTRA)
+static int ads7846_resume(struct spi_device *spi)
 {
-	struct ads7846 *ts = dev_get_drvdata(dev);
+	struct ads7846 *ts = dev_get_drvdata(&spi->dev);
 
 	ts->irq_disabled = 0;
 	enable_irq(ts->spi->irq);
-	dev->power.power_state = PMSG_ON;
+	spi->dev.power.power_state = PMSG_ON;
 	return 0;
 }
 
-static int __init ads7846_probe(struct device *dev)
+static int __devinit ads7846_probe(struct spi_device *spi)
 {
-	struct spi_device		*spi = to_spi_device(dev);
 	struct ads7846			*ts;
-	struct ads7846_platform_data	*pdata = dev->platform_data;
+	struct ads7846_platform_data	*pdata = spi->dev.platform_data;
 	struct spi_transfer		*x;
 
 	if (!spi->irq) {
-		dev_dbg(dev, "no IRQ?\n");
+		dev_dbg(&spi->dev, "no IRQ?\n");
 		return -ENODEV;
 	}
 
 	if (!pdata) {
-		dev_dbg(dev, "no platform data?\n");
+		dev_dbg(&spi->dev, "no platform data?\n");
 		return -ENODEV;
 	}
 
 	/* don't exceed max specified sample rate */
 	if (spi->max_speed_hz > (125000 * 16)) {
-		dev_dbg(dev, "f(sample) %d KHz?\n",
+		dev_dbg(&spi->dev, "f(sample) %d KHz?\n",
 				(spi->max_speed_hz/16)/1000);
 		return -EINVAL;
 	}
@@ -430,7 +425,7 @@ static int __init ads7846_probe(struct device *dev)
 	if (!(ts = kzalloc(sizeof(struct ads7846), GFP_KERNEL)))
 		return -ENOMEM;
 
-	dev_set_drvdata(dev, ts);
+	dev_set_drvdata(&spi->dev, ts);
 
 	ts->spi = spi;
 	spi->dev.power.power_state = PMSG_ON;
@@ -445,9 +440,9 @@ static int __init ads7846_probe(struct device *dev)
 
 	init_input_dev(&ts->input);
 
-	ts->input.dev = dev;
+	ts->input.dev = &spi->dev;
 	ts->input.name = "ADS784x Touchscreen";
-	snprintf(ts->phys, sizeof ts->phys, "%s/input0", dev->bus_id);
+	snprintf(ts->phys, sizeof ts->phys, "%s/input0", spi->dev.bus_id);
 	ts->input.phys = ts->phys;
 
 	ts->input.evbit[0] = BIT(EV_KEY) | BIT(EV_ABS);
@@ -511,65 +506,68 @@ static int __init ads7846_probe(struct device *dev)
 	ts->msg.context = ts;
 
 	if (request_irq(spi->irq, ads7846_irq, SA_SAMPLE_RANDOM,
-				dev->bus_id, ts)) {
-		dev_dbg(dev, "irq %d busy?\n", spi->irq);
+				spi->dev.bus_id, ts)) {
+		dev_dbg(&spi->dev, "irq %d busy?\n", spi->irq);
 		input_unregister_device(&ts->input);
 		kfree(ts);
 		return -EBUSY;
 	}
 	set_irq_type(spi->irq, IRQT_FALLING);
 
-	dev_info(dev, "touchscreen, irq %d\n", spi->irq);
+	dev_info(&spi->dev, "touchscreen, irq %d\n", spi->irq);
 
 	/* take a first sample, leaving nPENIRQ active; avoid
 	 * the touchscreen, in case it's not connected.
 	 */
-	(void) ads7846_read12_ser(dev,
+	(void) ads7846_read12_ser(&spi->dev,
 			  READ_12BIT_SER(vaux) | ADS_PD10_ALL_ON);
 
 	/* ads7843/7845 don't have temperature sensors, and
 	 * use the other sensors a bit differently too
 	 */
 	if (ts->model == 7846) {
-		device_create_file(dev, &dev_attr_temp0);
-		device_create_file(dev, &dev_attr_temp1);
+		device_create_file(&spi->dev, &dev_attr_temp0);
+		device_create_file(&spi->dev, &dev_attr_temp1);
 	}
 	if (ts->model != 7845)
-		device_create_file(dev, &dev_attr_vbatt);
-	device_create_file(dev, &dev_attr_vaux);
+		device_create_file(&spi->dev, &dev_attr_vbatt);
+	device_create_file(&spi->dev, &dev_attr_vaux);
 
 	return 0;
 }
 
-static int __exit ads7846_remove(struct device *dev)
+static int __devexit ads7846_remove(struct spi_device *spi)
 {
-	struct ads7846		*ts = dev_get_drvdata(dev);
+	struct ads7846		*ts = dev_get_drvdata(&spi->dev);
 
-	ads7846_suspend(dev, PMSG_SUSPEND EXTRA2);
+	ads7846_suspend(spi, PMSG_SUSPEND);
 	free_irq(ts->spi->irq, ts);
 	if (ts->irq_disabled)
 		enable_irq(ts->spi->irq);
 
 	if (ts->model == 7846) {
-		device_remove_file(dev, &dev_attr_temp0);
-		device_remove_file(dev, &dev_attr_temp1);
+		device_remove_file(&spi->dev, &dev_attr_temp0);
+		device_remove_file(&spi->dev, &dev_attr_temp1);
 	}
 	if (ts->model != 7845)
-		device_remove_file(dev, &dev_attr_vbatt);
-	device_remove_file(dev, &dev_attr_vaux);
+		device_remove_file(&spi->dev, &dev_attr_vbatt);
+	device_remove_file(&spi->dev, &dev_attr_vaux);
 
 	input_unregister_device(&ts->input);
 	kfree(ts);
 
-	dev_dbg(dev, "unregistered touchscreen\n");
+	dev_dbg(&spi->dev, "unregistered touchscreen\n");
 	return 0;
 }
 
-static struct device_driver ads7846_driver = {
-	.name		= "ads7846",
-	.bus		= &spi_bus_type,
+static struct spi_driver ads7846_driver = {
+	.driver = {
+		.name	= "ads7846",
+		.bus	= &spi_bus_type,
+		.owner	= THIS_MODULE,
+	},
 	.probe		= ads7846_probe,
-	.remove		= __exit_p(ads7846_remove),
+	.remove		= __devexit_p(ads7846_remove),
 	.suspend	= ads7846_suspend,
 	.resume		= ads7846_resume,
 };
@@ -594,18 +592,20 @@ static int __init ads7846_init(void)
 	// PXA:
 	// also Dell Axim X50
 	// also HP iPaq H191x/H192x/H415x/H435x
-	// also Intel Lubbock (alternate to UCB1400)
+	// also Intel Lubbock (additional to UCB1400; as temperature sensor)
 	// also Sharp Zaurus C7xx, C8xx (corgi/sheperd/husky)
 
+	// Atmel at91sam9261-EK uses ads7843
+
 	// also various AMD Au1x00 devel boards
 
-	return driver_register(&ads7846_driver);
+	return spi_register_driver(&ads7846_driver);
 }
 module_init(ads7846_init);
 
 static void __exit ads7846_exit(void)
 {
-	driver_unregister(&ads7846_driver);
+	spi_unregister_driver(&ads7846_driver);
 
 #ifdef	CONFIG_ARCH_OMAP
 	if (machine_is_omap_osk()) {
diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index 84a27013d399..72261e0f2ac1 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -1,7 +1,7 @@
 /* linux/spi/ads7846.h */
 
 /* Touchscreen characteristics vary between boards and models.  The
- * platform_data for the device's "struct device" holts this information.
+ * platform_data for the device's "struct device" holds this information.
  *
  * It's OK if the min/max values are zero.
  */
-- 
cgit v1.2.3-71-gd317


From 9904f22a7202c6b54e96b0cc9870817013c350a1 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 8 Jan 2006 13:34:26 -0800
Subject: [PATCH] spi: add spi_bitbang driver

This adds a bitbanging spi master, hooking up to board/adapter-specific glue
code which knows how to set and read the signals (gpios etc).

This code kicks in after the glue code creates a platform_device with the
right platform_data.  That data includes I/O loops, which will usually
come from expanding an inline function (provided in the header).  One goal
is that the I/O loops should be easily optimized down to a few GPIO register
accesses, in common cases, for speed and minimized overhead.

This understands all the currently defined protocol tweaking options in the
SPI framework, and might eventually serve as as reference implementation.

  - different word sizes (1..32 bits)
  - differing clock rates
  - SPI modes differing by CPOL (affecting chip select and I/O loops)
  - SPI modes differing by CPHA (affecting I/O loops)
  - delays (usecs) after transfers
  - temporarily deselecting chips in mid-transfer

A lot of hardware could work with this framework, though common types of
controller can't reach peak performance without switching to a driver
structure that supports pipelining of transfers (e.g.  DMA queues) and maybe
controllers (e.g.  IRQ driven).

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/spi/Kconfig             |  13 ++
 drivers/spi/Makefile            |   1 +
 drivers/spi/spi_bitbang.c       | 460 ++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/spi_bitbang.h | 128 +++++++++++
 4 files changed, 602 insertions(+)
 create mode 100644 drivers/spi/spi_bitbang.c
 create mode 100644 include/linux/spi/spi_bitbang.h

(limited to 'include/linux')

diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index d3105104a297..9b21c5d77b4a 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -51,6 +51,19 @@ config SPI_MASTER
 comment "SPI Master Controller Drivers"
 	depends on SPI_MASTER
 
+config SPI_BITBANG
+	tristate "Bitbanging SPI master"
+	depends on SPI_MASTER && EXPERIMENTAL
+	help
+	  With a few GPIO pins, your system can bitbang the SPI protocol.
+	  Select this to get SPI support through I/O pins (GPIO, parallel
+	  port, etc).  Or, some systems' SPI master controller drivers use
+	  this code to manage the per-word or per-transfer accesses to the
+	  hardware shift registers.
+
+	  This is library code, and is automatically selected by drivers that
+	  need it.  You only need to select this explicitly to support driver
+	  modules that aren't part of this kernel tree.
 
 #
 # Add new SPI master controllers in alphabetical order above this line
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
index afd2321753b3..5da6a4df4012 100644
--- a/drivers/spi/Makefile
+++ b/drivers/spi/Makefile
@@ -11,6 +11,7 @@ endif
 obj-$(CONFIG_SPI_MASTER)		+= spi.o
 
 # SPI master controller drivers (bus)
+obj-$(CONFIG_SPI_BITBANG)		+= spi_bitbang.o
 # 	... add above this line ...
 
 # SPI protocol drivers (device/link on bus)
diff --git a/drivers/spi/spi_bitbang.c b/drivers/spi/spi_bitbang.c
new file mode 100644
index 000000000000..44aff198eb96
--- /dev/null
+++ b/drivers/spi/spi_bitbang.c
@@ -0,0 +1,460 @@
+/*
+ * spi_bitbang.c - polling/bitbanging SPI master controller driver utilities
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/platform_device.h>
+
+#include <linux/spi/spi.h>
+#include <linux/spi/spi_bitbang.h>
+
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * FIRST PART (OPTIONAL):  word-at-a-time spi_transfer support.
+ * Use this for GPIO or shift-register level hardware APIs.
+ *
+ * spi_bitbang_cs is in spi_device->controller_state, which is unavailable
+ * to glue code.  These bitbang setup() and cleanup() routines are always
+ * used, though maybe they're called from controller-aware code.
+ *
+ * chipselect() and friends may use use spi_device->controller_data and
+ * controller registers as appropriate.
+ *
+ *
+ * NOTE:  SPI controller pins can often be used as GPIO pins instead,
+ * which means you could use a bitbang driver either to get hardware
+ * working quickly, or testing for differences that aren't speed related.
+ */
+
+struct spi_bitbang_cs {
+	unsigned	nsecs;	/* (clock cycle time)/2 */
+	u32		(*txrx_word)(struct spi_device *spi, unsigned nsecs,
+					u32 word, u8 bits);
+	unsigned	(*txrx_bufs)(struct spi_device *,
+					u32 (*txrx_word)(
+						struct spi_device *spi,
+						unsigned nsecs,
+						u32 word, u8 bits),
+					unsigned, struct spi_transfer *);
+};
+
+static unsigned bitbang_txrx_8(
+	struct spi_device	*spi,
+	u32			(*txrx_word)(struct spi_device *spi,
+					unsigned nsecs,
+					u32 word, u8 bits),
+	unsigned		ns,
+	struct spi_transfer	*t
+) {
+	unsigned		bits = spi->bits_per_word;
+	unsigned		count = t->len;
+	const u8		*tx = t->tx_buf;
+	u8			*rx = t->rx_buf;
+
+	while (likely(count > 0)) {
+		u8		word = 0;
+
+		if (tx)
+			word = *tx++;
+		word = txrx_word(spi, ns, word, bits);
+		if (rx)
+			*rx++ = word;
+		count -= 1;
+	}
+	return t->len - count;
+}
+
+static unsigned bitbang_txrx_16(
+	struct spi_device	*spi,
+	u32			(*txrx_word)(struct spi_device *spi,
+					unsigned nsecs,
+					u32 word, u8 bits),
+	unsigned		ns,
+	struct spi_transfer	*t
+) {
+	unsigned		bits = spi->bits_per_word;
+	unsigned		count = t->len;
+	const u16		*tx = t->tx_buf;
+	u16			*rx = t->rx_buf;
+
+	while (likely(count > 1)) {
+		u16		word = 0;
+
+		if (tx)
+			word = *tx++;
+		word = txrx_word(spi, ns, word, bits);
+		if (rx)
+			*rx++ = word;
+		count -= 2;
+	}
+	return t->len - count;
+}
+
+static unsigned bitbang_txrx_32(
+	struct spi_device	*spi,
+	u32			(*txrx_word)(struct spi_device *spi,
+					unsigned nsecs,
+					u32 word, u8 bits),
+	unsigned		ns,
+	struct spi_transfer	*t
+) {
+	unsigned		bits = spi->bits_per_word;
+	unsigned		count = t->len;
+	const u32		*tx = t->tx_buf;
+	u32			*rx = t->rx_buf;
+
+	while (likely(count > 3)) {
+		u32		word = 0;
+
+		if (tx)
+			word = *tx++;
+		word = txrx_word(spi, ns, word, bits);
+		if (rx)
+			*rx++ = word;
+		count -= 4;
+	}
+	return t->len - count;
+}
+
+/**
+ * spi_bitbang_setup - default setup for per-word I/O loops
+ */
+int spi_bitbang_setup(struct spi_device *spi)
+{
+	struct spi_bitbang_cs	*cs = spi->controller_state;
+	struct spi_bitbang	*bitbang;
+
+	if (!cs) {
+		cs = kzalloc(sizeof *cs, SLAB_KERNEL);
+		if (!cs)
+			return -ENOMEM;
+		spi->controller_state = cs;
+	}
+	bitbang = spi_master_get_devdata(spi->master);
+
+	if (!spi->bits_per_word)
+		spi->bits_per_word = 8;
+
+	/* spi_transfer level calls that work per-word */
+	if (spi->bits_per_word <= 8)
+		cs->txrx_bufs = bitbang_txrx_8;
+	else if (spi->bits_per_word <= 16)
+		cs->txrx_bufs = bitbang_txrx_16;
+	else if (spi->bits_per_word <= 32)
+		cs->txrx_bufs = bitbang_txrx_32;
+	else
+		return -EINVAL;
+
+	/* per-word shift register access, in hardware or bitbanging */
+	cs->txrx_word = bitbang->txrx_word[spi->mode & (SPI_CPOL|SPI_CPHA)];
+	if (!cs->txrx_word)
+		return -EINVAL;
+
+	if (!spi->max_speed_hz)
+		spi->max_speed_hz = 500 * 1000;
+
+	/* nsecs = max(50, (clock period)/2), be optimistic */
+	cs->nsecs = (1000000000/2) / (spi->max_speed_hz);
+	if (cs->nsecs < 50)
+		cs->nsecs = 50;
+	if (cs->nsecs > MAX_UDELAY_MS * 1000)
+		return -EINVAL;
+
+	dev_dbg(&spi->dev, "%s, mode %d, %u bits/w, %u nsec\n",
+			__FUNCTION__, spi->mode & (SPI_CPOL | SPI_CPHA),
+			spi->bits_per_word, 2 * cs->nsecs);
+
+	/* NOTE we _need_ to call chipselect() early, ideally with adapter
+	 * setup, unless the hardware defaults cooperate to avoid confusion
+	 * between normal (active low) and inverted chipselects.
+	 */
+
+	/* deselect chip (low or high) */
+	spin_lock(&bitbang->lock);
+	if (!bitbang->busy) {
+		bitbang->chipselect(spi, 0);
+		ndelay(cs->nsecs);
+	}
+	spin_unlock(&bitbang->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(spi_bitbang_setup);
+
+/**
+ * spi_bitbang_cleanup - default cleanup for per-word I/O loops
+ */
+void spi_bitbang_cleanup(const struct spi_device *spi)
+{
+	kfree(spi->controller_state);
+}
+EXPORT_SYMBOL_GPL(spi_bitbang_cleanup);
+
+static int spi_bitbang_bufs(struct spi_device *spi, struct spi_transfer *t)
+{
+	struct spi_bitbang_cs	*cs = spi->controller_state;
+	unsigned		nsecs = cs->nsecs;
+
+	return cs->txrx_bufs(spi, cs->txrx_word, nsecs, t);
+}
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * SECOND PART ... simple transfer queue runner.
+ *
+ * This costs a task context per controller, running the queue by
+ * performing each transfer in sequence.  Smarter hardware can queue
+ * several DMA transfers at once, and process several controller queues
+ * in parallel; this driver doesn't match such hardware very well.
+ *
+ * Drivers can provide word-at-a-time i/o primitives, or provide
+ * transfer-at-a-time ones to leverage dma or fifo hardware.
+ */
+static void bitbang_work(void *_bitbang)
+{
+	struct spi_bitbang	*bitbang = _bitbang;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&bitbang->lock, flags);
+	bitbang->busy = 1;
+	while (!list_empty(&bitbang->queue)) {
+		struct spi_message	*m;
+		struct spi_device	*spi;
+		unsigned		nsecs;
+		struct spi_transfer	*t;
+		unsigned		tmp;
+		unsigned		chipselect;
+		int			status;
+
+		m = container_of(bitbang->queue.next, struct spi_message,
+				queue);
+		list_del_init(&m->queue);
+		spin_unlock_irqrestore(&bitbang->lock, flags);
+
+// FIXME this is made-up
+nsecs = 100;
+
+		spi = m->spi;
+		t = m->transfers;
+		tmp = 0;
+		chipselect = 0;
+		status = 0;
+
+		for (;;t++) {
+			if (bitbang->shutdown) {
+				status = -ESHUTDOWN;
+				break;
+			}
+
+			/* set up default clock polarity, and activate chip */
+			if (!chipselect) {
+				bitbang->chipselect(spi, 1);
+				ndelay(nsecs);
+			}
+			if (!t->tx_buf && !t->rx_buf && t->len) {
+				status = -EINVAL;
+				break;
+			}
+
+			/* transfer data */
+			if (t->len) {
+				/* FIXME if bitbang->use_dma, dma_map_single()
+				 * before the transfer, and dma_unmap_single()
+				 * afterwards, for either or both buffers...
+				 */
+				status = bitbang->txrx_bufs(spi, t);
+			}
+			if (status != t->len) {
+				if (status > 0)
+					status = -EMSGSIZE;
+				break;
+			}
+			m->actual_length += status;
+			status = 0;
+
+			/* protocol tweaks before next transfer */
+			if (t->delay_usecs)
+				udelay(t->delay_usecs);
+
+			tmp++;
+			if (tmp >= m->n_transfer)
+				break;
+
+			chipselect = !t->cs_change;
+			if (chipselect);
+				continue;
+
+			bitbang->chipselect(spi, 0);
+
+			/* REVISIT do we want the udelay here instead? */
+			msleep(1);
+		}
+
+		tmp = m->n_transfer - 1;
+		tmp = m->transfers[tmp].cs_change;
+
+		m->status = status;
+		m->complete(m->context);
+
+		ndelay(2 * nsecs);
+		bitbang->chipselect(spi, status == 0 && tmp);
+		ndelay(nsecs);
+
+		spin_lock_irqsave(&bitbang->lock, flags);
+	}
+	bitbang->busy = 0;
+	spin_unlock_irqrestore(&bitbang->lock, flags);
+}
+
+/**
+ * spi_bitbang_transfer - default submit to transfer queue
+ */
+int spi_bitbang_transfer(struct spi_device *spi, struct spi_message *m)
+{
+	struct spi_bitbang	*bitbang;
+	unsigned long		flags;
+
+	m->actual_length = 0;
+	m->status = -EINPROGRESS;
+
+	bitbang = spi_master_get_devdata(spi->master);
+	if (bitbang->shutdown)
+		return -ESHUTDOWN;
+
+	spin_lock_irqsave(&bitbang->lock, flags);
+	list_add_tail(&m->queue, &bitbang->queue);
+	queue_work(bitbang->workqueue, &bitbang->work);
+	spin_unlock_irqrestore(&bitbang->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(spi_bitbang_transfer);
+
+/*----------------------------------------------------------------------*/
+
+/**
+ * spi_bitbang_start - start up a polled/bitbanging SPI master driver
+ * @bitbang: driver handle
+ *
+ * Caller should have zero-initialized all parts of the structure, and then
+ * provided callbacks for chip selection and I/O loops.  If the master has
+ * a transfer method, its final step should call spi_bitbang_transfer; or,
+ * that's the default if the transfer routine is not initialized.  It should
+ * also set up the bus number and number of chipselects.
+ *
+ * For i/o loops, provide callbacks either per-word (for bitbanging, or for
+ * hardware that basically exposes a shift register) or per-spi_transfer
+ * (which takes better advantage of hardware like fifos or DMA engines).
+ *
+ * Drivers using per-word I/O loops should use (or call) spi_bitbang_setup and
+ * spi_bitbang_cleanup to handle those spi master methods.  Those methods are
+ * the defaults if the bitbang->txrx_bufs routine isn't initialized.
+ *
+ * This routine registers the spi_master, which will process requests in a
+ * dedicated task, keeping IRQs unblocked most of the time.  To stop
+ * processing those requests, call spi_bitbang_stop().
+ */
+int spi_bitbang_start(struct spi_bitbang *bitbang)
+{
+	int	status;
+
+	if (!bitbang->master || !bitbang->chipselect)
+		return -EINVAL;
+
+	INIT_WORK(&bitbang->work, bitbang_work, bitbang);
+	spin_lock_init(&bitbang->lock);
+	INIT_LIST_HEAD(&bitbang->queue);
+
+	if (!bitbang->master->transfer)
+		bitbang->master->transfer = spi_bitbang_transfer;
+	if (!bitbang->txrx_bufs) {
+		bitbang->use_dma = 0;
+		bitbang->txrx_bufs = spi_bitbang_bufs;
+		if (!bitbang->master->setup) {
+			bitbang->master->setup = spi_bitbang_setup;
+			bitbang->master->cleanup = spi_bitbang_cleanup;
+		}
+	} else if (!bitbang->master->setup)
+		return -EINVAL;
+
+	/* this task is the only thing to touch the SPI bits */
+	bitbang->busy = 0;
+	bitbang->workqueue = create_singlethread_workqueue(
+			bitbang->master->cdev.dev->bus_id);
+	if (bitbang->workqueue == NULL) {
+		status = -EBUSY;
+		goto err1;
+	}
+
+	/* driver may get busy before register() returns, especially
+	 * if someone registered boardinfo for devices
+	 */
+	status = spi_register_master(bitbang->master);
+	if (status < 0)
+		goto err2;
+
+	return status;
+
+err2:
+	destroy_workqueue(bitbang->workqueue);
+err1:
+	return status;
+}
+EXPORT_SYMBOL_GPL(spi_bitbang_start);
+
+/**
+ * spi_bitbang_stop - stops the task providing spi communication
+ */
+int spi_bitbang_stop(struct spi_bitbang *bitbang)
+{
+	unsigned	limit = 500;
+
+	spin_lock_irq(&bitbang->lock);
+	bitbang->shutdown = 0;
+	while (!list_empty(&bitbang->queue) && limit--) {
+		spin_unlock_irq(&bitbang->lock);
+
+		dev_dbg(bitbang->master->cdev.dev, "wait for queue\n");
+		msleep(10);
+
+		spin_lock_irq(&bitbang->lock);
+	}
+	spin_unlock_irq(&bitbang->lock);
+	if (!list_empty(&bitbang->queue)) {
+		dev_err(bitbang->master->cdev.dev, "queue didn't empty\n");
+		return -EBUSY;
+	}
+
+	destroy_workqueue(bitbang->workqueue);
+
+	spi_unregister_master(bitbang->master);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(spi_bitbang_stop);
+
+MODULE_LICENSE("GPL");
+
diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h
new file mode 100644
index 000000000000..8dfe61a445f4
--- /dev/null
+++ b/include/linux/spi/spi_bitbang.h
@@ -0,0 +1,128 @@
+#ifndef	__SPI_BITBANG_H
+#define	__SPI_BITBANG_H
+
+/*
+ * Mix this utility code with some glue code to get one of several types of
+ * simple SPI master driver.  Two do polled word-at-a-time I/O:
+ *
+ *   -	GPIO/parport bitbangers.  Provide chipselect() and txrx_word[](),
+ *	expanding the per-word routines from the inline templates below.
+ *
+ *   -	Drivers for controllers resembling bare shift registers.  Provide
+ *	chipselect() and txrx_word[](), with custom setup()/cleanup() methods
+ *	that use your controller's clock and chipselect registers.
+ *
+ * Some hardware works well with requests at spi_transfer scope:
+ *
+ *   -	Drivers leveraging smarter hardware, with fifos or DMA; or for half
+ *	duplex (MicroWire) controllers.  Provide chipslect() and txrx_bufs(),
+ *	and custom setup()/cleanup() methods.
+ */
+struct spi_bitbang {
+	struct workqueue_struct	*workqueue;
+	struct work_struct	work;
+
+	spinlock_t		lock;
+	struct list_head	queue;
+	u8			busy;
+	u8			shutdown;
+	u8			use_dma;
+
+	struct spi_master	*master;
+
+	void	(*chipselect)(struct spi_device *spi, int is_on);
+
+	int	(*txrx_bufs)(struct spi_device *spi, struct spi_transfer *t);
+	u32	(*txrx_word[4])(struct spi_device *spi,
+			unsigned nsecs,
+			u32 word, u8 bits);
+};
+
+/* you can call these default bitbang->master methods from your custom
+ * methods, if you like.
+ */
+extern int spi_bitbang_setup(struct spi_device *spi);
+extern void spi_bitbang_cleanup(const struct spi_device *spi);
+extern int spi_bitbang_transfer(struct spi_device *spi, struct spi_message *m);
+
+/* start or stop queue processing */
+extern int spi_bitbang_start(struct spi_bitbang *spi);
+extern int spi_bitbang_stop(struct spi_bitbang *spi);
+
+#endif	/* __SPI_BITBANG_H */
+
+/*-------------------------------------------------------------------------*/
+
+#ifdef	EXPAND_BITBANG_TXRX
+
+/*
+ * The code that knows what GPIO pins do what should have declared four
+ * functions, ideally as inlines, before #defining EXPAND_BITBANG_TXRX
+ * and including this header:
+ *
+ *  void setsck(struct spi_device *, int is_on);
+ *  void setmosi(struct spi_device *, int is_on);
+ *  int getmiso(struct spi_device *);
+ *  void spidelay(unsigned);
+ *
+ * A non-inlined routine would call bitbang_txrx_*() routines.  The
+ * main loop could easily compile down to a handful of instructions,
+ * especially if the delay is a NOP (to run at peak speed).
+ *
+ * Since this is software, the timings may not be exactly what your board's
+ * chips need ... there may be several reasons you'd need to tweak timings
+ * in these routines, not just make to make it faster or slower to match a
+ * particular CPU clock rate.
+ */
+
+static inline u32
+bitbang_txrx_be_cpha0(struct spi_device *spi,
+		unsigned nsecs, unsigned cpol,
+		u32 word, u8 bits)
+{
+	/* if (cpol == 0) this is SPI_MODE_0; else this is SPI_MODE_2 */
+
+	/* clock starts at inactive polarity */
+	for (word <<= (32 - bits); likely(bits); bits--) {
+
+		/* setup MSB (to slave) on trailing edge */
+		setmosi(spi, word & (1 << 31));
+		spidelay(nsecs);	/* T(setup) */
+
+		setsck(spi, !cpol);
+		spidelay(nsecs);
+
+		/* sample MSB (from slave) on leading edge */
+		word <<= 1;
+		word |= getmiso(spi);
+		setsck(spi, cpol);
+	}
+	return word;
+}
+
+static inline u32
+bitbang_txrx_be_cpha1(struct spi_device *spi,
+		unsigned nsecs, unsigned cpol,
+		u32 word, u8 bits)
+{
+	/* if (cpol == 0) this is SPI_MODE_1; else this is SPI_MODE_3 */
+
+	/* clock starts at inactive polarity */
+	for (word <<= (32 - bits); likely(bits); bits--) {
+
+		/* setup MSB (to slave) on leading edge */
+		setsck(spi, !cpol);
+		setmosi(spi, word & (1 << 31));
+		spidelay(nsecs); /* T(setup) */
+
+		setsck(spi, cpol);
+		spidelay(nsecs);
+
+		/* sample MSB (from slave) on trailing edge */
+		word <<= 1;
+		word |= getmiso(spi);
+	}
+	return word;
+}
+
+#endif	/* EXPAND_BITBANG_TXRX */
-- 
cgit v1.2.3-71-gd317


From 2f9f762879015d738a5ec2ac8a16be94b3a4a06d Mon Sep 17 00:00:00 2001
From: Mike Lavender <mike@steroidmicros.com>
Date: Sun, 8 Jan 2006 13:34:27 -0800
Subject: [PATCH] spi: M25 series SPI flash

This was originally a driver for the ST M25P80 SPI flash.  It's been
updated slightly to handle other M25P series chips.

For many of these chips, the specific type could be probed, but for now
this just requires static setup with flash_platform_data that lists the
chip type (size, format) and any default partitioning to use.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Mike Lavender <mike@steroidmicros.com>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/mtd/devices/Kconfig  |   8 +
 drivers/mtd/devices/Makefile |   1 +
 drivers/mtd/devices/m25p80.c | 580 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/flash.h    |   4 +
 4 files changed, 593 insertions(+)
 create mode 100644 drivers/mtd/devices/m25p80.c

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index 84f2eb1fae02..5038e90ceb12 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -55,6 +55,14 @@ config MTD_DATAFLASH
 	  Sometimes DataFlash chips are packaged inside MMC-format
 	  cards; at this writing, the MMC stack won't handle those.
 
+config MTD_M25P80
+	tristate "Support for M25 SPI Flash"
+	depends on MTD && SPI_MASTER && EXPERIMENTAL
+	help
+	  This enables access to ST M25P80 and similar SPI flash chips,
+	  used for program and data storage.  Set up your spi devices
+	  with the right board-specific platform data.
+
 config MTD_SLRAM
 	tristate "Uncached system RAM"
 	depends on MTD
diff --git a/drivers/mtd/devices/Makefile b/drivers/mtd/devices/Makefile
index cd8d8074b5b6..7c5ed2178380 100644
--- a/drivers/mtd/devices/Makefile
+++ b/drivers/mtd/devices/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_MTD_LART)		+= lart.o
 obj-$(CONFIG_MTD_BLKMTD)	+= blkmtd.o
 obj-$(CONFIG_MTD_BLOCK2MTD)	+= block2mtd.o
 obj-$(CONFIG_MTD_DATAFLASH)	+= mtd_dataflash.o
+obj-$(CONFIG_MTD_M25P80)	+= m25p80.o
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
new file mode 100644
index 000000000000..71a072103a7f
--- /dev/null
+++ b/drivers/mtd/devices/m25p80.c
@@ -0,0 +1,580 @@
+/*
+ * MTD SPI driver for ST M25Pxx flash chips
+ *
+ * Author: Mike Lavender, mike@steroidmicros.com
+ *
+ * Copyright (c) 2005, Intec Automation Inc.
+ *
+ * Some parts are based on lart.c by Abraham Van Der Merwe
+ *
+ * Cleaned up and generalized based on mtd_dataflash.c
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/interrupt.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/flash.h>
+
+#include <asm/semaphore.h>
+
+
+/* NOTE: AT 25F and SST 25LF series are very similar,
+ * but commands for sector erase and chip id differ...
+ */
+
+#define FLASH_PAGESIZE		256
+
+/* Flash opcodes. */
+#define	OPCODE_WREN		6	/* Write enable */
+#define	OPCODE_RDSR		5	/* Read status register */
+#define	OPCODE_READ		3	/* Read data bytes */
+#define	OPCODE_PP		2	/* Page program */
+#define	OPCODE_SE		0xd8	/* Sector erase */
+#define	OPCODE_RES		0xab	/* Read Electronic Signature */
+#define	OPCODE_RDID		0x9f	/* Read JEDEC ID */
+
+/* Status Register bits. */
+#define	SR_WIP			1	/* Write in progress */
+#define	SR_WEL			2	/* Write enable latch */
+#define	SR_BP0			4	/* Block protect 0 */
+#define	SR_BP1			8	/* Block protect 1 */
+#define	SR_BP2			0x10	/* Block protect 2 */
+#define	SR_SRWD			0x80	/* SR write protect */
+
+/* Define max times to check status register before we give up. */
+#define	MAX_READY_WAIT_COUNT	100000
+
+
+#ifdef CONFIG_MTD_PARTITIONS
+#define	mtd_has_partitions()	(1)
+#else
+#define	mtd_has_partitions()	(0)
+#endif
+
+/****************************************************************************/
+
+struct m25p {
+	struct spi_device	*spi;
+	struct semaphore	lock;
+	struct mtd_info		mtd;
+	unsigned		partitioned;
+	u8			command[4];
+};
+
+static inline struct m25p *mtd_to_m25p(struct mtd_info *mtd)
+{
+	return container_of(mtd, struct m25p, mtd);
+}
+
+/****************************************************************************/
+
+/*
+ * Internal helper functions
+ */
+
+/*
+ * Read the status register, returning its value in the location
+ * Return the status register value.
+ * Returns negative if error occurred.
+ */
+static int read_sr(struct m25p *flash)
+{
+	ssize_t retval;
+	u8 code = OPCODE_RDSR;
+	u8 val;
+
+	retval = spi_write_then_read(flash->spi, &code, 1, &val, 1);
+
+	if (retval < 0) {
+		dev_err(&flash->spi->dev, "error %d reading SR\n",
+				(int) retval);
+		return retval;
+	}
+
+	return val;
+}
+
+
+/*
+ * Set write enable latch with Write Enable command.
+ * Returns negative if error occurred.
+ */
+static inline int write_enable(struct m25p *flash)
+{
+	u8	code = OPCODE_WREN;
+
+	return spi_write_then_read(flash->spi, &code, 1, NULL, 0);
+}
+
+
+/*
+ * Service routine to read status register until ready, or timeout occurs.
+ * Returns non-zero if error.
+ */
+static int wait_till_ready(struct m25p *flash)
+{
+	int count;
+	int sr;
+
+	/* one chip guarantees max 5 msec wait here after page writes,
+	 * but potentially three seconds (!) after page erase.
+	 */
+	for (count = 0; count < MAX_READY_WAIT_COUNT; count++) {
+		if ((sr = read_sr(flash)) < 0)
+			break;
+		else if (!(sr & SR_WIP))
+			return 0;
+
+		/* REVISIT sometimes sleeping would be best */
+	}
+
+	return 1;
+}
+
+
+/*
+ * Erase one sector of flash memory at offset ``offset'' which is any
+ * address within the sector which should be erased.
+ *
+ * Returns 0 if successful, non-zero otherwise.
+ */
+static int erase_sector(struct m25p *flash, u32 offset)
+{
+	DEBUG(MTD_DEBUG_LEVEL3, "%s: %s at 0x%08x\n", flash->spi->dev.bus_id,
+			__FUNCTION__, offset);
+
+	/* Wait until finished previous write command. */
+	if (wait_till_ready(flash))
+		return 1;
+
+	/* Send write enable, then erase commands. */
+	write_enable(flash);
+
+	/* Set up command buffer. */
+	flash->command[0] = OPCODE_SE;
+	flash->command[1] = offset >> 16;
+	flash->command[2] = offset >> 8;
+	flash->command[3] = offset;
+
+	spi_write(flash->spi, flash->command, sizeof(flash->command));
+
+	return 0;
+}
+
+/****************************************************************************/
+
+/*
+ * MTD implementation
+ */
+
+/*
+ * Erase an address range on the flash chip.  The address range may extend
+ * one or more erase sectors.  Return an error is there is a problem erasing.
+ */
+static int m25p80_erase(struct mtd_info *mtd, struct erase_info *instr)
+{
+	struct m25p *flash = mtd_to_m25p(mtd);
+	u32 addr,len;
+
+	DEBUG(MTD_DEBUG_LEVEL2, "%s: %s %s 0x%08x, len %zd\n",
+			flash->spi->dev.bus_id, __FUNCTION__, "at",
+			(u32)instr->addr, instr->len);
+
+	/* sanity checks */
+	if (instr->addr + instr->len > flash->mtd.size)
+		return -EINVAL;
+	if ((instr->addr % mtd->erasesize) != 0
+			|| (instr->len % mtd->erasesize) != 0) {
+		return -EINVAL;
+	}
+
+	addr = instr->addr;
+	len = instr->len;
+
+  	down(&flash->lock);
+
+	/* now erase those sectors */
+	while (len) {
+		if (erase_sector(flash, addr)) {
+			instr->state = MTD_ERASE_FAILED;
+			up(&flash->lock);
+			return -EIO;
+		}
+
+		addr += mtd->erasesize;
+		len -= mtd->erasesize;
+	}
+
+  	up(&flash->lock);
+
+	instr->state = MTD_ERASE_DONE;
+	mtd_erase_callback(instr);
+
+	return 0;
+}
+
+/*
+ * Read an address range from the flash chip.  The address range
+ * may be any size provided it is within the physical boundaries.
+ */
+static int m25p80_read(struct mtd_info *mtd, loff_t from, size_t len,
+	size_t *retlen, u_char *buf)
+{
+	struct m25p *flash = mtd_to_m25p(mtd);
+	struct spi_transfer t[2];
+	struct spi_message m;
+
+	DEBUG(MTD_DEBUG_LEVEL2, "%s: %s %s 0x%08x, len %zd\n",
+			flash->spi->dev.bus_id, __FUNCTION__, "from",
+			(u32)from, len);
+
+	/* sanity checks */
+	if (!len)
+		return 0;
+
+	if (from + len > flash->mtd.size)
+		return -EINVAL;
+
+	down(&flash->lock);
+
+	/* Wait till previous write/erase is done. */
+	if (wait_till_ready(flash)) {
+		/* REVISIT status return?? */
+		up(&flash->lock);
+		return 1;
+	}
+
+	memset(t, 0, (sizeof t));
+
+	/* NOTE:  OPCODE_FAST_READ (if available) is faster... */
+
+	/* Set up the write data buffer. */
+	flash->command[0] = OPCODE_READ;
+	flash->command[1] = from >> 16;
+	flash->command[2] = from >> 8;
+	flash->command[3] = from;
+
+	/* Byte count starts at zero. */
+	if (retlen)
+		*retlen = 0;
+
+	t[0].tx_buf = flash->command;
+	t[0].len = sizeof(flash->command);
+
+	t[1].rx_buf = buf;
+	t[1].len = len;
+
+	m.transfers = t;
+	m.n_transfer = 2;
+
+	spi_sync(flash->spi, &m);
+
+	*retlen = m.actual_length - sizeof(flash->command);
+
+  	up(&flash->lock);
+
+	return 0;
+}
+
+/*
+ * Write an address range to the flash chip.  Data must be written in
+ * FLASH_PAGESIZE chunks.  The address range may be any size provided
+ * it is within the physical boundaries.
+ */
+static int m25p80_write(struct mtd_info *mtd, loff_t to, size_t len,
+	size_t *retlen, const u_char *buf)
+{
+	struct m25p *flash = mtd_to_m25p(mtd);
+	u32 page_offset, page_size;
+	struct spi_transfer t[2];
+	struct spi_message m;
+
+	DEBUG(MTD_DEBUG_LEVEL2, "%s: %s %s 0x%08x, len %zd\n",
+			flash->spi->dev.bus_id, __FUNCTION__, "to",
+			(u32)to, len);
+
+	if (retlen)
+		*retlen = 0;
+
+	/* sanity checks */
+	if (!len)
+		return(0);
+
+	if (to + len > flash->mtd.size)
+		return -EINVAL;
+
+  	down(&flash->lock);
+
+	/* Wait until finished previous write command. */
+	if (wait_till_ready(flash))
+		return 1;
+
+	write_enable(flash);
+
+	memset(t, 0, (sizeof t));
+
+	/* Set up the opcode in the write buffer. */
+	flash->command[0] = OPCODE_PP;
+	flash->command[1] = to >> 16;
+	flash->command[2] = to >> 8;
+	flash->command[3] = to;
+
+	t[0].tx_buf = flash->command;
+	t[0].len = sizeof(flash->command);
+
+	m.transfers = t;
+	m.n_transfer = 2;
+
+	/* what page do we start with? */
+	page_offset = to % FLASH_PAGESIZE;
+
+	/* do all the bytes fit onto one page? */
+	if (page_offset + len <= FLASH_PAGESIZE) {
+		t[1].tx_buf = buf;
+		t[1].len = len;
+
+		spi_sync(flash->spi, &m);
+
+		*retlen = m.actual_length - sizeof(flash->command);
+	} else {
+		u32 i;
+
+		/* the size of data remaining on the first page */
+		page_size = FLASH_PAGESIZE - page_offset;
+
+		t[1].tx_buf = buf;
+		t[1].len = page_size;
+		spi_sync(flash->spi, &m);
+
+		*retlen = m.actual_length - sizeof(flash->command);
+
+		/* write everything in PAGESIZE chunks */
+		for (i = page_size; i < len; i += page_size) {
+			page_size = len - i;
+			if (page_size > FLASH_PAGESIZE)
+				page_size = FLASH_PAGESIZE;
+
+			/* write the next page to flash */
+			flash->command[1] = (to + i) >> 16;
+			flash->command[2] = (to + i) >> 8;
+			flash->command[3] = (to + i);
+
+			t[1].tx_buf = buf + i;
+			t[1].len = page_size;
+
+			wait_till_ready(flash);
+
+			write_enable(flash);
+
+			spi_sync(flash->spi, &m);
+
+			*retlen += m.actual_length - sizeof(flash->command);
+	        }
+ 	}
+
+	up(&flash->lock);
+
+	return 0;
+}
+
+
+/****************************************************************************/
+
+/*
+ * SPI device driver setup and teardown
+ */
+
+struct flash_info {
+	char		*name;
+	u8		id;
+	u16		jedec_id;
+	unsigned	sector_size;
+	unsigned	n_sectors;
+};
+
+static struct flash_info __devinitdata m25p_data [] = {
+	/* REVISIT: fill in JEDEC ids, for parts that have them */
+	{ "m25p05", 0x05, 0x0000, 32 * 1024, 2 },
+	{ "m25p10", 0x10, 0x0000, 32 * 1024, 4 },
+	{ "m25p20", 0x11, 0x0000, 64 * 1024, 4 },
+	{ "m25p40", 0x12, 0x0000, 64 * 1024, 8 },
+	{ "m25p80", 0x13, 0x0000, 64 * 1024, 16 },
+	{ "m25p16", 0x14, 0x0000, 64 * 1024, 32 },
+	{ "m25p32", 0x15, 0x0000, 64 * 1024, 64 },
+	{ "m25p64", 0x16, 0x2017, 64 * 1024, 128 },
+};
+
+/*
+ * board specific setup should have ensured the SPI clock used here
+ * matches what the READ command supports, at least until this driver
+ * understands FAST_READ (for clocks over 25 MHz).
+ */
+static int __devinit m25p_probe(struct spi_device *spi)
+{
+	struct flash_platform_data	*data;
+	struct m25p			*flash;
+	struct flash_info		*info;
+	unsigned			i;
+
+	/* Platform data helps sort out which chip type we have, as
+	 * well as how this board partitions it.
+	 */
+	data = spi->dev.platform_data;
+	if (!data || !data->type) {
+		/* FIXME some chips can identify themselves with RES
+		 * or JEDEC get-id commands.  Try them ...
+		 */
+		DEBUG(MTD_DEBUG_LEVEL1, "%s: no chip id\n",
+				flash->spi->dev.bus_id);
+		return -ENODEV;
+	}
+
+	for (i = 0, info = m25p_data; i < ARRAY_SIZE(m25p_data); i++, info++) {
+		if (strcmp(data->type, info->name) == 0)
+			break;
+	}
+	if (i == ARRAY_SIZE(m25p_data)) {
+		DEBUG(MTD_DEBUG_LEVEL1, "%s: unrecognized id %s\n",
+				flash->spi->dev.bus_id, data->type);
+		return -ENODEV;
+	}
+
+	flash = kzalloc(sizeof *flash, SLAB_KERNEL);
+	if (!flash)
+		return -ENOMEM;
+
+	flash->spi = spi;
+	init_MUTEX(&flash->lock);
+	dev_set_drvdata(&spi->dev, flash);
+
+	if (data->name)
+		flash->mtd.name = data->name;
+	else
+		flash->mtd.name = spi->dev.bus_id;
+
+	flash->mtd.type = MTD_NORFLASH;
+	flash->mtd.flags = MTD_CAP_NORFLASH;
+	flash->mtd.size = info->sector_size * info->n_sectors;
+	flash->mtd.erasesize = info->sector_size;
+	flash->mtd.erase = m25p80_erase;
+	flash->mtd.read = m25p80_read;
+	flash->mtd.write = m25p80_write;
+
+	dev_info(&spi->dev, "%s (%d Kbytes)\n", info->name,
+			flash->mtd.size / 1024);
+
+	DEBUG(MTD_DEBUG_LEVEL2,
+		"mtd .name = %s, .size = 0x%.8x (%uM) "
+			".erasesize = 0x%.8x (%uK) .numeraseregions = %d\n",
+		flash->mtd.name,
+		flash->mtd.size, flash->mtd.size / (1024*1024),
+		flash->mtd.erasesize, flash->mtd.erasesize / 1024,
+		flash->mtd.numeraseregions);
+
+	if (flash->mtd.numeraseregions)
+		for (i = 0; i < flash->mtd.numeraseregions; i++)
+			DEBUG(MTD_DEBUG_LEVEL2,
+				"mtd.eraseregions[%d] = { .offset = 0x%.8x, "
+				".erasesize = 0x%.8x (%uK), "
+				".numblocks = %d }\n",
+				i, flash->mtd.eraseregions[i].offset,
+				flash->mtd.eraseregions[i].erasesize,
+				flash->mtd.eraseregions[i].erasesize / 1024,
+				flash->mtd.eraseregions[i].numblocks);
+
+
+	/* partitions should match sector boundaries; and it may be good to
+	 * use readonly partitions for writeprotected sectors (BP2..BP0).
+	 */
+	if (mtd_has_partitions()) {
+		struct mtd_partition	*parts = NULL;
+		int			nr_parts = 0;
+
+#ifdef CONFIG_MTD_CMDLINE_PARTS
+		static const char *part_probes[] = { "cmdlinepart", NULL, };
+
+		nr_parts = parse_mtd_partitions(&flash->mtd,
+				part_probes, &parts, 0);
+#endif
+
+		if (nr_parts <= 0 && data && data->parts) {
+			parts = data->parts;
+			nr_parts = data->nr_parts;
+		}
+
+		if (nr_parts > 0) {
+			for (i = 0; i < data->nr_parts; i++) {
+				DEBUG(MTD_DEBUG_LEVEL2, "partitions[%d] = "
+					"{.name = %s, .offset = 0x%.8x, "
+						".size = 0x%.8x (%uK) }\n",
+					i, data->parts[i].name,
+					data->parts[i].offset,
+					data->parts[i].size,
+					data->parts[i].size / 1024);
+			}
+			flash->partitioned = 1;
+			return add_mtd_partitions(&flash->mtd, parts, nr_parts);
+		}
+	} else if (data->nr_parts)
+		dev_warn(&spi->dev, "ignoring %d default partitions on %s\n",
+				data->nr_parts, data->name);
+
+	return add_mtd_device(&flash->mtd) == 1 ? -ENODEV : 0;
+}
+
+
+static int __devexit m25p_remove(struct spi_device *spi)
+{
+	struct m25p	*flash = dev_get_drvdata(&spi->dev);
+	int		status;
+
+	/* Clean up MTD stuff. */
+	if (mtd_has_partitions() && flash->partitioned)
+		status = del_mtd_partitions(&flash->mtd);
+	else
+		status = del_mtd_device(&flash->mtd);
+	if (status == 0)
+		kfree(flash);
+	return 0;
+}
+
+
+static struct spi_driver m25p80_driver = {
+	.driver = {
+		.name	= "m25p80",
+		.bus	= &spi_bus_type,
+		.owner	= THIS_MODULE,
+	},
+	.probe	= m25p_probe,
+	.remove	= __devexit_p(m25p_remove),
+};
+
+
+static int m25p80_init(void)
+{
+	return spi_register_driver(&m25p80_driver);
+}
+
+
+static void m25p80_exit(void)
+{
+	spi_unregister_driver(&m25p80_driver);
+}
+
+
+module_init(m25p80_init);
+module_exit(m25p80_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mike Lavender");
+MODULE_DESCRIPTION("MTD SPI driver for ST M25Pxx flash chips");
diff --git a/include/linux/spi/flash.h b/include/linux/spi/flash.h
index 2ce6558bf3f8..3f22932e67a4 100644
--- a/include/linux/spi/flash.h
+++ b/include/linux/spi/flash.h
@@ -8,6 +8,8 @@ struct mtd_partition;
  * @name: optional flash device name (eg, as used with mtdparts=)
  * @parts: optional array of mtd_partitions for static partitioning
  * @nr_parts: number of mtd_partitions for static partitoning
+ * @type: optional flash device type (e.g. m25p80 vs m25p64), for use
+ *	with chips that can't be queried for JEDEC or other IDs
  *
  * Board init code (in arch/.../mach-xxx/board-yyy.c files) can
  * provide information about SPI flash parts (such as DataFlash) to
@@ -21,6 +23,8 @@ struct flash_platform_data {
 	struct mtd_partition *parts;
 	unsigned int	nr_parts;
 
+	char		*type;
+
 	/* we'll likely add more ... use JEDEC IDs, etc */
 };
 
-- 
cgit v1.2.3-71-gd317


From 8275c642ccdce09a2146d0a9eb022e3698ee927e Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vwool@ru.mvista.com>
Date: Sun, 8 Jan 2006 13:34:28 -0800
Subject: [PATCH] spi: use linked lists rather than an array

This makes the SPI core and its users access transfers in the SPI message
structure as linked list not as an array, as discussed on LKML.

From: David Brownell <dbrownell@users.sourceforge.net>

  Updates including doc, bugfixes to the list code, add
  spi_message_add_tail().  Plus, initialize things _before_ grabbing the
  locks in some cases (in case it grows more expensive).  This also merges
  some bitbang updates of mine that didn't yet make it into the mm tree.

Signed-off-by: Vitaly Wool <vwool@ru.mvista.com>
Signed-off-by: Dmitry Pervushin <dpervushin@gmail.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/input/touchscreen/ads7846.c | 12 +++--
 drivers/mtd/devices/m25p80.c        | 50 ++++++++++----------
 drivers/mtd/devices/mtd_dataflash.c | 28 ++++++-----
 drivers/spi/spi.c                   | 18 +++++---
 drivers/spi/spi_bitbang.c           | 86 +++++++++++++++++++---------------
 include/linux/spi/spi.h             | 92 +++++++++++++++++++++++++------------
 include/linux/spi/spi_bitbang.h     |  7 +++
 7 files changed, 180 insertions(+), 113 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index c741776ef3bf..dd8c6a9ffc76 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -155,10 +155,13 @@ static int ads7846_read12_ser(struct device *dev, unsigned command)
 	struct ser_req		*req = kzalloc(sizeof *req, SLAB_KERNEL);
 	int			status;
 	int			sample;
+	int 			i;
 
 	if (!req)
 		return -ENOMEM;
 
+	INIT_LIST_HEAD(&req->msg.transfers);
+
 	/* activate reference, so it has time to settle; */
 	req->xfer[0].tx_buf = &ref_on;
 	req->xfer[0].len = 1;
@@ -192,8 +195,8 @@ static int ads7846_read12_ser(struct device *dev, unsigned command)
 	/* group all the transfers together, so we can't interfere with
 	 * reading touchscreen state; disable penirq while sampling
 	 */
-	req->msg.transfers = req->xfer;
-	req->msg.n_transfer = 6;
+	for (i = 0; i < 6; i++)
+		spi_message_add_tail(&req->xfer[i], &req->msg);
 
 	disable_irq(spi->irq);
 	status = spi_sync(spi, &req->msg);
@@ -398,6 +401,7 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 	struct ads7846			*ts;
 	struct ads7846_platform_data	*pdata = spi->dev.platform_data;
 	struct spi_transfer		*x;
+	int				i;
 
 	if (!spi->irq) {
 		dev_dbg(&spi->dev, "no IRQ?\n");
@@ -500,8 +504,8 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 
 	CS_CHANGE(x[-1]);
 
-	ts->msg.transfers = ts->xfer;
-	ts->msg.n_transfer = x - ts->xfer;
+	for (i = 0; i < x - ts->xfer; i++)
+		spi_message_add_tail(&ts->xfer[i], &ts->msg);
 	ts->msg.complete = ads7846_rx;
 	ts->msg.context = ts;
 
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index 71a072103a7f..45108ed85588 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -245,6 +245,21 @@ static int m25p80_read(struct mtd_info *mtd, loff_t from, size_t len,
 	if (from + len > flash->mtd.size)
 		return -EINVAL;
 
+	spi_message_init(&m);
+	memset(t, 0, (sizeof t));
+
+	t[0].tx_buf = flash->command;
+	t[0].len = sizeof(flash->command);
+	spi_message_add_tail(&t[0], &m);
+
+	t[1].rx_buf = buf;
+	t[1].len = len;
+	spi_message_add_tail(&t[1], &m);
+
+	/* Byte count starts at zero. */
+	if (retlen)
+		*retlen = 0;
+
 	down(&flash->lock);
 
 	/* Wait till previous write/erase is done. */
@@ -254,8 +269,6 @@ static int m25p80_read(struct mtd_info *mtd, loff_t from, size_t len,
 		return 1;
 	}
 
-	memset(t, 0, (sizeof t));
-
 	/* NOTE:  OPCODE_FAST_READ (if available) is faster... */
 
 	/* Set up the write data buffer. */
@@ -264,19 +277,6 @@ static int m25p80_read(struct mtd_info *mtd, loff_t from, size_t len,
 	flash->command[2] = from >> 8;
 	flash->command[3] = from;
 
-	/* Byte count starts at zero. */
-	if (retlen)
-		*retlen = 0;
-
-	t[0].tx_buf = flash->command;
-	t[0].len = sizeof(flash->command);
-
-	t[1].rx_buf = buf;
-	t[1].len = len;
-
-	m.transfers = t;
-	m.n_transfer = 2;
-
 	spi_sync(flash->spi, &m);
 
 	*retlen = m.actual_length - sizeof(flash->command);
@@ -313,6 +313,16 @@ static int m25p80_write(struct mtd_info *mtd, loff_t to, size_t len,
 	if (to + len > flash->mtd.size)
 		return -EINVAL;
 
+	spi_message_init(&m);
+	memset(t, 0, (sizeof t));
+
+	t[0].tx_buf = flash->command;
+	t[0].len = sizeof(flash->command);
+	spi_message_add_tail(&t[0], &m);
+
+	t[1].tx_buf = buf;
+	spi_message_add_tail(&t[1], &m);
+
   	down(&flash->lock);
 
 	/* Wait until finished previous write command. */
@@ -321,26 +331,17 @@ static int m25p80_write(struct mtd_info *mtd, loff_t to, size_t len,
 
 	write_enable(flash);
 
-	memset(t, 0, (sizeof t));
-
 	/* Set up the opcode in the write buffer. */
 	flash->command[0] = OPCODE_PP;
 	flash->command[1] = to >> 16;
 	flash->command[2] = to >> 8;
 	flash->command[3] = to;
 
-	t[0].tx_buf = flash->command;
-	t[0].len = sizeof(flash->command);
-
-	m.transfers = t;
-	m.n_transfer = 2;
-
 	/* what page do we start with? */
 	page_offset = to % FLASH_PAGESIZE;
 
 	/* do all the bytes fit onto one page? */
 	if (page_offset + len <= FLASH_PAGESIZE) {
-		t[1].tx_buf = buf;
 		t[1].len = len;
 
 		spi_sync(flash->spi, &m);
@@ -352,7 +353,6 @@ static int m25p80_write(struct mtd_info *mtd, loff_t to, size_t len,
 		/* the size of data remaining on the first page */
 		page_size = FLASH_PAGESIZE - page_offset;
 
-		t[1].tx_buf = buf;
 		t[1].len = page_size;
 		spi_sync(flash->spi, &m);
 
diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c
index a39b3b6b266c..99d3a0320fc9 100644
--- a/drivers/mtd/devices/mtd_dataflash.c
+++ b/drivers/mtd/devices/mtd_dataflash.c
@@ -147,7 +147,7 @@ static int dataflash_erase(struct mtd_info *mtd, struct erase_info *instr)
 {
 	struct dataflash	*priv = (struct dataflash *)mtd->priv;
 	struct spi_device	*spi = priv->spi;
-	struct spi_transfer	x[1] = { { .tx_dma = 0, }, };
+	struct spi_transfer	x = { .tx_dma = 0, };
 	struct spi_message	msg;
 	unsigned		blocksize = priv->page_size << 3;
 	u8			*command;
@@ -162,10 +162,11 @@ static int dataflash_erase(struct mtd_info *mtd, struct erase_info *instr)
 			|| (instr->addr % priv->page_size) != 0)
 		return -EINVAL;
 
-	x[0].tx_buf = command = priv->command;
-	x[0].len = 4;
-	msg.transfers = x;
-	msg.n_transfer = 1;
+	spi_message_init(&msg);
+
+	x.tx_buf = command = priv->command;
+	x.len = 4;
+	spi_message_add_tail(&x, &msg);
 
 	down(&priv->lock);
 	while (instr->len > 0) {
@@ -256,12 +257,15 @@ static int dataflash_read(struct mtd_info *mtd, loff_t from, size_t len,
 	DEBUG(MTD_DEBUG_LEVEL3, "READ: (%x) %x %x %x\n",
 		command[0], command[1], command[2], command[3]);
 
+	spi_message_init(&msg);
+
 	x[0].tx_buf = command;
 	x[0].len = 8;
+	spi_message_add_tail(&x[0], &msg);
+
 	x[1].rx_buf = buf;
 	x[1].len = len;
-	msg.transfers = x;
-	msg.n_transfer = 2;
+	spi_message_add_tail(&x[1], &msg);
 
 	down(&priv->lock);
 
@@ -320,9 +324,11 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 	if ((to + len) > mtd->size)
 		return -EINVAL;
 
+	spi_message_init(&msg);
+
 	x[0].tx_buf = command = priv->command;
 	x[0].len = 4;
-	msg.transfers = x;
+	spi_message_add_tail(&x[0], &msg);
 
 	pageaddr = ((unsigned)to / priv->page_size);
 	offset = ((unsigned)to % priv->page_size);
@@ -364,7 +370,6 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 			DEBUG(MTD_DEBUG_LEVEL3, "TRANSFER: (%x) %x %x %x\n",
 				command[0], command[1], command[2], command[3]);
 
-			msg.n_transfer = 1;
 			status = spi_sync(spi, &msg);
 			if (status < 0)
 				DEBUG(MTD_DEBUG_LEVEL1, "%s: xfer %u -> %d \n",
@@ -385,14 +390,16 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 
 		x[1].tx_buf = writebuf;
 		x[1].len = writelen;
-		msg.n_transfer = 2;
+		spi_message_add_tail(x + 1, &msg);
 		status = spi_sync(spi, &msg);
+		spi_transfer_del(x + 1);
 		if (status < 0)
 			DEBUG(MTD_DEBUG_LEVEL1, "%s: pgm %u/%u -> %d \n",
 				spi->dev.bus_id, addr, writelen, status);
 
 		(void) dataflash_waitready(priv->spi);
 
+
 #ifdef	CONFIG_DATAFLASH_WRITE_VERIFY
 
 		/* (3) Compare to Buffer1 */
@@ -405,7 +412,6 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 		DEBUG(MTD_DEBUG_LEVEL3, "COMPARE: (%x) %x %x %x\n",
 			command[0], command[1], command[2], command[3]);
 
-		msg.n_transfer = 1;
 		status = spi_sync(spi, &msg);
 		if (status < 0)
 			DEBUG(MTD_DEBUG_LEVEL1, "%s: compare %u -> %d \n",
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 3ecedccdb96c..cdb242de901d 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -557,6 +557,17 @@ int spi_write_then_read(struct spi_device *spi,
 	if ((n_tx + n_rx) > SPI_BUFSIZ)
 		return -EINVAL;
 
+	spi_message_init(&message);
+	memset(x, 0, sizeof x);
+	if (n_tx) {
+		x[0].len = n_tx;
+		spi_message_add_tail(&x[0], &message);
+	}
+	if (n_rx) {
+		x[1].len = n_rx;
+		spi_message_add_tail(&x[1], &message);
+	}
+
 	/* ... unless someone else is using the pre-allocated buffer */
 	if (down_trylock(&lock)) {
 		local_buf = kmalloc(SPI_BUFSIZ, GFP_KERNEL);
@@ -565,18 +576,11 @@ int spi_write_then_read(struct spi_device *spi,
 	} else
 		local_buf = buf;
 
-	memset(x, 0, sizeof x);
-
 	memcpy(local_buf, txbuf, n_tx);
 	x[0].tx_buf = local_buf;
-	x[0].len = n_tx;
-
 	x[1].rx_buf = local_buf + n_tx;
-	x[1].len = n_rx;
 
 	/* do the i/o */
-	message.transfers = x;
-	message.n_transfer = ARRAY_SIZE(x);
 	status = spi_sync(spi, &message);
 	if (status == 0) {
 		memcpy(rxbuf, x[1].rx_buf, n_rx);
diff --git a/drivers/spi/spi_bitbang.c b/drivers/spi/spi_bitbang.c
index 44aff198eb96..f037e5593269 100644
--- a/drivers/spi/spi_bitbang.c
+++ b/drivers/spi/spi_bitbang.c
@@ -146,6 +146,9 @@ int spi_bitbang_setup(struct spi_device *spi)
 	struct spi_bitbang_cs	*cs = spi->controller_state;
 	struct spi_bitbang	*bitbang;
 
+	if (!spi->max_speed_hz)
+		return -EINVAL;
+
 	if (!cs) {
 		cs = kzalloc(sizeof *cs, SLAB_KERNEL);
 		if (!cs)
@@ -172,13 +175,8 @@ int spi_bitbang_setup(struct spi_device *spi)
 	if (!cs->txrx_word)
 		return -EINVAL;
 
-	if (!spi->max_speed_hz)
-		spi->max_speed_hz = 500 * 1000;
-
-	/* nsecs = max(50, (clock period)/2), be optimistic */
+	/* nsecs = (clock period)/2 */
 	cs->nsecs = (1000000000/2) / (spi->max_speed_hz);
-	if (cs->nsecs < 50)
-		cs->nsecs = 50;
 	if (cs->nsecs > MAX_UDELAY_MS * 1000)
 		return -EINVAL;
 
@@ -194,7 +192,7 @@ int spi_bitbang_setup(struct spi_device *spi)
 	/* deselect chip (low or high) */
 	spin_lock(&bitbang->lock);
 	if (!bitbang->busy) {
-		bitbang->chipselect(spi, 0);
+		bitbang->chipselect(spi, BITBANG_CS_INACTIVE);
 		ndelay(cs->nsecs);
 	}
 	spin_unlock(&bitbang->lock);
@@ -244,9 +242,9 @@ static void bitbang_work(void *_bitbang)
 		struct spi_message	*m;
 		struct spi_device	*spi;
 		unsigned		nsecs;
-		struct spi_transfer	*t;
+		struct spi_transfer	*t = NULL;
 		unsigned		tmp;
-		unsigned		chipselect;
+		unsigned		cs_change;
 		int			status;
 
 		m = container_of(bitbang->queue.next, struct spi_message,
@@ -254,37 +252,49 @@ static void bitbang_work(void *_bitbang)
 		list_del_init(&m->queue);
 		spin_unlock_irqrestore(&bitbang->lock, flags);
 
-// FIXME this is made-up
-nsecs = 100;
+		/* FIXME this is made-up ... the correct value is known to
+		 * word-at-a-time bitbang code, and presumably chipselect()
+		 * should enforce these requirements too?
+		 */
+		nsecs = 100;
 
 		spi = m->spi;
-		t = m->transfers;
 		tmp = 0;
-		chipselect = 0;
+		cs_change = 1;
 		status = 0;
 
-		for (;;t++) {
+		list_for_each_entry (t, &m->transfers, transfer_list) {
 			if (bitbang->shutdown) {
 				status = -ESHUTDOWN;
 				break;
 			}
 
-			/* set up default clock polarity, and activate chip */
-			if (!chipselect) {
-				bitbang->chipselect(spi, 1);
+			/* set up default clock polarity, and activate chip;
+			 * this implicitly updates clock and spi modes as
+			 * previously recorded for this device via setup().
+			 * (and also deselects any other chip that might be
+			 * selected ...)
+			 */
+			if (cs_change) {
+				bitbang->chipselect(spi, BITBANG_CS_ACTIVE);
 				ndelay(nsecs);
 			}
+			cs_change = t->cs_change;
 			if (!t->tx_buf && !t->rx_buf && t->len) {
 				status = -EINVAL;
 				break;
 			}
 
-			/* transfer data */
+			/* transfer data.  the lower level code handles any
+			 * new dma mappings it needs. our caller always gave
+			 * us dma-safe buffers.
+			 */
 			if (t->len) {
-				/* FIXME if bitbang->use_dma, dma_map_single()
-				 * before the transfer, and dma_unmap_single()
-				 * afterwards, for either or both buffers...
+				/* REVISIT dma API still needs a designated
+				 * DMA_ADDR_INVALID; ~0 might be better.
 				 */
+				if (!m->is_dma_mapped)
+					t->rx_dma = t->tx_dma = 0;
 				status = bitbang->txrx_bufs(spi, t);
 			}
 			if (status != t->len) {
@@ -299,29 +309,31 @@ nsecs = 100;
 			if (t->delay_usecs)
 				udelay(t->delay_usecs);
 
-			tmp++;
-			if (tmp >= m->n_transfer)
-				break;
-
-			chipselect = !t->cs_change;
-			if (chipselect);
+			if (!cs_change)
 				continue;
+			if (t->transfer_list.next == &m->transfers)
+				break;
 
-			bitbang->chipselect(spi, 0);
-
-			/* REVISIT do we want the udelay here instead? */
-			msleep(1);
+			/* sometimes a short mid-message deselect of the chip
+			 * may be needed to terminate a mode or command
+			 */
+			ndelay(nsecs);
+			bitbang->chipselect(spi, BITBANG_CS_INACTIVE);
+			ndelay(nsecs);
 		}
 
-		tmp = m->n_transfer - 1;
-		tmp = m->transfers[tmp].cs_change;
-
 		m->status = status;
 		m->complete(m->context);
 
-		ndelay(2 * nsecs);
-		bitbang->chipselect(spi, status == 0 && tmp);
-		ndelay(nsecs);
+		/* normally deactivate chipselect ... unless no error and
+		 * cs_change has hinted that the next message will probably
+		 * be for this chip too.
+		 */
+		if (!(status == 0 && cs_change)) {
+			ndelay(nsecs);
+			bitbang->chipselect(spi, BITBANG_CS_INACTIVE);
+			ndelay(nsecs);
+		}
 
 		spin_lock_irqsave(&bitbang->lock, flags);
 	}
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 6a41e2650b2e..939afd3a2e72 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -263,15 +263,16 @@ extern struct spi_master *spi_busnum_to_master(u16 busnum);
 
 /**
  * struct spi_transfer - a read/write buffer pair
- * @tx_buf: data to be written (dma-safe address), or NULL
- * @rx_buf: data to be read (dma-safe address), or NULL
- * @tx_dma: DMA address of buffer, if spi_message.is_dma_mapped
- * @rx_dma: DMA address of buffer, if spi_message.is_dma_mapped
+ * @tx_buf: data to be written (dma-safe memory), or NULL
+ * @rx_buf: data to be read (dma-safe memory), or NULL
+ * @tx_dma: DMA address of tx_buf, if spi_message.is_dma_mapped
+ * @rx_dma: DMA address of rx_buf, if spi_message.is_dma_mapped
  * @len: size of rx and tx buffers (in bytes)
  * @cs_change: affects chipselect after this transfer completes
  * @delay_usecs: microseconds to delay after this transfer before
  * 	(optionally) changing the chipselect status, then starting
  * 	the next transfer or completing this spi_message.
+ * @transfer_list: transfers are sequenced through spi_message.transfers
  *
  * SPI transfers always write the same number of bytes as they read.
  * Protocol drivers should always provide rx_buf and/or tx_buf.
@@ -279,11 +280,16 @@ extern struct spi_master *spi_busnum_to_master(u16 busnum);
  * the data being transferred; that may reduce overhead, when the
  * underlying driver uses dma.
  *
- * All SPI transfers start with the relevant chipselect active.  Drivers
- * can change behavior of the chipselect after the transfer finishes
- * (including any mandatory delay).  The normal behavior is to leave it
- * selected, except for the last transfer in a message.  Setting cs_change
- * allows two additional behavior options:
+ * If the transmit buffer is null, undefined data will be shifted out
+ * while filling rx_buf.  If the receive buffer is null, the data
+ * shifted in will be discarded.  Only "len" bytes shift out (or in).
+ * It's an error to try to shift out a partial word.  (For example, by
+ * shifting out three bytes with word size of sixteen or twenty bits;
+ * the former uses two bytes per word, the latter uses four bytes.)
+ *
+ * All SPI transfers start with the relevant chipselect active.  Normally
+ * it stays selected until after the last transfer in a message.  Drivers
+ * can affect the chipselect signal using cs_change:
  *
  * (i) If the transfer isn't the last one in the message, this flag is
  * used to make the chipselect briefly go inactive in the middle of the
@@ -299,7 +305,8 @@ extern struct spi_master *spi_busnum_to_master(u16 busnum);
  * The code that submits an spi_message (and its spi_transfers)
  * to the lower layers is responsible for managing its memory.
  * Zero-initialize every field you don't set up explicitly, to
- * insulate against future API updates.
+ * insulate against future API updates.  After you submit a message
+ * and its transfers, ignore them until its completion callback.
  */
 struct spi_transfer {
 	/* it's ok if tx_buf == rx_buf (right?)
@@ -316,12 +323,13 @@ struct spi_transfer {
 
 	unsigned	cs_change:1;
 	u16		delay_usecs;
+
+	struct list_head transfer_list;
 };
 
 /**
  * struct spi_message - one multi-segment SPI transaction
- * @transfers: the segements of the transaction
- * @n_transfer: how many segments
+ * @transfers: list of transfer segments in this transaction
  * @spi: SPI device to which the transaction is queued
  * @is_dma_mapped: if true, the caller provided both dma and cpu virtual
  *	addresses for each transfer buffer
@@ -333,14 +341,22 @@ struct spi_transfer {
  * @queue: for use by whichever driver currently owns the message
  * @state: for use by whichever driver currently owns the message
  *
+ * An spi_message is used to execute an atomic sequence of data transfers,
+ * each represented by a struct spi_transfer.  The sequence is "atomic"
+ * in the sense that no other spi_message may use that SPI bus until that
+ * sequence completes.  On some systems, many such sequences can execute as
+ * as single programmed DMA transfer.  On all systems, these messages are
+ * queued, and might complete after transactions to other devices.  Messages
+ * sent to a given spi_device are alway executed in FIFO order.
+ *
  * The code that submits an spi_message (and its spi_transfers)
  * to the lower layers is responsible for managing its memory.
  * Zero-initialize every field you don't set up explicitly, to
- * insulate against future API updates.
+ * insulate against future API updates.  After you submit a message
+ * and its transfers, ignore them until its completion callback.
  */
 struct spi_message {
-	struct spi_transfer	*transfers;
-	unsigned		n_transfer;
+	struct list_head 	transfers;
 
 	struct spi_device	*spi;
 
@@ -371,6 +387,24 @@ struct spi_message {
 	void			*state;
 };
 
+static inline void spi_message_init(struct spi_message *m)
+{
+	memset(m, 0, sizeof *m);
+	INIT_LIST_HEAD(&m->transfers);
+}
+
+static inline void
+spi_message_add_tail(struct spi_transfer *t, struct spi_message *m)
+{
+	list_add_tail(&t->transfer_list, &m->transfers);
+}
+
+static inline void
+spi_transfer_del(struct spi_transfer *t)
+{
+	list_del(&t->transfer_list);
+}
+
 /* It's fine to embed message and transaction structures in other data
  * structures so long as you don't free them while they're in use.
  */
@@ -383,8 +417,12 @@ static inline struct spi_message *spi_message_alloc(unsigned ntrans, gfp_t flags
 			+ ntrans * sizeof(struct spi_transfer),
 			flags);
 	if (m) {
-		m->transfers = (void *)(m + 1);
-		m->n_transfer = ntrans;
+		int i;
+		struct spi_transfer *t = (struct spi_transfer *)(m + 1);
+
+		INIT_LIST_HEAD(&m->transfers);
+		for (i = 0; i < ntrans; i++, t++)
+			spi_message_add_tail(t, m);
 	}
 	return m;
 }
@@ -402,6 +440,8 @@ static inline void spi_message_free(struct spi_message *m)
  * device doesn't work with the mode 0 default.  They may likewise need
  * to update clock rates or word sizes from initial values.  This function
  * changes those settings, and must be called from a context that can sleep.
+ * The changes take effect the next time the device is selected and data
+ * is transferred to or from it.
  */
 static inline int
 spi_setup(struct spi_device *spi)
@@ -468,15 +508,12 @@ spi_write(struct spi_device *spi, const u8 *buf, size_t len)
 {
 	struct spi_transfer	t = {
 			.tx_buf		= buf,
-			.rx_buf		= NULL,
 			.len		= len,
-			.cs_change	= 0,
-		};
-	struct spi_message	m = {
-			.transfers	= &t,
-			.n_transfer	= 1,
 		};
+	struct spi_message	m;
 
+	spi_message_init(&m);
+	spi_message_add_tail(&t, &m);
 	return spi_sync(spi, &m);
 }
 
@@ -493,16 +530,13 @@ static inline int
 spi_read(struct spi_device *spi, u8 *buf, size_t len)
 {
 	struct spi_transfer	t = {
-			.tx_buf		= NULL,
 			.rx_buf		= buf,
 			.len		= len,
-			.cs_change	= 0,
-		};
-	struct spi_message	m = {
-			.transfers	= &t,
-			.n_transfer	= 1,
 		};
+	struct spi_message	m;
 
+	spi_message_init(&m);
+	spi_message_add_tail(&t, &m);
 	return spi_sync(spi, &m);
 }
 
diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h
index 8dfe61a445f4..c961fe9bf3eb 100644
--- a/include/linux/spi/spi_bitbang.h
+++ b/include/linux/spi/spi_bitbang.h
@@ -31,8 +31,15 @@ struct spi_bitbang {
 	struct spi_master	*master;
 
 	void	(*chipselect)(struct spi_device *spi, int is_on);
+#define	BITBANG_CS_ACTIVE	1	/* normally nCS, active low */
+#define	BITBANG_CS_INACTIVE	0
 
+	/* txrx_bufs() may handle dma mapping for transfers that don't
+	 * already have one (transfer.{tx,rx}_dma is zero), or use PIO
+	 */
 	int	(*txrx_bufs)(struct spi_device *spi, struct spi_transfer *t);
+
+	/* txrx_word[SPI_MODE_*]() just looks like a shift register */
 	u32	(*txrx_word[4])(struct spi_device *spi,
 			unsigned nsecs,
 			u32 word, u8 bits);
-- 
cgit v1.2.3-71-gd317


From 5d870c8e216f121307445c71caa72e7e10a20061 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 11 Jan 2006 11:23:49 -0800
Subject: [PATCH] spi: remove fastcall crap

gcc4 generates warnings when a non-FASTCALL function pointer is assigned to a
FASTCALL one.  Perhaps it has taste.

Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/spi/spi.c       | 7 ++++++-
 include/linux/spi/spi.h | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index cdb242de901d..791c4dc550ae 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -480,6 +480,11 @@ EXPORT_SYMBOL_GPL(spi_busnum_to_master);
 
 /*-------------------------------------------------------------------------*/
 
+static void spi_complete(void *arg)
+{
+	complete(arg);
+}
+
 /**
  * spi_sync - blocking/synchronous SPI data transfers
  * @spi: device with which data will be exchanged
@@ -508,7 +513,7 @@ int spi_sync(struct spi_device *spi, struct spi_message *message)
 	DECLARE_COMPLETION(done);
 	int status;
 
-	message->complete = (void (*)(void *)) complete;
+	message->complete = spi_complete;
 	message->context = &done;
 	status = spi_async(spi, message);
 	if (status == 0)
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 939afd3a2e72..b05f1463a267 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -374,7 +374,7 @@ struct spi_message {
 	 */
 
 	/* completion is reported through a callback */
-	void 			FASTCALL((*complete)(void *context));
+	void 			(*complete)(void *context);
 	void			*context;
 	unsigned		actual_length;
 	int			status;
-- 
cgit v1.2.3-71-gd317


From 6d5b0c315e0c14f8a0fe274eda7676d62cbd8584 Mon Sep 17 00:00:00 2001
From: "Moore, Eric" <Eric.Moore@lsil.com>
Date: Fri, 13 Jan 2006 16:25:26 -0700
Subject: [SCSI] fusion - adding support for FC949ES

Add software recognition for the new LSI Logic Fibre Channel controller.

Signed-off-by: Eric Moore <Eric.Moore@lsil.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/message/fusion/mptbase.c | 4 ++++
 drivers/message/fusion/mptfc.c   | 2 ++
 include/linux/pci_ids.h          | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 8c26f093fb65..33213370027e 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -1378,6 +1378,10 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 		ioc->bus_type = FC;
 		ioc->errata_flag_1064 = 1;
 	}
+	else if (pdev->device == MPI_MANUFACTPAGE_DEVICEID_FC949E) {
+		ioc->prod_name = "LSIFC949E";
+		ioc->bus_type = FC;
+	}
 	else if (pdev->device == MPI_MANUFACTPAGE_DEVID_53C1030) {
 		ioc->prod_name = "LSI53C1030";
 		ioc->bus_type = SPI;
diff --git a/drivers/message/fusion/mptfc.c b/drivers/message/fusion/mptfc.c
index a380fe105a22..b102c7666d0e 100644
--- a/drivers/message/fusion/mptfc.c
+++ b/drivers/message/fusion/mptfc.c
@@ -145,6 +145,8 @@ static struct pci_device_id mptfc_pci_table[] = {
 		PCI_ANY_ID, PCI_ANY_ID },
 	{ PCI_VENDOR_ID_LSI_LOGIC, PCI_DEVICE_ID_LSI_FC949X,
 		PCI_ANY_ID, PCI_ANY_ID },
+	{ PCI_VENDOR_ID_LSI_LOGIC, PCI_DEVICE_ID_LSI_FC949ES,
+		PCI_ANY_ID, PCI_ANY_ID },
 	{0}	/* Terminating entry */
 };
 MODULE_DEVICE_TABLE(pci, mptfc_pci_table);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 7fb397e3f2d3..5403257ae3e7 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -181,6 +181,7 @@
 #define PCI_DEVICE_ID_LSI_FC929X	0x0626
 #define PCI_DEVICE_ID_LSI_FC939X	0x0642
 #define PCI_DEVICE_ID_LSI_FC949X	0x0640
+#define PCI_DEVICE_ID_LSI_FC949ES	0x0646
 #define PCI_DEVICE_ID_LSI_FC919X	0x0628
 #define PCI_DEVICE_ID_NCR_YELLOWFIN	0x0701
 #define PCI_DEVICE_ID_LSI_61C102	0x0901
-- 
cgit v1.2.3-71-gd317


From 3235798804ee75f09d45aee5003197930de57689 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 15 Jan 2006 02:12:54 +0100
Subject: Fix "stuct", "strut", "struc" typos

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 Documentation/DocBook/videobook.tmpl                         | 4 ++--
 Documentation/input/ff.txt                                   | 2 +-
 Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl | 2 +-
 drivers/net/wan/lmc/lmc_main.c                               | 2 +-
 drivers/scsi/FlashPoint.c                                    | 2 +-
 include/asm-v850/ptrace.h                                    | 2 +-
 include/linux/pfkeyv2.h                                      | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/videobook.tmpl b/Documentation/DocBook/videobook.tmpl
index 3ec6c875588a..fdff984a5161 100644
--- a/Documentation/DocBook/videobook.tmpl
+++ b/Documentation/DocBook/videobook.tmpl
@@ -229,7 +229,7 @@ int __init myradio_init(struct video_init *v)
 
 static int users = 0;
 
-static int radio_open(stuct video_device *dev, int flags)
+static int radio_open(struct video_device *dev, int flags)
 {
         if(users)
                 return -EBUSY;
@@ -949,7 +949,7 @@ int __init mycamera_init(struct video_init *v)
 
 static int users = 0;
 
-static int camera_open(stuct video_device *dev, int flags)
+static int camera_open(struct video_device *dev, int flags)
 {
         if(users)
                 return -EBUSY;
diff --git a/Documentation/input/ff.txt b/Documentation/input/ff.txt
index efa7dd6751f3..c7e10eaff203 100644
--- a/Documentation/input/ff.txt
+++ b/Documentation/input/ff.txt
@@ -120,7 +120,7 @@ to the unique id assigned by the driver. This data is required for performing
 some operations (removing an effect, controlling the playback).
 This if field must be set to -1 by the user in order to tell the driver to
 allocate a new effect.
-See <linux/input.h> for a description of the ff_effect stuct. You should also
+See <linux/input.h> for a description of the ff_effect struct. You should also
 find help in a few sketches, contained in files shape.fig and interactive.fig.
 You need xfig to visualize these files.
 
diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
index 4963d83d1511..e651ed8d1e6f 100644
--- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
@@ -5577,7 +5577,7 @@ struct _snd_pcm_runtime {
       <informalexample>
         <programlisting>
 <![CDATA[
-  static int mychip_suspend(strut pci_dev *pci, pm_message_t state)
+  static int mychip_suspend(struct pci_dev *pci, pm_message_t state)
   {
           /* (1) */
           struct snd_card *card = pci_get_drvdata(pci);
diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c
index 2b948ea397d5..40926d779161 100644
--- a/drivers/net/wan/lmc/lmc_main.c
+++ b/drivers/net/wan/lmc/lmc_main.c
@@ -641,7 +641,7 @@ static void lmc_watchdog (unsigned long data) /*fold00*/
     spin_lock_irqsave(&sc->lmc_lock, flags);
 
     if(sc->check != 0xBEAFCAFE){
-        printk("LMC: Corrupt net_device stuct, breaking out\n");
+        printk("LMC: Corrupt net_device struct, breaking out\n");
 	spin_unlock_irqrestore(&sc->lmc_lock, flags);
         return;
     }
diff --git a/drivers/scsi/FlashPoint.c b/drivers/scsi/FlashPoint.c
index 5beed4f6d985..8d64f0bed628 100644
--- a/drivers/scsi/FlashPoint.c
+++ b/drivers/scsi/FlashPoint.c
@@ -149,7 +149,7 @@ typedef SCCBMGR_INFO *      PSCCBMGR_INFO;
 #define PCI_BUS_CARD          0x03
 #define VESA_BUS_CARD         0x04
 
-/* SCCB struc used for both SCCB and UCB manager compiles! 
+/* SCCB struct used for both SCCB and UCB manager compiles! 
  * The UCB Manager treats the SCCB as it's 'native hardware structure' 
  */
 
diff --git a/include/asm-v850/ptrace.h b/include/asm-v850/ptrace.h
index 7bf72bb5078c..4f35cf2cd641 100644
--- a/include/asm-v850/ptrace.h
+++ b/include/asm-v850/ptrace.h
@@ -92,7 +92,7 @@ struct pt_regs
 /* The number of bytes used to store each register.  */
 #define _PT_REG_SIZE	4
 
-/* Offset of a general purpose register in a stuct pt_regs.  */
+/* Offset of a general purpose register in a struct pt_regs.  */
 #define PT_GPR(num)	((num) * _PT_REG_SIZE)
 
 /* Offsets of various special registers & fields in a struct pt_regs.  */
diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
index 6351c4055ace..bac0fb389cf1 100644
--- a/include/linux/pfkeyv2.h
+++ b/include/linux/pfkeyv2.h
@@ -104,7 +104,7 @@ struct sadb_prop {
 /* followed by:
 	struct sadb_comb sadb_combs[(sadb_prop_len +
 		sizeof(uint64_t) - sizeof(struct sadb_prop)) /
-		sizeof(strut sadb_comb)]; */
+		sizeof(struct sadb_comb)]; */
 
 struct sadb_comb {
 	uint8_t		sadb_comb_auth;
-- 
cgit v1.2.3-71-gd317


From 7170be5f586b59bdcdab082778a5d9203ba7b667 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@redhat.com>
Date: Sat, 14 Jan 2006 13:20:38 -0800
Subject: [PATCH] convert /proc/devices to use seq_file interface

A Christoph suggested that the /proc/devices file be converted to use the
seq_file interface.  This patch does that.

I've obxerved one or two installation that had sufficiently large sans that
they overran the 4k limit on /proc/devices.

Signed-off-by: Neil Horman <nhorman@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/genhd.c       | 106 +++++++++++++++++++++++++++-------
 fs/char_dev.c       |  96 +++++++++++++++++++++++--------
 fs/proc/proc_misc.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/fs.h  |  11 ++++
 4 files changed, 320 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index f1ed83f3f083..db57546a709d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -38,34 +38,100 @@ static inline int major_to_index(int major)
 	return major % MAX_PROBE_HASH;
 }
 
-#ifdef CONFIG_PROC_FS
-/* get block device names in somewhat random order */
-int get_blkdev_list(char *p, int used)
+struct blkdev_info {
+        int index;
+        struct blk_major_name *bd;
+};
+
+/*
+ * iterate over a list of blkdev_info structures.  allows
+ * the major_names array to be iterated over from outside this file
+ * must be called with the block_subsys_sem held
+ */
+void *get_next_blkdev(void *dev)
+{
+        struct blkdev_info *info;
+
+        if (dev == NULL) {
+                info = kmalloc(sizeof(*info), GFP_KERNEL);
+                if (!info)
+                        goto out;
+                info->index=0;
+                info->bd = major_names[info->index];
+                if (info->bd)
+                        goto out;
+        } else {
+                info = dev;
+        }
+
+        while (info->index < ARRAY_SIZE(major_names)) {
+                if (info->bd)
+                        info->bd = info->bd->next;
+                if (info->bd)
+                        goto out;
+                /*
+                 * No devices on this chain, move to the next
+                 */
+                info->index++;
+                info->bd = (info->index < ARRAY_SIZE(major_names)) ?
+			major_names[info->index] : NULL;
+                if (info->bd)
+                        goto out;
+        }
+
+out:
+        return info;
+}
+
+void *acquire_blkdev_list(void)
+{
+        down(&block_subsys_sem);
+        return get_next_blkdev(NULL);
+}
+
+void release_blkdev_list(void *dev)
+{
+        up(&block_subsys_sem);
+        kfree(dev);
+}
+
+
+/*
+ * Count the number of records in the blkdev_list.
+ * must be called with the block_subsys_sem held
+ */
+int count_blkdev_list(void)
 {
 	struct blk_major_name *n;
-	int i, len;
+	int i, count;
 
-	len = snprintf(p, (PAGE_SIZE-used), "\nBlock devices:\n");
+	count = 0;
 
-	down(&block_subsys_sem);
 	for (i = 0; i < ARRAY_SIZE(major_names); i++) {
-		for (n = major_names[i]; n; n = n->next) {
-			/*
-			 * If the curent string plus the 5 extra characters
-			 * in the line would run us off the page, then we're done
-			 */
-			if ((len + used + strlen(n->name) + 5) >= PAGE_SIZE)
-				goto page_full;
-			len += sprintf(p+len, "%3d %s\n",
-				       n->major, n->name);
-		}
+		for (n = major_names[i]; n; n = n->next)
+				count++;
 	}
-page_full:
-	up(&block_subsys_sem);
 
-	return len;
+	return count;
 }
-#endif
+
+/*
+ * extract the major and name values from a blkdev_info struct
+ * passed in as a void to *dev.  Must be called with
+ * block_subsys_sem held
+ */
+int get_blkdev_info(void *dev, int *major, char **name)
+{
+        struct blkdev_info *info = dev;
+
+        if (info->bd == NULL)
+                return 1;
+
+        *major = info->bd->major;
+        *name = info->bd->name;
+        return 0;
+}
+
 
 int register_blkdev(unsigned int major, const char *name)
 {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3b1b1eefdbb0..21195c481637 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -35,7 +35,7 @@ static struct char_device_struct {
 	unsigned int major;
 	unsigned int baseminor;
 	int minorct;
-	const char *name;
+	char name[64];
 	struct file_operations *fops;
 	struct cdev *cdev;		/* will die */
 } *chrdevs[MAX_PROBE_HASH];
@@ -46,34 +46,84 @@ static inline int major_to_index(int major)
 	return major % MAX_PROBE_HASH;
 }
 
-/* get char device names in somewhat random order */
-int get_chrdev_list(char *page)
-{
+struct chrdev_info {
+	int index;
 	struct char_device_struct *cd;
-	int i, len;
+};
 
-	len = sprintf(page, "Character devices:\n");
+void *get_next_chrdev(void *dev)
+{
+	struct chrdev_info *info;
 
+	if (dev == NULL) {
+		info = kmalloc(sizeof(*info), GFP_KERNEL);
+		if (!info)
+			goto out;
+		info->index=0;
+		info->cd = chrdevs[info->index];
+		if (info->cd)
+			goto out;
+	} else {
+		info = dev;
+	}
+
+	while (info->index < ARRAY_SIZE(chrdevs)) {
+		if (info->cd)
+			info->cd = info->cd->next;
+		if (info->cd)
+			goto out;
+		/*
+		 * No devices on this chain, move to the next
+		 */
+		info->index++;
+		info->cd = (info->index < ARRAY_SIZE(chrdevs)) ?
+			chrdevs[info->index] : NULL;
+		if (info->cd)
+			goto out;
+	}
+
+out:
+	return info;
+}
+
+void *acquire_chrdev_list(void)
+{
 	down(&chrdevs_lock);
+	return get_next_chrdev(NULL);
+}
+
+void release_chrdev_list(void *dev)
+{
+	up(&chrdevs_lock);
+	kfree(dev);
+}
+
+
+int count_chrdev_list(void)
+{
+	struct char_device_struct *cd;
+	int i, count;
+
+	count = 0;
+
 	for (i = 0; i < ARRAY_SIZE(chrdevs) ; i++) {
-		for (cd = chrdevs[i]; cd; cd = cd->next) {
-			/*
-			 * if the current name, plus the 5 extra characters
-			 * in the device line for this entry
-			 * would run us off the page, we're done
-			 */
-			if ((len+strlen(cd->name) + 5) >= PAGE_SIZE)
-				goto page_full;
-
-
-			len += sprintf(page+len, "%3d %s\n",
-				       cd->major, cd->name);
-		}
+		for (cd = chrdevs[i]; cd; cd = cd->next)
+			count++;
 	}
-page_full:
-	up(&chrdevs_lock);
 
-	return len;
+	return count;
+}
+
+int get_chrdev_info(void *dev, int *major, char **name)
+{
+	struct chrdev_info *info = dev;
+
+	if (info->cd == NULL)
+		return 1;
+
+	*major = info->cd->major;
+	*name = info->cd->name;
+	return 0;
 }
 
 /*
@@ -121,7 +171,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 	cd->major = major;
 	cd->baseminor = baseminor;
 	cd->minorct = minorct;
-	cd->name = name;
+	strncpy(cd->name,name, 64);
 
 	i = major_to_index(major);
 
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 63bf6c00fa0c..8f8014285a34 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -20,6 +20,7 @@
 #include <linux/time.h>
 #include <linux/kernel.h>
 #include <linux/kernel_stat.h>
+#include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/string.h>
 #include <linux/mman.h>
@@ -62,7 +63,6 @@
  */
 extern int get_hardware_list(char *);
 extern int get_stram_list(char *);
-extern int get_chrdev_list(char *);
 extern int get_filesystem_list(char *);
 extern int get_exec_domain_list(char *);
 extern int get_dma_list(char *);
@@ -248,6 +248,154 @@ static int cpuinfo_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &cpuinfo_op);
 }
+
+enum devinfo_states {
+	CHR_HDR,
+	CHR_LIST,
+	BLK_HDR,
+	BLK_LIST,
+	DEVINFO_DONE
+};
+
+struct devinfo_state {
+	void *chrdev;
+	void *blkdev;
+	unsigned int num_records;
+	unsigned int cur_record;
+	enum devinfo_states state;
+};
+
+static void *devinfo_start(struct seq_file *f, loff_t *pos)
+{
+	struct devinfo_state *info = f->private;
+
+	if (*pos) {
+		if ((info) && (*pos <= info->num_records))
+			return info;
+		return NULL;
+	}
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	f->private = info;
+	info->chrdev = acquire_chrdev_list();
+	info->blkdev = acquire_blkdev_list();
+	info->state = CHR_HDR;
+	info->num_records = count_chrdev_list();
+	info->num_records += count_blkdev_list();
+	info->num_records += 2; /* Character and Block headers */
+	*pos = 1;
+	info->cur_record = *pos;
+	return info;
+}
+
+static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	int idummy;
+	char *ndummy;
+	struct devinfo_state *info = f->private;
+
+	switch (info->state) {
+		case CHR_HDR:
+			info->state = CHR_LIST;
+			(*pos)++;
+			/*fallthrough*/
+		case CHR_LIST:
+			if (get_chrdev_info(info->chrdev,&idummy,&ndummy)) {
+				/*
+				 * The character dev list is complete
+				 */
+				info->state = BLK_HDR;
+			} else {
+				info->chrdev = get_next_chrdev(info->chrdev);
+			}
+			(*pos)++;
+			break;
+		case BLK_HDR:
+			info->state = BLK_LIST;
+			(*pos)++;
+			break;
+		case BLK_LIST:
+			if (get_blkdev_info(info->blkdev,&idummy,&ndummy)) {
+				/*
+				 * The block dev list is complete
+				 */
+				info->state = DEVINFO_DONE;
+			} else {
+				info->blkdev = get_next_blkdev(info->blkdev);
+			}
+			(*pos)++;
+			break;
+		case DEVINFO_DONE:
+			(*pos)++;
+			info->cur_record = *pos;
+			info = NULL;
+			break;
+		default:
+			break;
+	}
+	if (info)
+		info->cur_record = *pos;
+	return info;
+}
+
+static void devinfo_stop(struct seq_file *f, void *v)
+{
+	struct devinfo_state *info = f->private;
+
+	if (info) {
+		release_chrdev_list(info->chrdev);
+		release_blkdev_list(info->blkdev);
+		f->private = NULL;
+		kfree(info);
+	}
+}
+
+static int devinfo_show(struct seq_file *f, void *arg)
+{
+	int major;
+	char *name;
+	struct devinfo_state *info = f->private;
+
+	switch(info->state) {
+		case CHR_HDR:
+			seq_printf(f,"Character devices:\n");
+			/* fallthrough */
+		case CHR_LIST:
+			if (!get_chrdev_info(info->chrdev,&major,&name))
+				seq_printf(f,"%3d %s\n",major,name);
+			break;
+		case BLK_HDR:
+			seq_printf(f,"\nBlock devices:\n");
+			/* fallthrough */
+		case BLK_LIST:
+			if (!get_blkdev_info(info->blkdev,&major,&name))
+				seq_printf(f,"%3d %s\n",major,name);
+			break;
+		default:
+			break;
+	}
+
+	return 0;
+}
+
+static  struct seq_operations devinfo_op = {
+	.start  = devinfo_start,
+	.next   = devinfo_next,
+	.stop   = devinfo_stop,
+	.show   = devinfo_show,
+};
+
+static int devinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &devinfo_op);
+}
+
+static struct file_operations proc_devinfo_operations = {
+	.open		= devinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static struct file_operations proc_cpuinfo_operations = {
 	.open		= cpuinfo_open,
 	.read		= seq_read,
@@ -450,14 +598,6 @@ static struct file_operations proc_stat_operations = {
 	.release	= single_release,
 };
 
-static int devices_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = get_chrdev_list(page);
-	len += get_blkdev_list(page+len, len);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
 /*
  * /proc/interrupts
  */
@@ -582,7 +722,6 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_STRAM_PROC
 		{"stram",	stram_read_proc},
 #endif
-		{"devices",	devices_read_proc},
 		{"filesystems",	filesystems_read_proc},
 		{"cmdline",	cmdline_read_proc},
 		{"locks",	locks_read_proc},
@@ -598,6 +737,7 @@ void __init proc_misc_init(void)
 	entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);
 	if (entry)
 		entry->proc_fops = &proc_kmsg_operations;
+	create_seq_entry("devices", 0, &proc_devinfo_operations);
 	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
 	create_seq_entry("partitions", 0, &proc_partitions_operations);
 	create_seq_entry("stat", 0, &proc_stat_operations);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d1e370d25f7b..552cedfa6064 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1383,6 +1383,12 @@ extern int register_chrdev(unsigned int, const char *,
 extern int unregister_chrdev(unsigned int, const char *);
 extern void unregister_chrdev_region(dev_t, unsigned);
 extern int chrdev_open(struct inode *, struct file *);
+extern int get_chrdev_list(char *);
+extern void *acquire_chrdev_list(void);
+extern int count_chrdev_list(void);
+extern void *get_next_chrdev(void *);
+extern int get_chrdev_info(void *, int *, char **);
+extern void release_chrdev_list(void *);
 
 /* fs/block_dev.c */
 #define BDEVNAME_SIZE	32	/* Largest string for a blockdev identifier */
@@ -1391,6 +1397,11 @@ extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
 extern struct block_device *open_bdev_excl(const char *, int, void *);
 extern void close_bdev_excl(struct block_device *);
+extern void *acquire_blkdev_list(void);
+extern int count_blkdev_list(void);
+extern void *get_next_blkdev(void *);
+extern int get_blkdev_info(void *, int *, char **);
+extern void release_blkdev_list(void *);
 
 extern void init_special_inode(struct inode *, umode_t, dev_t);
 
-- 
cgit v1.2.3-71-gd317


From 2d0cfb527944c2cfee2cffab14f52d483e329fcf Mon Sep 17 00:00:00 2001
From: Patrick Gefre <pfg@sgi.com>
Date: Sat, 14 Jan 2006 13:20:40 -0800
Subject: [PATCH] Altix: ioc3 serial support

Add driver support for a 2 port PCI IOC3-based serial card on Altix boxes:

This is a re-submission.  On the original submission I was asked to
organize the code so that the MIPS ioc3 ethernet and serial parts could be
used with this driver.  Stanislaw Skowronek was kind enough to provide the
shim layer for this - thanks Stanislaw.  This patch includes the shim layer
and the Altix PCI ioc3 serial driver.  The MIPS merged ioc3 ethernet and
serial support is forthcoming.

Signed-off-by: Patrick Gefre <pfg@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/configs/gensparse_defconfig |    2 +
 arch/ia64/configs/sn2_defconfig       |    2 +
 drivers/serial/Kconfig                |    9 +-
 drivers/serial/Makefile               |    1 +
 drivers/serial/ioc3_serial.c          | 2197 +++++++++++++++++++++++++++++++++
 drivers/sn/Kconfig                    |   14 +
 drivers/sn/Makefile                   |    1 +
 drivers/sn/ioc3.c                     |  851 +++++++++++++
 include/asm-ia64/sn/ioc3.h            |  241 ++++
 include/linux/ioc3.h                  |   93 ++
 10 files changed, 3410 insertions(+), 1 deletion(-)
 create mode 100644 drivers/serial/ioc3_serial.c
 create mode 100644 drivers/sn/ioc3.c
 create mode 100644 include/asm-ia64/sn/ioc3.h
 create mode 100644 include/linux/ioc3.h

(limited to 'include/linux')

diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig
index 80f8663bc6d9..1d07d8072ec2 100644
--- a/arch/ia64/configs/gensparse_defconfig
+++ b/arch/ia64/configs/gensparse_defconfig
@@ -701,6 +701,7 @@ CONFIG_SERIAL_CORE_CONSOLE=y
 CONFIG_SERIAL_SGI_L1_CONSOLE=y
 # CONFIG_SERIAL_JSM is not set
 CONFIG_SERIAL_SGI_IOC4=y
+CONFIG_SERIAL_SGI_IOC3=y
 CONFIG_UNIX98_PTYS=y
 CONFIG_LEGACY_PTYS=y
 CONFIG_LEGACY_PTY_COUNT=256
@@ -1046,6 +1047,7 @@ CONFIG_INFINIBAND_IPOIB=m
 # SN Devices
 #
 CONFIG_SGI_IOC4=y
+CONFIG_SGI_IOC3=y
 
 #
 # File systems
diff --git a/arch/ia64/configs/sn2_defconfig b/arch/ia64/configs/sn2_defconfig
index ff8bb3770c9d..3cb503b659e6 100644
--- a/arch/ia64/configs/sn2_defconfig
+++ b/arch/ia64/configs/sn2_defconfig
@@ -659,6 +659,7 @@ CONFIG_SERIAL_CORE_CONSOLE=y
 CONFIG_SERIAL_SGI_L1_CONSOLE=y
 # CONFIG_SERIAL_JSM is not set
 CONFIG_SERIAL_SGI_IOC4=y
+CONFIG_SERIAL_SGI_IOC3=y
 CONFIG_UNIX98_PTYS=y
 CONFIG_LEGACY_PTYS=y
 CONFIG_LEGACY_PTY_COUNT=256
@@ -899,6 +900,7 @@ CONFIG_INFINIBAND_SRP=m
 # SN Devices
 #
 CONFIG_SGI_IOC4=y
+CONFIG_SGI_IOC3=y
 
 #
 # File systems
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 843717275d49..e0685a520514 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -190,7 +190,6 @@ config SERIAL_8250_BOCA
 	  To compile this driver as a module, choose M here: the module
 	  will be called 8250_boca.
 
-
 config SERIAL_8250_HUB6
 	tristate "Support Hub6 cards"
 	depends on SERIAL_8250 != n && ISA && SERIAL_8250_MANY_PORTS
@@ -917,4 +916,12 @@ config SERIAL_SGI_IOC4
 		and wish to use the serial ports on this card, say Y.
 		Otherwise, say N.
 
+config SERIAL_SGI_IOC3
+	tristate "SGI Altix IOC3 serial support"
+	depends on (IA64_GENERIC || IA64_SGI_SN2) && SGI_IOC3
+	select SERIAL_CORE
+	help
+	  If you have an SGI Altix with an IOC3 serial card,
+	  say Y or M.  Otherwise, say N.
+
 endmenu
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 24a583e482bb..eaf8e01db198 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -56,4 +56,5 @@ obj-$(CONFIG_SERIAL_JSM) += jsm/
 obj-$(CONFIG_SERIAL_TXX9) += serial_txx9.o
 obj-$(CONFIG_SERIAL_VR41XX) += vr41xx_siu.o
 obj-$(CONFIG_SERIAL_SGI_IOC4) += ioc4_serial.o
+obj-$(CONFIG_SERIAL_SGI_IOC3) += ioc3_serial.o
 obj-$(CONFIG_SERIAL_AT91) += at91_serial.o
diff --git a/drivers/serial/ioc3_serial.c b/drivers/serial/ioc3_serial.c
new file mode 100644
index 000000000000..8097cd91f16b
--- /dev/null
+++ b/drivers/serial/ioc3_serial.c
@@ -0,0 +1,2197 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2005 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * This file contains a module version of the ioc3 serial driver. This
+ * includes all the support functions needed (support functions, etc.)
+ * and the serial driver itself.
+ */
+#include <linux/errno.h>
+#include <linux/tty.h>
+#include <linux/serial.h>
+#include <linux/circ_buf.h>
+#include <linux/serial_reg.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/serial_core.h>
+#include <linux/ioc3.h>
+
+/*
+ * Interesting things about the ioc3
+ */
+
+#define LOGICAL_PORTS		2	/* rs232(0) and rs422(1) */
+#define PORTS_PER_CARD		2
+#define LOGICAL_PORTS_PER_CARD (PORTS_PER_CARD * LOGICAL_PORTS)
+#define MAX_CARDS		8
+#define MAX_LOGICAL_PORTS	(LOGICAL_PORTS_PER_CARD * MAX_CARDS)
+
+/* determine given the sio_ir what port it applies to */
+#define GET_PORT_FROM_SIO_IR(_x)	(_x & SIO_IR_SA) ? 0 : 1
+
+
+/*
+ * we have 2 logical ports (rs232, rs422) for each physical port
+ * evens are rs232, odds are rs422
+ */
+#define GET_PHYSICAL_PORT(_x)	((_x) >> 1)
+#define GET_LOGICAL_PORT(_x)	((_x) & 1)
+#define IS_PHYSICAL_PORT(_x)	!((_x) & 1)
+#define IS_RS232(_x)		!((_x) & 1)
+
+static unsigned int Num_of_ioc3_cards;
+static unsigned int Submodule_slot;
+
+/* defining this will get you LOTS of great debug info */
+//#define DEBUG_INTERRUPTS
+#define DPRINT_CONFIG(_x...)	;
+//#define DPRINT_CONFIG(_x...)  printk _x
+#define NOT_PROGRESS()	;
+//#define NOT_PROGRESS()	printk("%s : fails %d\n", __FUNCTION__, __LINE__)
+
+/* number of characters we want to transmit to the lower level at a time */
+#define MAX_CHARS		256
+#define FIFO_SIZE		(MAX_CHARS-1)	/* it's a uchar */
+
+/* Device name we're using */
+#define DEVICE_NAME		"ttySIOC"
+#define DEVICE_MAJOR		204
+#define DEVICE_MINOR		116
+
+/* flags for next_char_state */
+#define NCS_BREAK		0x1
+#define NCS_PARITY		0x2
+#define NCS_FRAMING		0x4
+#define NCS_OVERRUN		0x8
+
+/* cause we need SOME parameters ... */
+#define MIN_BAUD_SUPPORTED	1200
+#define MAX_BAUD_SUPPORTED	115200
+
+/* protocol types supported */
+#define PROTO_RS232		0
+#define PROTO_RS422		1
+
+/* Notification types */
+#define N_DATA_READY		0x01
+#define N_OUTPUT_LOWAT		0x02
+#define N_BREAK			0x04
+#define N_PARITY_ERROR		0x08
+#define N_FRAMING_ERROR		0x10
+#define N_OVERRUN_ERROR		0x20
+#define N_DDCD			0x40
+#define N_DCTS			0x80
+
+#define N_ALL_INPUT		(N_DATA_READY | N_BREAK			   \
+					| N_PARITY_ERROR | N_FRAMING_ERROR \
+					| N_OVERRUN_ERROR | N_DDCD | N_DCTS)
+
+#define N_ALL_OUTPUT		N_OUTPUT_LOWAT
+
+#define N_ALL_ERRORS		(N_PARITY_ERROR | N_FRAMING_ERROR \
+						| N_OVERRUN_ERROR)
+
+#define N_ALL			(N_DATA_READY | N_OUTPUT_LOWAT | N_BREAK    \
+					| N_PARITY_ERROR | N_FRAMING_ERROR  \
+					| N_OVERRUN_ERROR | N_DDCD | N_DCTS)
+
+#define SER_CLK_SPEED(prediv)	((22000000 << 1) / prediv)
+#define SER_DIVISOR(x, clk)	(((clk) + (x) * 8) / ((x) * 16))
+#define DIVISOR_TO_BAUD(div, clk) ((clk) / 16 / (div))
+
+/* Some masks */
+#define LCR_MASK_BITS_CHAR	(UART_LCR_WLEN5 | UART_LCR_WLEN6 \
+					| UART_LCR_WLEN7 | UART_LCR_WLEN8)
+#define LCR_MASK_STOP_BITS	(UART_LCR_STOP)
+
+#define PENDING(_a, _p)		(readl(&(_p)->vma->sio_ir) & (_a)->ic_enable)
+
+#define RING_BUF_SIZE		4096
+#define BUF_SIZE_BIT		SBBR_L_SIZE
+#define PROD_CONS_MASK		PROD_CONS_PTR_4K
+
+#define TOTAL_RING_BUF_SIZE	(RING_BUF_SIZE * 4)
+
+/* driver specific - one per card */
+struct ioc3_card {
+	struct {
+		/* uart ports are allocated here */
+		struct uart_port icp_uart_port[LOGICAL_PORTS];
+		/* the ioc3_port used for this port */
+		struct ioc3_port *icp_port;
+	} ic_port[PORTS_PER_CARD];
+	/* currently enabled interrupts */
+	uint32_t ic_enable;
+};
+
+/* Local port info for each IOC3 serial port */
+struct ioc3_port {
+	/* handy reference material */
+	struct uart_port *ip_port;
+	struct ioc3_card *ip_card;
+	struct ioc3_driver_data *ip_idd;
+	struct ioc3_submodule *ip_is;
+
+	/* pci mem addresses for this port */
+	struct ioc3_serialregs __iomem *ip_serial_regs;
+	struct ioc3_uartregs __iomem *ip_uart_regs;
+
+	/* Ring buffer page for this port */
+	dma_addr_t ip_dma_ringbuf;
+	/* vaddr of ring buffer */
+	struct ring_buffer *ip_cpu_ringbuf;
+
+	/* Rings for this port */
+	struct ring *ip_inring;
+	struct ring *ip_outring;
+
+	/* Hook to port specific values */
+	struct port_hooks *ip_hooks;
+
+	spinlock_t ip_lock;
+
+	/* Various rx/tx parameters */
+	int ip_baud;
+	int ip_tx_lowat;
+	int ip_rx_timeout;
+
+	/* Copy of notification bits */
+	int ip_notify;
+
+	/* Shadow copies of various registers so we don't need to PIO
+	 * read them constantly
+	 */
+	uint32_t ip_sscr;
+	uint32_t ip_tx_prod;
+	uint32_t ip_rx_cons;
+	unsigned char ip_flags;
+};
+
+/* tx low water mark.  We need to notify the driver whenever tx is getting
+ * close to empty so it can refill the tx buffer and keep things going.
+ * Let's assume that if we interrupt 1 ms before the tx goes idle, we'll
+ * have no trouble getting in more chars in time (I certainly hope so).
+ */
+#define TX_LOWAT_LATENCY      1000
+#define TX_LOWAT_HZ          (1000000 / TX_LOWAT_LATENCY)
+#define TX_LOWAT_CHARS(baud) (baud / 10 / TX_LOWAT_HZ)
+
+/* Flags per port */
+#define INPUT_HIGH		0x01
+	/* used to signify that we have turned off the rx_high
+	 * temporarily - we need to drain the fifo and don't
+	 * want to get blasted with interrupts.
+	 */
+#define DCD_ON			0x02
+	/* DCD state is on */
+#define LOWAT_WRITTEN		0x04
+#define READ_ABORTED		0x08
+	/* the read was aborted - used to avaoid infinate looping
+	 * in the interrupt handler
+	 */
+#define INPUT_ENABLE		0x10
+
+/* Since each port has different register offsets and bitmasks
+ * for everything, we'll store those that we need in tables so we
+ * don't have to be constantly checking the port we are dealing with.
+ */
+struct port_hooks {
+	uint32_t intr_delta_dcd;
+	uint32_t intr_delta_cts;
+	uint32_t intr_tx_mt;
+	uint32_t intr_rx_timer;
+	uint32_t intr_rx_high;
+	uint32_t intr_tx_explicit;
+	uint32_t intr_clear;
+	uint32_t intr_all;
+	char rs422_select_pin;
+};
+
+static struct port_hooks hooks_array[PORTS_PER_CARD] = {
+	/* values for port A */
+	{
+	.intr_delta_dcd = SIO_IR_SA_DELTA_DCD,
+	.intr_delta_cts = SIO_IR_SA_DELTA_CTS,
+	.intr_tx_mt = SIO_IR_SA_TX_MT,
+	.intr_rx_timer = SIO_IR_SA_RX_TIMER,
+	.intr_rx_high = SIO_IR_SA_RX_HIGH,
+	.intr_tx_explicit = SIO_IR_SA_TX_EXPLICIT,
+	.intr_clear = (SIO_IR_SA_TX_MT | SIO_IR_SA_RX_FULL
+				| SIO_IR_SA_RX_HIGH
+				| SIO_IR_SA_RX_TIMER
+				| SIO_IR_SA_DELTA_DCD
+				| SIO_IR_SA_DELTA_CTS
+				| SIO_IR_SA_INT
+				| SIO_IR_SA_TX_EXPLICIT
+				| SIO_IR_SA_MEMERR),
+	.intr_all =  SIO_IR_SA,
+	.rs422_select_pin = GPPR_UARTA_MODESEL_PIN,
+	 },
+
+	/* values for port B */
+	{
+	.intr_delta_dcd = SIO_IR_SB_DELTA_DCD,
+	.intr_delta_cts = SIO_IR_SB_DELTA_CTS,
+	.intr_tx_mt = SIO_IR_SB_TX_MT,
+	.intr_rx_timer = SIO_IR_SB_RX_TIMER,
+	.intr_rx_high = SIO_IR_SB_RX_HIGH,
+	.intr_tx_explicit = SIO_IR_SB_TX_EXPLICIT,
+	.intr_clear = (SIO_IR_SB_TX_MT | SIO_IR_SB_RX_FULL
+				| SIO_IR_SB_RX_HIGH
+				| SIO_IR_SB_RX_TIMER
+				| SIO_IR_SB_DELTA_DCD
+				| SIO_IR_SB_DELTA_CTS
+				| SIO_IR_SB_INT
+				| SIO_IR_SB_TX_EXPLICIT
+				| SIO_IR_SB_MEMERR),
+	.intr_all = SIO_IR_SB,
+	.rs422_select_pin = GPPR_UARTB_MODESEL_PIN,
+	 }
+};
+
+struct ring_entry {
+	union {
+		struct {
+			uint32_t alldata;
+			uint32_t allsc;
+		} all;
+		struct {
+			char data[4];	/* data bytes */
+			char sc[4];	/* status/control */
+		} s;
+	} u;
+};
+
+/* Test the valid bits in any of the 4 sc chars using "allsc" member */
+#define RING_ANY_VALID \
+	((uint32_t)(RXSB_MODEM_VALID | RXSB_DATA_VALID) * 0x01010101)
+
+#define ring_sc		u.s.sc
+#define ring_data	u.s.data
+#define ring_allsc	u.all.allsc
+
+/* Number of entries per ring buffer. */
+#define ENTRIES_PER_RING (RING_BUF_SIZE / (int) sizeof(struct ring_entry))
+
+/* An individual ring */
+struct ring {
+	struct ring_entry entries[ENTRIES_PER_RING];
+};
+
+/* The whole enchilada */
+struct ring_buffer {
+	struct ring TX_A;
+	struct ring RX_A;
+	struct ring TX_B;
+	struct ring RX_B;
+};
+
+/* Get a ring from a port struct */
+#define RING(_p, _wh)	&(((struct ring_buffer *)((_p)->ip_cpu_ringbuf))->_wh)
+
+/* for Infinite loop detection  */
+#define MAXITER		10000000
+
+
+/**
+ * set_baud - Baud rate setting code
+ * @port: port to set
+ * @baud: baud rate to use
+ */
+static int set_baud(struct ioc3_port *port, int baud)
+{
+	int divisor;
+	int actual_baud;
+	int diff;
+	int lcr, prediv;
+	struct ioc3_uartregs __iomem *uart;
+
+	for (prediv = 6; prediv < 64; prediv++) {
+		divisor = SER_DIVISOR(baud, SER_CLK_SPEED(prediv));
+		if (!divisor)
+			continue;	/* invalid divisor */
+		actual_baud = DIVISOR_TO_BAUD(divisor, SER_CLK_SPEED(prediv));
+
+		diff = actual_baud - baud;
+		if (diff < 0)
+			diff = -diff;
+
+		/* if we're within 1% we've found a match */
+		if (diff * 100 <= actual_baud)
+			break;
+	}
+
+	/* if the above loop completed, we didn't match
+	 * the baud rate.  give up.
+	 */
+	if (prediv == 64) {
+		NOT_PROGRESS();
+		return 1;
+	}
+
+	uart = port->ip_uart_regs;
+	lcr = readb(&uart->iu_lcr);
+
+	writeb(lcr | UART_LCR_DLAB, &uart->iu_lcr);
+	writeb((unsigned char)divisor, &uart->iu_dll);
+	writeb((unsigned char)(divisor >> 8), &uart->iu_dlm);
+	writeb((unsigned char)prediv, &uart->iu_scr);
+	writeb((unsigned char)lcr, &uart->iu_lcr);
+
+	return 0;
+}
+
+/**
+ * get_ioc3_port - given a uart port, return the control structure
+ * @the_port: uart port to find
+ */
+static struct ioc3_port *get_ioc3_port(struct uart_port *the_port)
+{
+	struct ioc3_driver_data *idd = dev_get_drvdata(the_port->dev);
+	struct ioc3_card *card_ptr = idd->data[Submodule_slot];
+	int ii, jj;
+
+	if (!card_ptr) {
+		NOT_PROGRESS();
+		return NULL;
+	}
+	for (ii = 0; ii < PORTS_PER_CARD; ii++) {
+		for (jj = 0; jj < LOGICAL_PORTS; jj++) {
+			if (the_port == &card_ptr->ic_port[ii].icp_uart_port[jj])
+				return card_ptr->ic_port[ii].icp_port;
+		}
+	}
+	NOT_PROGRESS();
+	return NULL;
+}
+
+/**
+ * port_init - Initialize the sio and ioc3 hardware for a given port
+ *			called per port from attach...
+ * @port: port to initialize
+ */
+static int inline port_init(struct ioc3_port *port)
+{
+	uint32_t sio_cr;
+	struct port_hooks *hooks = port->ip_hooks;
+	struct ioc3_uartregs __iomem *uart;
+	int reset_loop_counter = 0xfffff;
+	struct ioc3_driver_data *idd = port->ip_idd;
+
+	/* Idle the IOC3 serial interface */
+	writel(SSCR_RESET, &port->ip_serial_regs->sscr);
+
+	/* Wait until any pending bus activity for this port has ceased */
+	do {
+		sio_cr = readl(&idd->vma->sio_cr);
+		if (reset_loop_counter-- <= 0) {
+			printk(KERN_WARNING
+			       "IOC3 unable to come out of reset"
+				" scr 0x%x\n", sio_cr);
+			return -1;
+		}
+	} while (!(sio_cr & SIO_CR_ARB_DIAG_IDLE) &&
+	       (((sio_cr &= SIO_CR_ARB_DIAG) == SIO_CR_ARB_DIAG_TXA)
+		|| sio_cr == SIO_CR_ARB_DIAG_TXB
+		|| sio_cr == SIO_CR_ARB_DIAG_RXA
+		|| sio_cr == SIO_CR_ARB_DIAG_RXB));
+
+	/* Finish reset sequence */
+	writel(0, &port->ip_serial_regs->sscr);
+
+	/* Once RESET is done, reload cached tx_prod and rx_cons values
+	 * and set rings to empty by making prod == cons
+	 */
+	port->ip_tx_prod = readl(&port->ip_serial_regs->stcir) & PROD_CONS_MASK;
+	writel(port->ip_tx_prod, &port->ip_serial_regs->stpir);
+	port->ip_rx_cons = readl(&port->ip_serial_regs->srpir) & PROD_CONS_MASK;
+	writel(port->ip_rx_cons | SRCIR_ARM, &port->ip_serial_regs->srcir);
+
+	/* Disable interrupts for this 16550 */
+	uart = port->ip_uart_regs;
+	writeb(0, &uart->iu_lcr);
+	writeb(0, &uart->iu_ier);
+
+	/* Set the default baud */
+	set_baud(port, port->ip_baud);
+
+	/* Set line control to 8 bits no parity */
+	writeb(UART_LCR_WLEN8 | 0, &uart->iu_lcr);
+	/* UART_LCR_STOP == 1 stop */
+
+	/* Enable the FIFOs */
+	writeb(UART_FCR_ENABLE_FIFO, &uart->iu_fcr);
+	/* then reset 16550 FIFOs */
+	writeb(UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
+	       &uart->iu_fcr);
+
+	/* Clear modem control register */
+	writeb(0, &uart->iu_mcr);
+
+	/* Clear deltas in modem status register */
+	writel(0, &port->ip_serial_regs->shadow);
+
+	/* Only do this once per port pair */
+	if (port->ip_hooks == &hooks_array[0]) {
+		unsigned long ring_pci_addr;
+		uint32_t __iomem *sbbr_l, *sbbr_h;
+
+		sbbr_l = &idd->vma->sbbr_l;
+		sbbr_h = &idd->vma->sbbr_h;
+		ring_pci_addr = (unsigned long __iomem)port->ip_dma_ringbuf;
+		DPRINT_CONFIG(("%s: ring_pci_addr 0x%p\n",
+			       __FUNCTION__, (void *)ring_pci_addr));
+
+		writel((unsigned int)((uint64_t) ring_pci_addr >> 32), sbbr_h);
+		writel((unsigned int)ring_pci_addr | BUF_SIZE_BIT, sbbr_l);
+	}
+
+	/* Set the receive timeout value to 10 msec */
+	writel(SRTR_HZ / 100, &port->ip_serial_regs->srtr);
+
+	/* Set rx threshold, enable DMA */
+	/* Set high water mark at 3/4 of full ring */
+	port->ip_sscr = (ENTRIES_PER_RING * 3 / 4);
+
+	/* uart experiences pauses at high baud rate reducing actual
+	 * throughput by 10% or so unless we enable high speed polling
+	 * XXX when this hardware bug is resolved we should revert to
+	 * normal polling speed
+	 */
+	port->ip_sscr |= SSCR_HIGH_SPD;
+
+	writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+
+	/* Disable and clear all serial related interrupt bits */
+	port->ip_card->ic_enable &= ~hooks->intr_clear;
+	ioc3_disable(port->ip_is, idd, hooks->intr_clear);
+	ioc3_ack(port->ip_is, idd, hooks->intr_clear);
+	return 0;
+}
+
+/**
+ * enable_intrs - enable interrupts
+ * @port: port to enable
+ * @mask: mask to use
+ */
+static void enable_intrs(struct ioc3_port *port, uint32_t mask)
+{
+	if ((port->ip_card->ic_enable & mask) != mask) {
+		port->ip_card->ic_enable |= mask;
+		ioc3_enable(port->ip_is, port->ip_idd, mask);
+	}
+}
+
+/**
+ * local_open - local open a port
+ * @port: port to open
+ */
+static inline int local_open(struct ioc3_port *port)
+{
+	int spiniter = 0;
+
+	port->ip_flags = INPUT_ENABLE;
+
+	/* Pause the DMA interface if necessary */
+	if (port->ip_sscr & SSCR_DMA_EN) {
+		writel(port->ip_sscr | SSCR_DMA_PAUSE,
+		       &port->ip_serial_regs->sscr);
+		while ((readl(&port->ip_serial_regs->sscr)
+			& SSCR_PAUSE_STATE) == 0) {
+			spiniter++;
+			if (spiniter > MAXITER) {
+				NOT_PROGRESS();
+				return -1;
+			}
+		}
+	}
+
+	/* Reset the input fifo.  If the uart received chars while the port
+	 * was closed and DMA is not enabled, the uart may have a bunch of
+	 * chars hanging around in its rx fifo which will not be discarded
+	 * by rclr in the upper layer. We must get rid of them here.
+	 */
+	writeb(UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR,
+	       &port->ip_uart_regs->iu_fcr);
+
+	writeb(UART_LCR_WLEN8, &port->ip_uart_regs->iu_lcr);
+	/* UART_LCR_STOP == 1 stop */
+
+	/* Re-enable DMA, set default threshold to intr whenever there is
+	 * data available.
+	 */
+	port->ip_sscr &= ~SSCR_RX_THRESHOLD;
+	port->ip_sscr |= 1;	/* default threshold */
+
+	/* Plug in the new sscr.  This implicitly clears the DMA_PAUSE
+	 * flag if it was set above
+	 */
+	writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	port->ip_tx_lowat = 1;
+	return 0;
+}
+
+/**
+ * set_rx_timeout - Set rx timeout and threshold values.
+ * @port: port to use
+ * @timeout: timeout value in ticks
+ */
+static inline int set_rx_timeout(struct ioc3_port *port, int timeout)
+{
+	int threshold;
+
+	port->ip_rx_timeout = timeout;
+
+	/* Timeout is in ticks.  Let's figure out how many chars we
+	 * can receive at the current baud rate in that interval
+	 * and set the rx threshold to that amount.  There are 4 chars
+	 * per ring entry, so we'll divide the number of chars that will
+	 * arrive in timeout by 4.
+	 * So .... timeout * baud / 10 / HZ / 4, with HZ = 100.
+	 */
+	threshold = timeout * port->ip_baud / 4000;
+	if (threshold == 0)
+		threshold = 1;	/* otherwise we'll intr all the time! */
+
+	if ((unsigned)threshold > (unsigned)SSCR_RX_THRESHOLD)
+		return 1;
+
+	port->ip_sscr &= ~SSCR_RX_THRESHOLD;
+	port->ip_sscr |= threshold;
+	writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+
+	/* Now set the rx timeout to the given value
+	 * again timeout * SRTR_HZ / HZ
+	 */
+	timeout = timeout * SRTR_HZ / 100;
+	if (timeout > SRTR_CNT)
+		timeout = SRTR_CNT;
+	writel(timeout, &port->ip_serial_regs->srtr);
+	return 0;
+}
+
+/**
+ * config_port - config the hardware
+ * @port: port to config
+ * @baud: baud rate for the port
+ * @byte_size: data size
+ * @stop_bits: number of stop bits
+ * @parenb: parity enable ?
+ * @parodd: odd parity ?
+ */
+static inline int
+config_port(struct ioc3_port *port,
+	    int baud, int byte_size, int stop_bits, int parenb, int parodd)
+{
+	char lcr, sizebits;
+	int spiniter = 0;
+
+	DPRINT_CONFIG(("%s: line %d baud %d byte_size %d stop %d parenb %d "
+			"parodd %d\n",
+		       __FUNCTION__, ((struct uart_port *)port->ip_port)->line,
+			baud, byte_size, stop_bits, parenb, parodd));
+
+	if (set_baud(port, baud))
+		return 1;
+
+	switch (byte_size) {
+	case 5:
+		sizebits = UART_LCR_WLEN5;
+		break;
+	case 6:
+		sizebits = UART_LCR_WLEN6;
+		break;
+	case 7:
+		sizebits = UART_LCR_WLEN7;
+		break;
+	case 8:
+		sizebits = UART_LCR_WLEN8;
+		break;
+	default:
+		return 1;
+	}
+
+	/* Pause the DMA interface if necessary */
+	if (port->ip_sscr & SSCR_DMA_EN) {
+		writel(port->ip_sscr | SSCR_DMA_PAUSE,
+		       &port->ip_serial_regs->sscr);
+		while ((readl(&port->ip_serial_regs->sscr)
+			& SSCR_PAUSE_STATE) == 0) {
+			spiniter++;
+			if (spiniter > MAXITER)
+				return -1;
+		}
+	}
+
+	/* Clear relevant fields in lcr */
+	lcr = readb(&port->ip_uart_regs->iu_lcr);
+	lcr &= ~(LCR_MASK_BITS_CHAR | UART_LCR_EPAR |
+		 UART_LCR_PARITY | LCR_MASK_STOP_BITS);
+
+	/* Set byte size in lcr */
+	lcr |= sizebits;
+
+	/* Set parity */
+	if (parenb) {
+		lcr |= UART_LCR_PARITY;
+		if (!parodd)
+			lcr |= UART_LCR_EPAR;
+	}
+
+	/* Set stop bits */
+	if (stop_bits)
+		lcr |= UART_LCR_STOP /* 2 stop bits */ ;
+
+	writeb(lcr, &port->ip_uart_regs->iu_lcr);
+
+	/* Re-enable the DMA interface if necessary */
+	if (port->ip_sscr & SSCR_DMA_EN) {
+		writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	}
+	port->ip_baud = baud;
+
+	/* When we get within this number of ring entries of filling the
+	 * entire ring on tx, place an EXPLICIT intr to generate a lowat
+	 * notification when output has drained.
+	 */
+	port->ip_tx_lowat = (TX_LOWAT_CHARS(baud) + 3) / 4;
+	if (port->ip_tx_lowat == 0)
+		port->ip_tx_lowat = 1;
+
+	set_rx_timeout(port, 2);
+	return 0;
+}
+
+/**
+ * do_write - Write bytes to the port.  Returns the number of bytes
+ *			actually written. Called from transmit_chars
+ * @port: port to use
+ * @buf: the stuff to write
+ * @len: how many bytes in 'buf'
+ */
+static inline int do_write(struct ioc3_port *port, char *buf, int len)
+{
+	int prod_ptr, cons_ptr, total = 0;
+	struct ring *outring;
+	struct ring_entry *entry;
+	struct port_hooks *hooks = port->ip_hooks;
+
+	BUG_ON(!(len >= 0));
+
+	prod_ptr = port->ip_tx_prod;
+	cons_ptr = readl(&port->ip_serial_regs->stcir) & PROD_CONS_MASK;
+	outring = port->ip_outring;
+
+	/* Maintain a 1-entry red-zone.  The ring buffer is full when
+	 * (cons - prod) % ring_size is 1.  Rather than do this subtraction
+	 * in the body of the loop, I'll do it now.
+	 */
+	cons_ptr = (cons_ptr - (int)sizeof(struct ring_entry)) & PROD_CONS_MASK;
+
+	/* Stuff the bytes into the output */
+	while ((prod_ptr != cons_ptr) && (len > 0)) {
+		int xx;
+
+		/* Get 4 bytes (one ring entry) at a time */
+		entry = (struct ring_entry *)((caddr_t) outring + prod_ptr);
+
+		/* Invalidate all entries */
+		entry->ring_allsc = 0;
+
+		/* Copy in some bytes */
+		for (xx = 0; (xx < 4) && (len > 0); xx++) {
+			entry->ring_data[xx] = *buf++;
+			entry->ring_sc[xx] = TXCB_VALID;
+			len--;
+			total++;
+		}
+
+		/* If we are within some small threshold of filling up the
+		 * entire ring buffer, we must place an EXPLICIT intr here
+		 * to generate a lowat interrupt in case we subsequently
+		 * really do fill up the ring and the caller goes to sleep.
+		 * No need to place more than one though.
+		 */
+		if (!(port->ip_flags & LOWAT_WRITTEN) &&
+		    ((cons_ptr - prod_ptr) & PROD_CONS_MASK)
+		    <= port->ip_tx_lowat * (int)sizeof(struct ring_entry)) {
+			port->ip_flags |= LOWAT_WRITTEN;
+			entry->ring_sc[0] |= TXCB_INT_WHEN_DONE;
+		}
+
+		/* Go on to next entry */
+		prod_ptr += sizeof(struct ring_entry);
+		prod_ptr &= PROD_CONS_MASK;
+	}
+
+	/* If we sent something, start DMA if necessary */
+	if (total > 0 && !(port->ip_sscr & SSCR_DMA_EN)) {
+		port->ip_sscr |= SSCR_DMA_EN;
+		writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	}
+
+	/* Store the new producer pointer.  If tx is disabled, we stuff the
+	 * data into the ring buffer, but we don't actually start tx.
+	 */
+	if (!uart_tx_stopped(port->ip_port)) {
+		writel(prod_ptr, &port->ip_serial_regs->stpir);
+
+		/* If we are now transmitting, enable tx_mt interrupt so we
+		 * can disable DMA if necessary when the tx finishes.
+		 */
+		if (total > 0)
+			enable_intrs(port, hooks->intr_tx_mt);
+	}
+	port->ip_tx_prod = prod_ptr;
+
+	return total;
+}
+
+/**
+ * disable_intrs - disable interrupts
+ * @port: port to enable
+ * @mask: mask to use
+ */
+static inline void disable_intrs(struct ioc3_port *port, uint32_t mask)
+{
+	if (port->ip_card->ic_enable & mask) {
+		ioc3_disable(port->ip_is, port->ip_idd, mask);
+		port->ip_card->ic_enable &= ~mask;
+	}
+}
+
+/**
+ * set_notification - Modify event notification
+ * @port: port to use
+ * @mask: events mask
+ * @set_on: set ?
+ */
+static int set_notification(struct ioc3_port *port, int mask, int set_on)
+{
+	struct port_hooks *hooks = port->ip_hooks;
+	uint32_t intrbits, sscrbits;
+
+	BUG_ON(!mask);
+
+	intrbits = sscrbits = 0;
+
+	if (mask & N_DATA_READY)
+		intrbits |= (hooks->intr_rx_timer | hooks->intr_rx_high);
+	if (mask & N_OUTPUT_LOWAT)
+		intrbits |= hooks->intr_tx_explicit;
+	if (mask & N_DDCD) {
+		intrbits |= hooks->intr_delta_dcd;
+		sscrbits |= SSCR_RX_RING_DCD;
+	}
+	if (mask & N_DCTS)
+		intrbits |= hooks->intr_delta_cts;
+
+	if (set_on) {
+		enable_intrs(port, intrbits);
+		port->ip_notify |= mask;
+		port->ip_sscr |= sscrbits;
+	} else {
+		disable_intrs(port, intrbits);
+		port->ip_notify &= ~mask;
+		port->ip_sscr &= ~sscrbits;
+	}
+
+	/* We require DMA if either DATA_READY or DDCD notification is
+	 * currently requested. If neither of these is requested and
+	 * there is currently no tx in progress, DMA may be disabled.
+	 */
+	if (port->ip_notify & (N_DATA_READY | N_DDCD))
+		port->ip_sscr |= SSCR_DMA_EN;
+	else if (!(port->ip_card->ic_enable & hooks->intr_tx_mt))
+		port->ip_sscr &= ~SSCR_DMA_EN;
+
+	writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	return 0;
+}
+
+/**
+ * set_mcr - set the master control reg
+ * @the_port: port to use
+ * @mask1: mcr mask
+ * @mask2: shadow mask
+ */
+static inline int set_mcr(struct uart_port *the_port,
+			  int mask1, int mask2)
+{
+	struct ioc3_port *port = get_ioc3_port(the_port);
+	uint32_t shadow;
+	int spiniter = 0;
+	char mcr;
+
+	if (!port)
+		return -1;
+
+	/* Pause the DMA interface if necessary */
+	if (port->ip_sscr & SSCR_DMA_EN) {
+		writel(port->ip_sscr | SSCR_DMA_PAUSE,
+		       &port->ip_serial_regs->sscr);
+		while ((readl(&port->ip_serial_regs->sscr)
+			& SSCR_PAUSE_STATE) == 0) {
+			spiniter++;
+			if (spiniter > MAXITER)
+				return -1;
+		}
+	}
+	shadow = readl(&port->ip_serial_regs->shadow);
+	mcr = (shadow & 0xff000000) >> 24;
+
+	/* Set new value */
+	mcr |= mask1;
+	shadow |= mask2;
+	writeb(mcr, &port->ip_uart_regs->iu_mcr);
+	writel(shadow, &port->ip_serial_regs->shadow);
+
+	/* Re-enable the DMA interface if necessary */
+	if (port->ip_sscr & SSCR_DMA_EN) {
+		writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+	}
+	return 0;
+}
+
+/**
+ * ioc3_set_proto - set the protocol for the port
+ * @port: port to use
+ * @proto: protocol to use
+ */
+static int ioc3_set_proto(struct ioc3_port *port, int proto)
+{
+	struct port_hooks *hooks = port->ip_hooks;
+
+	switch (proto) {
+	default:
+	case PROTO_RS232:
+		/* Clear the appropriate GIO pin */
+		DPRINT_CONFIG(("%s: rs232\n", __FUNCTION__));
+		writel(0, (&port->ip_idd->vma->gppr[0]
+					+ hooks->rs422_select_pin));
+		break;
+
+	case PROTO_RS422:
+		/* Set the appropriate GIO pin */
+		DPRINT_CONFIG(("%s: rs422\n", __FUNCTION__));
+		writel(1, (&port->ip_idd->vma->gppr[0]
+					+ hooks->rs422_select_pin));
+		break;
+	}
+	return 0;
+}
+
+/**
+ * transmit_chars - upper level write, called with the_port->lock
+ * @the_port: port to write
+ */
+static void transmit_chars(struct uart_port *the_port)
+{
+	int xmit_count, tail, head;
+	int result;
+	char *start;
+	struct tty_struct *tty;
+	struct ioc3_port *port = get_ioc3_port(the_port);
+	struct uart_info *info;
+
+	if (!the_port)
+		return;
+	if (!port)
+		return;
+
+	info = the_port->info;
+	tty = info->tty;
+
+	if (uart_circ_empty(&info->xmit) || uart_tx_stopped(the_port)) {
+		/* Nothing to do or hw stopped */
+		set_notification(port, N_ALL_OUTPUT, 0);
+		return;
+	}
+
+	head = info->xmit.head;
+	tail = info->xmit.tail;
+	start = (char *)&info->xmit.buf[tail];
+
+	/* write out all the data or until the end of the buffer */
+	xmit_count = (head < tail) ? (UART_XMIT_SIZE - tail) : (head - tail);
+	if (xmit_count > 0) {
+		result = do_write(port, start, xmit_count);
+		if (result > 0) {
+			/* booking */
+			xmit_count -= result;
+			the_port->icount.tx += result;
+			/* advance the pointers */
+			tail += result;
+			tail &= UART_XMIT_SIZE - 1;
+			info->xmit.tail = tail;
+			start = (char *)&info->xmit.buf[tail];
+		}
+	}
+	if (uart_circ_chars_pending(&info->xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(the_port);
+
+	if (uart_circ_empty(&info->xmit)) {
+		set_notification(port, N_OUTPUT_LOWAT, 0);
+	} else {
+		set_notification(port, N_OUTPUT_LOWAT, 1);
+	}
+}
+
+/**
+ * ioc3_change_speed - change the speed of the port
+ * @the_port: port to change
+ * @new_termios: new termios settings
+ * @old_termios: old termios settings
+ */
+static void
+ioc3_change_speed(struct uart_port *the_port,
+		  struct termios *new_termios, struct termios *old_termios)
+{
+	struct ioc3_port *port = get_ioc3_port(the_port);
+	unsigned int cflag;
+	int baud;
+	int new_parity = 0, new_parity_enable = 0, new_stop = 0, new_data = 8;
+	struct uart_info *info = the_port->info;
+
+	cflag = new_termios->c_cflag;
+
+	switch (cflag & CSIZE) {
+	case CS5:
+		new_data = 5;
+		break;
+	case CS6:
+		new_data = 6;
+		break;
+	case CS7:
+		new_data = 7;
+		break;
+	case CS8:
+		new_data = 8;
+		break;
+	default:
+		/* cuz we always need a default ... */
+		new_data = 5;
+		break;
+	}
+	if (cflag & CSTOPB) {
+		new_stop = 1;
+	}
+	if (cflag & PARENB) {
+		new_parity_enable = 1;
+		if (cflag & PARODD)
+			new_parity = 1;
+	}
+	baud = uart_get_baud_rate(the_port, new_termios, old_termios,
+				  MIN_BAUD_SUPPORTED, MAX_BAUD_SUPPORTED);
+	DPRINT_CONFIG(("%s: returned baud %d for line %d\n", __FUNCTION__, baud,
+				the_port->line));
+
+	if (!the_port->fifosize)
+		the_port->fifosize = FIFO_SIZE;
+	uart_update_timeout(the_port, cflag, baud);
+
+	the_port->ignore_status_mask = N_ALL_INPUT;
+
+	info->tty->low_latency = 1;
+
+	if (I_IGNPAR(info->tty))
+		the_port->ignore_status_mask &= ~(N_PARITY_ERROR
+						  | N_FRAMING_ERROR);
+	if (I_IGNBRK(info->tty)) {
+		the_port->ignore_status_mask &= ~N_BREAK;
+		if (I_IGNPAR(info->tty))
+			the_port->ignore_status_mask &= ~N_OVERRUN_ERROR;
+	}
+	if (!(cflag & CREAD)) {
+		/* ignore everything */
+		the_port->ignore_status_mask &= ~N_DATA_READY;
+	}
+
+	if (cflag & CRTSCTS) {
+		/* enable hardware flow control */
+		port->ip_sscr |= SSCR_HFC_EN;
+	}
+	else {
+		/* disable hardware flow control */
+		port->ip_sscr &= ~SSCR_HFC_EN;
+	}
+	writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+
+	/* Set the configuration and proper notification call */
+	DPRINT_CONFIG(("%s : port 0x%p line %d cflag 0%o "
+		       "config_port(baud %d data %d stop %d penable %d "
+			" parity %d), notification 0x%x\n",
+		       __FUNCTION__, (void *)port, the_port->line, cflag, baud,
+		       new_data, new_stop, new_parity_enable, new_parity,
+		       the_port->ignore_status_mask));
+
+	if ((config_port(port, baud,	/* baud */
+			 new_data,	/* byte size */
+			 new_stop,	/* stop bits */
+			 new_parity_enable,	/* set parity */
+			 new_parity)) >= 0) {	/* parity 1==odd */
+		set_notification(port, the_port->ignore_status_mask, 1);
+	}
+}
+
+/**
+ * ic3_startup_local - Start up the serial port - returns >= 0 if no errors
+ * @the_port: Port to operate on
+ */
+static inline int ic3_startup_local(struct uart_port *the_port)
+{
+	struct ioc3_port *port;
+
+	if (!the_port) {
+		NOT_PROGRESS();
+		return -1;
+	}
+
+	port = get_ioc3_port(the_port);
+	if (!port) {
+		NOT_PROGRESS();
+		return -1;
+	}
+
+	local_open(port);
+
+	/* set the protocol */
+	ioc3_set_proto(port, IS_RS232(the_port->line) ? PROTO_RS232 :
+							PROTO_RS422);
+	return 0;
+}
+
+/*
+ * ioc3_cb_output_lowat - called when the output low water mark is hit
+ * @port: port to output
+ */
+static void ioc3_cb_output_lowat(struct ioc3_port *port)
+{
+	unsigned long pflags;
+
+	/* the_port->lock is set on the call here */
+	if (port->ip_port) {
+		spin_lock_irqsave(&port->ip_port->lock, pflags);
+		transmit_chars(port->ip_port);
+		spin_unlock_irqrestore(&port->ip_port->lock, pflags);
+	}
+}
+
+/*
+ * ioc3_cb_post_ncs - called for some basic errors
+ * @port: port to use
+ * @ncs: event
+ */
+static void ioc3_cb_post_ncs(struct uart_port *the_port, int ncs)
+{
+	struct uart_icount *icount;
+
+	icount = &the_port->icount;
+
+	if (ncs & NCS_BREAK)
+		icount->brk++;
+	if (ncs & NCS_FRAMING)
+		icount->frame++;
+	if (ncs & NCS_OVERRUN)
+		icount->overrun++;
+	if (ncs & NCS_PARITY)
+		icount->parity++;
+}
+
+/**
+ * do_read - Read in bytes from the port.  Return the number of bytes
+ *			actually read.
+ * @the_port: port to use
+ * @buf: place to put the stuff we read
+ * @len: how big 'buf' is
+ */
+
+static inline int do_read(struct uart_port *the_port, char *buf, int len)
+{
+	int prod_ptr, cons_ptr, total;
+	struct ioc3_port *port = get_ioc3_port(the_port);
+	struct ring *inring;
+	struct ring_entry *entry;
+	struct port_hooks *hooks = port->ip_hooks;
+	int byte_num;
+	char *sc;
+	int loop_counter;
+
+	BUG_ON(!(len >= 0));
+	BUG_ON(!port);
+
+	/* There is a nasty timing issue in the IOC3. When the rx_timer
+	 * expires or the rx_high condition arises, we take an interrupt.
+	 * At some point while servicing the interrupt, we read bytes from
+	 * the ring buffer and re-arm the rx_timer.  However the rx_timer is
+	 * not started until the first byte is received *after* it is armed,
+	 * and any bytes pending in the rx construction buffers are not drained
+	 * to memory until either there are 4 bytes available or the rx_timer
+	 * expires.  This leads to a potential situation where data is left
+	 * in the construction buffers forever - 1 to 3 bytes were received
+	 * after the interrupt was generated but before the rx_timer was
+	 * re-armed. At that point as long as no subsequent bytes are received
+	 * the timer will never be started and the bytes will remain in the
+	 * construction buffer forever.  The solution is to execute a DRAIN
+	 * command after rearming the timer.  This way any bytes received before
+	 * the DRAIN will be drained to memory, and any bytes received after
+	 * the DRAIN will start the TIMER and be drained when it expires.
+	 * Luckily, this only needs to be done when the DMA buffer is empty
+	 * since there is no requirement that this function return all
+	 * available data as long as it returns some.
+	 */
+	/* Re-arm the timer */
+
+	writel(port->ip_rx_cons | SRCIR_ARM, &port->ip_serial_regs->srcir);
+
+	prod_ptr = readl(&port->ip_serial_regs->srpir) & PROD_CONS_MASK;
+	cons_ptr = port->ip_rx_cons;
+
+	if (prod_ptr == cons_ptr) {
+		int reset_dma = 0;
+
+		/* Input buffer appears empty, do a flush. */
+
+		/* DMA must be enabled for this to work. */
+		if (!(port->ip_sscr & SSCR_DMA_EN)) {
+			port->ip_sscr |= SSCR_DMA_EN;
+			reset_dma = 1;
+		}
+
+		/* Potential race condition: we must reload the srpir after
+		 * issuing the drain command, otherwise we could think the rx
+		 * buffer is empty, then take a very long interrupt, and when
+		 * we come back it's full and we wait forever for the drain to
+		 * complete.
+		 */
+		writel(port->ip_sscr | SSCR_RX_DRAIN,
+		       &port->ip_serial_regs->sscr);
+		prod_ptr = readl(&port->ip_serial_regs->srpir) & PROD_CONS_MASK;
+
+		/* We must not wait for the DRAIN to complete unless there are
+		 * at least 8 bytes (2 ring entries) available to receive the
+		 * data otherwise the DRAIN will never complete and we'll
+		 * deadlock here.
+		 * In fact, to make things easier, I'll just ignore the flush if
+		 * there is any data at all now available.
+		 */
+		if (prod_ptr == cons_ptr) {
+			loop_counter = 0;
+			while (readl(&port->ip_serial_regs->sscr) &
+			       SSCR_RX_DRAIN) {
+				loop_counter++;
+				if (loop_counter > MAXITER)
+					return -1;
+			}
+
+			/* SIGH. We have to reload the prod_ptr *again* since
+			 * the drain may have caused it to change
+			 */
+			prod_ptr = readl(&port->ip_serial_regs->srpir)
+			    & PROD_CONS_MASK;
+		}
+		if (reset_dma) {
+			port->ip_sscr &= ~SSCR_DMA_EN;
+			writel(port->ip_sscr, &port->ip_serial_regs->sscr);
+		}
+	}
+	inring = port->ip_inring;
+	port->ip_flags &= ~READ_ABORTED;
+
+	total = 0;
+	loop_counter = 0xfffff;	/* to avoid hangs */
+
+	/* Grab bytes from the hardware */
+	while ((prod_ptr != cons_ptr) && (len > 0)) {
+		entry = (struct ring_entry *)((caddr_t) inring + cons_ptr);
+
+		if (loop_counter-- <= 0) {
+			printk(KERN_WARNING "IOC3 serial: "
+			       "possible hang condition/"
+			       "port stuck on read (line %d).\n",
+				the_port->line);
+			break;
+		}
+
+		/* According to the producer pointer, this ring entry
+		 * must contain some data.  But if the PIO happened faster
+		 * than the DMA, the data may not be available yet, so let's
+		 * wait until it arrives.
+		 */
+		if ((entry->ring_allsc & RING_ANY_VALID) == 0) {
+			/* Indicate the read is aborted so we don't disable
+			 * the interrupt thinking that the consumer is
+			 * congested.
+			 */
+			port->ip_flags |= READ_ABORTED;
+			len = 0;
+			break;
+		}
+
+		/* Load the bytes/status out of the ring entry */
+		for (byte_num = 0; byte_num < 4 && len > 0; byte_num++) {
+			sc = &(entry->ring_sc[byte_num]);
+
+			/* Check for change in modem state or overrun */
+			if ((*sc & RXSB_MODEM_VALID)
+			    && (port->ip_notify & N_DDCD)) {
+				/* Notify upper layer if DCD dropped */
+				if ((port->ip_flags & DCD_ON)
+				    && !(*sc & RXSB_DCD)) {
+					/* If we have already copied some data,
+					 * return it.  We'll pick up the carrier
+					 * drop on the next pass.  That way we
+					 * don't throw away the data that has
+					 * already been copied back to
+					 * the caller's buffer.
+					 */
+					if (total > 0) {
+						len = 0;
+						break;
+					}
+					port->ip_flags &= ~DCD_ON;
+
+					/* Turn off this notification so the
+					 * carrier drop protocol won't see it
+					 * again when it does a read.
+					 */
+					*sc &= ~RXSB_MODEM_VALID;
+
+					/* To keep things consistent, we need
+					 * to update the consumer pointer so
+					 * the next reader won't come in and
+					 * try to read the same ring entries
+					 * again. This must be done here before
+					 * the dcd change.
+					 */
+
+					if ((entry->ring_allsc & RING_ANY_VALID)
+					    == 0) {
+						cons_ptr += (int)sizeof
+						    (struct ring_entry);
+						cons_ptr &= PROD_CONS_MASK;
+					}
+					writel(cons_ptr,
+					       &port->ip_serial_regs->srcir);
+					port->ip_rx_cons = cons_ptr;
+
+					/* Notify upper layer of carrier drop */
+					if ((port->ip_notify & N_DDCD)
+					    && port->ip_port) {
+						uart_handle_dcd_change
+							(port->ip_port, 0);
+						wake_up_interruptible
+						    (&the_port->info->
+						     delta_msr_wait);
+					}
+
+					/* If we had any data to return, we
+					 * would have returned it above.
+					 */
+					return 0;
+				}
+			}
+			if (*sc & RXSB_MODEM_VALID) {
+				/* Notify that an input overrun occurred */
+				if ((*sc & RXSB_OVERRUN)
+				    && (port->ip_notify & N_OVERRUN_ERROR)) {
+					ioc3_cb_post_ncs(the_port, NCS_OVERRUN);
+				}
+				/* Don't look at this byte again */
+				*sc &= ~RXSB_MODEM_VALID;
+			}
+
+			/* Check for valid data or RX errors */
+			if ((*sc & RXSB_DATA_VALID) &&
+			    ((*sc & (RXSB_PAR_ERR
+				     | RXSB_FRAME_ERR | RXSB_BREAK))
+			     && (port->ip_notify & (N_PARITY_ERROR
+						    | N_FRAMING_ERROR
+						    | N_BREAK)))) {
+				/* There is an error condition on the next byte.
+				 * If we have already transferred some bytes,
+				 * we'll stop here. Otherwise if this is the
+				 * first byte to be read, we'll just transfer
+				 * it alone after notifying the
+				 * upper layer of its status.
+				 */
+				if (total > 0) {
+					len = 0;
+					break;
+				} else {
+					if ((*sc & RXSB_PAR_ERR) &&
+					    (port->
+					     ip_notify & N_PARITY_ERROR)) {
+						ioc3_cb_post_ncs(the_port,
+								 NCS_PARITY);
+					}
+					if ((*sc & RXSB_FRAME_ERR) &&
+					    (port->
+					     ip_notify & N_FRAMING_ERROR)) {
+						ioc3_cb_post_ncs(the_port,
+								 NCS_FRAMING);
+					}
+					if ((*sc & RXSB_BREAK)
+					    && (port->ip_notify & N_BREAK)) {
+						ioc3_cb_post_ncs
+						    (the_port, NCS_BREAK);
+					}
+					len = 1;
+				}
+			}
+			if (*sc & RXSB_DATA_VALID) {
+				*sc &= ~RXSB_DATA_VALID;
+				*buf = entry->ring_data[byte_num];
+				buf++;
+				len--;
+				total++;
+			}
+		}
+
+		/* If we used up this entry entirely, go on to the next one,
+		 * otherwise we must have run out of buffer space, so
+		 * leave the consumer pointer here for the next read in case
+		 * there are still unread bytes in this entry.
+		 */
+		if ((entry->ring_allsc & RING_ANY_VALID) == 0) {
+			cons_ptr += (int)sizeof(struct ring_entry);
+			cons_ptr &= PROD_CONS_MASK;
+		}
+	}
+
+	/* Update consumer pointer and re-arm rx timer interrupt */
+	writel(cons_ptr, &port->ip_serial_regs->srcir);
+	port->ip_rx_cons = cons_ptr;
+
+	/* If we have now dipped below the rx high water mark and we have
+	 * rx_high interrupt turned off, we can now turn it back on again.
+	 */
+	if ((port->ip_flags & INPUT_HIGH) && (((prod_ptr - cons_ptr)
+					       & PROD_CONS_MASK) <
+					      ((port->
+						ip_sscr &
+						SSCR_RX_THRESHOLD)
+					       << PROD_CONS_PTR_OFF))) {
+		port->ip_flags &= ~INPUT_HIGH;
+		enable_intrs(port, hooks->intr_rx_high);
+	}
+	return total;
+}
+
+/**
+ * receive_chars - upper level read.
+ * @the_port: port to read from
+ */
+static int receive_chars(struct uart_port *the_port)
+{
+	struct tty_struct *tty;
+	unsigned char ch[MAX_CHARS];
+	int read_count = 0, read_room, flip = 0;
+	struct uart_info *info = the_port->info;
+	struct ioc3_port *port = get_ioc3_port(the_port);
+	unsigned long pflags;
+
+	/* Make sure all the pointers are "good" ones */
+	if (!info)
+		return 0;
+	if (!info->tty)
+		return 0;
+
+	if (!(port->ip_flags & INPUT_ENABLE))
+		return 0;
+
+	spin_lock_irqsave(&the_port->lock, pflags);
+	tty = info->tty;
+
+	read_count = do_read(the_port, ch, MAX_CHARS);
+	if (read_count > 0) {
+		flip = 1;
+		read_room = tty_buffer_request_room(tty, read_count);
+		tty_insert_flip_string(tty, ch, read_room);
+		the_port->icount.rx += read_count;
+	}
+	spin_unlock_irqrestore(&the_port->lock, pflags);
+
+	if (flip)
+		tty_flip_buffer_push(tty);
+
+	return read_count;
+}
+
+/**
+ * ioc3uart_intr_one - lowest level (per port) interrupt handler.
+ * @is : submodule
+ * @idd: driver data
+ * @pending: interrupts to handle
+ * @regs: pt_regs
+ */
+
+static int inline
+ioc3uart_intr_one(struct ioc3_submodule *is,
+			struct ioc3_driver_data *idd,
+			unsigned int pending, struct pt_regs *regs)
+{
+	int port_num = GET_PORT_FROM_SIO_IR(pending);
+	struct port_hooks *hooks;
+	unsigned int rx_high_rd_aborted = 0;
+	unsigned long flags;
+	struct uart_port *the_port;
+	struct ioc3_port *port;
+	int loop_counter;
+	struct ioc3_card *card_ptr;
+	unsigned int sio_ir;
+
+	card_ptr = idd->data[is->id];
+	port = card_ptr->ic_port[port_num].icp_port;
+	hooks = port->ip_hooks;
+
+	/* Possible race condition here: The tx_mt interrupt bit may be
+	 * cleared without the intervention of the interrupt handler,
+	 * e.g. by a write.  If the top level interrupt handler reads a
+	 * tx_mt, then some other processor does a write, starting up
+	 * output, then we come in here, see the tx_mt and stop DMA, the
+	 * output started by the other processor will hang.  Thus we can
+	 * only rely on tx_mt being legitimate if it is read while the
+	 * port lock is held.  Therefore this bit must be ignored in the
+	 * passed in interrupt mask which was read by the top level
+	 * interrupt handler since the port lock was not held at the time
+	 * it was read.  We can only rely on this bit being accurate if it
+	 * is read while the port lock is held.  So we'll clear it for now,
+	 * and reload it later once we have the port lock.
+	 */
+
+	sio_ir = pending & ~(hooks->intr_tx_mt);
+	spin_lock_irqsave(&port->ip_lock, flags);
+
+	loop_counter = MAXITER;	/* to avoid hangs */
+
+	do {
+		uint32_t shadow;
+
+		if (loop_counter-- <= 0) {
+			printk(KERN_WARNING "IOC3 serial: "
+			       "possible hang condition/"
+			       "port stuck on interrupt (line %d).\n",
+				((struct uart_port *)port->ip_port)->line);
+			break;
+		}
+		/* Handle a DCD change */
+		if (sio_ir & hooks->intr_delta_dcd) {
+			ioc3_ack(is, idd, hooks->intr_delta_dcd);
+			shadow = readl(&port->ip_serial_regs->shadow);
+
+			if ((port->ip_notify & N_DDCD)
+			    && (shadow & SHADOW_DCD)
+			    && (port->ip_port)) {
+				the_port = port->ip_port;
+				uart_handle_dcd_change(the_port,
+						shadow & SHADOW_DCD);
+				wake_up_interruptible
+				    (&the_port->info->delta_msr_wait);
+			} else if ((port->ip_notify & N_DDCD)
+				   && !(shadow & SHADOW_DCD)) {
+				/* Flag delta DCD/no DCD */
+				uart_handle_dcd_change(port->ip_port,
+						shadow & SHADOW_DCD);
+				port->ip_flags |= DCD_ON;
+			}
+		}
+
+		/* Handle a CTS change */
+		if (sio_ir & hooks->intr_delta_cts) {
+			ioc3_ack(is, idd, hooks->intr_delta_cts);
+			shadow = readl(&port->ip_serial_regs->shadow);
+
+			if ((port->ip_notify & N_DCTS) && (port->ip_port)) {
+				the_port = port->ip_port;
+				uart_handle_cts_change(the_port, shadow
+						& SHADOW_CTS);
+				wake_up_interruptible
+				    (&the_port->info->delta_msr_wait);
+			}
+		}
+
+		/* rx timeout interrupt.  Must be some data available.  Put this
+		 * before the check for rx_high since servicing this condition
+		 * may cause that condition to clear.
+		 */
+		if (sio_ir & hooks->intr_rx_timer) {
+			ioc3_ack(is, idd, hooks->intr_rx_timer);
+			if ((port->ip_notify & N_DATA_READY)
+						&& (port->ip_port)) {
+				receive_chars(port->ip_port);
+			}
+		}
+
+		/* rx high interrupt. Must be after rx_timer.  */
+		else if (sio_ir & hooks->intr_rx_high) {
+			/* Data available, notify upper layer */
+			if ((port->ip_notify & N_DATA_READY) && port->ip_port) {
+				receive_chars(port->ip_port);
+			}
+
+			/* We can't ACK this interrupt.  If receive_chars didn't
+			 * cause the condition to clear, we'll have to disable
+			 * the interrupt until the data is drained.
+			 * If the read was aborted, don't disable the interrupt
+			 * as this may cause us to hang indefinitely.  An
+			 * aborted read generally means that this interrupt
+			 * hasn't been delivered to the cpu yet anyway, even
+			 * though we see it as asserted when we read the sio_ir.
+			 */
+			if ((sio_ir = PENDING(card_ptr, idd))
+					& hooks->intr_rx_high) {
+				if (port->ip_flags & READ_ABORTED) {
+					rx_high_rd_aborted++;
+				}
+				else {
+					card_ptr->ic_enable &= ~hooks->intr_rx_high;
+					port->ip_flags |= INPUT_HIGH;
+				}
+			}
+		}
+
+		/* We got a low water interrupt: notify upper layer to
+		 * send more data.  Must come before tx_mt since servicing
+		 * this condition may cause that condition to clear.
+		 */
+		if (sio_ir & hooks->intr_tx_explicit) {
+			port->ip_flags &= ~LOWAT_WRITTEN;
+			ioc3_ack(is, idd, hooks->intr_tx_explicit);
+			if (port->ip_notify & N_OUTPUT_LOWAT)
+				ioc3_cb_output_lowat(port);
+		}
+
+		/* Handle tx_mt.  Must come after tx_explicit.  */
+		else if (sio_ir & hooks->intr_tx_mt) {
+			/* If we are expecting a lowat notification
+			 * and we get to this point it probably means that for
+			 * some reason the tx_explicit didn't work as expected
+			 * (that can legitimately happen if the output buffer is
+			 * filled up in just the right way).
+			 * So send the notification now.
+			 */
+			if (port->ip_notify & N_OUTPUT_LOWAT) {
+				ioc3_cb_output_lowat(port);
+
+				/* We need to reload the sio_ir since the lowat
+				 * call may have caused another write to occur,
+				 * clearing the tx_mt condition.
+				 */
+				sio_ir = PENDING(card_ptr, idd);
+			}
+
+			/* If the tx_mt condition still persists even after the
+			 * lowat call, we've got some work to do.
+			 */
+			if (sio_ir & hooks->intr_tx_mt) {
+				/* If we are not currently expecting DMA input,
+				 * and the transmitter has just gone idle,
+				 * there is no longer any reason for DMA, so
+				 * disable it.
+				 */
+				if (!(port->ip_notify
+				      & (N_DATA_READY | N_DDCD))) {
+					BUG_ON(!(port->ip_sscr
+						 & SSCR_DMA_EN));
+					port->ip_sscr &= ~SSCR_DMA_EN;
+					writel(port->ip_sscr,
+					       &port->ip_serial_regs->sscr);
+				}
+				/* Prevent infinite tx_mt interrupt */
+				card_ptr->ic_enable &= ~hooks->intr_tx_mt;
+			}
+		}
+		sio_ir = PENDING(card_ptr, idd);
+
+		/* if the read was aborted and only hooks->intr_rx_high,
+		 * clear hooks->intr_rx_high, so we do not loop forever.
+		 */
+
+		if (rx_high_rd_aborted && (sio_ir == hooks->intr_rx_high)) {
+			sio_ir &= ~hooks->intr_rx_high;
+		}
+	} while (sio_ir & hooks->intr_all);
+
+	spin_unlock_irqrestore(&port->ip_lock, flags);
+	ioc3_enable(is, idd, card_ptr->ic_enable);
+	return 0;
+}
+
+/**
+ * ioc3uart_intr - field all serial interrupts
+ * @is : submodule
+ * @idd: driver data
+ * @pending: interrupts to handle
+ * @regs: pt_regs
+ *
+ */
+
+static int ioc3uart_intr(struct ioc3_submodule *is,
+			struct ioc3_driver_data *idd,
+			unsigned int pending, struct pt_regs *regs)
+{
+	int ret = 0;
+
+	/*
+	 * The upper level interrupt handler sends interrupts for both ports
+	 * here. So we need to call for each port with its interrupts.
+	 */
+
+	if (pending & SIO_IR_SA)
+		ret |= ioc3uart_intr_one(is, idd, pending & SIO_IR_SA, regs);
+	if (pending & SIO_IR_SB)
+		ret |= ioc3uart_intr_one(is, idd, pending & SIO_IR_SB, regs);
+
+	return ret;
+}
+
+/**
+ * ic3_type
+ * @port: Port to operate with (we ignore since we only have one port)
+ *
+ */
+static const char *ic3_type(struct uart_port *the_port)
+{
+	if (IS_RS232(the_port->line))
+		return "SGI IOC3 Serial [rs232]";
+	else
+		return "SGI IOC3 Serial [rs422]";
+}
+
+/**
+ * ic3_tx_empty - Is the transmitter empty?
+ * @port: Port to operate on
+ *
+ */
+static unsigned int ic3_tx_empty(struct uart_port *the_port)
+{
+	unsigned int ret = 0;
+	struct ioc3_port *port = get_ioc3_port(the_port);
+
+	if (readl(&port->ip_serial_regs->shadow) & SHADOW_TEMT)
+		ret = TIOCSER_TEMT;
+	return ret;
+}
+
+/**
+ * ic3_stop_tx - stop the transmitter
+ * @port: Port to operate on
+ *
+ */
+static void ic3_stop_tx(struct uart_port *the_port)
+{
+	struct ioc3_port *port = get_ioc3_port(the_port);
+
+	if (port)
+		set_notification(port, N_OUTPUT_LOWAT, 0);
+}
+
+/**
+ * ic3_stop_rx - stop the receiver
+ * @port: Port to operate on
+ *
+ */
+static void ic3_stop_rx(struct uart_port *the_port)
+{
+	struct ioc3_port *port = get_ioc3_port(the_port);
+
+	if (port)
+		port->ip_flags &= ~INPUT_ENABLE;
+}
+
+/**
+ * null_void_function
+ * @port: Port to operate on
+ *
+ */
+static void null_void_function(struct uart_port *the_port)
+{
+}
+
+/**
+ * ic3_shutdown - shut down the port - free irq and disable
+ * @port: port to shut down
+ *
+ */
+static void ic3_shutdown(struct uart_port *the_port)
+{
+	unsigned long port_flags;
+	struct ioc3_port *port;
+	struct uart_info *info;
+
+	port = get_ioc3_port(the_port);
+	if (!port)
+		return;
+
+	info = the_port->info;
+	wake_up_interruptible(&info->delta_msr_wait);
+
+	spin_lock_irqsave(&the_port->lock, port_flags);
+	set_notification(port, N_ALL, 0);
+	spin_unlock_irqrestore(&the_port->lock, port_flags);
+}
+
+/**
+ * ic3_set_mctrl - set control lines (dtr, rts, etc)
+ * @port: Port to operate on
+ * @mctrl: Lines to set/unset
+ *
+ */
+static void ic3_set_mctrl(struct uart_port *the_port, unsigned int mctrl)
+{
+	unsigned char mcr = 0;
+
+	if (mctrl & TIOCM_RTS)
+		mcr |= UART_MCR_RTS;
+	if (mctrl & TIOCM_DTR)
+		mcr |= UART_MCR_DTR;
+	if (mctrl & TIOCM_OUT1)
+		mcr |= UART_MCR_OUT1;
+	if (mctrl & TIOCM_OUT2)
+		mcr |= UART_MCR_OUT2;
+	if (mctrl & TIOCM_LOOP)
+		mcr |= UART_MCR_LOOP;
+
+	set_mcr(the_port, mcr, SHADOW_DTR);
+}
+
+/**
+ * ic3_get_mctrl - get control line info
+ * @port: port to operate on
+ *
+ */
+static unsigned int ic3_get_mctrl(struct uart_port *the_port)
+{
+	struct ioc3_port *port = get_ioc3_port(the_port);
+	uint32_t shadow;
+	unsigned int ret = 0;
+
+	if (!port)
+		return 0;
+
+	shadow = readl(&port->ip_serial_regs->shadow);
+	if (shadow & SHADOW_DCD)
+		ret |= TIOCM_CD;
+	if (shadow & SHADOW_DR)
+		ret |= TIOCM_DSR;
+	if (shadow & SHADOW_CTS)
+		ret |= TIOCM_CTS;
+	return ret;
+}
+
+/**
+ * ic3_start_tx - Start transmitter. Called with the_port->lock
+ * @port: Port to operate on
+ *
+ */
+static void ic3_start_tx(struct uart_port *the_port)
+{
+	struct ioc3_port *port = get_ioc3_port(the_port);
+
+	if (port) {
+		set_notification(port, N_OUTPUT_LOWAT, 1);
+		enable_intrs(port, port->ip_hooks->intr_tx_mt);
+	}
+}
+
+/**
+ * ic3_break_ctl - handle breaks
+ * @port: Port to operate on
+ * @break_state: Break state
+ *
+ */
+static void ic3_break_ctl(struct uart_port *the_port, int break_state)
+{
+}
+
+/**
+ * ic3_startup - Start up the serial port - always return 0 (We're always on)
+ * @port: Port to operate on
+ *
+ */
+static int ic3_startup(struct uart_port *the_port)
+{
+	int retval;
+	struct ioc3_port *port;
+	struct ioc3_card *card_ptr;
+	unsigned long port_flags;
+
+	if (!the_port) {
+		NOT_PROGRESS();
+		return -ENODEV;
+	}
+	port = get_ioc3_port(the_port);
+	if (!port) {
+		NOT_PROGRESS();
+		return -ENODEV;
+	}
+	card_ptr = port->ip_card;
+	port->ip_port = the_port;
+
+	if (!card_ptr) {
+		NOT_PROGRESS();
+		return -ENODEV;
+	}
+
+	/* Start up the serial port */
+	spin_lock_irqsave(&the_port->lock, port_flags);
+	retval = ic3_startup_local(the_port);
+	spin_unlock_irqrestore(&the_port->lock, port_flags);
+	return retval;
+}
+
+/**
+ * ic3_set_termios - set termios stuff
+ * @port: port to operate on
+ * @termios: New settings
+ * @termios: Old
+ *
+ */
+static void
+ic3_set_termios(struct uart_port *the_port,
+		struct termios *termios, struct termios *old_termios)
+{
+	unsigned long port_flags;
+
+	spin_lock_irqsave(&the_port->lock, port_flags);
+	ioc3_change_speed(the_port, termios, old_termios);
+	spin_unlock_irqrestore(&the_port->lock, port_flags);
+}
+
+/**
+ * ic3_request_port - allocate resources for port - no op....
+ * @port: port to operate on
+ *
+ */
+static int ic3_request_port(struct uart_port *port)
+{
+	return 0;
+}
+
+/* Associate the uart functions above - given to serial core */
+static struct uart_ops ioc3_ops = {
+	.tx_empty = ic3_tx_empty,
+	.set_mctrl = ic3_set_mctrl,
+	.get_mctrl = ic3_get_mctrl,
+	.stop_tx = ic3_stop_tx,
+	.start_tx = ic3_start_tx,
+	.stop_rx = ic3_stop_rx,
+	.enable_ms = null_void_function,
+	.break_ctl = ic3_break_ctl,
+	.startup = ic3_startup,
+	.shutdown = ic3_shutdown,
+	.set_termios = ic3_set_termios,
+	.type = ic3_type,
+	.release_port = null_void_function,
+	.request_port = ic3_request_port,
+};
+
+/*
+ * Boot-time initialization code
+ */
+
+static struct uart_driver ioc3_uart = {
+	.owner = THIS_MODULE,
+	.driver_name = "ioc3_serial",
+	.dev_name = DEVICE_NAME,
+	.major = DEVICE_MAJOR,
+	.minor = DEVICE_MINOR,
+	.nr = MAX_LOGICAL_PORTS
+};
+
+/**
+ * ioc3_serial_core_attach - register with serial core
+ *		This is done during pci probing
+ * @is: submodule struct for this
+ * @idd: handle for this card
+ */
+static inline int ioc3_serial_core_attach( struct ioc3_submodule *is,
+				struct ioc3_driver_data *idd)
+{
+	struct ioc3_port *port;
+	struct uart_port *the_port;
+	struct ioc3_card *card_ptr = idd->data[is->id];
+	int ii, phys_port;
+	struct pci_dev *pdev = idd->pdev;
+
+	DPRINT_CONFIG(("%s: attach pdev 0x%p - card_ptr 0x%p\n",
+		       __FUNCTION__, pdev, (void *)card_ptr));
+
+	if (!card_ptr)
+		return -ENODEV;
+
+	/* once around for each logical port on this card */
+	for (ii = 0; ii < LOGICAL_PORTS_PER_CARD; ii++) {
+		phys_port = GET_PHYSICAL_PORT(ii);
+		the_port = &card_ptr->ic_port[phys_port].
+				icp_uart_port[GET_LOGICAL_PORT(ii)];
+		port = card_ptr->ic_port[phys_port].icp_port;
+		port->ip_port = the_port;
+
+		DPRINT_CONFIG(("%s: attach the_port 0x%p / port 0x%p [%d/%d]\n",
+			__FUNCTION__, (void *)the_port, (void *)port,
+				phys_port, ii));
+
+		/* membase, iobase and mapbase just need to be non-0 */
+		the_port->membase = (unsigned char __iomem *)1;
+		the_port->iobase = (pdev->bus->number << 16) |  ii;
+		the_port->line = (Num_of_ioc3_cards << 2) | ii;
+		the_port->mapbase = 1;
+		the_port->type = PORT_16550A;
+		the_port->fifosize = FIFO_SIZE;
+		the_port->ops = &ioc3_ops;
+		the_port->irq = idd->irq_io;
+		the_port->dev = &pdev->dev;
+
+		if (uart_add_one_port(&ioc3_uart, the_port) < 0) {
+			printk(KERN_WARNING
+		          "%s: unable to add port %d bus %d\n",
+			       __FUNCTION__, the_port->line, pdev->bus->number);
+		} else {
+			DPRINT_CONFIG(("IOC3 serial port %d irq %d bus %d\n",
+		          the_port->line, the_port->irq, pdev->bus->number));
+		}
+
+		/* all ports are rs232 for now */
+		if (IS_PHYSICAL_PORT(ii))
+			ioc3_set_proto(port, PROTO_RS232);
+	}
+	return 0;
+}
+
+/**
+ * ioc3uart_remove - register detach function
+ * @is: submodule struct for this submodule
+ * @idd: ioc3 driver data for this submodule
+ */
+
+static int ioc3uart_remove(struct ioc3_submodule *is,
+			struct ioc3_driver_data *idd)
+{
+	struct ioc3_card *card_ptr = idd->data[is->id];
+	struct uart_port *the_port;
+	struct ioc3_port *port;
+	int ii;
+
+	if (card_ptr) {
+		for (ii = 0; ii < LOGICAL_PORTS_PER_CARD; ii++) {
+			the_port = &card_ptr->ic_port[GET_PHYSICAL_PORT(ii)].
+					icp_uart_port[GET_LOGICAL_PORT(ii)];
+			if (the_port)
+				uart_remove_one_port(&ioc3_uart, the_port);
+			port = card_ptr->ic_port[GET_PHYSICAL_PORT(ii)].icp_port;
+			if (port && IS_PHYSICAL_PORT(ii)
+					&& (GET_PHYSICAL_PORT(ii) == 0)) {
+				pci_free_consistent(port->ip_idd->pdev,
+					TOTAL_RING_BUF_SIZE,
+					(void *)port->ip_cpu_ringbuf,
+					port->ip_dma_ringbuf);
+				kfree(port);
+				card_ptr->ic_port[GET_PHYSICAL_PORT(ii)].
+							icp_port = NULL;
+			}
+		}
+		kfree(card_ptr);
+		idd->data[is->id] = NULL;
+	}
+	return 0;
+}
+
+/**
+ * ioc3uart_probe - card probe function called from shim driver
+ * @is: submodule struct for this submodule
+ * @idd: ioc3 driver data for this card
+ */
+
+static int __devinit
+ioc3uart_probe(struct ioc3_submodule *is, struct ioc3_driver_data *idd)
+{
+	struct pci_dev *pdev = idd->pdev;
+	struct ioc3_card *card_ptr;
+	int ret = 0;
+	struct ioc3_port *port;
+	struct ioc3_port *ports[PORTS_PER_CARD];
+	int phys_port;
+
+	DPRINT_CONFIG(("%s (0x%p, 0x%p)\n", __FUNCTION__, is, idd));
+
+	card_ptr = kmalloc(sizeof(struct ioc3_card), GFP_KERNEL);
+	if (!card_ptr) {
+		printk(KERN_WARNING "ioc3_attach_one"
+		       ": unable to get memory for the IOC3\n");
+		return -ENOMEM;
+	}
+	memset(card_ptr, 0, sizeof(struct ioc3_card));
+	idd->data[is->id] = card_ptr;
+	Submodule_slot = is->id;
+
+	writel(((UARTA_BASE >> 3) << SIO_CR_SER_A_BASE_SHIFT) |
+		((UARTB_BASE >> 3) << SIO_CR_SER_B_BASE_SHIFT) |
+		(0xf << SIO_CR_CMD_PULSE_SHIFT), &idd->vma->sio_cr);
+
+	pci_write_config_dword(pdev, PCI_LAT, 0xff00);
+
+	/* Enable serial port mode select generic PIO pins as outputs */
+	ioc3_gpcr_set(idd, GPCR_UARTA_MODESEL | GPCR_UARTB_MODESEL);
+
+	/* Create port structures for each port */
+	for (phys_port = 0; phys_port < PORTS_PER_CARD; phys_port++) {
+		port = kmalloc(sizeof(struct ioc3_port), GFP_KERNEL);
+		if (!port) {
+			printk(KERN_WARNING
+			       "IOC3 serial memory not available for port\n");
+			goto out4;
+		}
+		memset(port, 0, sizeof(struct ioc3_port));
+		spin_lock_init(&port->ip_lock);
+
+		/* we need to remember the previous ones, to point back to
+		 * them farther down - setting up the ring buffers.
+		 */
+		ports[phys_port] = port;
+
+		/* init to something useful */
+		card_ptr->ic_port[phys_port].icp_port = port;
+		port->ip_is = is;
+		port->ip_idd = idd;
+		port->ip_baud = 9600;
+		port->ip_card = card_ptr;
+		port->ip_hooks = &hooks_array[phys_port];
+
+		/* Setup each port */
+		if (phys_port == 0) {
+			port->ip_serial_regs = &idd->vma->port_a;
+			port->ip_uart_regs = &idd->vma->sregs.uarta;
+
+			DPRINT_CONFIG(("%s : Port A ip_serial_regs 0x%p "
+				       "ip_uart_regs 0x%p\n",
+				       __FUNCTION__,
+				       (void *)port->ip_serial_regs,
+				       (void *)port->ip_uart_regs));
+
+			/* setup ring buffers */
+			port->ip_cpu_ringbuf = pci_alloc_consistent(pdev,
+				TOTAL_RING_BUF_SIZE, &port->ip_dma_ringbuf);
+
+			BUG_ON(!((((int64_t) port->ip_dma_ringbuf) &
+				  (TOTAL_RING_BUF_SIZE - 1)) == 0));
+			port->ip_inring = RING(port, RX_A);
+			port->ip_outring = RING(port, TX_A);
+			DPRINT_CONFIG(("%s : Port A ip_cpu_ringbuf 0x%p "
+				       "ip_dma_ringbuf 0x%p, ip_inring 0x%p "
+					"ip_outring 0x%p\n",
+				       __FUNCTION__,
+				       (void *)port->ip_cpu_ringbuf,
+				       (void *)port->ip_dma_ringbuf,
+				       (void *)port->ip_inring,
+				       (void *)port->ip_outring));
+		}
+		else {
+			port->ip_serial_regs = &idd->vma->port_b;
+			port->ip_uart_regs = &idd->vma->sregs.uartb;
+
+			DPRINT_CONFIG(("%s : Port B ip_serial_regs 0x%p "
+				       "ip_uart_regs 0x%p\n",
+				       __FUNCTION__,
+				       (void *)port->ip_serial_regs,
+				       (void *)port->ip_uart_regs));
+
+			/* share the ring buffers */
+			port->ip_dma_ringbuf =
+			    ports[phys_port - 1]->ip_dma_ringbuf;
+			port->ip_cpu_ringbuf =
+			    ports[phys_port - 1]->ip_cpu_ringbuf;
+			port->ip_inring = RING(port, RX_B);
+			port->ip_outring = RING(port, TX_B);
+			DPRINT_CONFIG(("%s : Port B ip_cpu_ringbuf 0x%p "
+				       "ip_dma_ringbuf 0x%p, ip_inring 0x%p "
+					"ip_outring 0x%p\n",
+				       __FUNCTION__,
+				       (void *)port->ip_cpu_ringbuf,
+				       (void *)port->ip_dma_ringbuf,
+				       (void *)port->ip_inring,
+				       (void *)port->ip_outring));
+		}
+
+		DPRINT_CONFIG(("%s : port %d [addr 0x%p] card_ptr 0x%p",
+			       __FUNCTION__,
+			       phys_port, (void *)port, (void *)card_ptr));
+		DPRINT_CONFIG((" ip_serial_regs 0x%p ip_uart_regs 0x%p\n",
+			       (void *)port->ip_serial_regs,
+			       (void *)port->ip_uart_regs));
+
+		/* Initialize the hardware for IOC3 */
+		port_init(port);
+
+		DPRINT_CONFIG(("%s: phys_port %d port 0x%p inring 0x%p "
+			       "outring 0x%p\n",
+			       __FUNCTION__,
+			       phys_port, (void *)port,
+			       (void *)port->ip_inring,
+			       (void *)port->ip_outring));
+
+	}
+
+	/* register port with the serial core */
+
+	if ((ret = ioc3_serial_core_attach(is, idd)))
+		goto out4;
+
+	Num_of_ioc3_cards++;
+
+	return ret;
+
+	/* error exits that give back resources */
+out4:
+	kfree(card_ptr);
+	return ret;
+}
+
+static struct ioc3_submodule ioc3uart_submodule = {
+	.name = "IOC3uart",
+	.probe = ioc3uart_probe,
+	.remove = ioc3uart_remove,
+	/* call .intr for both ports initially */
+	.irq_mask = SIO_IR_SA | SIO_IR_SB,
+	.intr = ioc3uart_intr,
+	.owner = THIS_MODULE,
+};
+
+/**
+ * ioc3_detect - module init called,
+ */
+static int __devinit ioc3uart_init(void)
+{
+	int ret;
+
+	/* register with serial core */
+	if ((ret = uart_register_driver(&ioc3_uart)) < 0) {
+		printk(KERN_WARNING
+		       "%s: Couldn't register IOC3 uart serial driver\n",
+		       __FUNCTION__);
+		return ret;
+	}
+	ret = ioc3_register_submodule(&ioc3uart_submodule);
+	if (ret)
+		uart_unregister_driver(&ioc3_uart);
+	return ret;
+}
+
+static void __devexit ioc3uart_exit(void)
+{
+	ioc3_unregister_submodule(&ioc3uart_submodule);
+	uart_unregister_driver(&ioc3_uart);
+}
+
+module_init(ioc3uart_init);
+module_exit(ioc3uart_exit);
+
+MODULE_AUTHOR("Pat Gefre - Silicon Graphics Inc. (SGI) <pfg@sgi.com>");
+MODULE_DESCRIPTION("Serial PCI driver module for SGI IOC3 card");
+MODULE_LICENSE("GPL");
diff --git a/drivers/sn/Kconfig b/drivers/sn/Kconfig
index 13b8d249da5c..d95265b187a3 100644
--- a/drivers/sn/Kconfig
+++ b/drivers/sn/Kconfig
@@ -17,4 +17,18 @@ config SGI_IOC4
 	If you have an SGI Altix with an IOC4-based
 	I/O controller say Y.  Otherwise say N.
 
+config SGI_IOC3
+	tristate "SGI IOC3 Base IO support"
+	depends on (IA64_GENERIC || IA64_SGI_SN2)
+	default m
+	---help---
+	This option enables basic support for the SGI IOC3-based Base IO
+	controller card.  This option does not enable any specific
+	functions on such a card, but provides necessary infrastructure
+	for other drivers to utilize.
+
+	If you have an SGI Altix with an IOC3-based
+	I/O controller or a PCI IOC3 serial card say Y.
+	Otherwise say N.
+
 endmenu
diff --git a/drivers/sn/Makefile b/drivers/sn/Makefile
index c2a284185372..2cda011597c0 100644
--- a/drivers/sn/Makefile
+++ b/drivers/sn/Makefile
@@ -4,3 +4,4 @@
 #
 
 obj-$(CONFIG_SGI_IOC4) += ioc4.o
+obj-$(CONFIG_SGI_IOC3) += ioc3.o
diff --git a/drivers/sn/ioc3.c b/drivers/sn/ioc3.c
new file mode 100644
index 000000000000..aaa009f4a7bf
--- /dev/null
+++ b/drivers/sn/ioc3.c
@@ -0,0 +1,851 @@
+/*
+ * SGI IOC3 master driver and IRQ demuxer
+ *
+ * Copyright (c) 2005 Stanislaw Skowronek <skylark@linux-mips.org>
+ * Heavily based on similar work by:
+ *   Brent Casavant <bcasavan@sgi.com> - IOC4 master driver
+ *   Pat Gefre <pfg@sgi.com> - IOC3 serial port IRQ demuxer
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/ioc3.h>
+#include <linux/rwsem.h>
+
+#define IOC3_PCI_SIZE 0x100000
+
+static LIST_HEAD(ioc3_devices);
+static int ioc3_counter;
+static DECLARE_RWSEM(ioc3_devices_rwsem);
+
+static struct ioc3_submodule *ioc3_submodules[IOC3_MAX_SUBMODULES];
+static struct ioc3_submodule *ioc3_ethernet;
+static rwlock_t ioc3_submodules_lock = RW_LOCK_UNLOCKED;
+
+/* NIC probing code */
+
+#define GPCR_MLAN_EN    0x00200000      /* enable MCR to pin 8 */
+
+static inline unsigned mcr_pack(unsigned pulse, unsigned sample)
+{
+	return (pulse << 10) | (sample << 2);
+}
+
+static int nic_wait(struct ioc3_driver_data *idd)
+{
+	volatile unsigned mcr;
+
+        do {
+                mcr = (volatile unsigned)idd->vma->mcr;
+        } while (!(mcr & 2));
+
+        return mcr & 1;
+}
+
+static int nic_reset(struct ioc3_driver_data *idd)
+{
+        int presence;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	idd->vma->mcr = mcr_pack(500, 65);
+	presence = nic_wait(idd);
+	local_irq_restore(flags);
+
+	udelay(500);
+
+        return presence;
+}
+
+static inline int nic_read_bit(struct ioc3_driver_data *idd)
+{
+	int result;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	idd->vma->mcr = mcr_pack(6, 13);
+	result = nic_wait(idd);
+	local_irq_restore(flags);
+
+	udelay(500);
+
+	return result;
+}
+
+static inline void nic_write_bit(struct ioc3_driver_data *idd, int bit)
+{
+	if (bit)
+		idd->vma->mcr = mcr_pack(6, 110);
+	else
+		idd->vma->mcr = mcr_pack(80, 30);
+
+	nic_wait(idd);
+}
+
+static unsigned nic_read_byte(struct ioc3_driver_data *idd)
+{
+	unsigned result = 0;
+	int i;
+
+	for (i = 0; i < 8; i++)
+		result = (result >> 1) | (nic_read_bit(idd) << 7);
+
+	return result;
+}
+
+static void nic_write_byte(struct ioc3_driver_data *idd, int byte)
+{
+	int i, bit;
+
+	for (i = 8; i; i--) {
+		bit = byte & 1;
+		byte >>= 1;
+
+		nic_write_bit(idd, bit);
+	}
+}
+
+static unsigned long
+nic_find(struct ioc3_driver_data *idd, int *last, unsigned long addr)
+{
+	int a, b, index, disc;
+
+	nic_reset(idd);
+
+	/* Search ROM.  */
+	nic_write_byte(idd, 0xF0);
+
+	/* Algorithm from ``Book of iButton Standards''.  */
+	for (index = 0, disc = 0; index < 64; index++) {
+		a = nic_read_bit(idd);
+		b = nic_read_bit(idd);
+
+		if (a && b) {
+			printk(KERN_WARNING "IOC3 NIC search failed.\n");
+			*last = 0;
+			return 0;
+		}
+
+		if (!a && !b) {
+			if (index == *last) {
+				addr |= 1UL << index;
+			} else if (index > *last) {
+				addr &= ~(1UL << index);
+				disc = index;
+			} else if ((addr & (1UL << index)) == 0)
+				disc = index;
+			nic_write_bit(idd, (addr>>index)&1);
+			continue;
+		} else {
+			if (a)
+				addr |= 1UL << index;
+			else
+				addr &= ~(1UL << index);
+			nic_write_bit(idd, a);
+			continue;
+		}
+	}
+	*last = disc;
+	return addr;
+}
+
+static void nic_addr(struct ioc3_driver_data *idd, unsigned long addr)
+{
+	int index;
+
+	nic_reset(idd);
+	nic_write_byte(idd, 0xF0);
+	for (index = 0; index < 64; index++) {
+		nic_read_bit(idd);
+		nic_read_bit(idd);
+		nic_write_bit(idd, (addr>>index)&1);
+	}
+}
+
+static void crc16_byte(unsigned int *crc, unsigned char db)
+{
+	int i;
+
+	for(i=0;i<8;i++) {
+		*crc <<= 1;
+		if((db^(*crc>>16)) & 1)
+			*crc ^= 0x8005;
+		db >>= 1;
+	}
+	*crc &= 0xFFFF;
+}
+
+static unsigned int crc16_area(unsigned char *dbs, int size, unsigned int crc)
+{
+	while(size--)
+		crc16_byte(&crc, *(dbs++));
+	return crc;
+}
+
+static void crc8_byte(unsigned int *crc, unsigned char db)
+{
+	int i,f;
+
+	for(i=0;i<8;i++) {
+		f = (*crc ^ db) & 1;
+		*crc >>= 1;
+		db >>= 1;
+		if(f)
+			*crc ^= 0x8c;
+	}
+	*crc &= 0xff;
+}
+
+static unsigned int crc8_addr(unsigned long addr)
+{
+	int i;
+	unsigned int crc = 0x00;
+
+	for(i=0;i<8;i++)
+		crc8_byte(&crc, addr>>(i<<3));
+	return crc;
+}
+
+static void
+read_redir_page(struct ioc3_driver_data *idd, unsigned long addr, int page,
+			unsigned char *redir, unsigned char *data)
+{
+	int loops = 16, i;
+
+	while(redir[page] != 0xFF) {
+		page = redir[page]^0xFF;
+		loops--;
+		if(loops<0) {
+			printk(KERN_ERR "IOC3: NIC circular redirection\n");
+			return;
+		}
+	}
+	loops = 3;
+	while(loops>0) {
+		nic_addr(idd, addr);
+		nic_write_byte(idd, 0xF0);
+		nic_write_byte(idd, (page << 5) & 0xE0);
+		nic_write_byte(idd, (page >> 3) & 0x1F);
+		for(i=0;i<0x20;i++)
+			data[i] = nic_read_byte(idd);
+		if(crc16_area(data, 0x20, 0x0000) == 0x800d)
+			return;
+		loops--;
+	}
+	printk(KERN_ERR "IOC3: CRC error in data page\n");
+	for(i=0;i<0x20;i++)
+		data[i] = 0x00;
+}
+
+static void
+read_redir_map(struct ioc3_driver_data *idd, unsigned long addr,
+					 unsigned char *redir)
+{
+	int i,j,loops = 3,crc_ok;
+	unsigned int crc;
+
+	while(loops>0) {
+		crc_ok = 1;
+		nic_addr(idd, addr);
+		nic_write_byte(idd, 0xAA);
+		nic_write_byte(idd, 0x00);
+		nic_write_byte(idd, 0x01);
+		for(i=0;i<64;i+=8) {
+			for(j=0;j<8;j++)
+				redir[i+j] = nic_read_byte(idd);
+			crc = crc16_area(redir+i, 8, (i==0)?0x8707:0x0000);
+			crc16_byte(&crc, nic_read_byte(idd));
+			crc16_byte(&crc, nic_read_byte(idd));
+			if(crc != 0x800d)
+				crc_ok = 0;
+		}
+		if(crc_ok)
+			return;
+		loops--;
+	}
+	printk(KERN_ERR "IOC3: CRC error in redirection page\n");
+	for(i=0;i<64;i++)
+		redir[i] = 0xFF;
+}
+
+static void read_nic(struct ioc3_driver_data *idd, unsigned long addr)
+{
+	unsigned char redir[64];
+	unsigned char data[64],part[32];
+	int i,j;
+
+	/* read redirections */
+	read_redir_map(idd, addr, redir);
+	/* read data pages */
+	read_redir_page(idd, addr, 0, redir, data);
+	read_redir_page(idd, addr, 1, redir, data+32);
+	/* assemble the part # */
+	j=0;
+	for(i=0;i<19;i++)
+		if(data[i+11] != ' ')
+			part[j++] = data[i+11];
+	for(i=0;i<6;i++)
+		if(data[i+32] != ' ')
+			part[j++] = data[i+32];
+	part[j] = 0;
+	/* skip Octane power supplies */
+	if(!strncmp(part, "060-0035-", 9))
+		return;
+	if(!strncmp(part, "060-0038-", 9))
+		return;
+	strcpy(idd->nic_part, part);
+	/* assemble the serial # */
+	j=0;
+	for(i=0;i<10;i++)
+		if(data[i+1] != ' ')
+			idd->nic_serial[j++] = data[i+1];
+	idd->nic_serial[j] = 0;
+}
+
+static void read_mac(struct ioc3_driver_data *idd, unsigned long addr)
+{
+	int i, loops = 3;
+	unsigned char data[13];
+
+	while(loops>0) {
+		nic_addr(idd, addr);
+		nic_write_byte(idd, 0xF0);
+		nic_write_byte(idd, 0x00);
+		nic_write_byte(idd, 0x00);
+		nic_read_byte(idd);
+		for(i=0;i<13;i++)
+			data[i] = nic_read_byte(idd);
+		if(crc16_area(data, 13, 0x0000) == 0x800d) {
+			for(i=10;i>4;i--)
+				idd->nic_mac[10-i] = data[i];
+			return;
+		}
+		loops--;
+	}
+	printk(KERN_ERR "IOC3: CRC error in MAC address\n");
+	for(i=0;i<6;i++)
+		idd->nic_mac[i] = 0x00;
+}
+
+static void probe_nic(struct ioc3_driver_data *idd)
+{
+        int save = 0, loops = 3;
+        unsigned long first, addr;
+
+        idd->vma->gpcr_s = GPCR_MLAN_EN;
+
+        while(loops>0) {
+                idd->nic_part[0] = 0;
+                idd->nic_serial[0] = 0;
+                addr = first = nic_find(idd, &save, 0);
+                if(!first)
+                        return;
+                while(1) {
+                        if(crc8_addr(addr))
+                                break;
+                        else {
+                                switch(addr & 0xFF) {
+                                case 0x0B:
+                                        read_nic(idd, addr);
+                                        break;
+                                case 0x09:
+                                case 0x89:
+                                case 0x91:
+                                        read_mac(idd, addr);
+                                        break;
+                                }
+                        }
+                        addr = nic_find(idd, &save, addr);
+                        if(addr == first)
+                                return;
+                }
+                loops--;
+        }
+        printk(KERN_ERR "IOC3: CRC error in NIC address\n");
+}
+
+/* Interrupts */
+
+static inline void
+write_ireg(struct ioc3_driver_data *idd, uint32_t val, int which)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&idd->ir_lock, flags);
+	switch (which) {
+	case IOC3_W_IES:
+		writel(val, &idd->vma->sio_ies);
+		break;
+	case IOC3_W_IEC:
+		writel(val, &idd->vma->sio_iec);
+		break;
+	}
+	spin_unlock_irqrestore(&idd->ir_lock, flags);
+}
+static inline uint32_t get_pending_intrs(struct ioc3_driver_data *idd)
+{
+	unsigned long flag;
+	uint32_t intrs = 0;
+
+	spin_lock_irqsave(&idd->ir_lock, flag);
+	intrs = readl(&idd->vma->sio_ir);
+	intrs &= readl(&idd->vma->sio_ies);
+	spin_unlock_irqrestore(&idd->ir_lock, flag);
+	return intrs;
+}
+
+static irqreturn_t ioc3_intr_io(int irq, void *arg, struct pt_regs *regs)
+{
+	unsigned long flags;
+	struct ioc3_driver_data *idd = (struct ioc3_driver_data *)arg;
+	int handled = 1, id;
+	unsigned int pending;
+
+	read_lock_irqsave(&ioc3_submodules_lock, flags);
+
+	if(idd->dual_irq && idd->vma->eisr) {
+		/* send Ethernet IRQ to the driver */
+		if(ioc3_ethernet && idd->active[ioc3_ethernet->id] &&
+						ioc3_ethernet->intr) {
+			handled = handled && !ioc3_ethernet->intr(ioc3_ethernet,
+							idd, 0, regs);
+		}
+	}
+	pending = get_pending_intrs(idd);	/* look at the IO IRQs */
+
+	for(id=0;id<IOC3_MAX_SUBMODULES;id++) {
+		if(idd->active[id] && ioc3_submodules[id]
+				&& (pending & ioc3_submodules[id]->irq_mask)
+				&& ioc3_submodules[id]->intr) {
+			write_ireg(idd, ioc3_submodules[id]->irq_mask,
+							IOC3_W_IEC);
+			if(!ioc3_submodules[id]->intr(ioc3_submodules[id],
+				   idd, pending & ioc3_submodules[id]->irq_mask,
+					regs))
+				pending &= ~ioc3_submodules[id]->irq_mask;
+			if (ioc3_submodules[id]->reset_mask)
+				write_ireg(idd, ioc3_submodules[id]->irq_mask,
+							IOC3_W_IES);
+		}
+	}
+	read_unlock_irqrestore(&ioc3_submodules_lock, flags);
+	if(pending) {
+		printk(KERN_WARNING
+		  "IOC3: Pending IRQs 0x%08x discarded and disabled\n",pending);
+		write_ireg(idd, pending, IOC3_W_IEC);
+		handled = 1;
+	}
+	return handled?IRQ_HANDLED:IRQ_NONE;
+}
+
+static irqreturn_t ioc3_intr_eth(int irq, void *arg, struct pt_regs *regs)
+{
+	unsigned long flags;
+	struct ioc3_driver_data *idd = (struct ioc3_driver_data *)arg;
+	int handled = 1;
+
+	if(!idd->dual_irq)
+		return IRQ_NONE;
+	read_lock_irqsave(&ioc3_submodules_lock, flags);
+	if(ioc3_ethernet && idd->active[ioc3_ethernet->id]
+				&& ioc3_ethernet->intr)
+		handled = handled && !ioc3_ethernet->intr(ioc3_ethernet, idd, 0,
+								regs);
+	read_unlock_irqrestore(&ioc3_submodules_lock, flags);
+	return handled?IRQ_HANDLED:IRQ_NONE;
+}
+
+void ioc3_enable(struct ioc3_submodule *is,
+				struct ioc3_driver_data *idd, unsigned int irqs)
+{
+	write_ireg(idd, irqs & is->irq_mask, IOC3_W_IES);
+}
+
+void ioc3_ack(struct ioc3_submodule *is, struct ioc3_driver_data *idd,
+				unsigned int irqs)
+{
+	writel(irqs & is->irq_mask, &idd->vma->sio_ir);
+}
+
+void ioc3_disable(struct ioc3_submodule *is,
+				struct ioc3_driver_data *idd, unsigned int irqs)
+{
+	write_ireg(idd, irqs & is->irq_mask, IOC3_W_IEC);
+}
+
+void ioc3_gpcr_set(struct ioc3_driver_data *idd, unsigned int val)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&idd->gpio_lock, flags);
+	writel(val, &idd->vma->gpcr_s);
+	spin_unlock_irqrestore(&idd->gpio_lock, flags);
+}
+
+/* Keep it simple, stupid! */
+static int find_slot(void **tab, int max)
+{
+	int i;
+	for(i=0;i<max;i++)
+		if(!(tab[i]))
+			return i;
+	return -1;
+}
+
+/* Register an IOC3 submodule */
+int ioc3_register_submodule(struct ioc3_submodule *is)
+{
+	struct ioc3_driver_data *idd;
+	int alloc_id;
+	unsigned long flags;
+
+	write_lock_irqsave(&ioc3_submodules_lock, flags);
+	alloc_id = find_slot((void **)ioc3_submodules, IOC3_MAX_SUBMODULES);
+	if(alloc_id != -1) {
+		ioc3_submodules[alloc_id] = is;
+		if(is->ethernet) {
+			if(ioc3_ethernet==NULL)
+				ioc3_ethernet=is;
+			else
+				printk(KERN_WARNING
+				  "IOC3 Ethernet module already registered!\n");
+		}
+	}
+	write_unlock_irqrestore(&ioc3_submodules_lock, flags);
+
+	if(alloc_id == -1) {
+		printk(KERN_WARNING "Increase IOC3_MAX_SUBMODULES!\n");
+		return -ENOMEM;
+	}
+
+	is->id=alloc_id;
+
+	/* Initialize submodule for each IOC3 */
+	if (!is->probe)
+		return 0;
+
+	down_read(&ioc3_devices_rwsem);
+	list_for_each_entry(idd, &ioc3_devices, list) {
+		/* set to 1 for IRQs in probe */
+		idd->active[alloc_id] = 1;
+		idd->active[alloc_id] = !is->probe(is, idd);
+	}
+	up_read(&ioc3_devices_rwsem);
+
+	return 0;
+}
+
+/* Unregister an IOC3 submodule */
+void ioc3_unregister_submodule(struct ioc3_submodule *is)
+{
+	struct ioc3_driver_data *idd;
+	unsigned long flags;
+
+	write_lock_irqsave(&ioc3_submodules_lock, flags);
+	if(ioc3_submodules[is->id]==is)
+		ioc3_submodules[is->id]=NULL;
+	else
+		printk(KERN_WARNING
+			"IOC3 submodule %s has wrong ID.\n",is->name);
+	if(ioc3_ethernet==is)
+		ioc3_ethernet = NULL;
+	write_unlock_irqrestore(&ioc3_submodules_lock, flags);
+
+	/* Remove submodule for each IOC3 */
+	down_read(&ioc3_devices_rwsem);
+	list_for_each_entry(idd, &ioc3_devices, list)
+		if(idd->active[is->id]) {
+			if(is->remove)
+				if(is->remove(is, idd))
+					printk(KERN_WARNING
+					       "%s: IOC3 submodule %s remove failed "
+					       "for pci_dev %s.\n",
+					       __FUNCTION__, module_name(is->owner),
+					       pci_name(idd->pdev));
+			idd->active[is->id] = 0;
+			if(is->irq_mask)
+				write_ireg(idd, is->irq_mask, IOC3_W_IEC);
+		}
+	up_read(&ioc3_devices_rwsem);
+}
+
+/*********************
+ * Device management *
+ *********************/
+
+static char *
+ioc3_class_names[]={"unknown", "IP27 BaseIO", "IP30 system", "MENET 1/2/3",
+			"MENET 4", "CADduo", "Altix Serial"};
+
+static int ioc3_class(struct ioc3_driver_data *idd)
+{
+	int res = IOC3_CLASS_NONE;
+	/* NIC-based logic */
+	if(!strncmp(idd->nic_part, "030-0891-", 9))
+		res = IOC3_CLASS_BASE_IP30;
+	if(!strncmp(idd->nic_part, "030-1155-", 9))
+		res = IOC3_CLASS_CADDUO;
+	if(!strncmp(idd->nic_part, "030-1657-", 9))
+		res = IOC3_CLASS_SERIAL;
+	if(!strncmp(idd->nic_part, "030-1664-", 9))
+		res = IOC3_CLASS_SERIAL;
+	/* total random heuristics */
+#ifdef CONFIG_SGI_IP27
+	if(!idd->nic_part[0])
+		res = IOC3_CLASS_BASE_IP27;
+#endif
+	/* print educational message */
+	printk(KERN_INFO "IOC3 part: [%s], serial: [%s] => class %s\n",
+			idd->nic_part, idd->nic_serial, ioc3_class_names[res]);
+	return res;
+}
+/* Adds a new instance of an IOC3 card */
+static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
+{
+	struct ioc3_driver_data *idd;
+	uint32_t pcmd;
+	int ret, id;
+
+	/* Enable IOC3 and take ownership of it */
+	if ((ret = pci_enable_device(pdev))) {
+		printk(KERN_WARNING
+		       "%s: Failed to enable IOC3 device for pci_dev %s.\n",
+		       __FUNCTION__, pci_name(pdev));
+		goto out;
+	}
+	pci_set_master(pdev);
+
+#ifdef USE_64BIT_DMA
+        ret = pci_set_dma_mask(pdev, 0xffffffffffffffffULL);
+        if (!ret) {
+                ret = pci_set_consistent_dma_mask(pdev, 0xffffffffffffffffULL);
+                if (ret < 0) {
+                        printk(KERN_WARNING "%s: Unable to obtain 64 bit DMA "
+                               "for consistent allocations\n",
+				__FUNCTION__);
+                }
+	}
+#endif
+
+	/* Set up per-IOC3 data */
+	idd = kmalloc(sizeof(struct ioc3_driver_data), GFP_KERNEL);
+	if (!idd) {
+		printk(KERN_WARNING
+		       "%s: Failed to allocate IOC3 data for pci_dev %s.\n",
+		       __FUNCTION__, pci_name(pdev));
+		ret = -ENODEV;
+		goto out_idd;
+	}
+	memset(idd, 0, sizeof(struct ioc3_driver_data));
+	spin_lock_init(&idd->ir_lock);
+	spin_lock_init(&idd->gpio_lock);
+	idd->pdev = pdev;
+
+	/* Map all IOC3 registers.  These are shared between subdevices
+	 * so the main IOC3 module manages them.
+	 */
+	idd->pma = pci_resource_start(pdev, 0);
+	if (!idd->pma) {
+		printk(KERN_WARNING
+		       "%s: Unable to find IOC3 resource "
+		       "for pci_dev %s.\n",
+		       __FUNCTION__, pci_name(pdev));
+		ret = -ENODEV;
+		goto out_pci;
+	}
+	if (!request_region(idd->pma, IOC3_PCI_SIZE, "ioc3")) {
+		printk(KERN_WARNING
+		       "%s: Unable to request IOC3 region "
+		       "for pci_dev %s.\n",
+		       __FUNCTION__, pci_name(pdev));
+		ret = -ENODEV;
+		goto out_pci;
+	}
+	idd->vma = ioremap(idd->pma, IOC3_PCI_SIZE);
+	if (!idd->vma) {
+		printk(KERN_WARNING
+		       "%s: Unable to remap IOC3 region "
+		       "for pci_dev %s.\n",
+		       __FUNCTION__, pci_name(pdev));
+		ret = -ENODEV;
+		goto out_misc_region;
+	}
+
+	/* Track PCI-device specific data */
+	pci_set_drvdata(pdev, idd);
+	down_write(&ioc3_devices_rwsem);
+	list_add(&idd->list, &ioc3_devices);
+	idd->id = ioc3_counter++;
+	up_write(&ioc3_devices_rwsem);
+
+	idd->gpdr_shadow = idd->vma->gpdr;
+
+	/* Read IOC3 NIC contents */
+	probe_nic(idd);
+
+	/* Detect IOC3 class */
+	idd->class = ioc3_class(idd);
+
+	/* Initialize IOC3 */
+       pci_read_config_dword(pdev, PCI_COMMAND, &pcmd);
+       pci_write_config_dword(pdev, PCI_COMMAND,
+                               pcmd | PCI_COMMAND_MEMORY |
+                               PCI_COMMAND_PARITY | PCI_COMMAND_SERR |
+                               PCI_SCR_DROP_MODE_EN);
+
+	write_ireg(idd, ~0, IOC3_W_IEC);
+	writel(~0, &idd->vma->sio_ir);
+
+	/* Set up IRQs */
+	if(idd->class == IOC3_CLASS_BASE_IP30
+				|| idd->class == IOC3_CLASS_BASE_IP27) {
+		writel(0, &idd->vma->eier);
+		writel(~0, &idd->vma->eisr);
+
+		idd->dual_irq = 1;
+		if (!request_irq(pdev->irq, ioc3_intr_eth, SA_SHIRQ,
+				 "ioc3-eth", (void *)idd)) {
+			idd->irq_eth = pdev->irq;
+		} else {
+			printk(KERN_WARNING
+			       "%s : request_irq fails for IRQ 0x%x\n ",
+			       __FUNCTION__, pdev->irq);
+		}
+		if (!request_irq(pdev->irq+2, ioc3_intr_io, SA_SHIRQ,
+				 "ioc3-io", (void *)idd)) {
+			idd->irq_io = pdev->irq+2;
+		} else {
+			printk(KERN_WARNING
+			       "%s : request_irq fails for IRQ 0x%x\n ",
+			       __FUNCTION__, pdev->irq+2);
+		}
+	} else {
+		if (!request_irq(pdev->irq, ioc3_intr_io, SA_SHIRQ,
+				 "ioc3", (void *)idd)) {
+			idd->irq_io = pdev->irq;
+		} else {
+			printk(KERN_WARNING
+			       "%s : request_irq fails for IRQ 0x%x\n ",
+			       __FUNCTION__, pdev->irq);
+		}
+	}
+
+	/* Add this IOC3 to all submodules */
+	read_lock(&ioc3_submodules_lock);
+	for(id=0;id<IOC3_MAX_SUBMODULES;id++)
+		if(ioc3_submodules[id] && ioc3_submodules[id]->probe) {
+			idd->active[id] = 1;
+			idd->active[id] = !ioc3_submodules[id]->probe
+						(ioc3_submodules[id], idd);
+		}
+	read_unlock(&ioc3_submodules_lock);
+
+	printk(KERN_INFO "IOC3 Master Driver loaded for %s\n", pci_name(pdev));
+
+	return 0;
+
+out_misc_region:
+	release_region(idd->pma, IOC3_PCI_SIZE);
+out_pci:
+	kfree(idd);
+out_idd:
+	pci_disable_device(pdev);
+out:
+	return ret;
+}
+
+/* Removes a particular instance of an IOC3 card. */
+static void ioc3_remove(struct pci_dev *pdev)
+{
+	int id;
+	struct ioc3_driver_data *idd;
+
+	idd = pci_get_drvdata(pdev);
+
+	/* Remove this IOC3 from all submodules */
+	read_lock(&ioc3_submodules_lock);
+	for(id=0;id<IOC3_MAX_SUBMODULES;id++)
+		if(idd->active[id]) {
+			if(ioc3_submodules[id] && ioc3_submodules[id]->remove)
+				if(ioc3_submodules[id]->remove(ioc3_submodules[id],
+								idd))
+					printk(KERN_WARNING
+					       "%s: IOC3 submodule 0x%s remove failed "
+					       "for pci_dev %s.\n",
+					        __FUNCTION__,
+						module_name(ioc3_submodules[id]->owner),
+					        pci_name(pdev));
+			idd->active[id] = 0;
+		}
+	read_unlock(&ioc3_submodules_lock);
+
+	/* Clear and disable all IRQs */
+	write_ireg(idd, ~0, IOC3_W_IEC);
+	writel(~0, &idd->vma->sio_ir);
+
+	/* Release resources */
+	free_irq(idd->irq_io, (void *)idd);
+	if(idd->dual_irq)
+		free_irq(idd->irq_eth, (void *)idd);
+	iounmap(idd->vma);
+	release_region(idd->pma, IOC3_PCI_SIZE);
+
+	/* Disable IOC3 and relinquish */
+	pci_disable_device(pdev);
+
+	/* Remove and free driver data */
+	down_write(&ioc3_devices_rwsem);
+	list_del(&idd->list);
+	up_write(&ioc3_devices_rwsem);
+	kfree(idd);
+}
+
+static struct pci_device_id ioc3_id_table[] = {
+	{PCI_VENDOR_ID_SGI, PCI_DEVICE_ID_SGI_IOC3, PCI_ANY_ID, PCI_ANY_ID},
+	{0}
+};
+
+static struct pci_driver ioc3_driver = {
+	.name = "IOC3",
+	.id_table = ioc3_id_table,
+	.probe = ioc3_probe,
+	.remove = ioc3_remove,
+};
+
+MODULE_DEVICE_TABLE(pci, ioc3_id_table);
+
+/*********************
+ * Module management *
+ *********************/
+
+/* Module load */
+static int __devinit ioc3_init(void)
+{
+	if (ia64_platform_is("sn2"))
+		return pci_register_driver(&ioc3_driver);
+	return 0;
+}
+
+/* Module unload */
+static void __devexit ioc3_exit(void)
+{
+	pci_unregister_driver(&ioc3_driver);
+}
+
+module_init(ioc3_init);
+module_exit(ioc3_exit);
+
+MODULE_AUTHOR("Stanislaw Skowronek <skylark@linux-mips.org>");
+MODULE_DESCRIPTION("PCI driver for SGI IOC3");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL(ioc3_register_submodule);
+EXPORT_SYMBOL(ioc3_unregister_submodule);
+EXPORT_SYMBOL(ioc3_ack);
+EXPORT_SYMBOL(ioc3_gpcr_set);
+EXPORT_SYMBOL(ioc3_disable);
+EXPORT_SYMBOL(ioc3_enable);
diff --git a/include/asm-ia64/sn/ioc3.h b/include/asm-ia64/sn/ioc3.h
new file mode 100644
index 000000000000..95ed6cc83cf1
--- /dev/null
+++ b/include/asm-ia64/sn/ioc3.h
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) 2005 Silicon Graphics, Inc.
+ */
+#ifndef IA64_SN_IOC3_H
+#define IA64_SN_IOC3_H
+
+/* serial port register map */
+struct ioc3_serialregs {
+	uint32_t sscr;
+	uint32_t stpir;
+	uint32_t stcir;
+	uint32_t srpir;
+	uint32_t srcir;
+	uint32_t srtr;
+	uint32_t shadow;
+};
+
+/* SUPERIO uart register map */
+struct ioc3_uartregs {
+	char iu_lcr;
+	union {
+		char iir;	/* read only */
+		char fcr;	/* write only */
+	} u3;
+	union {
+		char ier;	/* DLAB == 0 */
+		char dlm;	/* DLAB == 1 */
+	} u2;
+	union {
+		char rbr;	/* read only, DLAB == 0 */
+		char thr;	/* write only, DLAB == 0 */
+		char dll;	/* DLAB == 1 */
+	} u1;
+	char iu_scr;
+	char iu_msr;
+	char iu_lsr;
+	char iu_mcr;
+};
+
+#define iu_rbr u1.rbr
+#define iu_thr u1.thr
+#define iu_dll u1.dll
+#define iu_ier u2.ier
+#define iu_dlm u2.dlm
+#define iu_iir u3.iir
+#define iu_fcr u3.fcr
+
+struct ioc3_sioregs {
+	char fill[0x170];
+	struct ioc3_uartregs uartb;
+	struct ioc3_uartregs uarta;
+};
+
+/* PCI IO/mem space register map */
+struct ioc3 {
+	uint32_t pci_id;
+	uint32_t pci_scr;
+	uint32_t pci_rev;
+	uint32_t pci_lat;
+	uint32_t pci_addr;
+	uint32_t pci_err_addr_l;
+	uint32_t pci_err_addr_h;
+
+	uint32_t sio_ir;
+	/* these registers are read-only for general kernel code. To
+	 * modify them use the functions in ioc3.c
+	 */
+	uint32_t sio_ies;
+	uint32_t sio_iec;
+	uint32_t sio_cr;
+	uint32_t int_out;
+	uint32_t mcr;
+	uint32_t gpcr_s;
+	uint32_t gpcr_c;
+	uint32_t gpdr;
+	uint32_t gppr[9];
+	char fill[0x4c];
+
+	/* serial port registers */
+	uint32_t sbbr_h;
+	uint32_t sbbr_l;
+
+	struct ioc3_serialregs port_a;
+	struct ioc3_serialregs port_b;
+	char fill1[0x1ff10];
+	/* superio registers */
+	struct ioc3_sioregs sregs;
+};
+
+/* These don't exist on the ioc3 serial card... */
+#define eier	fill1[8]
+#define eisr	fill1[4]
+
+#define PCI_LAT			0xc	/* Latency Timer */
+#define PCI_SCR_DROP_MODE_EN	0x00008000 /* drop pios on parity err */
+#define UARTA_BASE		0x178
+#define UARTB_BASE		0x170
+
+
+/* bitmasks for serial RX status byte */
+#define RXSB_OVERRUN		0x01	/* char(s) lost */
+#define RXSB_PAR_ERR		0x02	/* parity error */
+#define RXSB_FRAME_ERR		0x04	/* framing error */
+#define RXSB_BREAK		0x08	/* break character */
+#define RXSB_CTS		0x10	/* state of CTS */
+#define RXSB_DCD		0x20	/* state of DCD */
+#define RXSB_MODEM_VALID	0x40	/* DCD, CTS and OVERRUN are valid */
+#define RXSB_DATA_VALID		0x80	/* FRAME_ERR PAR_ERR & BREAK valid */
+
+/* bitmasks for serial TX control byte */
+#define TXCB_INT_WHEN_DONE	0x20	/* interrupt after this byte is sent */
+#define TXCB_INVALID		0x00	/* byte is invalid */
+#define TXCB_VALID		0x40	/* byte is valid */
+#define TXCB_MCR		0x80	/* data<7:0> to modem cntrl register */
+#define TXCB_DELAY		0xc0	/* delay data<7:0> mSec */
+
+/* bitmasks for SBBR_L */
+#define SBBR_L_SIZE		0x00000001	/* 0 1KB rings, 1 4KB rings */
+
+/* bitmasks for SSCR_<A:B> */
+#define SSCR_RX_THRESHOLD	0x000001ff	/* hiwater mark */
+#define SSCR_TX_TIMER_BUSY	0x00010000	/* TX timer in progress */
+#define SSCR_HFC_EN		0x00020000	/* h/w flow cntrl enabled */
+#define SSCR_RX_RING_DCD	0x00040000	/* postRX record on delta-DCD */
+#define SSCR_RX_RING_CTS	0x00080000	/* postRX record on delta-CTS */
+#define SSCR_HIGH_SPD		0x00100000	/* 4X speed */
+#define SSCR_DIAG		0x00200000	/* bypass clock divider */
+#define SSCR_RX_DRAIN		0x08000000	/* drain RX buffer to memory */
+#define SSCR_DMA_EN		0x10000000	/* enable ring buffer DMA */
+#define SSCR_DMA_PAUSE		0x20000000	/* pause DMA */
+#define SSCR_PAUSE_STATE	0x40000000	/* set when PAUSE takes effect*/
+#define SSCR_RESET		0x80000000	/* reset DMA channels */
+
+/* all producer/comsumer pointers are the same bitfield */
+#define PROD_CONS_PTR_4K	0x00000ff8	/* for 4K buffers */
+#define PROD_CONS_PTR_1K	0x000003f8	/* for 1K buffers */
+#define PROD_CONS_PTR_OFF	3
+
+/* bitmasks for SRCIR_<A:B> */
+#define SRCIR_ARM		0x80000000	/* arm RX timer */
+
+/* bitmasks for SHADOW_<A:B> */
+#define SHADOW_DR		0x00000001	/* data ready */
+#define SHADOW_OE		0x00000002	/* overrun error */
+#define SHADOW_PE		0x00000004	/* parity error */
+#define SHADOW_FE		0x00000008	/* framing error */
+#define SHADOW_BI		0x00000010	/* break interrupt */
+#define SHADOW_THRE		0x00000020	/* transmit holding reg empty */
+#define SHADOW_TEMT		0x00000040	/* transmit shift reg empty */
+#define SHADOW_RFCE		0x00000080	/* char in RX fifo has error */
+#define SHADOW_DCTS		0x00010000	/* delta clear to send */
+#define SHADOW_DDCD		0x00080000	/* delta data carrier detect */
+#define SHADOW_CTS		0x00100000	/* clear to send */
+#define SHADOW_DCD		0x00800000	/* data carrier detect */
+#define SHADOW_DTR		0x01000000	/* data terminal ready */
+#define SHADOW_RTS		0x02000000	/* request to send */
+#define SHADOW_OUT1		0x04000000	/* 16550 OUT1 bit */
+#define SHADOW_OUT2		0x08000000	/* 16550 OUT2 bit */
+#define SHADOW_LOOP		0x10000000	/* loopback enabled */
+
+/* bitmasks for SRTR_<A:B> */
+#define SRTR_CNT		0x00000fff	/* reload value for RX timer */
+#define SRTR_CNT_VAL		0x0fff0000	/* current value of RX timer */
+#define SRTR_CNT_VAL_SHIFT	16
+#define SRTR_HZ			16000		/* SRTR clock frequency */
+
+/* bitmasks for SIO_IR, SIO_IEC and SIO_IES  */
+#define SIO_IR_SA_TX_MT		0x00000001	/* Serial port A TX empty */
+#define SIO_IR_SA_RX_FULL	0x00000002	/* port A RX buf full */
+#define SIO_IR_SA_RX_HIGH	0x00000004	/* port A RX hiwat */
+#define SIO_IR_SA_RX_TIMER	0x00000008	/* port A RX timeout */
+#define SIO_IR_SA_DELTA_DCD	0x00000010	/* port A delta DCD */
+#define SIO_IR_SA_DELTA_CTS	0x00000020	/* port A delta CTS */
+#define SIO_IR_SA_INT		0x00000040	/* port A pass-thru intr */
+#define SIO_IR_SA_TX_EXPLICIT	0x00000080	/* port A explicit TX thru */
+#define SIO_IR_SA_MEMERR	0x00000100	/* port A PCI error */
+#define SIO_IR_SB_TX_MT		0x00000200
+#define SIO_IR_SB_RX_FULL	0x00000400
+#define SIO_IR_SB_RX_HIGH	0x00000800
+#define SIO_IR_SB_RX_TIMER	0x00001000
+#define SIO_IR_SB_DELTA_DCD	0x00002000
+#define SIO_IR_SB_DELTA_CTS	0x00004000
+#define SIO_IR_SB_INT		0x00008000
+#define SIO_IR_SB_TX_EXPLICIT	0x00010000
+#define SIO_IR_SB_MEMERR	0x00020000
+#define SIO_IR_PP_INT		0x00040000	/* P port pass-thru intr */
+#define SIO_IR_PP_INTA		0x00080000	/* PP context A thru */
+#define SIO_IR_PP_INTB		0x00100000	/* PP context B thru */
+#define SIO_IR_PP_MEMERR	0x00200000	/* PP PCI error */
+#define SIO_IR_KBD_INT		0x00400000	/* kbd/mouse intr */
+#define SIO_IR_RT_INT		0x08000000	/* RT output pulse */
+#define SIO_IR_GEN_INT1		0x10000000	/* RT input pulse */
+#define SIO_IR_GEN_INT_SHIFT	28
+
+/* per device interrupt masks */
+#define SIO_IR_SA		(SIO_IR_SA_TX_MT | \
+				 SIO_IR_SA_RX_FULL | \
+				 SIO_IR_SA_RX_HIGH | \
+				 SIO_IR_SA_RX_TIMER | \
+				 SIO_IR_SA_DELTA_DCD | \
+				 SIO_IR_SA_DELTA_CTS | \
+				 SIO_IR_SA_INT | \
+				 SIO_IR_SA_TX_EXPLICIT | \
+				 SIO_IR_SA_MEMERR)
+
+#define SIO_IR_SB		(SIO_IR_SB_TX_MT | \
+				 SIO_IR_SB_RX_FULL | \
+				 SIO_IR_SB_RX_HIGH | \
+				 SIO_IR_SB_RX_TIMER | \
+				 SIO_IR_SB_DELTA_DCD | \
+				 SIO_IR_SB_DELTA_CTS | \
+				 SIO_IR_SB_INT | \
+				 SIO_IR_SB_TX_EXPLICIT | \
+				 SIO_IR_SB_MEMERR)
+
+#define SIO_IR_PP		(SIO_IR_PP_INT | SIO_IR_PP_INTA | \
+				 SIO_IR_PP_INTB | SIO_IR_PP_MEMERR)
+#define SIO_IR_RT		(SIO_IR_RT_INT | SIO_IR_GEN_INT1)
+
+/* bitmasks for SIO_CR */
+#define SIO_CR_CMD_PULSE_SHIFT 15
+#define SIO_CR_SER_A_BASE_SHIFT 1
+#define SIO_CR_SER_B_BASE_SHIFT 8
+#define SIO_CR_ARB_DIAG		0x00380000	/* cur !enet PCI requet (ro) */
+#define SIO_CR_ARB_DIAG_TXA	0x00000000
+#define SIO_CR_ARB_DIAG_RXA	0x00080000
+#define SIO_CR_ARB_DIAG_TXB	0x00100000
+#define SIO_CR_ARB_DIAG_RXB	0x00180000
+#define SIO_CR_ARB_DIAG_PP	0x00200000
+#define SIO_CR_ARB_DIAG_IDLE	0x00400000	/* 0 -> active request (ro) */
+
+/* defs for some of the generic I/O pins */
+#define GPCR_PHY_RESET		0x20	/* pin is output to PHY reset */
+#define GPCR_UARTB_MODESEL	0x40	/* pin is output to port B mode sel */
+#define GPCR_UARTA_MODESEL	0x80	/* pin is output to port A mode sel */
+
+#define GPPR_PHY_RESET_PIN	5	/* GIO pin controlling phy reset */
+#define GPPR_UARTB_MODESEL_PIN	6	/* GIO pin cntrling uartb modeselect */
+#define GPPR_UARTA_MODESEL_PIN	7	/* GIO pin cntrling uarta modeselect */
+
+#endif /* IA64_SN_IOC3_H */
diff --git a/include/linux/ioc3.h b/include/linux/ioc3.h
new file mode 100644
index 000000000000..e7906a72a4f1
--- /dev/null
+++ b/include/linux/ioc3.h
@@ -0,0 +1,93 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2005 Stanislaw Skowronek <skylark@linux-mips.org>
+ */
+
+#ifndef _LINUX_IOC3_H
+#define _LINUX_IOC3_H
+
+#include <asm/sn/ioc3.h>
+
+#define IOC3_MAX_SUBMODULES	32
+
+#define IOC3_CLASS_NONE		0
+#define IOC3_CLASS_BASE_IP27	1
+#define IOC3_CLASS_BASE_IP30	2
+#define IOC3_CLASS_MENET_123	3
+#define IOC3_CLASS_MENET_4	4
+#define IOC3_CLASS_CADDUO	5
+#define IOC3_CLASS_SERIAL	6
+
+/* One of these per IOC3 */
+struct ioc3_driver_data {
+	struct list_head list;
+	int id;				/* IOC3 sequence number */
+	/* PCI mapping */
+	unsigned long pma;		/* physical address */
+	struct __iomem ioc3 *vma;	/* pointer to registers */
+	struct pci_dev *pdev;		/* PCI device */
+	/* IRQ stuff */
+	int dual_irq;			/* set if separate IRQs are used */
+	int irq_io, irq_eth;		/* IRQ numbers */
+	/* GPIO magic */
+	spinlock_t gpio_lock;
+	unsigned int gpdr_shadow;
+	/* NIC identifiers */
+	char nic_part[32];
+	char nic_serial[16];
+	char nic_mac[6];
+	/* submodule set */
+	int class;
+	void *data[IOC3_MAX_SUBMODULES];	/* for submodule use */
+	int active[IOC3_MAX_SUBMODULES];	/* set if probe succeeds */
+	/* is_ir_lock must be held while
+	 * modifying sio_ie values, so
+	 * we can be sure that sio_ie is
+	 * not changing when we read it
+	 * along with sio_ir.
+	 */
+	spinlock_t ir_lock;	/* SIO_IE[SC] mod lock */
+};
+
+/* One per submodule */
+struct ioc3_submodule {
+	char *name;		/* descriptive submodule name */
+	struct module *owner;	/* owning kernel module */
+	int ethernet;		/* set for ethernet drivers */
+	int (*probe) (struct ioc3_submodule *, struct ioc3_driver_data *);
+	int (*remove) (struct ioc3_submodule *, struct ioc3_driver_data *);
+	int id;			/* assigned by IOC3, index for the "data" array */
+	/* IRQ stuff */
+	unsigned int irq_mask;	/* IOC3 IRQ mask, leave clear for Ethernet */
+	int reset_mask;		/* non-zero if you want the ioc3.c module to reset interrupts */
+	int (*intr) (struct ioc3_submodule *, struct ioc3_driver_data *, unsigned int, struct pt_regs *);
+	/* private submodule data */
+	void *data;		/* assigned by submodule */
+};
+
+/**********************************
+ * Functions needed by submodules *
+ **********************************/
+
+#define IOC3_W_IES		0
+#define IOC3_W_IEC		1
+
+/* registers a submodule for all existing and future IOC3 chips */
+extern int ioc3_register_submodule(struct ioc3_submodule *);
+/* unregisters a submodule */
+extern void ioc3_unregister_submodule(struct ioc3_submodule *);
+/* enables IRQs indicated by irq_mask for a specified IOC3 chip */
+extern void ioc3_enable(struct ioc3_submodule *, struct ioc3_driver_data *, unsigned int);
+/* ackowledges specified IRQs */
+extern void ioc3_ack(struct ioc3_submodule *, struct ioc3_driver_data *, unsigned int);
+/* disables IRQs indicated by irq_mask for a specified IOC3 chip */
+extern void ioc3_disable(struct ioc3_submodule *, struct ioc3_driver_data *, unsigned int);
+/* atomically sets GPCR bits */
+extern void ioc3_gpcr_set(struct ioc3_driver_data *, unsigned int);
+/* general ireg writer */
+extern void ioc3_write_ireg(struct ioc3_driver_data *idd, uint32_t value, int reg);
+
+#endif
-- 
cgit v1.2.3-71-gd317


From b0a9499c3dd50d333e2aedb7e894873c58da3785 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 14 Jan 2006 13:20:41 -0800
Subject: [PATCH] sched: add new SCHED_BATCH policy

Add a new SCHED_BATCH (3) scheduling policy: such tasks are presumed
CPU-intensive, and will acquire a constant +5 priority level penalty.  Such
policy is nice for workloads that are non-interactive, but which do not
want to give up their nice levels.  The policy is also useful for workloads
that want a deterministic scheduling policy without interactivity causing
extra preemptions (between that workload's tasks).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  7 ++++---
 kernel/exit.c         |  4 +++-
 kernel/sched.c        | 48 +++++++++++++++++++++++++++++++++---------------
 3 files changed, 40 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a72e17135421..2df1a1a2fee5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -160,6 +160,7 @@ extern unsigned long nr_iowait(void);
 #define SCHED_NORMAL		0
 #define SCHED_FIFO		1
 #define SCHED_RR		2
+#define SCHED_BATCH		3
 
 struct sched_param {
 	int sched_priority;
@@ -470,9 +471,9 @@ struct signal_struct {
 
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are
- * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values
- * are inverted: lower p->prio value means higher priority.
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
  *
  * The MAX_USER_RT_PRIO value allows the actual maximum
  * RT priority to be separate from the value exported to
diff --git a/kernel/exit.c b/kernel/exit.c
index f8e609ff1893..7fb541cb8d69 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -244,7 +244,9 @@ static inline void reparent_to_init(void)
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
 
-	if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
+	if ((current->policy == SCHED_NORMAL ||
+			current->policy == SCHED_BATCH)
+				&& (task_nice(current) < 0))
 		set_user_nice(current, 0);
 	/* cpus_allowed? */
 	/* rt_priority? */
diff --git a/kernel/sched.c b/kernel/sched.c
index c9dec2aa1976..e1dc903d5a75 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -748,10 +748,14 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
 	unsigned long long __sleep_time = now - p->timestamp;
 	unsigned long sleep_time;
 
-	if (__sleep_time > NS_MAX_SLEEP_AVG)
-		sleep_time = NS_MAX_SLEEP_AVG;
-	else
-		sleep_time = (unsigned long)__sleep_time;
+	if (unlikely(p->policy == SCHED_BATCH))
+		sleep_time = 0;
+	else {
+		if (__sleep_time > NS_MAX_SLEEP_AVG)
+			sleep_time = NS_MAX_SLEEP_AVG;
+		else
+			sleep_time = (unsigned long)__sleep_time;
+	}
 
 	if (likely(sleep_time > 0)) {
 		/*
@@ -3560,7 +3564,7 @@ void set_user_nice(task_t *p, long nice)
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
-	 * not SCHED_NORMAL:
+	 * not SCHED_NORMAL/SCHED_BATCH:
 	 */
 	if (rt_task(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
@@ -3706,10 +3710,16 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
 	BUG_ON(p->array);
 	p->policy = policy;
 	p->rt_priority = prio;
-	if (policy != SCHED_NORMAL)
+	if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
 		p->prio = MAX_RT_PRIO-1 - p->rt_priority;
-	else
+	} else {
 		p->prio = p->static_prio;
+		/*
+		 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+		 */
+		if (policy == SCHED_BATCH)
+			p->sleep_avg = 0;
+	}
 }
 
 /**
@@ -3733,29 +3743,35 @@ recheck:
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-				policy != SCHED_NORMAL)
-			return -EINVAL;
+			policy != SCHED_NORMAL && policy != SCHED_BATCH)
+		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
-	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+	 * SCHED_BATCH is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
-	if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
+	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
+					!= (param->sched_priority == 0))
 		return -EINVAL;
 
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
-		/* can't change policy */
-		if (policy != p->policy &&
-			!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+		/*
+		 * can't change policy, except between SCHED_NORMAL
+		 * and SCHED_BATCH:
+		 */
+		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
+			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
+				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
 			return -EPERM;
 		/* can't increase priority */
-		if (policy != SCHED_NORMAL &&
+		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
 		    param->sched_priority > p->rt_priority &&
 		    param->sched_priority >
 				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
@@ -4233,6 +4249,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_NORMAL:
+	case SCHED_BATCH:
 		ret = 0;
 		break;
 	}
@@ -4256,6 +4273,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
 		ret = 1;
 		break;
 	case SCHED_NORMAL:
+	case SCHED_BATCH:
 		ret = 0;
 	}
 	return ret;
-- 
cgit v1.2.3-71-gd317


From 852cf918dcf2ae46468b425e679fbcbf0ea8fdbb Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 14 Jan 2006 13:20:46 -0800
Subject: [PATCH] Fix for CONFIG_NUMA without CONFIG_SWAP

Some people apparently run CONFIG_NUMA without CONFIG_SWAP.  The migration
code currently depends on swap.  This patch provides a set of inline
fallback functions so that the kernel properly compiles.  However, calls to
migration functions will fail.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 389d1c382e20..e92054d6530b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -180,6 +180,11 @@ extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
+#else
+static inline int isolate_lru_page(struct page *p) { return -ENOSYS; }
+static inline int putback_lru_pages(struct list_head *l) { return 0; }
+static inline int migrate_pages(struct list_head *l, struct list_head *t,
+	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
 #endif
 
 #ifdef CONFIG_MMU
-- 
cgit v1.2.3-71-gd317


From 7339ff8302fd70aabf5f1ae26e0c4905fa74a495 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Sat, 14 Jan 2006 13:20:48 -0800
Subject: [PATCH] Add tmpfs options for memory placement policies

Anything that writes into a tmpfs filesystem is liable to disproportionately
decrease the available memory on a particular node.  Since there's no telling
what sort of application (e.g.  dd/cp/cat) might be dropping large files
there, this lets the admin choose the appropriate default behavior for their
site's situation.

Introduce a tmpfs mount option which allows specifying a memory policy and
a second option to specify the nodelist for that policy.  With the default
policy, tmpfs will behave as it does today.  This patch adds support for
preferred, bind, and interleave policies.

The default policy will cause pages to be added to tmpfs files on the node
which is doing the writing.  Some jobs expect a single process to create
and manage the tmpfs files.  This results in a node which has a
significantly reduced number of free pages.

With this patch, the administrator can specify the policy and nodes for
that policy where they would prefer allocations.

This patch was originally written by Brent Casavant and Hugh Dickins.  I
added support for the bind and preferred policies and the mpol_nodelist
mount option.

Signed-off-by: Brent Casavant <bcasavan@sgi.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/tmpfs.txt | 12 ++++++++++++
 fs/hugetlbfs/inode.c                |  2 +-
 include/linux/mempolicy.h           | 11 ++++-------
 include/linux/shmem_fs.h            |  2 ++
 mm/mempolicy.c                      | 24 +++++++++++++++++++++++
 mm/shmem.c                          | 39 ++++++++++++++++++++++++++++++-------
 6 files changed, 75 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index 0d783c504ead..dbe4d87d2615 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -78,6 +78,18 @@ use up all the memory on the machine; but enhances the scalability of
 that instance in a system with many cpus making intensive use of it.
 
 
+tmpfs has a mount option to set the NUMA memory allocation policy for
+all files in that instance:
+mpol=interleave		prefers to allocate memory from each node in turn
+mpol=default		prefers to allocate memory from the local node
+mpol=bind		prefers to allocate from mpol_nodelist
+mpol=preferred		prefers to allocate from first node in mpol_nodelist
+
+The following mount option is used in conjunction with mpol=interleave,
+mpol=bind or mpol=preferred:
+mpol_nodelist:	nodelist suitable for parsing with nodelist_parse.
+
+
 To specify the initial root directory you can use the following mount
 options:
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ab4c3a9d51b8..f568102da1e8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -402,7 +402,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		info = HUGETLBFS_I(inode);
-		mpol_shared_policy_init(&info->policy);
+		mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index c7ac77e873b3..d6a53ed6ab6c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -132,12 +132,8 @@ struct shared_policy {
 	spinlock_t lock;
 };
 
-static inline void mpol_shared_policy_init(struct shared_policy *info)
-{
-	info->root = RB_ROOT;
-	spin_lock_init(&info->lock);
-}
-
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+				nodemask_t *nodes);
 int mpol_set_shared_policy(struct shared_policy *info,
 				struct vm_area_struct *vma,
 				struct mempolicy *new);
@@ -211,7 +207,8 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
 	return -EINVAL;
 }
 
-static inline void mpol_shared_policy_init(struct shared_policy *info)
+static inline void mpol_shared_policy_init(struct shared_policy *info,
+					int policy, nodemask_t *nodes)
 {
 }
 
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index c3e598276e78..c057f0b32318 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -26,6 +26,8 @@ struct shmem_sb_info {
 	unsigned long free_blocks;  /* How many are left for allocation */
 	unsigned long max_inodes;   /* How many inodes are allowed */
 	unsigned long free_inodes;  /* How many are left for allocation */
+	int policy;		    /* Default NUMA memory alloc policy */
+	nodemask_t policy_nodes;    /* nodemask for preferred and bind */
 	spinlock_t    stat_lock;
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b62cab575a84..3171f884d245 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1359,6 +1359,30 @@ restart:
 	return 0;
 }
 
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+				nodemask_t *policy_nodes)
+{
+	info->root = RB_ROOT;
+	spin_lock_init(&info->lock);
+
+	if (policy != MPOL_DEFAULT) {
+		struct mempolicy *newpol;
+
+		/* Falls back to MPOL_DEFAULT on any error */
+		newpol = mpol_new(policy, policy_nodes);
+		if (!IS_ERR(newpol)) {
+			/* Create pseudo-vma that contains just the policy */
+			struct vm_area_struct pvma;
+
+			memset(&pvma, 0, sizeof(struct vm_area_struct));
+			/* Policy covers entire file */
+			pvma.vm_end = TASK_SIZE;
+			mpol_set_shared_policy(info, &pvma, newpol);
+			mpol_free(newpol);
+		}
+	}
+}
+
 int mpol_set_shared_policy(struct shared_policy *info,
 			struct vm_area_struct *vma, struct mempolicy *npol)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 343b3c0937e5..ce501bce1c2e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1316,7 +1316,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 		case S_IFREG:
 			inode->i_op = &shmem_inode_operations;
 			inode->i_fop = &shmem_file_operations;
-			mpol_shared_policy_init(&info->policy);
+			mpol_shared_policy_init(&info->policy, sbinfo->policy,
+							&sbinfo->policy_nodes);
 			break;
 		case S_IFDIR:
 			inode->i_nlink++;
@@ -1330,7 +1331,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 			 * Must not load anything in the rbtree,
 			 * mpol_free_shared_policy will not be called.
 			 */
-			mpol_shared_policy_init(&info->policy);
+			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+						NULL);
 			break;
 		}
 	} else if (sbinfo->max_inodes) {
@@ -1843,7 +1845,9 @@ static struct inode_operations shmem_symlink_inode_operations = {
 	.put_link	= shmem_put_link,
 };
 
-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid,
+	gid_t *gid, unsigned long *blocks, unsigned long *inodes,
+	int *policy, nodemask_t *policy_nodes)
 {
 	char *this_char, *value, *rest;
 
@@ -1897,6 +1901,19 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid,
 			*gid = simple_strtoul(value,&rest,0);
 			if (*rest)
 				goto bad_val;
+		} else if (!strcmp(this_char,"mpol")) {
+			if (!strcmp(value,"default"))
+				*policy = MPOL_DEFAULT;
+			else if (!strcmp(value,"preferred"))
+				*policy = MPOL_PREFERRED;
+			else if (!strcmp(value,"bind"))
+				*policy = MPOL_BIND;
+			else if (!strcmp(value,"interleave"))
+				*policy = MPOL_INTERLEAVE;
+			else
+				goto bad_val;
+		} else if (!strcmp(this_char,"mpol_nodelist")) {
+			nodelist_parse(value, *policy_nodes);
 		} else {
 			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
 			       this_char);
@@ -1917,12 +1934,14 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 	unsigned long max_blocks = sbinfo->max_blocks;
 	unsigned long max_inodes = sbinfo->max_inodes;
+	int policy = sbinfo->policy;
+	nodemask_t policy_nodes = sbinfo->policy_nodes;
 	unsigned long blocks;
 	unsigned long inodes;
 	int error = -EINVAL;
 
-	if (shmem_parse_options(data, NULL, NULL, NULL,
-				&max_blocks, &max_inodes))
+	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
+				&max_inodes, &policy, &policy_nodes))
 		return error;
 
 	spin_lock(&sbinfo->stat_lock);
@@ -1948,6 +1967,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	sbinfo->free_blocks = max_blocks - blocks;
 	sbinfo->max_inodes  = max_inodes;
 	sbinfo->free_inodes = max_inodes - inodes;
+	sbinfo->policy = policy;
+	sbinfo->policy_nodes = policy_nodes;
 out:
 	spin_unlock(&sbinfo->stat_lock);
 	return error;
@@ -1972,6 +1993,8 @@ static int shmem_fill_super(struct super_block *sb,
 	struct shmem_sb_info *sbinfo;
 	unsigned long blocks = 0;
 	unsigned long inodes = 0;
+	int policy = MPOL_DEFAULT;
+	nodemask_t policy_nodes = node_online_map;
 
 #ifdef CONFIG_TMPFS
 	/*
@@ -1984,8 +2007,8 @@ static int shmem_fill_super(struct super_block *sb,
 		inodes = totalram_pages - totalhigh_pages;
 		if (inodes > blocks)
 			inodes = blocks;
-		if (shmem_parse_options(data, &mode, &uid, &gid,
-					&blocks, &inodes))
+		if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
+					&inodes, &policy, &policy_nodes))
 			return -EINVAL;
 	}
 #else
@@ -2003,6 +2026,8 @@ static int shmem_fill_super(struct super_block *sb,
 	sbinfo->free_blocks = blocks;
 	sbinfo->max_inodes = inodes;
 	sbinfo->free_inodes = inodes;
+	sbinfo->policy = policy;
+	sbinfo->policy_nodes = policy_nodes;
 
 	sb->s_fs_info = sbinfo;
 	sb->s_maxbytes = SHMEM_MAX_BYTES;
-- 
cgit v1.2.3-71-gd317


From 1f1c12afe5c3e0ef901eec12dee09df4947462ee Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Sat, 14 Jan 2006 13:21:03 -0800
Subject: [PATCH] s390: cputime misaccounting

finish_arch_switch needs to update the user cpu time as well, not just the
system cpu time.  Otherwise the partial user cpu time of a process that is
stored in the lowcore will be (mis-)accounted to the next process.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/kernel/time.c   |  2 +-
 arch/s390/kernel/vtime.c  | 27 ++++++++++++++++++++++++++-
 include/asm-s390/system.h |  5 +++--
 include/linux/hardirq.h   |  4 ----
 4 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index b0d8ca8e5eeb..7c0fe152a111 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -214,7 +214,7 @@ void account_ticks(struct pt_regs *regs)
 #endif
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-	account_user_vtime(current);
+	account_tick_vtime(current);
 #else
 	while (ticks--)
 		update_process_times(user_mode(regs));
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 22a895ecb7a4..dfe6f0856617 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -32,7 +32,7 @@ DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer);
  * Update process times based on virtual cpu times stored by entry.S
  * to the lowcore fields user_timer, system_timer & steal_clock.
  */
-void account_user_vtime(struct task_struct *tsk)
+void account_tick_vtime(struct task_struct *tsk)
 {
 	cputime_t cputime;
 	__u64 timer, clock;
@@ -72,6 +72,31 @@ void account_user_vtime(struct task_struct *tsk)
  	run_posix_cpu_timers(tsk);
 }
 
+/*
+ * Update process times based on virtual cpu times stored by entry.S
+ * to the lowcore fields user_timer, system_timer & steal_clock.
+ */
+void account_vtime(struct task_struct *tsk)
+{
+	cputime_t cputime;
+	__u64 timer;
+
+	timer = S390_lowcore.last_update_timer;
+	asm volatile ("  STPT %0"    /* Store current cpu timer value */
+		      : "=m" (S390_lowcore.last_update_timer) );
+	S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
+
+	cputime = S390_lowcore.user_timer >> 12;
+	S390_lowcore.user_timer -= cputime << 12;
+	S390_lowcore.steal_clock -= cputime << 12;
+	account_user_time(tsk, cputime);
+
+	cputime =  S390_lowcore.system_timer >> 12;
+	S390_lowcore.system_timer -= cputime << 12;
+	S390_lowcore.steal_clock -= cputime << 12;
+	account_system_time(tsk, 0, cputime);
+}
+
 /*
  * Update process times based on virtual cpu times stored by entry.S
  * to the lowcore fields user_timer, system_timer & steal_clock.
diff --git a/include/asm-s390/system.h b/include/asm-s390/system.h
index c7c3a9ad593f..b2e65e8bf812 100644
--- a/include/asm-s390/system.h
+++ b/include/asm-s390/system.h
@@ -115,13 +115,14 @@ static inline void sched_cacheflush(void)
 }
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void account_user_vtime(struct task_struct *);
+extern void account_vtime(struct task_struct *);
+extern void account_tick_vtime(struct task_struct *);
 extern void account_system_vtime(struct task_struct *);
 #endif
 
 #define finish_arch_switch(prev) do {					     \
 	set_fs(current->thread.mm_segment);				     \
-	account_system_vtime(prev);					     \
+	account_vtime(prev);						     \
 } while (0)
 
 #define nop() __asm__ __volatile__ ("nop")
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 71d2b8a723b9..eab537091f2a 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -93,10 +93,6 @@ extern void synchronize_irq(unsigned int irq);
 struct task_struct;
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-static inline void account_user_vtime(struct task_struct *tsk)
-{
-}
-
 static inline void account_system_vtime(struct task_struct *tsk)
 {
 }
-- 
cgit v1.2.3-71-gd317


From 505970b96e3b7d22177c38e03435a68376628e7a Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sat, 14 Jan 2006 13:21:06 -0800
Subject: [PATCH] cpuset oom lock fix

The problem, reported in:

  http://bugzilla.kernel.org/show_bug.cgi?id=5859

and by various other email messages and lkml posts is that the cpuset hook
in the oom (out of memory) code can try to take a cpuset semaphore while
holding the tasklist_lock (a spinlock).

One must not sleep while holding a spinlock.

The fix seems easy enough - move the cpuset semaphore region outside the
tasklist_lock region.

This required a few lines of mechanism to implement.  The oom code where
the locking needs to be changed does not have access to the cpuset locks,
which are internal to kernel/cpuset.c only.  So I provided a couple more
cpuset interface routines, available to the rest of the kernel, which
simple take and drop the lock needed here (cpusets callback_sem).

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  6 ++++++
 kernel/cpuset.c        | 33 ++++++++++++++++++++++++++++-----
 mm/oom_kill.c          |  3 +++
 3 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index c472f972bd6d..3bc606927116 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -48,6 +48,9 @@ extern void __cpuset_memory_pressure_bump(void);
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
+extern void cpuset_lock(void);
+extern void cpuset_unlock(void);
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
@@ -93,6 +96,9 @@ static inline char *cpuset_task_status_allowed(struct task_struct *task,
 	return buffer;
 }
 
+static inline void cpuset_lock(void) {}
+static inline void cpuset_unlock(void) {}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d4b6bd7d74e5..fe2f71f92ae0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2149,6 +2149,33 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 	return allowed;
 }
 
+/**
+ * cpuset_lock - lock out any changes to cpuset structures
+ *
+ * The out of memory (oom) code needs to lock down cpusets
+ * from being changed while it scans the tasklist looking for a
+ * task in an overlapping cpuset.  Expose callback_sem via this
+ * cpuset_lock() routine, so the oom code can lock it, before
+ * locking the task list.  The tasklist_lock is a spinlock, so
+ * must be taken inside callback_sem.
+ */
+
+void cpuset_lock(void)
+{
+	down(&callback_sem);
+}
+
+/**
+ * cpuset_unlock - release lock on cpuset changes
+ *
+ * Undo the lock taken in a previous cpuset_lock() call.
+ */
+
+void cpuset_unlock(void)
+{
+	up(&callback_sem);
+}
+
 /**
  * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
  * @p: pointer to task_struct of some other task.
@@ -2158,7 +2185,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
  * determine if task @p's memory usage might impact the memory
  * available to the current task.
  *
- * Acquires callback_sem - not suitable for calling from a fast path.
+ * Call while holding callback_sem.
  **/
 
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2166,8 +2193,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
 	int overlap = 0;		/* do cpusets overlap? */
 
-	down(&callback_sem);
-
 	task_lock(current);
 	if (current->flags & PF_EXITING) {
 		task_unlock(current);
@@ -2186,8 +2211,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 
 	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
 done:
-	up(&callback_sem);
-
 	return overlap;
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4748b906aff2..14bd4ec79597 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -274,6 +274,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
 		show_mem();
 	}
 
+	cpuset_lock();
 	read_lock(&tasklist_lock);
 retry:
 	p = select_bad_process();
@@ -284,6 +285,7 @@ retry:
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
 		read_unlock(&tasklist_lock);
+		cpuset_unlock();
 		panic("Out of memory and no killable processes...\n");
 	}
 
@@ -293,6 +295,7 @@ retry:
 
  out:
 	read_unlock(&tasklist_lock);
+	cpuset_unlock();
 	if (mm)
 		mmput(mm);
 
-- 
cgit v1.2.3-71-gd317


From 44db77f33cc42c49f55ddb360f5e9a05581ffdc0 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sat, 14 Jan 2006 13:21:12 -0800
Subject: [PATCH] ncpfs: remove kmalloc wrapper

Remove remaining kmalloc wrapper bits from fs/ncpfs/.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ncpfs/inode.c       | 19 ++-----------------
 fs/ncpfs/ioctl.c       | 20 +++++++++++---------
 include/linux/ncp_fs.h | 28 ----------------------------
 3 files changed, 13 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8c8839203cd5..d277a58bd128 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -716,10 +716,8 @@ static void ncp_put_super(struct super_block *sb)
 	fput(server->ncp_filp);
 	kill_proc(server->m.wdog_pid, SIGTERM, 1);
 
-	if (server->priv.data) 
-		ncp_kfree_s(server->priv.data, server->priv.len);
-	if (server->auth.object_name)
-		ncp_kfree_s(server->auth.object_name, server->auth.object_name_len);
+	kfree(server->priv.data);
+	kfree(server->auth.object_name);
 	vfree(server->packet);
 	sb->s_fs_info = NULL;
 	kfree(server);
@@ -958,11 +956,6 @@ out:
 	return result;
 }
 
-#ifdef DEBUG_NCP_MALLOC
-int ncp_malloced;
-int ncp_current_malloced;
-#endif
-
 static struct super_block *ncp_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
@@ -981,10 +974,6 @@ static int __init init_ncp_fs(void)
 	int err;
 	DPRINTK("ncpfs: init_module called\n");
 
-#ifdef DEBUG_NCP_MALLOC
-	ncp_malloced = 0;
-	ncp_current_malloced = 0;
-#endif
 	err = init_inodecache();
 	if (err)
 		goto out1;
@@ -1003,10 +992,6 @@ static void __exit exit_ncp_fs(void)
 	DPRINTK("ncpfs: cleanup_module called\n");
 	unregister_filesystem(&ncp_fs_type);
 	destroy_inodecache();
-#ifdef DEBUG_NCP_MALLOC
-	PRINTK("ncp_malloced: %d\n", ncp_malloced);
-	PRINTK("ncp_current_malloced: %d\n", ncp_current_malloced);
-#endif
 }
 
 module_init(init_ncp_fs)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index d6e0c089e1b1..eb3813ad136f 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -518,10 +518,11 @@ outrel:
 			if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
 				return -ENOMEM;
 			if (user.object_name_len) {
-				newname = ncp_kmalloc(user.object_name_len, GFP_USER);
-				if (!newname) return -ENOMEM;
+				newname = kmalloc(user.object_name_len, GFP_USER);
+				if (!newname)
+					return -ENOMEM;
 				if (copy_from_user(newname, user.object_name, user.object_name_len)) {
-					ncp_kfree_s(newname, user.object_name_len);
+					kfree(newname);
 					return -EFAULT;
 				}
 			} else {
@@ -540,8 +541,8 @@ outrel:
 			server->priv.len = 0;
 			server->priv.data = NULL;
 			/* leave critical section */
-			if (oldprivate) ncp_kfree_s(oldprivate, oldprivatelen);
-			if (oldname) ncp_kfree_s(oldname, oldnamelen);
+			kfree(oldprivate);
+			kfree(oldname);
 			return 0;
 		}
 	case NCP_IOC_GETPRIVATEDATA:
@@ -581,10 +582,11 @@ outrel:
 			if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
 				return -ENOMEM;
 			if (user.len) {
-				new = ncp_kmalloc(user.len, GFP_USER);
-				if (!new) return -ENOMEM;
+				new = kmalloc(user.len, GFP_USER);
+				if (!new)
+					return -ENOMEM;
 				if (copy_from_user(new, user.data, user.len)) {
-					ncp_kfree_s(new, user.len);
+					kfree(new);
 					return -EFAULT;
 				}
 			} else {
@@ -596,7 +598,7 @@ outrel:
 			server->priv.len = user.len;
 			server->priv.data = new;
 			/* leave critical section */
-			if (old) ncp_kfree_s(old, oldlen);
+			kfree(old);
 			return 0;
 		}
 
diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h
index 7297e4372c0f..e01342568530 100644
--- a/include/linux/ncp_fs.h
+++ b/include/linux/ncp_fs.h
@@ -201,34 +201,6 @@ static inline struct ncp_inode_info *NCP_FINFO(struct inode *inode)
 	return container_of(inode, struct ncp_inode_info, vfs_inode);
 }
 
-#ifdef DEBUG_NCP_MALLOC
-
-#include <linux/slab.h>
-
-extern int ncp_malloced;
-extern int ncp_current_malloced;
-
-static inline void *
- ncp_kmalloc(unsigned int size, int priority)
-{
-	ncp_malloced += 1;
-	ncp_current_malloced += 1;
-	return kmalloc(size, priority);
-}
-
-static inline void ncp_kfree_s(void *obj, int size)
-{
-	ncp_current_malloced -= 1;
-	kfree(obj);
-}
-
-#else				/* DEBUG_NCP_MALLOC */
-
-#define ncp_kmalloc(s,p) kmalloc(s,p)
-#define ncp_kfree_s(o,s) kfree(o)
-
-#endif				/* DEBUG_NCP_MALLOC */
-
 /* linux/fs/ncpfs/inode.c */
 int ncp_notify_change(struct dentry *, struct iattr *);
 struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
-- 
cgit v1.2.3-71-gd317


From d063389ecf20e5c20be91a0007656deb9fc38a1c Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sat, 14 Jan 2006 13:21:13 -0800
Subject: [PATCH] smbfs: remove kmalloc wrapper

Remove the remaining kmalloc() wrapper bits from fs/smbfs/.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/smbfs/Makefile      |  1 -
 fs/smbfs/inode.c       | 32 +++++++-------------------------
 fs/smbfs/request.c     | 13 +++++--------
 include/linux/smb_fs.h | 47 -----------------------------------------------
 4 files changed, 12 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
index 93246b7dd6fb..6673ee82cb4c 100644
--- a/fs/smbfs/Makefile
+++ b/fs/smbfs/Makefile
@@ -13,7 +13,6 @@ smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
 EXTRA_CFLAGS += -DSMBFS_PARANOIA
 #EXTRA_CFLAGS += -DSMBFS_DEBUG
 #EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
-#EXTRA_CFLAGS += -DDEBUG_SMB_MALLOC
 #EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
 #EXTRA_CFLAGS += -Werror
 
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 6ec88bf59b2d..02e3e82d465c 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -487,11 +487,11 @@ smb_put_super(struct super_block *sb)
 	if (server->conn_pid)
 		kill_proc(server->conn_pid, SIGTERM, 1);
 
-	smb_kfree(server->ops);
+	kfree(server->ops);
 	smb_unload_nls(server);
 	sb->s_fs_info = NULL;
 	smb_unlock_server(server);
-	smb_kfree(server);
+	kfree(server);
 }
 
 static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
@@ -519,11 +519,10 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
 	sb->s_op = &smb_sops;
 	sb->s_time_gran = 100;
 
-	server = smb_kmalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
+	server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
 	if (!server)
 		goto out_no_server;
 	sb->s_fs_info = server;
-	memset(server, 0, sizeof(struct smb_sb_info));
 
 	server->super_block = sb;
 	server->mnt = NULL;
@@ -542,8 +541,8 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
 	/* FIXME: move these to the smb_sb_info struct */
 	VERBOSE("alloc chunk = %d\n", sizeof(struct smb_ops) +
 		sizeof(struct smb_mount_data_kernel));
-	mem = smb_kmalloc(sizeof(struct smb_ops) +
-			  sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
+	mem = kmalloc(sizeof(struct smb_ops) +
+		      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
 	if (!mem)
 		goto out_no_mem;
 
@@ -621,12 +620,12 @@ out_no_root:
 out_no_smbiod:
 	smb_unload_nls(server);
 out_bad_option:
-	smb_kfree(mem);
+	kfree(mem);
 out_no_mem:
 	if (!server->mnt)
 		printk(KERN_ERR "smb_fill_super: allocation failure\n");
 	sb->s_fs_info = NULL;
-	smb_kfree(server);
+	kfree(server);
 	goto out_fail;
 out_wrong_data:
 	printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
@@ -782,12 +781,6 @@ out:
 	return error;
 }
 
-#ifdef DEBUG_SMB_MALLOC
-int smb_malloced;
-int smb_current_kmalloced;
-int smb_current_vmalloced;
-#endif
-
 static struct super_block *smb_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
@@ -807,12 +800,6 @@ static int __init init_smb_fs(void)
 	int err;
 	DEBUG1("registering ...\n");
 
-#ifdef DEBUG_SMB_MALLOC
-	smb_malloced = 0;
-	smb_current_kmalloced = 0;
-	smb_current_vmalloced = 0;
-#endif
-
 	err = init_inodecache();
 	if (err)
 		goto out_inode;
@@ -837,11 +824,6 @@ static void __exit exit_smb_fs(void)
 	unregister_filesystem(&smb_fs_type);
 	smb_destroy_request_cache();
 	destroy_inodecache();
-#ifdef DEBUG_SMB_MALLOC
-	printk(KERN_DEBUG "smb_malloced: %d\n", smb_malloced);
-	printk(KERN_DEBUG "smb_current_kmalloced: %d\n",smb_current_kmalloced);
-	printk(KERN_DEBUG "smb_current_vmalloced: %d\n",smb_current_vmalloced);
-#endif
 }
 
 module_init(init_smb_fs)
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index a0f296d9928a..c71c375863cc 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -68,7 +68,7 @@ static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
 		goto out;
 
 	if (bufsize > 0) {
-		buf = smb_kmalloc(bufsize, GFP_NOFS);
+		buf = kmalloc(bufsize, GFP_NOFS);
 		if (!buf) {
 			kmem_cache_free(req_cachep, req);
 			return NULL;
@@ -124,9 +124,8 @@ static void smb_free_request(struct smb_request *req)
 {
 	atomic_dec(&req->rq_server->nr_requests);
 	if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
-		smb_kfree(req->rq_buffer);
-	if (req->rq_trans2buffer)
-		smb_kfree(req->rq_trans2buffer);
+		kfree(req->rq_buffer);
+	kfree(req->rq_trans2buffer);
 	kmem_cache_free(req_cachep, req);
 }
 
@@ -183,8 +182,7 @@ static int smb_setup_request(struct smb_request *req)
 	req->rq_err = 0;
 	req->rq_errno = 0;
 	req->rq_fragment = 0;
-	if (req->rq_trans2buffer)
-		smb_kfree(req->rq_trans2buffer);
+	kfree(req->rq_trans2buffer);
 
 	return 0;
 }
@@ -647,10 +645,9 @@ static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
 			goto out_too_long;
 
 		req->rq_trans2bufsize = buf_len;
-		req->rq_trans2buffer = smb_kmalloc(buf_len, GFP_NOFS);
+		req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
 		if (!req->rq_trans2buffer)
 			goto out_no_mem;
-		memset(req->rq_trans2buffer, 0, buf_len);
 
 		req->rq_parm = req->rq_trans2buffer;
 		req->rq_data = req->rq_trans2buffer + parm_tot;
diff --git a/include/linux/smb_fs.h b/include/linux/smb_fs.h
index c4153120ade6..621a3d3662f3 100644
--- a/include/linux/smb_fs.h
+++ b/include/linux/smb_fs.h
@@ -58,53 +58,6 @@ static inline struct smb_inode_info *SMB_I(struct inode *inode)
 /* where to find the base of the SMB packet proper */
 #define smb_base(buf) ((u8 *)(((u8 *)(buf))+4))
 
-#ifdef DEBUG_SMB_MALLOC
-
-#include <linux/slab.h>
-
-extern int smb_malloced;
-extern int smb_current_vmalloced;
-extern int smb_current_kmalloced;
-
-static inline void *
-smb_vmalloc(unsigned int size)
-{
-        smb_malloced += 1;
-        smb_current_vmalloced += 1;
-        return vmalloc(size);
-}
-
-static inline void
-smb_vfree(void *obj)
-{
-        smb_current_vmalloced -= 1;
-        vfree(obj);
-}
-
-static inline void *
-smb_kmalloc(size_t size, int flags)
-{
-	smb_malloced += 1;
-	smb_current_kmalloced += 1;
-	return kmalloc(size, flags);
-}
-
-static inline void
-smb_kfree(void *obj)
-{
-	smb_current_kmalloced -= 1;
-	kfree(obj);
-}
-
-#else /* DEBUG_SMB_MALLOC */
-
-#define smb_kmalloc(s,p)	kmalloc(s,p)
-#define smb_kfree(o)		kfree(o)
-#define smb_vmalloc(s)		vmalloc(s)
-#define smb_vfree(o)		vfree(o)
-
-#endif /* DEBUG_SMB_MALLOC */
-
 /*
  * Flags for the in-memory inode
  */
-- 
cgit v1.2.3-71-gd317


From 67a6680d64e18c7a1901f31ef747ea53b6cd986d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 14 Jan 2006 13:21:25 -0800
Subject: [PATCH] fbdev: Sanitize ->fb_ioctl prototype

The ioctl and file arguments to ->fb_mmap are totally unused and there's not
reason a driver should need them.

Also update the ->fb_compat_ioctl prototype to be the same as ->fb_mmap.

Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/video/amifb.c                 |  9 +++------
 drivers/video/arcfb.c                 |  5 ++---
 drivers/video/atafb.c                 |  3 +--
 drivers/video/aty/aty128fb.c          |  6 ++----
 drivers/video/aty/atyfb_base.c        |  6 ++----
 drivers/video/aty/radeon_base.c       |  4 ++--
 drivers/video/bw2.c                   |  6 ++----
 drivers/video/cg14.c                  |  6 ++----
 drivers/video/cg3.c                   |  6 ++----
 drivers/video/cg6.c                   |  6 ++----
 drivers/video/fbmem.c                 |  4 ++--
 drivers/video/ffb.c                   |  6 ++----
 drivers/video/imsttfb.c               |  3 +--
 drivers/video/intelfb/intelfbdrv.c    |  8 +++-----
 drivers/video/kyro/fbdev.c            |  5 ++---
 drivers/video/leo.c                   |  6 ++----
 drivers/video/matrox/matroxfb_base.c  |  5 ++---
 drivers/video/matrox/matroxfb_crtc2.c |  9 ++++-----
 drivers/video/p9100.c                 |  7 +++----
 drivers/video/pm3fb.c                 |  8 ++------
 drivers/video/pmag-aa-fb.c            |  3 +--
 drivers/video/radeonfb.c              |  4 ++--
 drivers/video/sis/sis_main.c          | 28 ++++++++--------------------
 drivers/video/sis/sis_main.h          |  5 +++++
 drivers/video/sstfb.c                 |  3 +--
 drivers/video/tcx.c                   |  7 +++----
 include/linux/fb.h                    |  8 ++++----
 27 files changed, 67 insertions(+), 109 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/amifb.c b/drivers/video/amifb.c
index 2c42a812655a..3033c72dea20 100644
--- a/drivers/video/amifb.c
+++ b/drivers/video/amifb.c
@@ -1131,9 +1131,7 @@ static void amifb_copyarea(struct fb_info *info,
 			   const struct fb_copyarea *region);
 static void amifb_imageblit(struct fb_info *info,
 			    const struct fb_image *image);
-static int amifb_ioctl(struct inode *inode, struct file *file,
-		       unsigned int cmd, unsigned long arg,
-		       struct fb_info *info);
+static int amifb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg);
 
 
 	/*
@@ -2172,9 +2170,8 @@ static void amifb_imageblit(struct fb_info *info, const struct fb_image *image)
 	 * Amiga Frame Buffer Specific ioctls
 	 */
 
-static int amifb_ioctl(struct inode *inode, struct file *file,
-		       unsigned int cmd, unsigned long arg,
-		       struct fb_info *info)
+static int amifb_ioctl(struct fb_info *info,
+		       unsigned int cmd, unsigned long arg)
 {
 	union {
 		struct fb_fix_cursorinfo fix;
diff --git a/drivers/video/arcfb.c b/drivers/video/arcfb.c
index 89060b2db8e5..df8e5667b348 100644
--- a/drivers/video/arcfb.c
+++ b/drivers/video/arcfb.c
@@ -399,9 +399,8 @@ static void arcfb_imageblit(struct fb_info *info, const struct fb_image *image)
 				image->height);
 }
 
-static int arcfb_ioctl(struct inode *inode, struct file *file,
-			  unsigned int cmd, unsigned long arg,
-			  struct fb_info *info)
+static int arcfb_ioctl(struct fb_info *info,
+			  unsigned int cmd, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
 	struct arcfb_par *par = info->par;
diff --git a/drivers/video/atafb.c b/drivers/video/atafb.c
index 15ec1295bc29..e69ab65f7843 100644
--- a/drivers/video/atafb.c
+++ b/drivers/video/atafb.c
@@ -2571,8 +2571,7 @@ atafb_pan_display(struct fb_var_screeninfo *var, int con, struct fb_info *info)
 }
 
 static int
-atafb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-	       unsigned long arg, int con, struct fb_info *info)
+atafb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
 #ifdef FBCMD_GET_CURRENTPAR
diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c
index e686185a076d..bfc8a93b2c73 100644
--- a/drivers/video/aty/aty128fb.c
+++ b/drivers/video/aty/aty128fb.c
@@ -431,8 +431,7 @@ static int aty128fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
 static int aty128fb_pan_display(struct fb_var_screeninfo *var,
 			   struct fb_info *fb);
 static int aty128fb_blank(int blank, struct fb_info *fb);
-static int aty128fb_ioctl(struct inode *inode, struct file *file, u_int cmd,
-			  u_long arg, struct fb_info *info);
+static int aty128fb_ioctl(struct fb_info *info, u_int cmd, unsigned long arg);
 static int aty128fb_sync(struct fb_info *info);
 
     /*
@@ -2108,8 +2107,7 @@ static int aty128fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
 /* in param: u32*	backlight value: 0 to 15 */
 #define FBIO_ATY128_SET_MIRROR	_IOW('@', 2, __u32)
 
-static int aty128fb_ioctl(struct inode *inode, struct file *file, u_int cmd,
-			  u_long arg, struct fb_info *info)
+static int aty128fb_ioctl(struct fb_info *info, u_int cmd, u_long arg)
 {
 	struct aty128fb_par *par = info->par;
 	u32 value;
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c
index ed81005cbdba..75b463ad4c75 100644
--- a/drivers/video/aty/atyfb_base.c
+++ b/drivers/video/aty/atyfb_base.c
@@ -238,8 +238,7 @@ static int atyfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
 	u_int transp, struct fb_info *info);
 static int atyfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info);
 static int atyfb_blank(int blank, struct fb_info *info);
-static int atyfb_ioctl(struct inode *inode, struct file *file, u_int cmd,
-	u_long arg, struct fb_info *info);
+static int atyfb_ioctl(struct fb_info *info, u_int cmd, u_long arg);
 extern void atyfb_fillrect(struct fb_info *info, const struct fb_fillrect *rect);
 extern void atyfb_copyarea(struct fb_info *info, const struct fb_copyarea *area);
 extern void atyfb_imageblit(struct fb_info *info, const struct fb_image *image);
@@ -1739,8 +1738,7 @@ struct atyclk {
 #define FBIO_WAITFORVSYNC _IOW('F', 0x20, __u32)
 #endif
 
-static int atyfb_ioctl(struct inode *inode, struct file *file, u_int cmd,
-	u_long arg, struct fb_info *info)
+static int atyfb_ioctl(struct fb_info *info, u_int cmd, u_long arg)
 {
 	struct atyfb_par *par = (struct atyfb_par *) info->par;
 #ifdef __sparc__
diff --git a/drivers/video/aty/radeon_base.c b/drivers/video/aty/radeon_base.c
index 156db84cb363..c9f0c5a07e6e 100644
--- a/drivers/video/aty/radeon_base.c
+++ b/drivers/video/aty/radeon_base.c
@@ -864,8 +864,8 @@ static int radeonfb_pan_display (struct fb_var_screeninfo *var,
 }
 
 
-static int radeonfb_ioctl (struct inode *inode, struct file *file, unsigned int cmd,
-                           unsigned long arg, struct fb_info *info)
+static int radeonfb_ioctl (struct fb_info *info, unsigned int cmd,
+                           unsigned long arg)
 {
         struct radeonfb_info *rinfo = info->par;
 	unsigned int tmp;
diff --git a/drivers/video/bw2.c b/drivers/video/bw2.c
index 9248fe1fbb1a..641a6c1fbf22 100644
--- a/drivers/video/bw2.c
+++ b/drivers/video/bw2.c
@@ -36,8 +36,7 @@
 static int bw2_blank(int, struct fb_info *);
 
 static int bw2_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
-static int bw2_ioctl(struct inode *, struct file *, unsigned int,
-		     unsigned long, struct fb_info *);
+static int bw2_ioctl(struct fb_info *, unsigned int, unsigned long);
 
 /*
  *  Frame buffer operations
@@ -181,8 +180,7 @@ static int bw2_mmap(struct fb_info *info, struct file *file, struct vm_area_stru
 				  vma);
 }
 
-static int bw2_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		     unsigned long arg, struct fb_info *info)
+static int bw2_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg)
 {
 	struct bw2_par *par = (struct bw2_par *) info->par;
 
diff --git a/drivers/video/cg14.c b/drivers/video/cg14.c
index a56147102abb..edb9936673c8 100644
--- a/drivers/video/cg14.c
+++ b/drivers/video/cg14.c
@@ -32,8 +32,7 @@ static int cg14_setcolreg(unsigned, unsigned, unsigned, unsigned,
 			 unsigned, struct fb_info *);
 
 static int cg14_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
-static int cg14_ioctl(struct inode *, struct file *, unsigned int,
-		      unsigned long, struct fb_info *);
+static int cg14_ioctl(struct fb_info *, unsigned int, unsigned long);
 static int cg14_pan_display(struct fb_var_screeninfo *, struct fb_info *);
 
 /*
@@ -277,8 +276,7 @@ static int cg14_mmap(struct fb_info *info, struct file *file, struct vm_area_str
 				  par->iospace, vma);
 }
 
-static int cg14_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		      unsigned long arg, struct fb_info *info)
+static int cg14_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg)
 {
 	struct cg14_par *par = (struct cg14_par *) info->par;
 	struct cg14_regs __iomem *regs = par->regs;
diff --git a/drivers/video/cg3.c b/drivers/video/cg3.c
index 9fcd89608ed7..027f707fa5db 100644
--- a/drivers/video/cg3.c
+++ b/drivers/video/cg3.c
@@ -34,8 +34,7 @@ static int cg3_setcolreg(unsigned, unsigned, unsigned, unsigned,
 static int cg3_blank(int, struct fb_info *);
 
 static int cg3_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
-static int cg3_ioctl(struct inode *, struct file *, unsigned int,
-		     unsigned long, struct fb_info *);
+static int cg3_ioctl(struct fb_info *, unsigned int, unsigned long);
 
 /*
  *  Frame buffer operations
@@ -240,8 +239,7 @@ static int cg3_mmap(struct fb_info *info, struct file *file, struct vm_area_stru
 				  vma);
 }
 
-static int cg3_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		     unsigned long arg, struct fb_info *info)
+static int cg3_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg)
 {
 	struct cg3_par *par = (struct cg3_par *) info->par;
 
diff --git a/drivers/video/cg6.c b/drivers/video/cg6.c
index 050835e39aa3..678e335d2b0f 100644
--- a/drivers/video/cg6.c
+++ b/drivers/video/cg6.c
@@ -37,8 +37,7 @@ static void cg6_imageblit(struct fb_info *, const struct fb_image *);
 static void cg6_fillrect(struct fb_info *, const struct fb_fillrect *);
 static int cg6_sync(struct fb_info *);
 static int cg6_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
-static int cg6_ioctl(struct inode *, struct file *, unsigned int,
-		     unsigned long, struct fb_info *);
+static int cg6_ioctl(struct fb_info *, unsigned int, unsigned long);
 
 /*
  *  Frame buffer operations
@@ -534,8 +533,7 @@ static int cg6_mmap(struct fb_info *info, struct file *file, struct vm_area_stru
 				  vma);
 }
 
-static int cg6_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		     unsigned long arg, struct fb_info *info)
+static int cg6_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg)
 {
 	struct cg6_par *par = (struct cg6_par *) info->par;
 
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 32a9b69becc5..80c16fb61e1c 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -957,7 +957,7 @@ fb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 	default:
 		if (fb->fb_ioctl == NULL)
 			return -EINVAL;
-		return fb->fb_ioctl(inode, file, cmd, arg, info);
+		return fb->fb_ioctl(info, cmd, arg);
 	}
 }
 
@@ -1107,7 +1107,7 @@ fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	default:
 		if (fb->fb_compat_ioctl)
-			ret = fb->fb_compat_ioctl(file, cmd, arg, info);
+			ret = fb->fb_compat_ioctl(info, cmd, arg);
 		break;
 	}
 	unlock_kernel();
diff --git a/drivers/video/ffb.c b/drivers/video/ffb.c
index c4870d559afc..d096c129d20b 100644
--- a/drivers/video/ffb.c
+++ b/drivers/video/ffb.c
@@ -38,8 +38,7 @@ static void ffb_fillrect(struct fb_info *, const struct fb_fillrect *);
 static void ffb_copyarea(struct fb_info *, const struct fb_copyarea *);
 static int ffb_sync(struct fb_info *);
 static int ffb_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
-static int ffb_ioctl(struct inode *, struct file *, unsigned int,
-		     unsigned long, struct fb_info *);
+static int ffb_ioctl(struct fb_info *, unsigned int, unsigned long);
 static int ffb_pan_display(struct fb_var_screeninfo *, struct fb_info *);
 
 /*
@@ -848,8 +847,7 @@ static int ffb_mmap(struct fb_info *info, struct file *file, struct vm_area_stru
 				  0, vma);
 }
 
-static int ffb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		     unsigned long arg, struct fb_info *info)
+static int ffb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg)
 {
 	struct ffb_par *par = (struct ffb_par *) info->par;
 
diff --git a/drivers/video/imsttfb.c b/drivers/video/imsttfb.c
index a5d813050db5..ad416ae47596 100644
--- a/drivers/video/imsttfb.c
+++ b/drivers/video/imsttfb.c
@@ -1267,8 +1267,7 @@ imsttfb_cursor(struct fb_info *info, struct fb_cursor *cursor)
 #define FBIMSTT_GETIDXREG	0x545406
 
 static int
-imsttfb_ioctl(struct inode *inode, struct file *file, u_int cmd,
-	      u_long arg, struct fb_info *info)
+imsttfb_ioctl(struct fb_info *info, u_int cmd, u_long arg)
 {
 	struct imstt_par *par = info->par;
 	void __user *argp = (void __user *)arg;
diff --git a/drivers/video/intelfb/intelfbdrv.c b/drivers/video/intelfb/intelfbdrv.c
index 0090544842f5..6b8bd3cdf9c0 100644
--- a/drivers/video/intelfb/intelfbdrv.c
+++ b/drivers/video/intelfb/intelfbdrv.c
@@ -157,9 +157,8 @@ static int intelfb_cursor(struct fb_info *info,
 
 static int intelfb_sync(struct fb_info *info);
 
-static int intelfb_ioctl(struct inode *inode, struct file *file,
-			 unsigned int cmd, unsigned long arg,
-			 struct fb_info *info);
+static int intelfb_ioctl(struct fb_info *info,
+			 unsigned int cmd, unsigned long arg);
 
 static int __devinit intelfb_pci_register(struct pci_dev *pdev,
 					  const struct pci_device_id *ent);
@@ -1380,8 +1379,7 @@ intelfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
 
 /* When/if we have our own ioctls. */
 static int
-intelfb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-	      unsigned long arg, struct fb_info *info)
+intelfb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg)
 {
 	int retval = 0;
 
diff --git a/drivers/video/kyro/fbdev.c b/drivers/video/kyro/fbdev.c
index bcd359b6d4ff..477ad297de4e 100644
--- a/drivers/video/kyro/fbdev.c
+++ b/drivers/video/kyro/fbdev.c
@@ -586,9 +586,8 @@ static int __init kyrofb_setup(char *options)
 }
 #endif
 
-static int kyrofb_ioctl(struct inode *inode, struct file *file,
-			unsigned int cmd, unsigned long arg,
-			struct fb_info *info)
+static int kyrofb_ioctl(struct fb_info *info,
+			unsigned int cmd, unsigned long arg)
 {
 	overlay_create ol_create;
 	overlay_viewport_set ol_viewport_set;
diff --git a/drivers/video/leo.c b/drivers/video/leo.c
index 494287f8f8bf..fb14787e13b9 100644
--- a/drivers/video/leo.c
+++ b/drivers/video/leo.c
@@ -33,8 +33,7 @@ static int leo_setcolreg(unsigned, unsigned, unsigned, unsigned,
 static int leo_blank(int, struct fb_info *);
 
 static int leo_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
-static int leo_ioctl(struct inode *, struct file *, unsigned int,
-		     unsigned long, struct fb_info *);
+static int leo_ioctl(struct fb_info *, unsigned int, unsigned long);
 static int leo_pan_display(struct fb_var_screeninfo *, struct fb_info *);
 
 /*
@@ -373,8 +372,7 @@ static int leo_mmap(struct fb_info *info, struct file *file, struct vm_area_stru
 				  vma);
 }
 
-static int leo_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		     unsigned long arg, struct fb_info *info)
+static int leo_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg)
 {
 	struct leo_par *par = (struct leo_par *) info->par;
 
diff --git a/drivers/video/matrox/matroxfb_base.c b/drivers/video/matrox/matroxfb_base.c
index 1e74f4cca53b..4055ff6f5a81 100644
--- a/drivers/video/matrox/matroxfb_base.c
+++ b/drivers/video/matrox/matroxfb_base.c
@@ -865,9 +865,8 @@ static struct matrox_altout panellink_output = {
 	.name	 = "Panellink output",
 };
 
-static int matroxfb_ioctl(struct inode *inode, struct file *file,
-			  unsigned int cmd, unsigned long arg,
-			  struct fb_info *info)
+static int matroxfb_ioctl(struct fb_info *info,
+			  unsigned int cmd, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
 	MINFO_FROM_INFO(info);
diff --git a/drivers/video/matrox/matroxfb_crtc2.c b/drivers/video/matrox/matroxfb_crtc2.c
index d52d7d825c41..27eb4bb4f89f 100644
--- a/drivers/video/matrox/matroxfb_crtc2.c
+++ b/drivers/video/matrox/matroxfb_crtc2.c
@@ -419,11 +419,10 @@ static int matroxfb_dh_get_vblank(const struct matroxfb_dh_fb_info* m2info, stru
 	return 0;
 }
 
-static int matroxfb_dh_ioctl(struct inode* inode,
-		struct file* file,
+static int matroxfb_dh_ioctl(struct fb_info *info,
 		unsigned int cmd,
-		unsigned long arg,
-		struct fb_info* info) {
+		unsigned long arg)
+{
 #define m2info (container_of(info, struct matroxfb_dh_fb_info, fbcon))
 	MINFO_FROM(m2info->primary_dev);
 
@@ -457,7 +456,7 @@ static int matroxfb_dh_ioctl(struct inode* inode,
 		case MATROXFB_GET_OUTPUT_MODE:
 		case MATROXFB_GET_ALL_OUTPUTS:
 			{
-				return ACCESS_FBINFO(fbcon.fbops)->fb_ioctl(inode, file, cmd, arg, &ACCESS_FBINFO(fbcon));
+				return ACCESS_FBINFO(fbcon.fbops)->fb_ioctl(&ACCESS_FBINFO(fbcon), cmd, arg);
 			}
 		case MATROXFB_SET_OUTPUT_CONNECTION:
 			{
diff --git a/drivers/video/p9100.c b/drivers/video/p9100.c
index b251e754e16c..a1025e9606f1 100644
--- a/drivers/video/p9100.c
+++ b/drivers/video/p9100.c
@@ -32,8 +32,7 @@ static int p9100_setcolreg(unsigned, unsigned, unsigned, unsigned,
 static int p9100_blank(int, struct fb_info *);
 
 static int p9100_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
-static int p9100_ioctl(struct inode *, struct file *, unsigned int,
-		       unsigned long, struct fb_info *);
+static int p9100_ioctl(struct fb_info *, unsigned int, unsigned long);
 
 /*
  *  Frame buffer operations
@@ -232,8 +231,8 @@ static int p9100_mmap(struct fb_info *info, struct file *file, struct vm_area_st
 				  vma);
 }
 
-static int p9100_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		       unsigned long arg, struct fb_info *info)
+static int p9100_ioctl(struct fb_info *info, unsigned int cmd,
+		       unsigned long arg)
 {
 	struct p9100_par *par = (struct p9100_par *) info->par;
 
diff --git a/drivers/video/pm3fb.c b/drivers/video/pm3fb.c
index 2e11b601c488..0e78ddc81583 100644
--- a/drivers/video/pm3fb.c
+++ b/drivers/video/pm3fb.c
@@ -657,9 +657,7 @@ static void pm3fb_set_disp(const void *par, struct display *disp,
 static void pm3fb_detect(void);
 static int pm3fb_pan_display(const struct fb_var_screeninfo *var,
 			     struct fb_info_gen *info);
-static int pm3fb_ioctl(struct inode *inode, struct file *file,
-                       u_int cmd, u_long arg, int con,
-		       struct fb_info *info);
+static int pm3fb_ioctl(struct fb_info *info, u_int cmd, u_long arg);
 
 
 /* the struct that hold them together */
@@ -3438,9 +3436,7 @@ static int pm3fb_pan_display(const struct fb_var_screeninfo *var,
 	return 0;
 }
 
-static int pm3fb_ioctl(struct inode *inode, struct file *file,
-                       u_int cmd, u_long arg, int con,
-		       struct fb_info *info)
+static int pm3fb_ioctl(struct fb_info *info, u_int cmd, u_long arg)
 {
 	struct pm3fb_info *l_fb_info = (struct pm3fb_info *) info;
 	u32 cm, i;
diff --git a/drivers/video/pmag-aa-fb.c b/drivers/video/pmag-aa-fb.c
index 28d1fe5fe340..d92f352211ef 100644
--- a/drivers/video/pmag-aa-fb.c
+++ b/drivers/video/pmag-aa-fb.c
@@ -299,8 +299,7 @@ static int aafb_set_cmap(struct fb_cmap *cmap, int kspc, int con,
 		return -EINVAL;
 }
 
-static int aafb_ioctl(struct inode *inode, struct file *file, u32 cmd,
-		      unsigned long arg, int con, struct fb_info *info)
+static int aafb_ioctl(struct fb_info *info, u32 cmd, unsigned long arg)
 {
 	/* TODO: Not yet implemented */
 	return -ENOIOCTLCMD;
diff --git a/drivers/video/radeonfb.c b/drivers/video/radeonfb.c
index 600318f708f2..db9fb9074dbc 100644
--- a/drivers/video/radeonfb.c
+++ b/drivers/video/radeonfb.c
@@ -1497,8 +1497,8 @@ static int radeonfb_pan_display (struct fb_var_screeninfo *var,
 }
 
 
-static int radeonfb_ioctl (struct inode *inode, struct file *file, unsigned int cmd,
-                           unsigned long arg, struct fb_info *info)
+static int radeonfb_ioctl (struct fb_info *info, unsigned int cmd,
+                           unsigned long arg)
 {
         struct radeonfb_info *rinfo = (struct radeonfb_info *) info;
 	unsigned int tmp;
diff --git a/drivers/video/sis/sis_main.c b/drivers/video/sis/sis_main.c
index dea1a46c67c4..8adf5bf91eee 100644
--- a/drivers/video/sis/sis_main.c
+++ b/drivers/video/sis/sis_main.c
@@ -1743,13 +1743,14 @@ sisfb_blank(int blank, struct fb_info *info)
 
 /* ----------- FBDev related routines for all series ---------- */
 
-static int
-sisfb_ioctl(struct inode *inode, struct file *file,
-            unsigned int cmd, unsigned long arg,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
-	    int con,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15)
+static int	sisfb_ioctl(struct fb_info *info, unsigned int cmd,
+			    unsigned long arg)
+#else
+static int	sisfb_ioctl(struct inode *inode, struct file *file,
+				unsigned int cmd, unsigned long arg,
+				struct fb_info *info)
 #endif
-	    struct fb_info *info)
 {
 	struct sis_video_info	*ivideo = (struct sis_video_info *)info->par;
 	struct sis_memreq	sismemreq;
@@ -1924,19 +1925,6 @@ sisfb_ioctl(struct inode *inode, struct file *file,
 	return 0;
 }
 
-#ifdef SIS_NEW_CONFIG_COMPAT
-static long
-sisfb_compat_ioctl(struct file *f, unsigned int cmd, unsigned long arg, struct fb_info *info)
-{
-	int ret;
-
-	lock_kernel();
-	ret = sisfb_ioctl(NULL, f, cmd, arg, info);
-	unlock_kernel();
-	return ret;
-}
-#endif
-
 static int
 sisfb_get_fix(struct fb_fix_screeninfo *fix, int con, struct fb_info *info)
 {
@@ -2007,7 +1995,7 @@ static struct fb_ops sisfb_ops = {
 #endif
 	.fb_sync	= fbcon_sis_sync,
 #ifdef SIS_NEW_CONFIG_COMPAT
-	.fb_compat_ioctl= sisfb_compat_ioctl,
+	.fb_compat_ioctl= sisfb_ioctl,
 #endif
 	.fb_ioctl	= sisfb_ioctl
 };
diff --git a/drivers/video/sis/sis_main.h b/drivers/video/sis/sis_main.h
index 445bcbba03ae..70b6df371b8e 100644
--- a/drivers/video/sis/sis_main.h
+++ b/drivers/video/sis/sis_main.h
@@ -727,9 +727,14 @@ static int	sisfb_ioctl(struct inode *inode, struct file *file,
 #endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15)
+static int	sisfb_ioctl(struct fb_info *info, unsigned int cmd,
+			    unsigned long arg);
+#else
 static int	sisfb_ioctl(struct inode *inode, struct file *file,
 				unsigned int cmd, unsigned long arg,
 				struct fb_info *info);
+#endif
 static int	sisfb_set_par(struct fb_info *info);
 static int	sisfb_blank(int blank,
 				struct fb_info *info);
diff --git a/drivers/video/sstfb.c b/drivers/video/sstfb.c
index 8a5ce210bb27..99921df35474 100644
--- a/drivers/video/sstfb.c
+++ b/drivers/video/sstfb.c
@@ -771,8 +771,7 @@ static int sstfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
 	return 0;
 }
 
-static int sstfb_ioctl(struct inode *inode, struct file *file,
-                       u_int cmd, u_long arg, struct fb_info *info )
+static int sstfb_ioctl(struct fb_info *info, u_int cmd, u_long arg)
 {
 	struct sstfb_par *par = info->par;
 	struct pci_dev *sst_dev = par->dev;
diff --git a/drivers/video/tcx.c b/drivers/video/tcx.c
index 2b27b4474001..cb940771ec58 100644
--- a/drivers/video/tcx.c
+++ b/drivers/video/tcx.c
@@ -34,8 +34,7 @@ static int tcx_setcolreg(unsigned, unsigned, unsigned, unsigned,
 static int tcx_blank(int, struct fb_info *);
 
 static int tcx_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
-static int tcx_ioctl(struct inode *, struct file *, unsigned int,
-		     unsigned long, struct fb_info *);
+static int tcx_ioctl(struct fb_info *, unsigned int, unsigned long);
 static int tcx_pan_display(struct fb_var_screeninfo *, struct fb_info *);
 
 /*
@@ -312,8 +311,8 @@ static int tcx_mmap(struct fb_info *info, struct file *file, struct vm_area_stru
 				  vma);
 }
 
-static int tcx_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		     unsigned long arg, struct fb_info *info)
+static int tcx_ioctl(struct fb_info *info, unsigned int cmd,
+		     unsigned long arg)
 {
 	struct tcx_par *par = (struct tcx_par *) info->par;
 
diff --git a/include/linux/fb.h b/include/linux/fb.h
index a973be2cfe61..6da8a80a2662 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -608,12 +608,12 @@ struct fb_ops {
 	int (*fb_sync)(struct fb_info *info);
 
 	/* perform fb specific ioctl (optional) */
-	int (*fb_ioctl)(struct inode *inode, struct file *file, unsigned int cmd,
-			unsigned long arg, struct fb_info *info);
+	int (*fb_ioctl)(struct fb_info *info, unsigned int cmd,
+			unsigned long arg);
 
 	/* Handle 32bit compat ioctl (optional) */
-	long (*fb_compat_ioctl)(struct file *f, unsigned cmd, unsigned long arg,
-			       struct fb_info *info);
+	int (*fb_compat_ioctl)(struct fb_info *info, unsigned cmd,
+			unsigned long arg);
 
 	/* perform fb specific mmap */
 	int (*fb_mmap)(struct fb_info *info, struct file *file, struct vm_area_struct *vma);
-- 
cgit v1.2.3-71-gd317


From 216d526c89d144928f095f2800bc6c67e968d628 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 14 Jan 2006 13:21:25 -0800
Subject: [PATCH] fbdev: Sanitize ->fb_mmap prototype

No need for a file argument.  If we'd really need it it's in vma->vm_file
already.  gbefb and sgivwfb used to set vma->vm_file to the file argument, but
the kernel alrady did that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/video/68328fb.c        | 6 ++----
 drivers/video/acornfb.c        | 2 +-
 drivers/video/amba-clcd.c      | 2 +-
 drivers/video/aty/atyfb_base.c | 4 ++--
 drivers/video/au1100fb.c       | 2 +-
 drivers/video/bw2.c            | 4 ++--
 drivers/video/cg14.c           | 4 ++--
 drivers/video/cg3.c            | 4 ++--
 drivers/video/cg6.c            | 4 ++--
 drivers/video/controlfb.c      | 4 ++--
 drivers/video/fbmem.c          | 2 +-
 drivers/video/ffb.c            | 4 ++--
 drivers/video/gbefb.c          | 3 +--
 drivers/video/igafb.c          | 2 +-
 drivers/video/leo.c            | 4 ++--
 drivers/video/p9100.c          | 4 ++--
 drivers/video/pxafb.c          | 2 +-
 drivers/video/sa1100fb.c       | 2 +-
 drivers/video/sgivwfb.c        | 5 ++---
 drivers/video/tcx.c            | 4 ++--
 drivers/video/vfb.c            | 4 ++--
 include/linux/fb.h             | 2 +-
 22 files changed, 35 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/68328fb.c b/drivers/video/68328fb.c
index 3b0ddc55236b..78488bb41aeb 100644
--- a/drivers/video/68328fb.c
+++ b/drivers/video/68328fb.c
@@ -102,8 +102,7 @@ static int mc68x328fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
 			 u_int transp, struct fb_info *info);
 static int mc68x328fb_pan_display(struct fb_var_screeninfo *var,
 			   struct fb_info *info);
-static int mc68x328fb_mmap(struct fb_info *info, struct file *file,
-		    struct vm_area_struct *vma);
+static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma);
 
 static struct fb_ops mc68x328fb_ops = {
 	.fb_check_var	= mc68x328fb_check_var,
@@ -398,8 +397,7 @@ static int mc68x328fb_pan_display(struct fb_var_screeninfo *var,
      *  Most drivers don't need their own mmap function 
      */
 
-static int mc68x328fb_mmap(struct fb_info *info, struct file *file,
-		    struct vm_area_struct *vma)
+static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 #ifndef MMU
 	/* this is uClinux (no MMU) specific code */
diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c
index 750cebb18306..b058273527bb 100644
--- a/drivers/video/acornfb.c
+++ b/drivers/video/acornfb.c
@@ -883,7 +883,7 @@ acornfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
  * Note that we are entered with the kernel locked.
  */
 static int
-acornfb_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma)
+acornfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	unsigned long off, start;
 	u32 len;
diff --git a/drivers/video/amba-clcd.c b/drivers/video/amba-clcd.c
index 0da4083ba908..b2187175d03f 100644
--- a/drivers/video/amba-clcd.c
+++ b/drivers/video/amba-clcd.c
@@ -307,7 +307,7 @@ static int clcdfb_blank(int blank_mode, struct fb_info *info)
 	return 0;
 }
 
-static int clcdfb_mmap(struct fb_info *info, struct file *file,
+static int clcdfb_mmap(struct fb_info *info,
 		       struct vm_area_struct *vma)
 {
 	struct clcd_fb *fb = to_clcd(info);
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c
index 75b463ad4c75..485be386a8ff 100644
--- a/drivers/video/aty/atyfb_base.c
+++ b/drivers/video/aty/atyfb_base.c
@@ -243,7 +243,7 @@ extern void atyfb_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
 extern void atyfb_copyarea(struct fb_info *info, const struct fb_copyarea *area);
 extern void atyfb_imageblit(struct fb_info *info, const struct fb_image *image);
 #ifdef __sparc__
-static int atyfb_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma);
+static int atyfb_mmap(struct fb_info *info, struct vm_area_struct *vma);
 #endif
 static int atyfb_sync(struct fb_info *info);
 
@@ -1843,7 +1843,7 @@ static int atyfb_sync(struct fb_info *info)
 }
 
 #ifdef __sparc__
-static int atyfb_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma)
+static int atyfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	struct atyfb_par *par = (struct atyfb_par *) info->par;
 	unsigned int size, page, map_size = 0;
diff --git a/drivers/video/au1100fb.c b/drivers/video/au1100fb.c
index a5129806172f..2406899f1207 100644
--- a/drivers/video/au1100fb.c
+++ b/drivers/video/au1100fb.c
@@ -379,7 +379,7 @@ void au1100fb_fb_rotate(struct fb_info *fbi, int angle)
  * Map video memory in user space. We don't use the generic fb_mmap method mainly
  * to allow the use of the TLB streaming flag (CCA=6)
  */
-int au1100fb_fb_mmap(struct fb_info *fbi, struct file *file, struct vm_area_struct *vma)
+int au1100fb_fb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
 {
 	struct au1100fb_device *fbdev = to_au1100fb_device(fbi);
 	unsigned int len;
diff --git a/drivers/video/bw2.c b/drivers/video/bw2.c
index 641a6c1fbf22..c029db4646f6 100644
--- a/drivers/video/bw2.c
+++ b/drivers/video/bw2.c
@@ -35,7 +35,7 @@
 
 static int bw2_blank(int, struct fb_info *);
 
-static int bw2_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
+static int bw2_mmap(struct fb_info *, struct vm_area_struct *);
 static int bw2_ioctl(struct fb_info *, unsigned int, unsigned long);
 
 /*
@@ -168,7 +168,7 @@ static struct sbus_mmap_map bw2_mmap_map[] = {
 	{ .size = 0 }
 };
 
-static int bw2_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma)
+static int bw2_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	struct bw2_par *par = (struct bw2_par *)info->par;
 
diff --git a/drivers/video/cg14.c b/drivers/video/cg14.c
index edb9936673c8..63b6c79c8a0a 100644
--- a/drivers/video/cg14.c
+++ b/drivers/video/cg14.c
@@ -31,7 +31,7 @@
 static int cg14_setcolreg(unsigned, unsigned, unsigned, unsigned,
 			 unsigned, struct fb_info *);
 
-static int cg14_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
+static int cg14_mmap(struct fb_info *, struct vm_area_struct *);
 static int cg14_ioctl(struct fb_info *, unsigned int, unsigned long);
 static int cg14_pan_display(struct fb_var_screeninfo *, struct fb_info *);
 
@@ -267,7 +267,7 @@ static int cg14_setcolreg(unsigned regno,
 	return 0;
 }
 
-static int cg14_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma)
+static int cg14_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	struct cg14_par *par = (struct cg14_par *) info->par;
 
diff --git a/drivers/video/cg3.c b/drivers/video/cg3.c
index 027f707fa5db..3de6e1b5ab2f 100644
--- a/drivers/video/cg3.c
+++ b/drivers/video/cg3.c
@@ -33,7 +33,7 @@ static int cg3_setcolreg(unsigned, unsigned, unsigned, unsigned,
 			 unsigned, struct fb_info *);
 static int cg3_blank(int, struct fb_info *);
 
-static int cg3_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
+static int cg3_mmap(struct fb_info *, struct vm_area_struct *);
 static int cg3_ioctl(struct fb_info *, unsigned int, unsigned long);
 
 /*
@@ -229,7 +229,7 @@ static struct sbus_mmap_map cg3_mmap_map[] = {
 	{ .size = 0 }
 };
 
-static int cg3_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma)
+static int cg3_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	struct cg3_par *par = (struct cg3_par *)info->par;
 
diff --git a/drivers/video/cg6.c b/drivers/video/cg6.c
index 678e335d2b0f..7aab91ead681 100644
--- a/drivers/video/cg6.c
+++ b/drivers/video/cg6.c
@@ -36,7 +36,7 @@ static int cg6_blank(int, struct fb_info *);
 static void cg6_imageblit(struct fb_info *, const struct fb_image *);
 static void cg6_fillrect(struct fb_info *, const struct fb_fillrect *);
 static int cg6_sync(struct fb_info *);
-static int cg6_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
+static int cg6_mmap(struct fb_info *, struct vm_area_struct *);
 static int cg6_ioctl(struct fb_info *, unsigned int, unsigned long);
 
 /*
@@ -523,7 +523,7 @@ static struct sbus_mmap_map cg6_mmap_map[] = {
 	{ .size	= 0 }
 };
 
-static int cg6_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma)
+static int cg6_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	struct cg6_par *par = (struct cg6_par *)info->par;
 
diff --git a/drivers/video/controlfb.c b/drivers/video/controlfb.c
index 03798e9c882d..655301a8671c 100644
--- a/drivers/video/controlfb.c
+++ b/drivers/video/controlfb.c
@@ -128,7 +128,7 @@ static int controlfb_pan_display(struct fb_var_screeninfo *var,
 static int controlfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
 	u_int transp, struct fb_info *info);
 static int controlfb_blank(int blank_mode, struct fb_info *info);
-static int controlfb_mmap(struct fb_info *info, struct file *file,
+static int controlfb_mmap(struct fb_info *info,
 	struct vm_area_struct *vma);
 static int controlfb_set_par (struct fb_info *info);
 static int controlfb_check_var (struct fb_var_screeninfo *var, struct fb_info *info);
@@ -280,7 +280,7 @@ static int controlfb_pan_display(struct fb_var_screeninfo *var,
  * for controlfb.
  * Note there's no locking in here; it's done in fb_mmap() in fbmem.c.
  */
-static int controlfb_mmap(struct fb_info *info, struct file *file,
+static int controlfb_mmap(struct fb_info *info,
                        struct vm_area_struct *vma)
 {
        unsigned long off, start;
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 80c16fb61e1c..d2dede6ed3e5 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1135,7 +1135,7 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 	if (fb->fb_mmap) {
 		int res;
 		lock_kernel();
-		res = fb->fb_mmap(info, file, vma);
+		res = fb->fb_mmap(info, vma);
 		unlock_kernel();
 		return res;
 	}
diff --git a/drivers/video/ffb.c b/drivers/video/ffb.c
index d096c129d20b..9c9b21d469a1 100644
--- a/drivers/video/ffb.c
+++ b/drivers/video/ffb.c
@@ -37,7 +37,7 @@ static void ffb_imageblit(struct fb_info *, const struct fb_image *);
 static void ffb_fillrect(struct fb_info *, const struct fb_fillrect *);
 static void ffb_copyarea(struct fb_info *, const struct fb_copyarea *);
 static int ffb_sync(struct fb_info *);
-static int ffb_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
+static int ffb_mmap(struct fb_info *, struct vm_area_struct *);
 static int ffb_ioctl(struct fb_info *, unsigned int, unsigned long);
 static int ffb_pan_display(struct fb_var_screeninfo *, struct fb_info *);
 
@@ -838,7 +838,7 @@ static struct sbus_mmap_map ffb_mmap_map[] = {
 	{ .size = 0 }
 };
 
-static int ffb_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma)
+static int ffb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	struct ffb_par *par = (struct ffb_par *)info->par;
 
diff --git a/drivers/video/gbefb.c b/drivers/video/gbefb.c
index d744c51807b7..38d22729b129 100644
--- a/drivers/video/gbefb.c
+++ b/drivers/video/gbefb.c
@@ -979,7 +979,7 @@ static int gbefb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
 	return 0;
 }
 
-static int gbefb_mmap(struct fb_info *info, struct file *file,
+static int gbefb_mmap(struct fb_info *info,
 			struct vm_area_struct *vma)
 {
 	unsigned long size = vma->vm_end - vma->vm_start;
@@ -1000,7 +1000,6 @@ static int gbefb_mmap(struct fb_info *info, struct file *file,
 		pgprot_fb(pgprot_val(vma->vm_page_prot));
 
 	vma->vm_flags |= VM_IO | VM_RESERVED;
-	vma->vm_file = file;
 
 	/* look for the starting tile */
 	tile = &gbe_tiles.cpu[offset >> TILE_SHIFT];
diff --git a/drivers/video/igafb.c b/drivers/video/igafb.c
index e326f44f652d..6b88050d21bf 100644
--- a/drivers/video/igafb.c
+++ b/drivers/video/igafb.c
@@ -219,7 +219,7 @@ static void iga_blank_border(struct iga_par *par)
 }
 
 #ifdef __sparc__
-static int igafb_mmap(struct fb_info *info, struct file *file,
+static int igafb_mmap(struct fb_info *info,
 		      struct vm_area_struct *vma)
 {
 	struct iga_par *par = (struct iga_par *)info->par;
diff --git a/drivers/video/leo.c b/drivers/video/leo.c
index fb14787e13b9..a23cfdb9d826 100644
--- a/drivers/video/leo.c
+++ b/drivers/video/leo.c
@@ -32,7 +32,7 @@ static int leo_setcolreg(unsigned, unsigned, unsigned, unsigned,
 			 unsigned, struct fb_info *);
 static int leo_blank(int, struct fb_info *);
 
-static int leo_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
+static int leo_mmap(struct fb_info *, struct vm_area_struct *);
 static int leo_ioctl(struct fb_info *, unsigned int, unsigned long);
 static int leo_pan_display(struct fb_var_screeninfo *, struct fb_info *);
 
@@ -362,7 +362,7 @@ static struct sbus_mmap_map leo_mmap_map[] = {
 	{ .size = 0 }
 };
 
-static int leo_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma)
+static int leo_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	struct leo_par *par = (struct leo_par *)info->par;
 
diff --git a/drivers/video/p9100.c b/drivers/video/p9100.c
index a1025e9606f1..0d1957505359 100644
--- a/drivers/video/p9100.c
+++ b/drivers/video/p9100.c
@@ -31,7 +31,7 @@ static int p9100_setcolreg(unsigned, unsigned, unsigned, unsigned,
 			   unsigned, struct fb_info *);
 static int p9100_blank(int, struct fb_info *);
 
-static int p9100_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
+static int p9100_mmap(struct fb_info *, struct vm_area_struct *);
 static int p9100_ioctl(struct fb_info *, unsigned int, unsigned long);
 
 /*
@@ -221,7 +221,7 @@ static struct sbus_mmap_map p9100_mmap_map[] = {
 	{ 0,			0,		0		    }
 };
 
-static int p9100_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma)
+static int p9100_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	struct p9100_par *par = (struct p9100_par *)info->par;
 
diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c
index 9fc10b9e6f57..53ad61f1038c 100644
--- a/drivers/video/pxafb.c
+++ b/drivers/video/pxafb.c
@@ -395,7 +395,7 @@ static int pxafb_blank(int blank, struct fb_info *info)
 	return 0;
 }
 
-static int pxafb_mmap(struct fb_info *info, struct file *file,
+static int pxafb_mmap(struct fb_info *info,
 		      struct vm_area_struct *vma)
 {
 	struct pxafb_info *fbi = (struct pxafb_info *)info;
diff --git a/drivers/video/sa1100fb.c b/drivers/video/sa1100fb.c
index 087e58689e4c..8a893ce7040d 100644
--- a/drivers/video/sa1100fb.c
+++ b/drivers/video/sa1100fb.c
@@ -815,7 +815,7 @@ static int sa1100fb_blank(int blank, struct fb_info *info)
 	return 0;
 }
 
-static int sa1100fb_mmap(struct fb_info *info, struct file *file,
+static int sa1100fb_mmap(struct fb_info *info,
 			 struct vm_area_struct *vma)
 {
 	struct sa1100fb_info *fbi = (struct sa1100fb_info *)info;
diff --git a/drivers/video/sgivwfb.c b/drivers/video/sgivwfb.c
index 7054660767e4..2e6df1fcb2b9 100644
--- a/drivers/video/sgivwfb.c
+++ b/drivers/video/sgivwfb.c
@@ -115,7 +115,7 @@ static int sgivwfb_set_par(struct fb_info *info);
 static int sgivwfb_setcolreg(u_int regno, u_int red, u_int green,
 			     u_int blue, u_int transp,
 			     struct fb_info *info);
-static int sgivwfb_mmap(struct fb_info *info, struct file *file,
+static int sgivwfb_mmap(struct fb_info *info,
 			struct vm_area_struct *vma);
 
 static struct fb_ops sgivwfb_ops = {
@@ -706,7 +706,7 @@ static int sgivwfb_setcolreg(u_int regno, u_int red, u_int green,
 	return 0;
 }
 
-static int sgivwfb_mmap(struct fb_info *info, struct file *file,
+static int sgivwfb_mmap(struct fb_info *info,
 			struct vm_area_struct *vma)
 {
 	unsigned long size = vma->vm_end - vma->vm_start;
@@ -723,7 +723,6 @@ static int sgivwfb_mmap(struct fb_info *info, struct file *file,
 	if (remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT,
 						size, vma->vm_page_prot))
 		return -EAGAIN;
-	vma->vm_file = file;
 	printk(KERN_DEBUG "sgivwfb: mmap framebuffer P(%lx)->V(%lx)\n",
 	       offset, vma->vm_start);
 	return 0;
diff --git a/drivers/video/tcx.c b/drivers/video/tcx.c
index cb940771ec58..95b918229d9b 100644
--- a/drivers/video/tcx.c
+++ b/drivers/video/tcx.c
@@ -33,7 +33,7 @@ static int tcx_setcolreg(unsigned, unsigned, unsigned, unsigned,
 			 unsigned, struct fb_info *);
 static int tcx_blank(int, struct fb_info *);
 
-static int tcx_mmap(struct fb_info *, struct file *, struct vm_area_struct *);
+static int tcx_mmap(struct fb_info *, struct vm_area_struct *);
 static int tcx_ioctl(struct fb_info *, unsigned int, unsigned long);
 static int tcx_pan_display(struct fb_var_screeninfo *, struct fb_info *);
 
@@ -301,7 +301,7 @@ static struct sbus_mmap_map __tcx_mmap_map[TCX_MMAP_ENTRIES] = {
 	{ .size = 0 }
 };
 
-static int tcx_mmap(struct fb_info *info, struct file *file, struct vm_area_struct *vma)
+static int tcx_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	struct tcx_par *par = (struct tcx_par *)info->par;
 
diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c
index ffa1ad474226..53208cb58396 100644
--- a/drivers/video/vfb.c
+++ b/drivers/video/vfb.c
@@ -81,7 +81,7 @@ static int vfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
 			 u_int transp, struct fb_info *info);
 static int vfb_pan_display(struct fb_var_screeninfo *var,
 			   struct fb_info *info);
-static int vfb_mmap(struct fb_info *info, struct file *file,
+static int vfb_mmap(struct fb_info *info,
 		    struct vm_area_struct *vma);
 
 static struct fb_ops vfb_ops = {
@@ -368,7 +368,7 @@ static int vfb_pan_display(struct fb_var_screeninfo *var,
      *  Most drivers don't need their own mmap function 
      */
 
-static int vfb_mmap(struct fb_info *info, struct file *file,
+static int vfb_mmap(struct fb_info *info,
 		    struct vm_area_struct *vma)
 {
 	return -EINVAL;
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 6da8a80a2662..2cb19e6503aa 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -616,7 +616,7 @@ struct fb_ops {
 			unsigned long arg);
 
 	/* perform fb specific mmap */
-	int (*fb_mmap)(struct fb_info *info, struct file *file, struct vm_area_struct *vma);
+	int (*fb_mmap)(struct fb_info *info, struct vm_area_struct *vma);
 
 	/* save current hardware state */
 	void (*fb_save_state)(struct fb_info *info);
-- 
cgit v1.2.3-71-gd317


From 40fc55cb69c0386504ab5184e9bea0a7aecb2bd3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.he>
Date: Sat, 14 Jan 2006 13:21:28 -0800
Subject: [PATCH] Make __always_inline actually force always inlining

This patch is the first in a series that tries to optimize the kernel in terms
of size (and thus cache behavior, both cpu and pagecache).

This first patch changes __always_inline to be a forced inline instead of the
"regular" inline it was on everything except alpha.  This forced inline
matches the intention of the define better as a matter of documentation.
There is no change in behavior by this patch, since "inline" currently is
mapped to a forced inline anyway.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compiler-gcc3.h | 1 +
 include/linux/compiler-gcc4.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index 4209082ee934..1698b845761f 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -13,3 +13,4 @@
 #define __must_check		__attribute__((warn_unused_result))
 #endif
 
+#define __always_inline		inline __attribute__((always_inline))
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index e913e9beaf69..8249115a1f73 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -6,4 +6,4 @@
 #define __attribute_used__	__attribute__((__used__))
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
-
+#define __always_inline		inline __attribute__((always_inline))
-- 
cgit v1.2.3-71-gd317


From 652050aec936fdd70ed9cbce1cd1ef30a7c9d117 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 14 Jan 2006 13:21:30 -0800
Subject: [PATCH] mark several functions __always_inline

      Arjan van de Ven <arjan@infradead.org>

Mark a number of functions as 'must inline'.  The functions affected by this
patch need to be inlined because they use knowledge that their arguments are
constant so that most of the function optimizes away.  At this point this
patch does not change behavior, it's for documentation only (and for future
patches in the inline series)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/bitops.h    | 2 +-
 include/asm-i386/current.h   | 2 +-
 include/asm-i386/string.h    | 8 ++++----
 include/asm-i386/uaccess.h   | 8 ++++----
 include/asm-x86_64/fixmap.h  | 2 +-
 include/asm-x86_64/uaccess.h | 6 +++---
 include/linux/mm.h           | 2 +-
 7 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index fe0819fe9c64..88e6ca248cd7 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -247,7 +247,7 @@ static inline int test_and_change_bit(int nr, volatile unsigned long* addr)
 static int test_bit(int nr, const volatile void * addr);
 #endif
 
-static inline int constant_test_bit(int nr, const volatile unsigned long *addr)
+static __always_inline int constant_test_bit(int nr, const volatile unsigned long *addr)
 {
 	return ((1UL << (nr & 31)) & (addr[nr >> 5])) != 0;
 }
diff --git a/include/asm-i386/current.h b/include/asm-i386/current.h
index d97328951f5f..3cbbecd79016 100644
--- a/include/asm-i386/current.h
+++ b/include/asm-i386/current.h
@@ -5,7 +5,7 @@
 
 struct task_struct;
 
-static inline struct task_struct * get_current(void)
+static __always_inline struct task_struct * get_current(void)
 {
 	return current_thread_info()->task;
 }
diff --git a/include/asm-i386/string.h b/include/asm-i386/string.h
index 02c8f5d22065..bb5f88a27f7a 100644
--- a/include/asm-i386/string.h
+++ b/include/asm-i386/string.h
@@ -201,7 +201,7 @@ __asm__ __volatile__(
 return __res;
 }
 
-static inline void * __memcpy(void * to, const void * from, size_t n)
+static __always_inline void * __memcpy(void * to, const void * from, size_t n)
 {
 int d0, d1, d2;
 __asm__ __volatile__(
@@ -223,7 +223,7 @@ return (to);
  * This looks ugly, but the compiler can optimize it totally,
  * as the count is constant.
  */
-static inline void * __constant_memcpy(void * to, const void * from, size_t n)
+static __always_inline void * __constant_memcpy(void * to, const void * from, size_t n)
 {
 	long esi, edi;
 	if (!n) return to;
@@ -367,7 +367,7 @@ return s;
  * things 32 bits at a time even when we don't know the size of the
  * area at compile-time..
  */
-static inline void * __constant_c_memset(void * s, unsigned long c, size_t count)
+static __always_inline void * __constant_c_memset(void * s, unsigned long c, size_t count)
 {
 int d0, d1;
 __asm__ __volatile__(
@@ -416,7 +416,7 @@ extern char *strstr(const char *cs, const char *ct);
  * This looks horribly ugly, but the compiler can optimize it totally,
  * as we by now know that both pattern and count is constant..
  */
-static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
+static __always_inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
 {
 	switch (count) {
 		case 0:
diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h
index 89ab7e2bc5aa..3f1337c34208 100644
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -411,7 +411,7 @@ unsigned long __must_check __copy_from_user_ll(void *to,
  * Returns number of bytes that could not be copied.
  * On success, this will be zero.
  */
-static inline unsigned long __must_check
+static __always_inline unsigned long __must_check
 __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 {
 	if (__builtin_constant_p(n)) {
@@ -432,7 +432,7 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 	return __copy_to_user_ll(to, from, n);
 }
 
-static inline unsigned long __must_check
+static __always_inline unsigned long __must_check
 __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
        might_sleep();
@@ -456,7 +456,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
  * If some data could not be copied, this function will pad the copied
  * data to the requested size using zero bytes.
  */
-static inline unsigned long
+static __always_inline unsigned long
 __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 {
 	if (__builtin_constant_p(n)) {
@@ -477,7 +477,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 	return __copy_from_user_ll(to, from, n);
 }
 
-static inline unsigned long
+static __always_inline unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        might_sleep();
diff --git a/include/asm-x86_64/fixmap.h b/include/asm-x86_64/fixmap.h
index a582cfcf2231..7b286bd21d1d 100644
--- a/include/asm-x86_64/fixmap.h
+++ b/include/asm-x86_64/fixmap.h
@@ -76,7 +76,7 @@ extern void __this_fixmap_does_not_exist(void);
  * directly without translation, we catch the bug with a NULL-deference
  * kernel oops. Illegal ranges of incoming indices are caught too.
  */
-static inline unsigned long fix_to_virt(const unsigned int idx)
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
 {
 	/*
 	 * this branch gets completely eliminated after inlining,
diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h
index 2892c4b7a28b..bddffcb591b8 100644
--- a/include/asm-x86_64/uaccess.h
+++ b/include/asm-x86_64/uaccess.h
@@ -244,7 +244,7 @@ extern unsigned long copy_to_user(void __user *to, const void *from, unsigned le
 extern unsigned long copy_from_user(void *to, const void __user *from, unsigned len); 
 extern unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len); 
 
-static inline int __copy_from_user(void *dst, const void __user *src, unsigned size) 
+static __always_inline int __copy_from_user(void *dst, const void __user *src, unsigned size)
 { 
        int ret = 0;
 	if (!__builtin_constant_p(size))
@@ -273,7 +273,7 @@ static inline int __copy_from_user(void *dst, const void __user *src, unsigned s
 	}
 }	
 
-static inline int __copy_to_user(void __user *dst, const void *src, unsigned size) 
+static __always_inline int __copy_to_user(void __user *dst, const void *src, unsigned size)
 { 
        int ret = 0;
 	if (!__builtin_constant_p(size))
@@ -305,7 +305,7 @@ static inline int __copy_to_user(void __user *dst, const void *src, unsigned siz
 }	
 
 
-static inline int __copy_in_user(void __user *dst, const void __user *src, unsigned size) 
+static __always_inline int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 { 
        int ret = 0;
 	if (!__builtin_constant_p(size))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c643016499a1..85854b867463 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -512,7 +512,7 @@ static inline void set_page_links(struct page *page, unsigned long zone,
 extern struct page *mem_map;
 #endif
 
-static inline void *lowmem_page_address(struct page *page)
+static __always_inline void *lowmem_page_address(struct page *page)
 {
 	return __va(page_to_pfn(page) << PAGE_SHIFT);
 }
-- 
cgit v1.2.3-71-gd317


From a9df3d0f312f4b1aefec76ae5ee86cccbf7cd4e0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 14 Jan 2006 13:21:33 -0800
Subject: [PATCH] When CONFIG_CC_OPTIMIZE_FOR_SIZE, allow gcc4 to control
 inlining

If optimizing for size (CONFIG_CC_OPTIMIZE_FOR_SIZE), allow gcc4 compilers
to decide what to inline and what not - instead of the kernel forcing gcc
to inline all the time.  This requires several places that require to be
inlined to be marked as such, previous patches in this series do that.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/feature-removal-schedule.txt |  9 +++++++++
 include/linux/compiler-gcc4.h              |  9 +++++++++
 lib/Kconfig.debug                          | 14 ++++++++++++++
 3 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 9474501dd6cc..b4a1ea762698 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -123,6 +123,15 @@ Who:	Christoph Hellwig <hch@lst.de>
 
 ---------------------------
 
+What:	CONFIG_FORCED_INLINING
+When:	June 2006
+Why:	Config option is there to see if gcc is good enough. (in january
+        2006). If it is, the behavior should just be the default. If it's not,
+	the option should just go away entirely.
+Who:    Arjan van de Ven
+
+---------------------------
+
 What:	START_ARRAY ioctl for md
 When:	July 2006
 Files:	drivers/md/md.c
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 8249115a1f73..6f5cc6f0e7a6 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -3,6 +3,15 @@
 /* These definitions are for GCC v4.x.  */
 #include <linux/compiler-gcc.h>
 
+#ifdef CONFIG_FORCED_INLINING
+# undef inline
+# undef __inline__
+# undef __inline
+# define inline			inline		__attribute__((always_inline))
+# define __inline__		__inline__	__attribute__((always_inline))
+# define __inline		__inline	__attribute__((always_inline))
+#endif
+
 #define __attribute_used__	__attribute__((__used__))
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a609235a517f..a314e663d517 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -195,6 +195,20 @@ config FRAME_POINTER
 	  some architectures or if you use external debuggers.
 	  If you don't debug the kernel, you can say N.
 
+config FORCED_INLINING
+	bool "Force gcc to inline functions marked 'inline'"
+	depends on DEBUG_KERNEL
+	default y
+	help
+	  This option determines if the kernel forces gcc to inline the functions
+	  developers have marked 'inline'. Doing so takes away freedom from gcc to
+	  do what it thinks is best, which is desirable for the gcc 3.x series of
+	  compilers. The gcc 4.x series have a rewritten inlining algorithm and
+	  disabling this option will generate a smaller kernel there. Hopefully
+	  this algorithm is so good that allowing gcc4 to make the decision can
+	  become the default in the future, until then this option is there to
+	  test gcc for this.
+
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3-71-gd317


From 436eddd035c0ff807f4c64551a5a6edc7fb299d0 Mon Sep 17 00:00:00 2001
From: Tyler Trafford <tatrafford@comcast.net>
Date: Fri, 13 Jan 2006 14:38:18 -0200
Subject: V4L/DVB (3365): i2c ids for upd64031a saa717x upd64083 wm8739

- Add i2c ids for drivers: upd64031a saa717x upd64083 wm8739

Signed-off-by: Tyler Trafford <tatrafford@comcast.net>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 include/linux/i2c-id.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 6ff2d365895f..474c8f4f5d4f 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -104,6 +104,10 @@
 #define I2C_DRIVERID_AKITAIOEXP	74	/* IO Expander on Sharp SL-C1000 */
 #define I2C_DRIVERID_INFRARED	75	/* I2C InfraRed on Video boards */
 #define I2C_DRIVERID_TVP5150	76	/* TVP5150 video decoder        */
+#define I2C_DRIVERID_WM8739	77	/* wm8739 audio processor	*/
+#define I2C_DRIVERID_UPD64083	78	/* upd64083 video processor	*/
+#define I2C_DRIVERID_UPD64031A	79	/* upd64031a video processor	*/
+#define I2C_DRIVERID_SAA717X	80	/* saa717x video encoder	*/
 
 #define I2C_DRIVERID_I2CDEV	900
 #define I2C_DRIVERID_ARP        902    /* SMBus ARP Client              */
-- 
cgit v1.2.3-71-gd317


From 9d44190eae97ad4c9ce30f1084e1b0dabd646df5 Mon Sep 17 00:00:00 2001
From: kogiidena <kogiidena@eggplant.ddo.jp>
Date: Mon, 16 Jan 2006 22:14:10 -0800
Subject: [PATCH] sh: kexec() support

This adds kexec() support for SH.

Signed-off-by: kogiidena <kogiidena@eggplant.ddo.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Cc: <fastboot@lists.osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/sh/kernel/Makefile          |   4 +-
 arch/sh/kernel/machine_kexec.c   | 112 +++++++++++++++++++++++++++++++++++++++
 arch/sh/kernel/process.c         |  10 ++++
 arch/sh/kernel/relocate_kernel.S | 102 +++++++++++++++++++++++++++++++++++
 include/asm-sh/kexec.h           |  33 ++++++++++++
 include/linux/kexec.h            |   1 +
 6 files changed, 259 insertions(+), 3 deletions(-)
 create mode 100644 arch/sh/kernel/machine_kexec.c
 create mode 100644 arch/sh/kernel/relocate_kernel.S
 create mode 100644 include/asm-sh/kexec.h

(limited to 'include/linux')

diff --git a/arch/sh/kernel/Makefile b/arch/sh/kernel/Makefile
index 8b819698df14..7a86eeb22655 100644
--- a/arch/sh/kernel/Makefile
+++ b/arch/sh/kernel/Makefile
@@ -17,6 +17,4 @@ obj-$(CONFIG_SH_KGDB)		+= kgdb_stub.o kgdb_jmp.o
 obj-$(CONFIG_SH_CPU_FREQ)	+= cpufreq.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-
-USE_STANDARD_AS_RULE := true
-
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
new file mode 100644
index 000000000000..43546525f28f
--- /dev/null
+++ b/arch/sh/kernel/machine_kexec.c
@@ -0,0 +1,112 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2003 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ * LANDISK/sh4 supported by kogiidena
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/cacheflush.h>
+
+typedef NORET_TYPE void (*relocate_new_kernel_t)(
+				unsigned long indirection_page,
+				unsigned long reboot_code_buffer,
+				unsigned long start_address,
+				unsigned long vbr_reg) ATTRIB_NORET;
+
+const extern unsigned char relocate_new_kernel[];
+const extern unsigned int relocate_new_kernel_size;
+extern void *gdb_vbr_vector;
+
+/*
+ * Provide a dummy crash_notes definition while crash dump arrives to ppc.
+ * This prevents breakage of crash_notes attribute in kernel/ksysfs.c.
+ */
+void *crash_notes = NULL;
+
+void machine_shutdown(void)
+{
+}
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+}
+
+/*
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+static void kexec_info(struct kimage *image)
+{
+        int i;
+	printk("kexec information\n");
+	for (i = 0; i < image->nr_segments; i++) {
+	        printk("  segment[%d]: 0x%08x - 0x%08x (0x%08x)\n",
+		       i,
+		       (unsigned int)image->segment[i].mem,
+		       (unsigned int)image->segment[i].mem + image->segment[i].memsz,
+		       (unsigned int)image->segment[i].memsz);
+ 	}
+	printk("  start     : 0x%08x\n\n", (unsigned int)image->start);
+}
+
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+
+	unsigned long page_list;
+	unsigned long reboot_code_buffer;
+	unsigned long vbr_reg;
+	relocate_new_kernel_t rnk;
+
+#if defined(CONFIG_SH_STANDARD_BIOS)
+	vbr_reg = ((unsigned long )gdb_vbr_vector) - 0x100;
+#else
+	vbr_reg = 0x80000000;  // dummy
+#endif
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+
+	page_list = image->head;
+
+	/* we need both effective and real address here */
+	reboot_code_buffer =
+			(unsigned long)page_address(image->control_code_page);
+
+	/* copy our kernel relocation code to the control code page */
+	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+						relocate_new_kernel_size);
+
+        kexec_info(image);
+	flush_cache_all();
+
+	/* now call it */
+	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+       	(*rnk)(page_list, reboot_code_buffer, image->start, vbr_reg);
+}
+
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index aac15e42d03b..a4dc2b532e10 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -71,6 +71,16 @@ void cpu_idle(void)
 
 void machine_restart(char * __unused)
 {
+
+#ifdef CONFIG_KEXEC
+	struct kimage *image;
+	image = xchg(&kexec_image, 0);
+	if (image) {
+		machine_shutdown();
+		machine_kexec(image);
+	}
+#endif
+
 	/* SR.BL=1 and invoke address error to let CPU reset (manual reset) */
 	asm volatile("ldc %0, sr\n\t"
 		     "mov.l @%1, %0" : : "r" (0x10000000), "r" (0x80000001));
diff --git a/arch/sh/kernel/relocate_kernel.S b/arch/sh/kernel/relocate_kernel.S
new file mode 100644
index 000000000000..b0695cffec6e
--- /dev/null
+++ b/arch/sh/kernel/relocate_kernel.S
@@ -0,0 +1,102 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * 2005.9.17 kogiidena@eggplant.ddo.jp
+ *
+ * LANDISK/sh4 is supported. Maybe, SH archtecture works well.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+
+#define PAGE_SIZE      4096 /* must be same value as in <asm/page.h> */
+
+
+		.globl relocate_new_kernel
+relocate_new_kernel:
+	/* r4 = indirection_page   */
+	/* r5 = reboot_code_buffer */
+	/* r6 = start_address      */
+	/* r7 = vbr_reg            */
+
+	mov.l	10f,r8	  /* 4096 */
+	mov.l	11f,r9    /* 0xa0000000 */
+
+	/*  stack setting */
+	add	r8,r5
+	mov	r5,r15
+
+	bra	1f
+	mov	r4,r0	  /* cmd = indirection_page */
+0:
+	mov.l	@r4+,r0	  /* cmd = *ind++ */
+
+1:	/* addr = (cmd | 0xa0000000) & 0xfffffff0 */
+	mov	r0,r2
+	or	r9,r2
+	mov	#-16,r1
+	and	r1,r2
+
+	/* if(cmd & IND_DESTINATION) dst = addr  */
+	tst	#1,r0
+	bt	2f
+	bra	0b
+	mov	r2,r5
+
+2:	/* else if(cmd & IND_INDIRECTION) ind = addr  */
+	tst	#2,r0
+	bt	3f
+	bra	0b
+	mov	r2,r4
+
+3:	/* else if(cmd & IND_DONE) goto 6  */
+	tst	#4,r0
+	bt	4f
+	bra	6f
+	nop
+
+4:	/* else if(cmd & IND_SOURCE) memcpy(dst,addr,PAGE_SIZE) */
+	tst	#8,r0
+	bt	0b
+
+	mov	r8,r3
+	shlr2	r3
+	shlr2	r3
+5:
+	dt	r3
+	mov.l	@r2+,r1   /*  16n+0 */
+	mov.l	r1,@r5
+	add	#4,r5
+	mov.l	@r2+,r1	  /*  16n+4 */
+	mov.l	r1,@r5
+	add	#4,r5
+	mov.l	@r2+,r1   /*  16n+8 */
+	mov.l	r1,@r5
+	add	#4,r5
+	mov.l	@r2+,r1   /*  16n+12 */
+	mov.l	r1,@r5
+	add	#4,r5
+	bf	5b
+
+	bra	0b
+	nop
+6:
+#ifdef CONFIG_SH_STANDARD_BIOS
+	ldc   r7, vbr
+#endif
+	jmp @r6
+	nop
+
+	.align 2
+10:
+	.long	PAGE_SIZE
+11:
+	.long	0xa0000000
+
+relocate_new_kernel_end:
+
+	.globl relocate_new_kernel_size
+relocate_new_kernel_size:
+	.long relocate_new_kernel_end - relocate_new_kernel
diff --git a/include/asm-sh/kexec.h b/include/asm-sh/kexec.h
new file mode 100644
index 000000000000..9dfe59f6fcb5
--- /dev/null
+++ b/include/asm-sh/kexec.h
@@ -0,0 +1,33 @@
+#ifndef _SH_KEXEC_H
+#define _SH_KEXEC_H
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
+ * calculation for the amount of memory directly mappable into the
+ * kernel memory space.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+/* Maximum address we can use for the control code buffer */
+#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+
+#define KEXEC_CONTROL_CODE_SIZE	4096
+
+/* The native architecture */
+#define KEXEC_ARCH KEXEC_ARCH_SH
+
+#ifndef __ASSEMBLY__
+
+extern void machine_shutdown(void);
+extern void *crash_notes;
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _SH_KEXEC_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 94abc07cb164..a311f58c8a7c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -119,6 +119,7 @@ extern struct kimage *kexec_image;
 #define KEXEC_ARCH_PPC64   (21 << 16)
 #define KEXEC_ARCH_IA_64   (50 << 16)
 #define KEXEC_ARCH_S390    (22 << 16)
+#define KEXEC_ARCH_SH      (42 << 16)
 
 #define KEXEC_FLAGS    (KEXEC_ON_CRASH)  /* List of defined/legal kexec flags */
 
-- 
cgit v1.2.3-71-gd317


From f87fd4c2a0c4f3baad28057360b36a59591ef751 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 16 Jan 2006 22:14:23 -0800
Subject: [PATCH] add /sys/fs

This patch adds an empty /sys/fs, which filesystems can use.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c     | 5 +++++
 include/linux/fs.h | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 8bc15b362d23..ce97becff461 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -48,6 +48,10 @@ static int hash_mask __read_mostly, hash_bits __read_mostly;
 static kmem_cache_t *mnt_cache;
 static struct rw_semaphore namespace_sem;
 
+/* /sys/fs */
+decl_subsys(fs, NULL, NULL);
+EXPORT_SYMBOL_GPL(fs_subsys);
+
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -1725,6 +1729,7 @@ void __init mnt_init(unsigned long mempages)
 		i--;
 	} while (i);
 	sysfs_init();
+	subsystem_register(&fs_subsys);
 	init_rootfs();
 	init_mount_tree();
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 552cedfa6064..b77f2608eef9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1290,6 +1290,9 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 
 extern int vfs_statfs(struct super_block *, struct kstatfs *);
 
+/* /sys/fs */
+extern struct subsystem fs_subsys;
+
 #define FLOCK_VERIFY_READ  1
 #define FLOCK_VERIFY_WRITE 2
 
-- 
cgit v1.2.3-71-gd317


From c09b42404d29c8a9266f8186632330dc8474bf2e Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@cs.vt.edu>
Date: Tue, 17 Jan 2006 07:03:44 +0100
Subject: [PATCH] x86_64: add __meminit for memory hotplug

Add __meminit to the __init lineup to ensure functions default
to __init when memory hotplug is not enabled.  Replace __devinit
with __meminit on functions that were changed when the memory
hotplug code was introduced.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/init.c  |  2 +-
 include/linux/init.h | 12 ++++++++++++
 mm/page_alloc.c      | 14 +++++++-------
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 7df494b51a5b..2700f01994ba 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -268,7 +268,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pkmap_page_table = pte;	
 }
 
-static void __devinit free_new_highpage(struct page *page)
+static void __meminit free_new_highpage(struct page *page)
 {
 	set_page_count(page, 1);
 	__free_page(page);
diff --git a/include/linux/init.h b/include/linux/init.h
index 59008c3826cf..ff8d8b8632f4 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -241,6 +241,18 @@ void __init parse_early_param(void);
 #define __cpuexitdata	__exitdata
 #endif
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+#define __meminit
+#define __meminitdata
+#define __memexit
+#define __memexitdata
+#else
+#define __meminit	__init
+#define __meminitdata __initdata
+#define __memexit __exit
+#define __memexitdata	__exitdata
+#endif
+
 /* Functions marked as __devexit may be discarded at kernel link time, depending
    on config options.  Newer versions of binutils detect references from
    retained sections to discarded sections and flag an error.  Pointers to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8c960b469593..c2e29743a8d1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1735,7 +1735,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
-void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn)
 {
 	struct page *page;
@@ -1788,7 +1788,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
 
-static int __devinit zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
 {
 	int batch;
 
@@ -1882,7 +1882,7 @@ static struct per_cpu_pageset
  * Dynamically allocate memory for the
  * per cpu pageset array in struct zone.
  */
-static int __devinit process_zones(int cpu)
+static int __meminit process_zones(int cpu)
 {
 	struct zone *zone, *dzone;
 
@@ -1923,7 +1923,7 @@ static inline void free_zone_pagesets(int cpu)
 	}
 }
 
-static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+static int __meminit pageset_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
@@ -1963,7 +1963,7 @@ void __init setup_per_cpu_pageset(void)
 
 #endif
 
-static __devinit
+static __meminit
 void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
@@ -1983,7 +1983,7 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 		init_waitqueue_head(zone->wait_table + i);
 }
 
-static __devinit void zone_pcp_init(struct zone *zone)
+static __meminit void zone_pcp_init(struct zone *zone)
 {
 	int cpu;
 	unsigned long batch = zone_batchsize(zone);
@@ -2001,7 +2001,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
 		zone->name, zone->present_pages, batch);
 }
 
-static __devinit void init_currently_empty_zone(struct zone *zone,
+static __meminit void init_currently_empty_zone(struct zone *zone,
 		unsigned long zone_start_pfn, unsigned long size)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
-- 
cgit v1.2.3-71-gd317


From 9343e79a7bb2d3268d68997163608b87d58d8098 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Tue, 17 Jan 2006 02:10:53 -0800
Subject: [IPV6]: Preserve procfs IPV6 address output format

Procfs always output IPV6 addresses without the colon
characters, and we cannot change that.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h   | 1 +
 net/ipv6/addrconf.c      | 2 +-
 net/ipv6/anycast.c       | 2 +-
 net/ipv6/ip6_flowlabel.c | 4 ++--
 net/ipv6/mcast.c         | 6 +++---
 5 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 323924edb26a..a5363324cf95 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -228,6 +228,7 @@ extern void dump_stack(void);
 	ntohs((addr).s6_addr16[6]), \
 	ntohs((addr).s6_addr16[7])
 #define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
+#define NIP6_SEQFMT "%04x%04x%04x%04x%04x%04x%04x%04x"
 
 #if defined(__LITTLE_ENDIAN)
 #define HIPQUAD(addr) \
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index dfb4f145a139..d328d5986143 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2644,7 +2644,7 @@ static int if6_seq_show(struct seq_file *seq, void *v)
 {
 	struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v;
 	seq_printf(seq,
-		   NIP6_FMT " %02x %02x %02x %02x %8s\n",
+		   NIP6_SEQFMT " %02x %02x %02x %02x %8s\n",
 		   NIP6(ifp->addr),
 		   ifp->idev->dev->ifindex,
 		   ifp->prefix_len,
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 72bd08af2dfb..840a33d33296 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -532,7 +532,7 @@ static int ac6_seq_show(struct seq_file *seq, void *v)
 	struct ac6_iter_state *state = ac6_seq_private(seq);
 
 	seq_printf(seq,
-		   "%-4d %-15s " NIP6_FMT " %5d\n",
+		   "%-4d %-15s " NIP6_SEQFMT " %5d\n",
 		   state->dev->ifindex, state->dev->name,
 		   NIP6(im->aca_addr),
 		   im->aca_users);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 4183c8dac7f6..69cbe8a66d02 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -629,7 +629,7 @@ static void ip6fl_fl_seq_show(struct seq_file *seq, struct ip6_flowlabel *fl)
 {
 	while(fl) {
 		seq_printf(seq,
-			   "%05X %-1d %-6d %-6d %-6ld %-8ld " NIP6_FMT " %-4d\n",
+			   "%05X %-1d %-6d %-6d %-6ld %-8ld " NIP6_SEQFMT " %-4d\n",
 			   (unsigned)ntohl(fl->label),
 			   fl->share,
 			   (unsigned)fl->owner,
@@ -645,7 +645,7 @@ static void ip6fl_fl_seq_show(struct seq_file *seq, struct ip6_flowlabel *fl)
 static int ip6fl_seq_show(struct seq_file *seq, void *v)
 {
 	if (v == SEQ_START_TOKEN)
-		seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-39s %s\n",
+		seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-32s %s\n",
 			   "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt");
 	else
 		ip6fl_fl_seq_show(seq, v);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 0e03eabfb9da..6c05c7978bef 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2373,7 +2373,7 @@ static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
 	struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
 
 	seq_printf(seq,
-		   "%-4d %-15s " NIP6_FMT " %5d %08X %ld\n", 
+		   "%-4d %-15s " NIP6_SEQFMT " %5d %08X %ld\n", 
 		   state->dev->ifindex, state->dev->name,
 		   NIP6(im->mca_addr),
 		   im->mca_users, im->mca_flags,
@@ -2542,12 +2542,12 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(seq, 
 			   "%3s %6s "
-			   "%39s %39s %6s %6s\n", "Idx",
+			   "%32s %32s %6s %6s\n", "Idx",
 			   "Device", "Multicast Address",
 			   "Source Address", "INC", "EXC");
 	} else {
 		seq_printf(seq,
-			   "%3d %6.6s " NIP6_FMT " " NIP6_FMT " %6lu %6lu\n",
+			   "%3d %6.6s " NIP6_SEQFMT " " NIP6_SEQFMT " %6lu %6lu\n",
 			   state->dev->ifindex, state->dev->name,
 			   NIP6(state->im->mca_addr),
 			   NIP6(psf->sf_addr),
-- 
cgit v1.2.3-71-gd317


From e0069caede8387c585060b7e2e87729e9efcebc6 Mon Sep 17 00:00:00 2001
From: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Date: Tue, 17 Jan 2006 02:39:19 -0800
Subject: [NETFILTER] ip6tables: remove unused definitions

These definitions ware used for only internal use in kernel <= 2.6.13,
which had not introduced the unified parser of IPv6 extension header yet.

Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6/ip6t_ah.h   | 9 ---------
 include/linux/netfilter_ipv6/ip6t_esp.h  | 9 ---------
 include/linux/netfilter_ipv6/ip6t_frag.h | 9 ---------
 include/linux/netfilter_ipv6/ip6t_opts.h | 9 ---------
 include/linux/netfilter_ipv6/ip6t_rt.h   | 9 ---------
 5 files changed, 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6/ip6t_ah.h b/include/linux/netfilter_ipv6/ip6t_ah.h
index c4f0793a0a98..8531879eb464 100644
--- a/include/linux/netfilter_ipv6/ip6t_ah.h
+++ b/include/linux/netfilter_ipv6/ip6t_ah.h
@@ -18,13 +18,4 @@ struct ip6t_ah
 #define IP6T_AH_INV_LEN		0x02	/* Invert the sense of length. */
 #define IP6T_AH_INV_MASK	0x03	/* All possible flags. */
 
-#define MASK_HOPOPTS    128
-#define MASK_DSTOPTS    64
-#define MASK_ROUTING    32
-#define MASK_FRAGMENT   16
-#define MASK_AH         8
-#define MASK_ESP        4
-#define MASK_NONE       2
-#define MASK_PROTO      1
-
 #endif /*_IP6T_AH_H*/
diff --git a/include/linux/netfilter_ipv6/ip6t_esp.h b/include/linux/netfilter_ipv6/ip6t_esp.h
index 01142b98a231..a91b6abc8079 100644
--- a/include/linux/netfilter_ipv6/ip6t_esp.h
+++ b/include/linux/netfilter_ipv6/ip6t_esp.h
@@ -7,15 +7,6 @@ struct ip6t_esp
 	u_int8_t  invflags;			/* Inverse flags */
 };
 
-#define MASK_HOPOPTS    128
-#define MASK_DSTOPTS    64
-#define MASK_ROUTING    32
-#define MASK_FRAGMENT   16
-#define MASK_AH         8
-#define MASK_ESP        4
-#define MASK_NONE       2
-#define MASK_PROTO      1
-
 /* Values for "invflags" field in struct ip6t_esp. */
 #define IP6T_ESP_INV_SPI		0x01	/* Invert the sense of spi. */
 #define IP6T_ESP_INV_MASK	0x01	/* All possible flags. */
diff --git a/include/linux/netfilter_ipv6/ip6t_frag.h b/include/linux/netfilter_ipv6/ip6t_frag.h
index 449a57eca7dd..66070a0d6dfc 100644
--- a/include/linux/netfilter_ipv6/ip6t_frag.h
+++ b/include/linux/netfilter_ipv6/ip6t_frag.h
@@ -21,13 +21,4 @@ struct ip6t_frag
 #define IP6T_FRAG_INV_LEN	0x02	/* Invert the sense of length. */
 #define IP6T_FRAG_INV_MASK	0x03	/* All possible flags. */
 
-#define MASK_HOPOPTS    128
-#define MASK_DSTOPTS    64
-#define MASK_ROUTING    32
-#define MASK_FRAGMENT   16
-#define MASK_AH         8
-#define MASK_ESP        4
-#define MASK_NONE       2
-#define MASK_PROTO      1
-
 #endif /*_IP6T_FRAG_H*/
diff --git a/include/linux/netfilter_ipv6/ip6t_opts.h b/include/linux/netfilter_ipv6/ip6t_opts.h
index e259b6275bd2..a07e36380ae8 100644
--- a/include/linux/netfilter_ipv6/ip6t_opts.h
+++ b/include/linux/netfilter_ipv6/ip6t_opts.h
@@ -20,13 +20,4 @@ struct ip6t_opts
 #define IP6T_OPTS_INV_LEN	0x01	/* Invert the sense of length. */
 #define IP6T_OPTS_INV_MASK	0x01	/* All possible flags. */
 
-#define MASK_HOPOPTS    128
-#define MASK_DSTOPTS    64
-#define MASK_ROUTING    32
-#define MASK_FRAGMENT   16
-#define MASK_AH         8
-#define MASK_ESP        4
-#define MASK_NONE       2
-#define MASK_PROTO      1
-
 #endif /*_IP6T_OPTS_H*/
diff --git a/include/linux/netfilter_ipv6/ip6t_rt.h b/include/linux/netfilter_ipv6/ip6t_rt.h
index f1070fbf2757..52156023e8db 100644
--- a/include/linux/netfilter_ipv6/ip6t_rt.h
+++ b/include/linux/netfilter_ipv6/ip6t_rt.h
@@ -30,13 +30,4 @@ struct ip6t_rt
 #define IP6T_RT_INV_LEN		0x04	/* Invert the sense of length. */
 #define IP6T_RT_INV_MASK	0x07	/* All possible flags. */
 
-#define MASK_HOPOPTS    128
-#define MASK_DSTOPTS    64
-#define MASK_ROUTING    32
-#define MASK_FRAGMENT   16
-#define MASK_AH         8
-#define MASK_ESP        4
-#define MASK_NONE       2
-#define MASK_PROTO      1
-
 #endif /*_IP6T_RT_H*/
-- 
cgit v1.2.3-71-gd317


From 8243126c5e29030bf1a3fb75187a513966dcba62 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 17 Jan 2006 02:54:21 -0800
Subject: [NET]: Make second arg to skb_reserved() signed.

Some subsystems, such as PPP, can send negative values
here.  It just happened to work correctly on 32-bit with
an unsigned value, but on 64-bit this explodes.

Figured out by Paul Mackerras based upon several PPP crash
reports.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e5fd66c5650b..ad7cc22bd424 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -926,7 +926,7 @@ static inline int skb_tailroom(const struct sk_buff *skb)
  *	Increase the headroom of an empty &sk_buff by reducing the tail
  *	room. This is only allowed for an empty buffer.
  */
-static inline void skb_reserve(struct sk_buff *skb, unsigned int len)
+static inline void skb_reserve(struct sk_buff *skb, int len)
 {
 	skb->data += len;
 	skb->tail += len;
-- 
cgit v1.2.3-71-gd317


From 1bc4ccfff8675adc3d96f91245eb7e2dc0043ca9 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 9 Jan 2006 17:18:14 +0000
Subject: [PATCH] libata: add a function to decide if we need iordy

This ought to be simple but for PIO2 we have to poke around the drive
data to get it 100% correct.

Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 34 ++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |  2 ++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index bc48d88fef5a..b42acbe0e9a5 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1168,6 +1168,39 @@ ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 	return AC_ERR_OTHER;
 }
 
+/**
+ *	ata_pio_need_iordy	-	check if iordy needed
+ *	@adev: ATA device
+ *
+ *	Check if the current speed of the device requires IORDY. Used
+ *	by various controllers for chip configuration.
+ */
+
+unsigned int ata_pio_need_iordy(const struct ata_device *adev)
+{
+	int pio;
+	int speed = adev->pio_mode - XFER_PIO_0;
+
+	if (speed < 2)
+		return 0;
+	if (speed > 2)
+		return 1;
+		
+	/* If we have no drive specific rule, then PIO 2 is non IORDY */
+
+	if (adev->id[ATA_ID_FIELD_VALID] & 2) {	/* EIDE */
+		pio = adev->id[ATA_ID_EIDE_PIO];
+		/* Is the speed faster than the drive allows non IORDY ? */
+		if (pio) {
+			/* This is cycle times not frequency - watch the logic! */
+			if (pio > 240)	/* PIO2 is 240nS per cycle */
+				return 1;
+			return 0;
+		}
+	}
+	return 0;
+}
+
 /**
  *	ata_dev_identify - obtain IDENTIFY x DEVICE page
  *	@ap: port on which device we wish to probe resides
@@ -5126,6 +5159,7 @@ EXPORT_SYMBOL_GPL(ata_dev_id_string);
 EXPORT_SYMBOL_GPL(ata_dev_config);
 EXPORT_SYMBOL_GPL(ata_scsi_simulate);
 
+EXPORT_SYMBOL_GPL(ata_pio_need_iordy);
 EXPORT_SYMBOL_GPL(ata_timing_compute);
 EXPORT_SYMBOL_GPL(ata_timing_merge);
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a43c95f8f968..af6624450f65 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -499,6 +499,8 @@ extern int ata_scsi_slave_config(struct scsi_device *sdev);
 /*
  * Timing helpers
  */
+
+extern unsigned int ata_pio_need_iordy(const struct ata_device *);
 extern int ata_timing_compute(struct ata_device *, unsigned short,
 			      struct ata_timing *, int, int);
 extern void ata_timing_merge(const struct ata_timing *,
-- 
cgit v1.2.3-71-gd317


From 16cb4b333c9e7a00ce3b1d74ec0c9b4c2e956910 Mon Sep 17 00:00:00 2001
From: Per Liden <per.liden@nospam.ericsson.com>
Date: Fri, 13 Jan 2006 22:22:22 +0100
Subject: [TIPC] Updated link priority macros

Added macros for min/default/max link priority in tipc_config.h.
Also renamed TIPC_NUM_LINK_PRI to TIPC_MEDIA_LINK_PRI since that
is a more accurate description of what it is used for.

Signed-off-by: Per Liden <per.liden@ericsson.com>
---
 include/linux/tipc_config.h |  7 +++++--
 net/tipc/bcast.c            |  4 ++--
 net/tipc/bearer.c           | 19 +++++++++++++------
 net/tipc/eth_media.c        |  6 +++---
 net/tipc/link.c             |  3 ++-
 5 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tipc_config.h b/include/linux/tipc_config.h
index a52c8c64a5a3..33a653913d94 100644
--- a/include/linux/tipc_config.h
+++ b/include/linux/tipc_config.h
@@ -168,10 +168,13 @@
 #define TIPC_MAX_LINK_NAME	60	/* format = Z.C.N:interface-Z.C.N:interface */
 
 /*
- * Link priority limits (range from 0 to # priorities - 1)
+ * Link priority limits (min, default, max, media default)
  */
 
-#define TIPC_NUM_LINK_PRI 32
+#define TIPC_MIN_LINK_PRI	0
+#define TIPC_DEF_LINK_PRI	10
+#define TIPC_MAX_LINK_PRI	31
+#define TIPC_MEDIA_LINK_PRI	(TIPC_MAX_LINK_PRI + 1)
 
 /*
  * Link tolerance limits (min, default, max), in ms
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 9713d622efb8..af9743a52d6c 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -82,7 +82,7 @@ struct bcbearer {
 	struct bearer bearer;
 	struct media media;
 	struct bcbearer_pair bpairs[MAX_BEARERS];
-	struct bcbearer_pair bpairs_temp[TIPC_NUM_LINK_PRI];
+	struct bcbearer_pair bpairs_temp[TIPC_MAX_LINK_PRI + 1];
 };
 
 /**
@@ -630,7 +630,7 @@ void bcbearer_sort(void)
 	bp_curr = bcbearer->bpairs;
 	memset(bcbearer->bpairs, 0, sizeof(bcbearer->bpairs));
 
-	for (pri = (TIPC_NUM_LINK_PRI - 1); pri >= 0; pri--) {
+	for (pri = TIPC_MAX_LINK_PRI; pri >= 0; pri--) {
 
 		if (!bp_temp[pri].primary)
 			continue;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 3dd19fdc5a2c..02b6cf6ab7a4 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -119,7 +119,8 @@ int  tipc_register_media(u32 media_type,
 		warn("Media registration error: no broadcast address supplied\n");
 		goto exit;
 	}
-	if (bearer_priority >= TIPC_NUM_LINK_PRI) {
+	if ((bearer_priority < TIPC_MIN_LINK_PRI) &&
+	    (bearer_priority > TIPC_MAX_LINK_PRI)) {
 		warn("Media registration error: priority %u\n", bearer_priority);
 		goto exit;
 	}
@@ -476,10 +477,15 @@ int tipc_enable_bearer(const char *name, u32 bcast_scope, u32 priority)
 
 	if (tipc_mode != TIPC_NET_MODE)
 		return -ENOPROTOOPT;
+
 	if (!bearer_name_validate(name, &b_name) ||
 	    !addr_domain_valid(bcast_scope) ||
-	    !in_scope(bcast_scope, tipc_own_addr) ||
-	    (priority > TIPC_NUM_LINK_PRI))
+	    !in_scope(bcast_scope, tipc_own_addr))
+		return -EINVAL;
+
+	if ((priority < TIPC_MIN_LINK_PRI ||
+	     priority > TIPC_MAX_LINK_PRI) &&
+	    (priority != TIPC_MEDIA_LINK_PRI))
 		return -EINVAL;
 
 	write_lock_bh(&net_lock);
@@ -491,7 +497,8 @@ int tipc_enable_bearer(const char *name, u32 bcast_scope, u32 priority)
 		warn("No media <%s>\n", b_name.media_name);
 		goto failed;
 	}
-	if (priority == TIPC_NUM_LINK_PRI)
+
+	if (priority == TIPC_MEDIA_LINK_PRI)
 		priority = m_ptr->priority;
 
 restart:
@@ -547,8 +554,8 @@ restart:
 	}
 	b_ptr->publ.lock = SPIN_LOCK_UNLOCKED;
 	write_unlock_bh(&net_lock);
-	info("Enabled bearer <%s>, discovery domain %s\n",
-	     name, addr_string_fill(addr_string, bcast_scope));
+	info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
+	     name, addr_string_fill(addr_string, bcast_scope), priority);
 	return 0;
 failed:
 	write_unlock_bh(&net_lock);
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index 34d0462db3aa..2ab6c16280e6 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -42,9 +42,9 @@
 
 #define MAX_ETH_BEARERS		2
 #define TIPC_PROTOCOL		0x88ca
-#define ETH_LINK_PRIORITY	10
+#define ETH_LINK_PRIORITY	TIPC_DEF_LINK_PRI
 #define ETH_LINK_TOLERANCE	TIPC_DEF_LINK_TOL
-
+#define ETH_LINK_WINDOW		TIPC_DEF_LINK_WIN
 
 /**
  * struct eth_bearer - Ethernet bearer data structure
@@ -260,7 +260,7 @@ int eth_media_start(void)
 	res = tipc_register_media(TIPC_MEDIA_TYPE_ETH, "eth",
 				  enable_bearer, disable_bearer, send_msg, 
 				  eth_addr2str, &bcast_addr, ETH_LINK_PRIORITY, 
-				  ETH_LINK_TOLERANCE, TIPC_DEF_LINK_WIN);
+				  ETH_LINK_TOLERANCE, ETH_LINK_WINDOW);
 	if (res)
 		return res;
 
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 7265f4be4766..d1e1ae66464a 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -2812,7 +2812,8 @@ struct sk_buff *link_cmd_config(const void *req_tlv_area, int req_tlv_space,
 		}
 		break;
 	case TIPC_CMD_SET_LINK_PRI: 
-		if (new_value < TIPC_NUM_LINK_PRI) {
+		if ((new_value >= TIPC_MIN_LINK_PRI) &&
+		    (new_value <= TIPC_MAX_LINK_PRI)) {
 			l_ptr->priority = new_value;
 			link_send_proto_msg(l_ptr, STATE_MSG, 
 					    0, 0, 0, new_value, 0);
-- 
cgit v1.2.3-71-gd317


From 33a9c4da5ab16192ef1e961d4c4e45c18031cd67 Mon Sep 17 00:00:00 2001
From: Per Liden <per.liden@ericsson.com>
Date: Mon, 16 Jan 2006 11:42:12 +0100
Subject: [TIPC] Move ethernet protocol id to linux/if_ether.h

Signed-off-by: Per Liden <per.liden@ericsson.com>
---
 include/linux/if_ether.h | 1 +
 net/tipc/eth_media.c     | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index fe26d431de87..7a92c1ce1457 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -72,6 +72,7 @@
 					 * over Ethernet
 					 */
 #define ETH_P_AOE	0x88A2		/* ATA over Ethernet		*/
+#define ETH_P_TIPC	0x88CA		/* TIPC 			*/
 
 /*
  *	Non DIX types. Won't clash for 1500 types.
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index 2ab6c16280e6..64a880b5935a 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -41,7 +41,6 @@
 #include <linux/version.h>
 
 #define MAX_ETH_BEARERS		2
-#define TIPC_PROTOCOL		0x88ca
 #define ETH_LINK_PRIORITY	TIPC_DEF_LINK_PRI
 #define ETH_LINK_TOLERANCE	TIPC_DEF_LINK_TOL
 #define ETH_LINK_WINDOW		TIPC_DEF_LINK_WIN
@@ -78,7 +77,7 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
 		clone->nh.raw = clone->data;
 		dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev;
 		clone->dev = dev;
-		dev->hard_header(clone, dev, TIPC_PROTOCOL, 
+		dev->hard_header(clone, dev, ETH_P_TIPC, 
 				 &dest->dev_addr.eth_addr,
 				 dev->dev_addr, clone->len);
 		dev_queue_xmit(clone);
@@ -141,7 +140,7 @@ static int enable_bearer(struct tipc_bearer *tb_ptr)
 		return -EDQUOT;
 	if (!eb_ptr->dev) {
 		eb_ptr->dev = dev;
-		eb_ptr->tipc_packet_type.type = __constant_htons(TIPC_PROTOCOL);
+		eb_ptr->tipc_packet_type.type = __constant_htons(ETH_P_TIPC);
 		eb_ptr->tipc_packet_type.dev = dev;
 		eb_ptr->tipc_packet_type.func = recv_msg;
 		eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr;
-- 
cgit v1.2.3-71-gd317


From 8d238e012469a9a332c78d6a69a8a46ac4b1e9c2 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 17 Jan 2006 20:50:31 +0000
Subject: [PATCH] libata: Fix heuristic typos add LBA48PIO flag and support
 code, add IRQ flag for next diff

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 4 ++++
 include/linux/libata.h     | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index b42acbe0e9a5..e6044455ca45 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -611,6 +611,10 @@ int ata_rwcmd_protocol(struct ata_queued_cmd *qc)
 	if (dev->flags & ATA_DFLAG_PIO) {
 		tf->protocol = ATA_PROT_PIO;
 		index = dev->multi_count ? 0 : 8;
+	} else if (lba48 && (qc->ap->flags & ATA_FLAG_PIO_LBA48)) {
+		/* Unable to use DMA due to host limitation */
+		tf->protocol = ATA_PROT_PIO;
+		index = dev->multi_count ? 0 : 4;
 	} else {
 		tf->protocol = ATA_PROT_DMA;
 		index = 16;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index af6624450f65..9e5db2949c58 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -126,16 +126,19 @@ enum {
 
 	ATA_FLAG_SUSPENDED	= (1 << 12), /* port is suspended */
 
+	ATA_FLAG_PIO_LBA48	= (1 << 13), /* Host DMA engine is LBA28 only */
+	ATA_FLAG_IRQ_MASK	= (1 << 14), /* Mask IRQ in PIO xfers */
+
 	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
 	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
 	ATA_QCFLAG_SINGLE	= (1 << 4), /* no s/g, just a single buffer */
 	ATA_QCFLAG_DMAMAP	= ATA_QCFLAG_SG | ATA_QCFLAG_SINGLE,
 
 	/* various lengths of time */
-	ATA_TMOUT_EDD		= 5 * HZ,	/* hueristic */
+	ATA_TMOUT_EDD		= 5 * HZ,	/* heuristic */
 	ATA_TMOUT_PIO		= 30 * HZ,
-	ATA_TMOUT_BOOT		= 30 * HZ,	/* hueristic */
-	ATA_TMOUT_BOOT_QUICK	= 7 * HZ,	/* hueristic */
+	ATA_TMOUT_BOOT		= 30 * HZ,	/* heuristic */
+	ATA_TMOUT_BOOT_QUICK	= 7 * HZ,	/* heuristic */
 	ATA_TMOUT_CDB		= 30 * HZ,
 	ATA_TMOUT_CDB_QUICK	= 5 * HZ,
 	ATA_TMOUT_INTERNAL	= 30 * HZ,
-- 
cgit v1.2.3-71-gd317


From d9004eb466d03b7900ed432fecec6819012b4ed3 Mon Sep 17 00:00:00 2001
From: Alon Bar-Lev <alon.barlev@gmail.com>
Date: Wed, 18 Jan 2006 11:47:33 +0000
Subject: [SERIAL] Add 8250 support for Decision Computer International Co.
 PCCOM2

There is a new device which is look like:

	Serial controller: Decision Computer International Co. PCCOM2 (rev 02) (prog-if 02 [16550])
	0700: 6666:0004 (rev 02) (prog-if 02)
	Flags: medium devsel, IRQ 177
	Memory at fe000000 (32-bit, non-prefetchable) [size=128]
	I/O ports at e880 [size=128]
	I/O ports at e400 [size=256]

It has two 16550A, and is not listed in kernel, although the
manufacturer clams that it is supported...

I've created the following patch, it only add the new PCI id and the
card to the repository, it seems to work.

Signed-off-by: Alon Bar-Lev <alon.barlev@gmail.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/8250_pci.c | 10 ++++++++++
 include/linux/pci_ids.h   |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 589fb076654a..2a912153321e 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -940,6 +940,7 @@ enum pci_board_num_t {
 	pbn_b2_bt_2_921600,
 	pbn_b2_bt_4_921600,
 
+	pbn_b3_2_115200,
 	pbn_b3_4_115200,
 	pbn_b3_8_115200,
 
@@ -1311,6 +1312,12 @@ static struct pciserial_board pci_boards[] __devinitdata = {
 		.uart_offset	= 8,
 	},
 
+	[pbn_b3_2_115200] = {
+		.flags		= FL_BASE3,
+		.num_ports	= 2,
+		.base_baud	= 115200,
+		.uart_offset	= 8,
+	},
 	[pbn_b3_4_115200] = {
 		.flags		= FL_BASE3,
 		.num_ports	= 4,
@@ -2272,6 +2279,9 @@ static struct pci_device_id serial_pci_tbl[] = {
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_nec_nile4 },
 
+	{	PCI_VENDOR_ID_DCI, PCI_DEVICE_ID_DCI_PCCOM2,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b3_2_115200 },
 	{	PCI_VENDOR_ID_DCI, PCI_DEVICE_ID_DCI_PCCOM4,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b3_4_115200 },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5403257ae3e7..ecc1fc1f0f04 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1992,6 +1992,7 @@
 #define PCI_VENDOR_ID_DCI		0x6666
 #define PCI_DEVICE_ID_DCI_PCCOM4	0x0001
 #define PCI_DEVICE_ID_DCI_PCCOM8	0x0002
+#define PCI_DEVICE_ID_DCI_PCCOM2	0x0004
 
 #define PCI_VENDOR_ID_INTEL		0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC	0x0008
-- 
cgit v1.2.3-71-gd317


From 053837fce7aa79025ed57656855df09f80175527 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 18 Jan 2006 17:42:27 -0800
Subject: [PATCH] mm: migration page refcounting fix

Migration code currently does not take a reference to target page
properly, so between unlocking the pte and trying to take a new
reference to the page with isolate_lru_page, anything could happen to
it.

Fix this by holding the pte lock until we get a chance to elevate the
refcount.

Other small cleanups while we're here.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm_inline.h | 21 --------------
 include/linux/swap.h      |  1 +
 mm/filemap.c              |  1 +
 mm/mempolicy.c            | 29 +++++++++++--------
 mm/rmap.c                 |  2 +-
 mm/swap.c                 | 26 +++++++++++++++++
 mm/vmscan.c               | 71 ++++++++++++++++++++---------------------------
 7 files changed, 76 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 49cc68af01f8..8ac854f7f190 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -39,24 +39,3 @@ del_page_from_lru(struct zone *zone, struct page *page)
 	}
 }
 
-/*
- * Isolate one page from the LRU lists.
- *
- * - zone->lru_lock must be held
- */
-static inline int __isolate_lru_page(struct page *page)
-{
-	if (unlikely(!TestClearPageLRU(page)))
-		return 0;
-
-	if (get_page_testone(page)) {
-		/*
-		 * It is being freed elsewhere
-		 */
-		__put_page(page);
-		SetPageLRU(page);
-		return -ENOENT;
-	}
-
-	return 1;
-}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e92054d6530b..d01f7efb0f2c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -167,6 +167,7 @@ extern void FASTCALL(lru_cache_add_active(struct page *));
 extern void FASTCALL(activate_page(struct page *));
 extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
+extern int lru_add_drain_all(void);
 extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index a965b6b35f26..44da3d476994 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
+ *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode_lock		(page_remove_rmap->set_page_dirty)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3171f884d245..551cde40520b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -208,6 +208,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		page = vm_normal_page(vma, addr, *pte);
 		if (!page)
 			continue;
+		/*
+		 * The check for PageReserved here is important to avoid
+		 * handling zero pages and other pages that may have been
+		 * marked special by the system.
+		 *
+		 * If the PageReserved would not be checked here then f.e.
+		 * the location of the zero page could have an influence
+		 * on MPOL_MF_STRICT, zero pages would be counted for
+		 * the per node stats, and there would be useless attempts
+		 * to put zero pages on the migration list.
+		 */
 		if (PageReserved(page))
 			continue;
 		nid = page_to_nid(page);
@@ -216,11 +227,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 		if (flags & MPOL_MF_STATS)
 			gather_stats(page, private);
-		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-			spin_unlock(ptl);
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(vma, page, private, flags);
-			spin_lock(ptl);
-		}
 		else
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -309,6 +317,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
 
+	/* Clear the LRU lists so pages can be isolated */
+	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		lru_add_drain_all();
+
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
@@ -555,15 +567,8 @@ static void migrate_page_add(struct vm_area_struct *vma,
 	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 	    mapping_writably_mapped(page->mapping) ||
 	    single_mm_mapping(vma->vm_mm, page->mapping)) {
-		int rc = isolate_lru_page(page);
-
-		if (rc == 1)
+		if (isolate_lru_page(page))
 			list_add(&page->lru, pagelist);
-		/*
-		 * If the isolate attempt was not successful then we just
-		 * encountered an unswappable page. Something must be wrong.
-	 	 */
-		WARN_ON(rc == 0);
 	}
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index dfbb89f99a15..d85a99d28c03 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -33,7 +33,7 @@
  *     mapping->i_mmap_lock
  *       anon_vma->lock
  *         mm->page_table_lock or pte_lock
- *           zone->lru_lock (in mark_page_accessed)
+ *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
  *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
  *             mapping->private_lock (in __set_page_dirty_buffers)
diff --git a/mm/swap.c b/mm/swap.c
index cbb48e721ab9..bc2442a7b0ee 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -174,6 +174,32 @@ void lru_add_drain(void)
 	put_cpu();
 }
 
+#ifdef CONFIG_NUMA
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+	return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+}
+
+#else
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+	lru_add_drain();
+	return 0;
+}
+#endif
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf903b2d198f..827bf674577a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -586,7 +586,7 @@ static inline void move_to_lru(struct page *page)
 }
 
 /*
- * Add isolated pages on the list back to the LRU
+ * Add isolated pages on the list back to the LRU.
  *
  * returns the number of pages put back.
  */
@@ -760,46 +760,33 @@ next:
 	return nr_failed + retry;
 }
 
-static void lru_add_drain_per_cpu(void *dummy)
-{
-	lru_add_drain();
-}
-
 /*
  * Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
- * page is not on the LRU lists yet.
+ * indicated list with elevated refcount.
  *
  * Result:
  *  0 = page not on LRU list
  *  1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
  */
 int isolate_lru_page(struct page *page)
 {
-	int rc = 0;
-	struct zone *zone = page_zone(page);
+	int ret = 0;
 
-redo:
-	spin_lock_irq(&zone->lru_lock);
-	rc = __isolate_lru_page(page);
-	if (rc == 1) {
-		if (PageActive(page))
-			del_page_from_active_list(zone, page);
-		else
-			del_page_from_inactive_list(zone, page);
-	}
-	spin_unlock_irq(&zone->lru_lock);
-	if (rc == 0) {
-		/*
-		 * Maybe this page is still waiting for a cpu to drain it
-		 * from one of the lru lists?
-		 */
-		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-		if (rc == 0 && PageLRU(page))
-			goto redo;
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+		spin_lock_irq(&zone->lru_lock);
+		if (TestClearPageLRU(page)) {
+			ret = 1;
+			get_page(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+		}
+		spin_unlock_irq(&zone->lru_lock);
 	}
-	return rc;
+
+	return ret;
 }
 #endif
 
@@ -831,18 +818,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		switch (__isolate_lru_page(page)) {
-		case 1:
-			/* Succeeded to isolate page */
-			list_move(&page->lru, dst);
-			nr_taken++;
-			break;
-		case -ENOENT:
-			/* Not possible to isolate */
-			list_move(&page->lru, src);
-			break;
-		default:
+		if (!TestClearPageLRU(page))
 			BUG();
+		list_del(&page->lru);
+		if (get_page_testone(page)) {
+			/*
+			 * It is being freed elsewhere
+			 */
+			__put_page(page);
+			SetPageLRU(page);
+			list_add(&page->lru, src);
+			continue;
+		} else {
+			list_add(&page->lru, dst);
+			nr_taken++;
 		}
 	}
 
-- 
cgit v1.2.3-71-gd317


From 9eeff2395e3cfd05c9b2e6074ff943a34b0c5c21 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 18 Jan 2006 17:42:31 -0800
Subject: [PATCH] Zone reclaim: Reclaim logic

Some bits for zone reclaim exists in 2.6.15 but they are not usable.  This
patch fixes them up, removes unused code and makes zone reclaim usable.

Zone reclaim allows the reclaiming of pages from a zone if the number of
free pages falls below the watermarks even if other zones still have enough
pages available.  Zone reclaim is of particular importance for NUMA
machines.  It can be more beneficial to reclaim a page than taking the
performance penalties that come with allocating a page on a remote zone.

Zone reclaim is enabled if the maximum distance to another node is higher
than RECLAIM_DISTANCE, which may be defined by an arch.  By default
RECLAIM_DISTANCE is 20.  20 is the distance to another node in the same
component (enclosure or motherboard) on IA64.  The meaning of the NUMA
distance information seems to vary by arch.

If zone reclaim is not successful then no further reclaim attempts will
occur for a certain time period (ZONE_RECLAIM_INTERVAL).

This patch was discussed before. See

http://marc.theaimsgroup.com/?l=linux-kernel&m=113519961504207&w=2
http://marc.theaimsgroup.com/?l=linux-kernel&m=113408418232531&w=2
http://marc.theaimsgroup.com/?l=linux-kernel&m=113389027420032&w=2
http://marc.theaimsgroup.com/?l=linux-kernel&m=113380938612205&w=2

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h   | 12 +++++----
 include/linux/swap.h     | 11 ++++++++
 include/linux/topology.h |  8 ++++++
 mm/page_alloc.c          | 17 +++++++++---
 mm/vmscan.c              | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 108 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 34cbefd2ebde..93a849f742db 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -149,14 +149,16 @@ struct zone {
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	int			all_unreclaimable; /* All pages pinned */
 
-	/*
-	 * Does the allocator try to reclaim pages from the zone as soon
-	 * as it fails a watermark_ok() in __alloc_pages?
-	 */
-	int			reclaim_pages;
 	/* A count of how many reclaimers are scanning this zone */
 	atomic_t		reclaim_in_progress;
 
+	/*
+	 * timestamp (in jiffies) of the last zone reclaim that did not
+	 * result in freeing of pages. This is used to avoid repeated scans
+	 * if all memory in the zone is in use.
+	 */
+	unsigned long		last_unsuccessful_zone_reclaim;
+
 	/*
 	 * prev_priority holds the scanning priority for this zone.  It is
 	 * defined as the scanning priority at which we achieved our reclaim
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d01f7efb0f2c..4a99e4a7fbf3 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -176,6 +176,17 @@ extern int try_to_free_pages(struct zone **, gfp_t);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
+#ifdef CONFIG_NUMA
+extern int zone_reclaim_mode;
+extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+#else
+#define zone_reclaim_mode 0
+static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 315a5163d6a0..e8eb0040ce3a 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -56,6 +56,14 @@
 #define REMOTE_DISTANCE		20
 #define node_distance(from,to)	((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
 #endif
+#ifndef RECLAIM_DISTANCE
+/*
+ * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
+ * (in whatever arch specific measurement units returned by node_distance())
+ * then switch on zone reclaim on boot.
+ */
+#define RECLAIM_DISTANCE 20
+#endif
 #ifndef PENALTY_FOR_NODE_WITH_CPUS
 #define PENALTY_FOR_NODE_WITH_CPUS	(1)
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2e29743a8d1..df54e2fc8ee0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 				mark = (*z)->pages_high;
 			if (!zone_watermark_ok(*z, order, mark,
 				    classzone_idx, alloc_flags))
-				continue;
+				if (!zone_reclaim_mode ||
+				    !zone_reclaim(*z, gfp_mask, order))
+					continue;
 		}
 
 		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
-		if (node_distance(local_node, node) !=
-				node_distance(local_node, prev_node))
+
+		if (distance != node_distance(local_node, prev_node))
 			node_load[node] += load;
 		prev_node = node;
 		load--;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5117b6897a9..2e34b61a70c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
 }
 
 module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+
+/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+	int nr_pages = 1 << order;
+	struct task_struct *p = current;
+	struct reclaim_state reclaim_state;
+	struct scan_control sc = {
+		.gfp_mask	= gfp_mask,
+		.may_writepage	= 0,
+		.may_swap	= 0,
+		.nr_mapped	= read_page_state(nr_mapped),
+		.nr_scanned	= 0,
+		.nr_reclaimed	= 0,
+		.priority	= 0
+	};
+
+	if (!(gfp_mask & __GFP_WAIT) ||
+		zone->zone_pgdat->node_id != numa_node_id() ||
+		zone->all_unreclaimable ||
+		atomic_read(&zone->reclaim_in_progress) > 0)
+			return 0;
+
+	if (time_before(jiffies,
+		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+			return 0;
+
+	disable_swap_token();
+
+	if (nr_pages > SWAP_CLUSTER_MAX)
+		sc.swap_cluster_max = nr_pages;
+	else
+		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+	cond_resched();
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+	shrink_zone(zone, &sc);
+	p->reclaim_state = NULL;
+	current->flags &= ~PF_MEMALLOC;
+
+	if (sc.nr_reclaimed == 0)
+		zone->last_unsuccessful_zone_reclaim = jiffies;
+
+	return sc.nr_reclaimed > nr_pages;
+}
+#endif
+
-- 
cgit v1.2.3-71-gd317


From 1743660b911bfb849b1fb33830522254561b9f9b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 18 Jan 2006 17:42:32 -0800
Subject: [PATCH] Zone reclaim: proc override

proc support for zone reclaim

This patch creates a proc entry /proc/sys/vm/zone_reclaim_mode that may be
used to override the automatic determination of the zone reclaim made on
bootup.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
 include/linux/sysctl.h      |  1 +
 kernel/sysctl.c             | 11 +++++++++++
 3 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6910c0136f8d..391dd64363e7 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -27,6 +27,7 @@ Currently, these files are in /proc/sys/vm:
 - laptop_mode
 - block_dump
 - drop-caches
+- zone_reclaim_mode
 
 ==============================================================
 
@@ -120,3 +121,20 @@ set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
 
 The initial value is zero.  Kernel does not use this value at boot time to set
 the high water marks for each per cpu page list.
+
+===============================================================
+
+zone_reclaim_mode:
+
+This is set during bootup to 1 if it is determined that pages from
+remote zones will cause a significant performance reduction. The
+page allocator will then reclaim easily reusable pages (those page
+cache pages that are currently not used) before going off node.
+
+The user can override this setting. It may be beneficial to switch
+off zone reclaim if the system is used for a file server and all
+of memory should be used for caching files from disk.
+
+It may be beneficial to switch this on if one wants to do zone
+reclaim regardless of the numa distances in the system.
+
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7f472127b7b5..8352a7ce5895 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -182,6 +182,7 @@ enum
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
 	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
 	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
+	VM_ZONE_RECLAIM_MODE=31,/* reclaim local zone memory before going off node */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5d69b6e29f5..cb99a42f8b37 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -869,6 +869,17 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec_jiffies,
 		.strategy	= &sysctl_jiffies,
 	},
+#endif
+#ifdef CONFIG_NUMA
+	{
+		.ctl_name	= VM_ZONE_RECLAIM_MODE,
+		.procname	= "zone_reclaim_mode",
+		.data		= &zone_reclaim_mode,
+		.maxlen		= sizeof(zone_reclaim_mode),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &zero,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3-71-gd317


From dc85da15d42b0efc792b0f5eab774dc5dbc1ceec Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 18 Jan 2006 17:42:36 -0800
Subject: [PATCH] NUMA policies in the slab allocator V2

This patch fixes a regression in 2.6.14 against 2.6.13 that causes an
imbalance in memory allocation during bootup.

The slab allocator in 2.6.13 is not numa aware and simply calls
alloc_pages().  This means that memory policies may control the behavior of
alloc_pages().  During bootup the memory policy is set to MPOL_INTERLEAVE
resulting in the spreading out of allocations during bootup over all
available nodes.  The slab allocator in 2.6.13 has only a single list of
slab pages.  As a result the per cpu slab cache and the spinlock controlled
page lists may contain slab entries from off node memory.  The slab
allocator in 2.6.13 makes no effort to discern the locality of an entry on
its lists.

The NUMA aware slab allocator in 2.6.14 controls locality of the slab pages
explicitly by calling alloc_pages_node().  The NUMA slab allocator manages
slab entries by having lists of available slab pages for each node.  The
per cpu slab cache can only contain slab entries associated with the node
local to the processor.  This guarantees that the default allocation mode
of the slab allocator always assigns local memory if available.

Setting MPOL_INTERLEAVE as a default policy during bootup has no effect
anymore.  In 2.6.14 all node unspecific slab allocations are performed on
the boot processor.  This means that most of key data structures are
allocated on one node.  Most processors will have to refer to these
structures making the boot node a potential bottleneck.  This may reduce
performance and cause unnecessary memory pressure on the boot node.

This patch implements NUMA policies in the slab layer.  There is the need
of explicit application of NUMA memory policies by the slab allcator itself
since the NUMA slab allocator does no longer let the page_allocator control
locality.

The check for policies is made directly at the beginning of __cache_alloc
using current->mempolicy.  The memory policy is already frequently checked
by the page allocator (alloc_page_vma() and alloc_page_current()).  So it
is highly likely that the cacheline is present.  For MPOL_INTERLEAVE
kmalloc() will spread out each request to one node after another so that an
equal distribution of allocations can be obtained during bootup.

It is not possible to push the policy check to lower layers of the NUMA
slab allocator since the per cpu caches are now only containing slab
entries from the current node.  If the policy says that the local node is
not to be preferred or forbidden then there is no point in checking the
slab cache or local list of slab pages.  The allocation better be directed
immediately to the lists containing slab entries for the allowed set of
nodes.

This way of applying policy also fixes another strange behavior in 2.6.13.
alloc_pages() is controlled by the memory allocation policy of the current
process.  It could therefore be that one process is running with
MPOL_INTERLEAVE and would f.e.  obtain a new page following that policy
since no slab entries are in the lists anymore.  A page can typically be
used for multiple slab entries but lets say that the current process is
only using one.  The other entries are then added to the slab lists.  These
are now non local entries in the slab lists despite of the possible
availability of local pages that would provide faster access and increase
the performance of the application.

Another process without MPOL_INTERLEAVE may now run and expect a local slab
entry from kmalloc().  However, there are still these free slab entries
from the off node page obtained from the other process via MPOL_INTERLEAVE
in the cache.  The process will then get an off node slab entry although
other slab entries may be available that are local to that process.  This
means that the policy if one process may contaminate the locality of the
slab caches for other processes.

This patch in effect insures that a per process policy is followed for the
allocation of slab entries and that there cannot be a memory policy
influence from one process to another.  A process with default policy will
always get a local slab entry if one is available.  And the process using
memory policies will get its memory arranged as requested.  Off-node slab
allocation will require the use of spinlocks and will make the use of per
cpu caches not possible.  A process using memory policies to redirect
allocations offnode will have to cope with additional lock overhead in
addition to the latency added by the need to access a remote slab entry.

Changes V1->V2
- Remove #ifdef CONFIG_NUMA by moving forward declaration into
  prior #ifdef CONFIG_NUMA section.

- Give the function determining the node number to use a saner
  name.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |  1 +
 mm/mempolicy.c            | 30 ++++++++++++++++++++++++++++++
 mm/slab.c                 | 12 ++++++++++++
 3 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d6a53ed6ab6c..bbd2221923c3 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -159,6 +159,7 @@ extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
+extern unsigned slab_node(struct mempolicy *policy);
 
 extern int policy_zone;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a683a66599b1..71430d440822 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -976,6 +976,36 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	return nid;
 }
 
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+	if (in_interrupt())
+		return numa_node_id();
+
+	switch (policy->policy) {
+	case MPOL_INTERLEAVE:
+		return interleave_nodes(policy);
+
+	case MPOL_BIND:
+		/*
+		 * Follow bind policy behavior and start allocation at the
+		 * first node.
+		 */
+		return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+	case MPOL_PREFERRED:
+		if (policy->v.preferred_node >= 0)
+			return policy->v.preferred_node;
+		/* Fall through */
+
+	default:
+		return numa_node_id();
+	}
+}
+
 /* Do static interleaving for a VMA with known offset. */
 static unsigned offset_il_node(struct mempolicy *pol,
 		struct vm_area_struct *vma, unsigned long off)
diff --git a/mm/slab.c b/mm/slab.c
index bd0317f1e06c..9025608696ec 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -103,6 +103,7 @@
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/nodemask.h>
+#include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
 
 #include	<asm/uaccess.h>
@@ -773,6 +774,8 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 }
 
 #ifdef CONFIG_NUMA
+static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
+
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
 	struct array_cache **ac_ptr;
@@ -2570,6 +2573,15 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 	void *objp;
 	struct array_cache *ac;
 
+#ifdef CONFIG_NUMA
+	if (current->mempolicy) {
+		int nid = slab_node(current->mempolicy);
+
+		if (nid != numa_node_id())
+			return __cache_alloc_node(cachep, flags, nid);
+	}
+#endif
+
 	check_irq_off();
 	ac = ac_data(cachep);
 	if (likely(ac->avail)) {
-- 
cgit v1.2.3-71-gd317


From 5131cf154ad1c6e584efa58d17a469d0b80f49bd Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 18 Jan 2006 17:43:04 -0800
Subject: [PATCH] add missing syscall declarations

All standard system calls should be declared in include/linux/syscalls.h.

Add some of the new additions that were previously missed.

Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inotify.c             |  1 +
 include/linux/syscalls.h | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inotify.c b/fs/inotify.c
index 2fecb7af4a77..878ccca61213 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -33,6 +33,7 @@
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
+#include <linux/syscalls.h>
 
 #include <asm/ioctls.h>
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3eed47347013..e666d6070569 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -510,9 +510,24 @@ asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3,
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
 asmlinkage long sys_ioprio_get(int which, int who);
 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
-					unsigned long maxnode);
+				unsigned long maxnode);
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
-			const unsigned long __user *from, const unsigned long __user *to);
+				const unsigned long __user *from,
+				const unsigned long __user *to);
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+				unsigned long mode,
+				unsigned long __user *nmask,
+				unsigned long maxnode,
+				unsigned flags);
+asmlinkage long sys_get_mempolicy(int __user *policy,
+				unsigned long __user *nmask,
+				unsigned long maxnode,
+				unsigned long addr, unsigned long flags);
+
+asmlinkage long sys_inotify_init(void);
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *path,
+					u32 mask);
+asmlinkage long sys_inotify_rm_watch(int fd, u32 wd);
 
 asmlinkage long sys_spu_run(int fd, __u32 __user *unpc,
 				 __u32 __user *ustatus);
-- 
cgit v1.2.3-71-gd317


From f193fbab2e4710f629e7c1859d4908646b47b126 Mon Sep 17 00:00:00 2001
From: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Date: Wed, 18 Jan 2006 17:43:13 -0800
Subject: [PATCH] nfsd: check error status from nfsd_sync_dir

Change nfsd_sync_dir to return an error if ->sync fails, and pass that error
up through the stack.  This involves a number of rearrangements of error
paths, and care to distinguish between Linux -errno numbers and NFSERR
numbers.

In the 'create' routines, we continue with the 'setattr' even if a previous
sync_dir failed.

This patch is quite different from Takashi's in a few ways, but there is still
a strong lineage.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c             | 80 ++++++++++++++++++++++++++---------------------
 include/linux/nfsd/nfsd.h |  2 +-
 2 files changed, 45 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index eef0576a7785..acf637869ccf 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -710,14 +710,15 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
 {
 	struct inode *inode = dp->d_inode;
 	int (*fsync) (struct file *, struct dentry *, int);
-	int err = nfs_ok;
+	int err;
 
-	filemap_fdatawrite(inode->i_mapping);
-	if (fop && (fsync = fop->fsync))
-		err=fsync(filp, dp, 0);
-	filemap_fdatawait(inode->i_mapping);
+	err = filemap_fdatawrite(inode->i_mapping);
+	if (err == 0 && fop && (fsync = fop->fsync))
+		err = fsync(filp, dp, 0);
+	if (err == 0)
+		err = filemap_fdatawait(inode->i_mapping);
 
-	return nfserrno(err);
+	return err;
 }
 	
 
@@ -734,10 +735,10 @@ nfsd_sync(struct file *filp)
 	return err;
 }
 
-void
+int
 nfsd_sync_dir(struct dentry *dp)
 {
-	nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
+	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
 }
 
 /*
@@ -1065,6 +1066,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (EX_ISSYNC(fhp->fh_export)) {
 		if (file->f_op && file->f_op->fsync) {
 			err = nfsd_sync(file);
+			err = nfserrno(err);
 		} else {
 			err = nfserr_notsupp;
 		}
@@ -1175,7 +1177,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out_nfserr;
 
 	if (EX_ISSYNC(fhp->fh_export)) {
-		nfsd_sync_dir(dentry);
+		err = nfsd_sync_dir(dentry);
 		write_inode_now(dchild->d_inode, 1);
 	}
 
@@ -1185,9 +1187,11 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 * send along the gid when it tries to implement setgid
 	 * directories via NFS.
 	 */
-	err = 0;
-	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0)
-		err = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
+		int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+		if (err2)
+			err = err2;
+	}
 	/*
 	 * Update the file handle to get the new inode info.
 	 */
@@ -1306,17 +1310,12 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out_nfserr;
 
 	if (EX_ISSYNC(fhp->fh_export)) {
-		nfsd_sync_dir(dentry);
+		err = nfsd_sync_dir(dentry);
+		if (err)
+			err = nfserrno(err);
 		/* setattr will sync the child (or not) */
 	}
 
-	/*
-	 * Update the filehandle to get the new inode info.
-	 */
-	err = fh_update(resfhp);
-	if (err)
-		goto out;
-
 	if (createmode == NFS3_CREATE_EXCLUSIVE) {
 		/* Cram the verifier into atime/mtime/mode */
 		iap->ia_valid = ATTR_MTIME|ATTR_ATIME
@@ -1337,8 +1336,17 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 * implement setgid directories via NFS. Clear out all that cruft.
 	 */
  set_attr:
-	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0)
- 		err = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) {
+ 		int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+		if (err2)
+			err = nfserrno(err2);
+	}
+
+	/*
+	 * Update the filehandle to get the new inode info.
+	 */
+	if (!err)
+		err = fh_update(resfhp);
 
  out:
 	fh_unlock(fhp);
@@ -1447,10 +1455,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	} else
 		err = vfs_symlink(dentry->d_inode, dnew, path, mode);
 
-	if (!err) {
+	if (!err)
 		if (EX_ISSYNC(fhp->fh_export))
-			nfsd_sync_dir(dentry);
-	} else
+			err = nfsd_sync_dir(dentry);
+	if (err)
 		err = nfserrno(err);
 	fh_unlock(fhp);
 
@@ -1506,8 +1514,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	err = vfs_link(dold, dirp, dnew);
 	if (!err) {
 		if (EX_ISSYNC(ffhp->fh_export)) {
-			nfsd_sync_dir(ddir);
+			err = nfsd_sync_dir(ddir);
 			write_inode_now(dest, 1);
+			if (err)
+				err = nfserrno(err);
 		}
 	} else {
 		if (err == -EXDEV && rqstp->rq_vers == 2)
@@ -1595,8 +1605,9 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 #endif
 	err = vfs_rename(fdir, odentry, tdir, ndentry);
 	if (!err && EX_ISSYNC(tfhp->fh_export)) {
-		nfsd_sync_dir(tdentry);
-		nfsd_sync_dir(fdentry);
+		err = nfsd_sync_dir(tdentry);
+		if (!err)
+			err = nfsd_sync_dir(fdentry);
 	}
 
  out_dput_new:
@@ -1671,17 +1682,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 
 	dput(rdentry);
 
-	if (err)
-		goto out_nfserr;
-	if (EX_ISSYNC(fhp->fh_export)) 
-		nfsd_sync_dir(dentry);
-
-out:
-	return err;
+	if (err == 0 &&
+	    EX_ISSYNC(fhp->fh_export))
+			err = nfsd_sync_dir(dentry);
 
 out_nfserr:
 	err = nfserrno(err);
-	goto out;
+out:
+	return err;
 }
 
 /*
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 51c231a1e5a6..ec7c2e872d72 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -124,7 +124,7 @@ int		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 
 int		nfsd_notify_change(struct inode *, struct iattr *);
 int		nfsd_permission(struct svc_export *, struct dentry *, int);
-void		nfsd_sync_dir(struct dentry *dp);
+int		nfsd_sync_dir(struct dentry *dp);
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 #ifdef CONFIG_NFSD_V2_ACL
-- 
cgit v1.2.3-71-gd317


From 1918e341383ab787d6c5b17200f4ed901b10c777 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 18 Jan 2006 17:43:16 -0800
Subject: [PATCH] svcrpc: save and restore the daddr field when request
 deferred

The server code currently keeps track of the destination address on every
request so that it can reply using the same address.  However we forget to do
that in the case of a deferred request.  Remedy this oversight.  >From folks
at PolyServe.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sunrpc/svc.h | 1 +
 net/sunrpc/svcsock.c       | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e4086ec8b952..50cab2a09f28 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -246,6 +246,7 @@ struct svc_deferred_req {
 	u32			prot;	/* protocol (UDP or TCP) */
 	struct sockaddr_in	addr;
 	struct svc_sock		*svsk;	/* where reply must go */
+	u32			daddr;	/* where reply must come from */
 	struct cache_deferred_req handle;
 	int			argslen;
 	u32			args[0];
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index e67613e4eb18..50580620e897 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1527,6 +1527,7 @@ svc_defer(struct cache_req *req)
 		dr->handle.owner = rqstp->rq_server;
 		dr->prot = rqstp->rq_prot;
 		dr->addr = rqstp->rq_addr;
+		dr->daddr = rqstp->rq_daddr;
 		dr->argslen = rqstp->rq_arg.len >> 2;
 		memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
 	}
@@ -1552,6 +1553,7 @@ static int svc_deferred_recv(struct svc_rqst *rqstp)
 	rqstp->rq_arg.len = dr->argslen<<2;
 	rqstp->rq_prot        = dr->prot;
 	rqstp->rq_addr        = dr->addr;
+	rqstp->rq_daddr       = dr->daddr;
 	return dr->argslen<<2;
 }
 
-- 
cgit v1.2.3-71-gd317


From 3a65588adc4401622b204caa897123e16a4a0318 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 18 Jan 2006 17:43:19 -0800
Subject: [PATCH] nfsd4: rename lk_stateowner

One of the things that's confusing about nfsd4_lock is that the lk_stateowner
field could be set to either of two different lockowners: the open owner or
the lock owner.  Rename to lk_replay_owner and add a comment to make it clear
that it's used for whichever stateowner has its sequence id bumped for replay
detection.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c       | 16 ++++++++--------
 fs/nfsd/nfs4xdr.c         |  5 ++---
 include/linux/nfsd/xdr4.h |  5 +++--
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5bf7fd3947ce..578ea521c827 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2725,11 +2725,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
 		                        CHECK_FH | OPEN_STATE,
-		                        &lock->lk_stateowner, &open_stp,
+		                        &lock->lk_replay_owner, &open_stp,
 					lock);
 		if (status)
 			goto out;
-		open_sop = lock->lk_stateowner;
+		open_sop = lock->lk_replay_owner;
 		/* create lockowner and lock stateid */
 		fp = open_stp->st_file;
 		strhashval = lock_ownerstr_hashval(fp->fi_inode, 
@@ -2752,12 +2752,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 				       lock->lk_old_lock_seqid, 
 				       &lock->lk_old_lock_stateid, 
 				       CHECK_FH | LOCK_STATE, 
-				       &lock->lk_stateowner, &lock_stp, lock);
+				       &lock->lk_replay_owner, &lock_stp, lock);
 		if (status)
 			goto out;
-		lock_sop = lock->lk_stateowner;
+		lock_sop = lock->lk_replay_owner;
 	}
-	/* lock->lk_stateowner and lock_stp have been created or found */
+	/* lock->lk_replay_owner and lock_stp have been created or found */
 	filp = lock_stp->st_vfs_file;
 
 	status = nfserr_grace;
@@ -2830,9 +2830,9 @@ conflicting_lock:
 out:
 	if (status && lock->lk_is_new && lock_sop)
 		release_stateowner(lock_sop);
-	if (lock->lk_stateowner) {
-		nfs4_get_stateowner(lock->lk_stateowner);
-		*replay_owner = lock->lk_stateowner;
+	if (lock->lk_replay_owner) {
+		nfs4_get_stateowner(lock->lk_replay_owner);
+		*replay_owner = lock->lk_replay_owner;
 	}
 	nfs4_unlock_state();
 	return status;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index dcd673186944..6b743327686c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -528,7 +528,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 {
 	DECODE_HEAD;
 
-	lock->lk_stateowner = NULL;
+	lock->lk_replay_owner = NULL;
 	/*
 	* type, reclaim(boolean), offset, length, new_lock_owner(boolean)
 	*/
@@ -1895,7 +1895,6 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
 static void
 nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock *lock)
 {
-
 	ENCODE_SEQID_OP_HEAD;
 
 	if (!nfserr) {
@@ -1906,7 +1905,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
 	} else if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lock->lk_denied);
 
-	ENCODE_SEQID_OP_TAIL(lock->lk_stateowner);
+	ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
 }
 
 static void
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 8903688890ce..77adba7d2281 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -145,8 +145,9 @@ struct nfsd4_lock {
 		} ok;
 		struct nfsd4_lock_denied        denied;
 	} u;
-
-	struct nfs4_stateowner *lk_stateowner;
+	/* The lk_replay_owner is the open owner in the open_to_lock_owner
+	 * case and the lock owner otherwise: */
+	struct nfs4_stateowner *lk_replay_owner;
 };
 #define lk_new_open_seqid       v.new.open_seqid
 #define lk_new_open_stateid     v.new.open_stateid
-- 
cgit v1.2.3-71-gd317


From 5590ff0d5528b60153c0b4e7b771472b5a95e297 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 18 Jan 2006 17:43:53 -0800
Subject: [PATCH] vfs: *at functions: core

Here is a series of patches which introduce in total 13 new system calls
which take a file descriptor/filename pair instead of a single file
name.  These functions, openat etc, have been discussed on numerous
occasions.  They are needed to implement race-free filesystem traversal,
they are necessary to implement a virtual per-thread current working
directory (think multi-threaded backup software), etc.

We have in glibc today implementations of the interfaces which use the
/proc/self/fd magic.  But this code is rather expensive.  Here are some
results (similar to what Jim Meyering posted before).

The test creates a deep directory hierarchy on a tmpfs filesystem.  Then
rm -fr is used to remove all directories.  Without syscall support I get
this:

real    0m31.921s
user    0m0.688s
sys     0m31.234s

With syscall support the results are much better:

real    0m20.699s
user    0m0.536s
sys     0m20.149s

The interfaces are for obvious reasons currently not much used.  But they'll
be used.  coreutils (and Jeff's posixutils) are already using them.
Furthermore, code like ftw/fts in libc (maybe even glob) will also start using
them.  I expect a patch to make follow soon.  Every program which is walking
the filesystem tree will benefit.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@ftp.linux.org.uk>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/osf_sys.c       |   2 +-
 arch/sparc64/kernel/sys_sparc32.c |   4 +-
 fs/compat.c                       |  48 +++++++++--
 fs/exec.c                         |   2 +-
 fs/namei.c                        | 172 ++++++++++++++++++++++++++++++--------
 fs/open.c                         |  83 ++++++++++++++----
 fs/stat.c                         |  66 ++++++++++++---
 include/linux/fcntl.h             |   7 ++
 include/linux/fs.h                |   7 +-
 include/linux/namei.h             |   7 +-
 include/linux/time.h              |   2 +-
 11 files changed, 320 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 01fe990d3e54..7fb14f42a125 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -960,7 +960,7 @@ osf_utimes(char __user *filename, struct timeval32 __user *tvs)
 			return -EFAULT;
 	}
 
-	return do_utimes(filename, tvs ? ktvs : NULL);
+	return do_utimes(AT_FDCWD, filename, tvs ? ktvs : NULL);
 }
 
 #define MAX_SELECT_SECONDS \
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index d4b7a100cb8a..9264ccbaaafa 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -821,7 +821,7 @@ asmlinkage long sys32_utimes(char __user *filename,
 			return -EFAULT;
 	}
 
-	return do_utimes(filename, (tvs ? &ktvs[0] : NULL));
+	return do_utimes(AT_FDCWD, filename, (tvs ? &ktvs[0] : NULL));
 }
 
 /* These are here just in case some old sparc32 binary calls it. */
@@ -1003,7 +1003,7 @@ asmlinkage long sys32_adjtimex(struct timex32 __user *utp)
 asmlinkage long sparc32_open(const char __user *filename,
 			     int flags, int mode)
 {
-	return do_sys_open(filename, flags, mode);
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
 }
 
 extern unsigned long do_mremap(unsigned long addr,
diff --git a/fs/compat.c b/fs/compat.c
index 2468ac1df2f0..c6ba9deabada 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -68,10 +68,10 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
 		tv[0].tv_usec = 0;
 		tv[1].tv_usec = 0;
 	}
-	return do_utimes(filename, t ? tv : NULL);
+	return do_utimes(AT_FDCWD, filename, t ? tv : NULL);
 }
 
-asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_futimesat(int dfd, char __user *filename, struct compat_timeval __user *t)
 {
 	struct timeval tv[2];
 
@@ -82,14 +82,19 @@ asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval _
 		    get_user(tv[1].tv_usec, &t[1].tv_usec))
 			return -EFAULT; 
 	} 
-	return do_utimes(filename, t ? tv : NULL);
+	return do_utimes(dfd, filename, t ? tv : NULL);
+}
+
+asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
+{
+	return compat_sys_futimesat(AT_FDCWD, filename, t);
 }
 
 asmlinkage long compat_sys_newstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat(filename, &stat);
+	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_compat_stat(&stat, statbuf);
@@ -100,13 +105,34 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat(filename, &stat);
+	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_compat_stat(&stat, statbuf);
 	return error;
 }
 
+asmlinkage long compat_sys_newfstatat(int dfd, char __user *filename,
+		struct compat_stat __user *statbuf, int flag)
+{
+	struct kstat stat;
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, &stat);
+	else
+		error = vfs_stat_fd(dfd, filename, &stat);
+
+	if (!error)
+		error = cp_compat_stat(&stat, statbuf);
+
+out:
+	return error;
+}
+
 asmlinkage long compat_sys_newfstat(unsigned int fd,
 		struct compat_stat __user * statbuf)
 {
@@ -1290,7 +1316,17 @@ out:
 asmlinkage long
 compat_sys_open(const char __user *filename, int flags, int mode)
 {
-	return do_sys_open(filename, flags, mode);
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
+}
+
+/*
+ * Exactly like fs/open.c:sys_openat(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_openat(int dfd, const char __user *filename, int flags, int mode)
+{
+	return do_sys_open(dfd, filename, flags, mode);
 }
 
 /*
diff --git a/fs/exec.c b/fs/exec.c
index 62b40af68cc4..055378d2513e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -477,7 +477,7 @@ struct file *open_exec(const char *name)
 	int err;
 	struct file *file;
 
-	err = path_lookup_open(name, LOOKUP_FOLLOW, &nd, FMODE_READ);
+	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ);
 	file = ERR_PTR(err);
 
 	if (!err) {
diff --git a/fs/namei.c b/fs/namei.c
index 33fb5bd34a81..4acdac043b6b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -30,6 +30,8 @@
 #include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/namei.h>
 #include <asm/namei.h>
 #include <asm/uaccess.h>
 
@@ -1063,7 +1065,8 @@ set_it:
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
+static int fastcall do_path_lookup(int dfd, const char *name,
+				unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
 
@@ -1083,9 +1086,38 @@ int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata
 		}
 		nd->mnt = mntget(current->fs->rootmnt);
 		nd->dentry = dget(current->fs->root);
-	} else {
+	} else if (dfd == AT_FDCWD) {
 		nd->mnt = mntget(current->fs->pwdmnt);
 		nd->dentry = dget(current->fs->pwd);
+	} else {
+		struct file *file;
+		int fput_needed;
+		struct dentry *dentry;
+
+		file = fget_light(dfd, &fput_needed);
+		if (!file) {
+			retval = -EBADF;
+			goto out_fail;
+		}
+
+		dentry = file->f_dentry;
+
+		if (!S_ISDIR(dentry->d_inode->i_mode)) {
+			retval = -ENOTDIR;
+			fput_light(file, fput_needed);
+			goto out_fail;
+		}
+
+		retval = file_permission(file, MAY_EXEC);
+		if (retval) {
+			fput_light(file, fput_needed);
+			goto out_fail;
+		}
+
+		nd->mnt = mntget(file->f_vfsmnt);
+		nd->dentry = dget(dentry);
+
+		fput_light(file, fput_needed);
 	}
 	read_unlock(&current->fs->lock);
 	current->total_link_count = 0;
@@ -1094,11 +1126,19 @@ out:
 	if (unlikely(current->audit_context
 		     && nd && nd->dentry && nd->dentry->d_inode))
 		audit_inode(name, nd->dentry->d_inode, flags);
+out_fail:
 	return retval;
 }
 
-static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags,
-		struct nameidata *nd, int open_flags, int create_mode)
+int fastcall path_lookup(const char *name, unsigned int flags,
+			struct nameidata *nd)
+{
+	return do_path_lookup(AT_FDCWD, name, flags, nd);
+}
+
+static int __path_lookup_intent_open(int dfd, const char *name,
+		unsigned int lookup_flags, struct nameidata *nd,
+		int open_flags, int create_mode)
 {
 	struct file *filp = get_empty_filp();
 	int err;
@@ -1108,7 +1148,7 @@ static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags
 	nd->intent.open.file = filp;
 	nd->intent.open.flags = open_flags;
 	nd->intent.open.create_mode = create_mode;
-	err = path_lookup(name, lookup_flags|LOOKUP_OPEN, nd);
+	err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
 	if (IS_ERR(nd->intent.open.file)) {
 		if (err == 0) {
 			err = PTR_ERR(nd->intent.open.file);
@@ -1126,10 +1166,10 @@ static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags
  * @nd: pointer to nameidata
  * @open_flags: open intent flags
  */
-int path_lookup_open(const char *name, unsigned int lookup_flags,
+int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
 		struct nameidata *nd, int open_flags)
 {
-	return __path_lookup_intent_open(name, lookup_flags, nd,
+	return __path_lookup_intent_open(dfd, name, lookup_flags, nd,
 			open_flags, 0);
 }
 
@@ -1141,12 +1181,12 @@ int path_lookup_open(const char *name, unsigned int lookup_flags,
  * @open_flags: open intent flags
  * @create_mode: create intent flags
  */
-static int path_lookup_create(const char *name, unsigned int lookup_flags,
-			      struct nameidata *nd, int open_flags,
-			      int create_mode)
+static int path_lookup_create(int dfd, const char *name,
+			      unsigned int lookup_flags, struct nameidata *nd,
+			      int open_flags, int create_mode)
 {
-	return __path_lookup_intent_open(name, lookup_flags|LOOKUP_CREATE, nd,
-			open_flags, create_mode);
+	return __path_lookup_intent_open(dfd, name, lookup_flags|LOOKUP_CREATE,
+			nd, open_flags, create_mode);
 }
 
 int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
@@ -1156,7 +1196,7 @@ int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
 	int err = PTR_ERR(tmp);
 
 	if (!IS_ERR(tmp)) {
-		err = __path_lookup_intent_open(tmp, lookup_flags, nd, open_flags, 0);
+		err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0);
 		putname(tmp);
 	}
 	return err;
@@ -1248,18 +1288,24 @@ access:
  * that namei follows links, while lnamei does not.
  * SMP-safe
  */
-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
+int fastcall __user_walk_fd(int dfd, const char __user *name, unsigned flags,
+			    struct nameidata *nd)
 {
 	char *tmp = getname(name);
 	int err = PTR_ERR(tmp);
 
 	if (!IS_ERR(tmp)) {
-		err = path_lookup(tmp, flags, nd);
+		err = do_path_lookup(dfd, tmp, flags, nd);
 		putname(tmp);
 	}
 	return err;
 }
 
+int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
+{
+	return __user_walk_fd(AT_FDCWD, name, flags, nd);
+}
+
 /*
  * It's inline, so penalty for filesystems that don't use sticky bit is
  * minimal.
@@ -1518,7 +1564,8 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
  * for symlinks (where the permissions are checked later).
  * SMP-safe
  */
-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
+int open_namei(int dfd, const char *pathname, int flag,
+		int mode, struct nameidata *nd)
 {
 	int acc_mode, error;
 	struct path path;
@@ -1540,7 +1587,8 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
 	 * The simplest case - just a plain lookup.
 	 */
 	if (!(flag & O_CREAT)) {
-		error = path_lookup_open(pathname, lookup_flags(flag), nd, flag);
+		error = path_lookup_open(dfd, pathname, lookup_flags(flag),
+					 nd, flag);
 		if (error)
 			return error;
 		goto ok;
@@ -1549,7 +1597,7 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
 	/*
 	 * Create - we need to know the parent.
 	 */
-	error = path_lookup_create(pathname, LOOKUP_PARENT, nd, flag, mode);
+	error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
 	if (error)
 		return error;
 
@@ -1744,7 +1792,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	return error;
 }
 
-asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev)
+asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
+				unsigned dev)
 {
 	int error = 0;
 	char * tmp;
@@ -1757,7 +1806,7 @@ asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev)
 	if (IS_ERR(tmp))
 		return PTR_ERR(tmp);
 
-	error = path_lookup(tmp, LOOKUP_PARENT, &nd);
+	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
 	if (error)
 		goto out;
 	dentry = lookup_create(&nd, 0);
@@ -1793,6 +1842,11 @@ out:
 	return error;
 }
 
+asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
+{
+	return sys_mknodat(AT_FDCWD, filename, mode, dev);
+}
+
 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int error = may_create(dir, dentry, NULL);
@@ -1815,7 +1869,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return error;
 }
 
-asmlinkage long sys_mkdir(const char __user * pathname, int mode)
+asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
 {
 	int error = 0;
 	char * tmp;
@@ -1826,7 +1880,7 @@ asmlinkage long sys_mkdir(const char __user * pathname, int mode)
 		struct dentry *dentry;
 		struct nameidata nd;
 
-		error = path_lookup(tmp, LOOKUP_PARENT, &nd);
+		error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
 		if (error)
 			goto out;
 		dentry = lookup_create(&nd, 1);
@@ -1846,6 +1900,11 @@ out:
 	return error;
 }
 
+asmlinkage long sys_mkdir(const char __user *pathname, int mode)
+{
+	return sys_mkdirat(AT_FDCWD, pathname, mode);
+}
+
 /*
  * We try to drop the dentry early: we should have
  * a usage count of 2 if we're the only user of this
@@ -1907,7 +1966,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	return error;
 }
 
-asmlinkage long sys_rmdir(const char __user * pathname)
+static long do_rmdir(int dfd, const char __user *pathname)
 {
 	int error = 0;
 	char * name;
@@ -1918,7 +1977,7 @@ asmlinkage long sys_rmdir(const char __user * pathname)
 	if(IS_ERR(name))
 		return PTR_ERR(name);
 
-	error = path_lookup(name, LOOKUP_PARENT, &nd);
+	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
 	if (error)
 		goto exit;
 
@@ -1948,6 +2007,11 @@ exit:
 	return error;
 }
 
+asmlinkage long sys_rmdir(const char __user *pathname)
+{
+	return do_rmdir(AT_FDCWD, pathname);
+}
+
 int vfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int error = may_delete(dir, dentry, 0);
@@ -1984,7 +2048,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
  * writeout happening, and we don't want to prevent access to the directory
  * while waiting on the I/O.
  */
-asmlinkage long sys_unlink(const char __user * pathname)
+static long do_unlinkat(int dfd, const char __user *pathname)
 {
 	int error = 0;
 	char * name;
@@ -1996,7 +2060,7 @@ asmlinkage long sys_unlink(const char __user * pathname)
 	if(IS_ERR(name))
 		return PTR_ERR(name);
 
-	error = path_lookup(name, LOOKUP_PARENT, &nd);
+	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
 	if (error)
 		goto exit;
 	error = -EISDIR;
@@ -2031,6 +2095,22 @@ slashes:
 	goto exit2;
 }
 
+asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
+{
+	if ((flag & ~AT_REMOVEDIR) != 0)
+		return -EINVAL;
+
+	if (flag & AT_REMOVEDIR)
+		return do_rmdir(dfd, pathname);
+
+	return do_unlinkat(dfd, pathname);
+}
+
+asmlinkage long sys_unlink(const char __user *pathname)
+{
+	return do_unlinkat(AT_FDCWD, pathname);
+}
+
 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
 {
 	int error = may_create(dir, dentry, NULL);
@@ -2052,7 +2132,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
 	return error;
 }
 
-asmlinkage long sys_symlink(const char __user * oldname, const char __user * newname)
+asmlinkage long sys_symlinkat(const char __user *oldname,
+			      int newdfd, const char __user *newname)
 {
 	int error = 0;
 	char * from;
@@ -2067,7 +2148,7 @@ asmlinkage long sys_symlink(const char __user * oldname, const char __user * new
 		struct dentry *dentry;
 		struct nameidata nd;
 
-		error = path_lookup(to, LOOKUP_PARENT, &nd);
+		error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
 		if (error)
 			goto out;
 		dentry = lookup_create(&nd, 0);
@@ -2085,6 +2166,11 @@ out:
 	return error;
 }
 
+asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname)
+{
+	return sys_symlinkat(oldname, AT_FDCWD, newname);
+}
+
 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
 {
 	struct inode *inode = old_dentry->d_inode;
@@ -2132,7 +2218,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
  * with linux 2.0, and to avoid hard-linking to directories
  * and other special files.  --ADM
  */
-asmlinkage long sys_link(const char __user * oldname, const char __user * newname)
+asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
+			   int newdfd, const char __user *newname)
 {
 	struct dentry *new_dentry;
 	struct nameidata nd, old_nd;
@@ -2143,10 +2230,10 @@ asmlinkage long sys_link(const char __user * oldname, const char __user * newnam
 	if (IS_ERR(to))
 		return PTR_ERR(to);
 
-	error = __user_walk(oldname, 0, &old_nd);
+	error = __user_walk_fd(olddfd, oldname, 0, &old_nd);
 	if (error)
 		goto exit;
-	error = path_lookup(to, LOOKUP_PARENT, &nd);
+	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
 	if (error)
 		goto out;
 	error = -EXDEV;
@@ -2169,6 +2256,11 @@ exit:
 	return error;
 }
 
+asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
+{
+	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname);
+}
+
 /*
  * The worst of all namespace operations - renaming directory. "Perverted"
  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
@@ -2315,7 +2407,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return error;
 }
 
-static int do_rename(const char * oldname, const char * newname)
+static int do_rename(int olddfd, const char *oldname,
+			int newdfd, const char *newname)
 {
 	int error = 0;
 	struct dentry * old_dir, * new_dir;
@@ -2323,11 +2416,11 @@ static int do_rename(const char * oldname, const char * newname)
 	struct dentry * trap;
 	struct nameidata oldnd, newnd;
 
-	error = path_lookup(oldname, LOOKUP_PARENT, &oldnd);
+	error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd);
 	if (error)
 		goto exit;
 
-	error = path_lookup(newname, LOOKUP_PARENT, &newnd);
+	error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd);
 	if (error)
 		goto exit1;
 
@@ -2391,7 +2484,8 @@ exit:
 	return error;
 }
 
-asmlinkage long sys_rename(const char __user * oldname, const char __user * newname)
+asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
+			     int newdfd, const char __user *newname)
 {
 	int error;
 	char * from;
@@ -2403,13 +2497,18 @@ asmlinkage long sys_rename(const char __user * oldname, const char __user * newn
 	to = getname(newname);
 	error = PTR_ERR(to);
 	if (!IS_ERR(to)) {
-		error = do_rename(from,to);
+		error = do_rename(olddfd, from, newdfd, to);
 		putname(to);
 	}
 	putname(from);
 	return error;
 }
 
+asmlinkage long sys_rename(const char __user *oldname, const char __user *newname)
+{
+	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
+}
+
 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
 {
 	int len;
@@ -2553,6 +2652,7 @@ struct inode_operations page_symlink_inode_operations = {
 };
 
 EXPORT_SYMBOL(__user_walk);
+EXPORT_SYMBOL(__user_walk_fd);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/open.c b/fs/open.c
index 8e20c1f32563..70e0230d8e77 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -20,6 +20,7 @@
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
+#include <linux/fcntl.h>
 #include <asm/uaccess.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
@@ -383,7 +384,7 @@ asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times)
 
 		error = get_user(newattrs.ia_atime.tv_sec, &times->actime);
 		newattrs.ia_atime.tv_nsec = 0;
-		if (!error) 
+		if (!error)
 			error = get_user(newattrs.ia_mtime.tv_sec, &times->modtime);
 		newattrs.ia_mtime.tv_nsec = 0;
 		if (error)
@@ -414,14 +415,14 @@ out:
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-long do_utimes(char __user * filename, struct timeval * times)
+long do_utimes(int dfd, char __user *filename, struct timeval *times)
 {
 	int error;
 	struct nameidata nd;
 	struct inode * inode;
 	struct iattr newattrs;
 
-	error = user_path_walk(filename, &nd);
+	error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
 
 	if (error)
 		goto out;
@@ -461,13 +462,18 @@ out:
 	return error;
 }
 
-asmlinkage long sys_utimes(char __user * filename, struct timeval __user * utimes)
+asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes)
 {
 	struct timeval times[2];
 
 	if (utimes && copy_from_user(&times, utimes, sizeof(times)))
 		return -EFAULT;
-	return do_utimes(filename, utimes ? times : NULL);
+	return do_utimes(dfd, filename, utimes ? times : NULL);
+}
+
+asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes)
+{
+	return sys_futimesat(AT_FDCWD, filename, utimes);
 }
 
 
@@ -476,7 +482,7 @@ asmlinkage long sys_utimes(char __user * filename, struct timeval __user * utime
  * We do this by temporarily clearing all FS-related capabilities and
  * switching the fsuid/fsgid around to the real ones.
  */
-asmlinkage long sys_access(const char __user * filename, int mode)
+asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
 	struct nameidata nd;
 	int old_fsuid, old_fsgid;
@@ -506,7 +512,7 @@ asmlinkage long sys_access(const char __user * filename, int mode)
 	else
 		current->cap_effective = current->cap_permitted;
 
-	res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+	res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
 	if (!res) {
 		res = vfs_permission(&nd, mode);
 		/* SuS v2 requires we report a read only fs too */
@@ -523,6 +529,11 @@ asmlinkage long sys_access(const char __user * filename, int mode)
 	return res;
 }
 
+asmlinkage long sys_access(const char __user *filename, int mode)
+{
+	return sys_faccessat(AT_FDCWD, filename, mode);
+}
+
 asmlinkage long sys_chdir(const char __user * filename)
 {
 	struct nameidata nd;
@@ -635,14 +646,15 @@ out:
 	return err;
 }
 
-asmlinkage long sys_chmod(const char __user * filename, mode_t mode)
+asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
+			     mode_t mode)
 {
 	struct nameidata nd;
 	struct inode * inode;
 	int error;
 	struct iattr newattrs;
 
-	error = user_path_walk(filename, &nd);
+	error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
 	if (error)
 		goto out;
 	inode = nd.dentry->d_inode;
@@ -669,6 +681,11 @@ out:
 	return error;
 }
 
+asmlinkage long sys_chmod(const char __user *filename, mode_t mode)
+{
+	return sys_fchmodat(AT_FDCWD, filename, mode);
+}
+
 static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 {
 	struct inode * inode;
@@ -717,6 +734,26 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
 	return error;
 }
 
+asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
+			     gid_t group, int flag)
+{
+	struct nameidata nd;
+	int error = -EINVAL;
+	int follow;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	error = __user_walk_fd(dfd, filename, follow, &nd);
+	if (!error) {
+		error = chown_common(nd.dentry, user, group);
+		path_release(&nd);
+	}
+out:
+	return error;
+}
+
 asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group)
 {
 	struct nameidata nd;
@@ -820,7 +857,8 @@ cleanup_file:
  * for the internal routines (ie open_namei()/follow_link() etc). 00 is
  * used by symlinks.
  */
-struct file *filp_open(const char * filename, int flags, int mode)
+static struct file *do_filp_open(int dfd, const char *filename, int flags,
+				 int mode)
 {
 	int namei_flags, error;
 	struct nameidata nd;
@@ -829,12 +867,17 @@ struct file *filp_open(const char * filename, int flags, int mode)
 	if ((namei_flags+1) & O_ACCMODE)
 		namei_flags++;
 
-	error = open_namei(filename, namei_flags, mode, &nd);
+	error = open_namei(dfd, filename, namei_flags, mode, &nd);
 	if (!error)
 		return nameidata_to_filp(&nd, flags);
 
 	return ERR_PTR(error);
 }
+
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+	return do_filp_open(AT_FDCWD, filename, flags, mode);
+}
 EXPORT_SYMBOL(filp_open);
 
 /**
@@ -991,7 +1034,7 @@ void fastcall put_unused_fd(unsigned int fd)
 EXPORT_SYMBOL(put_unused_fd);
 
 /*
- * Install a file pointer in the fd array.  
+ * Install a file pointer in the fd array.
  *
  * The VFS is full of places where we drop the files lock between
  * setting the open_fds bitmap and installing the file in the file
@@ -1016,7 +1059,7 @@ void fastcall fd_install(unsigned int fd, struct file * file)
 
 EXPORT_SYMBOL(fd_install);
 
-long do_sys_open(const char __user *filename, int flags, int mode)
+long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
 	char *tmp = getname(filename);
 	int fd = PTR_ERR(tmp);
@@ -1024,7 +1067,7 @@ long do_sys_open(const char __user *filename, int flags, int mode)
 	if (!IS_ERR(tmp)) {
 		fd = get_unused_fd();
 		if (fd >= 0) {
-			struct file *f = filp_open(tmp, flags, mode);
+			struct file *f = do_filp_open(dfd, tmp, flags, mode);
 			if (IS_ERR(f)) {
 				put_unused_fd(fd);
 				fd = PTR_ERR(f);
@@ -1043,10 +1086,20 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode)
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	return do_sys_open(filename, flags, mode);
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
 }
 EXPORT_SYMBOL_GPL(sys_open);
 
+asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
+			   int mode)
+{
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	return do_sys_open(dfd, filename, flags, mode);
+}
+EXPORT_SYMBOL_GPL(sys_openat);
+
 #ifndef __alpha__
 
 /*
diff --git a/fs/stat.c b/fs/stat.c
index b8a0e5110ab2..24211b030f39 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -63,12 +63,12 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 
 EXPORT_SYMBOL(vfs_getattr);
 
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
 {
 	struct nameidata nd;
 	int error;
 
-	error = user_path_walk(name, &nd);
+	error = __user_walk_fd(dfd, name, LOOKUP_FOLLOW, &nd);
 	if (!error) {
 		error = vfs_getattr(nd.mnt, nd.dentry, stat);
 		path_release(&nd);
@@ -76,14 +76,19 @@ int vfs_stat(char __user *name, struct kstat *stat)
 	return error;
 }
 
+int vfs_stat(char __user *name, struct kstat *stat)
+{
+	return vfs_stat_fd(AT_FDCWD, name, stat);
+}
+
 EXPORT_SYMBOL(vfs_stat);
 
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
 {
 	struct nameidata nd;
 	int error;
 
-	error = user_path_walk_link(name, &nd);
+	error = __user_walk_fd(dfd, name, 0, &nd);
 	if (!error) {
 		error = vfs_getattr(nd.mnt, nd.dentry, stat);
 		path_release(&nd);
@@ -91,6 +96,11 @@ int vfs_lstat(char __user *name, struct kstat *stat)
 	return error;
 }
 
+int vfs_lstat(char __user *name, struct kstat *stat)
+{
+	return vfs_lstat_fd(AT_FDCWD, name, stat);
+}
+
 EXPORT_SYMBOL(vfs_lstat);
 
 int vfs_fstat(unsigned int fd, struct kstat *stat)
@@ -151,7 +161,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user * statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat(filename, &stat);
+	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_old_stat(&stat, statbuf);
@@ -161,7 +171,7 @@ asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user
 asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __user * statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat(filename, &stat);
+	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_old_stat(&stat, statbuf);
@@ -229,27 +239,50 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_newstat(char __user * filename, struct stat __user * statbuf)
+asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_stat(filename, &stat);
+	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_new_stat(&stat, statbuf);
 
 	return error;
 }
-asmlinkage long sys_newlstat(char __user * filename, struct stat __user * statbuf)
+
+asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
 {
 	struct kstat stat;
-	int error = vfs_lstat(filename, &stat);
+	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
 
 	if (!error)
 		error = cp_new_stat(&stat, statbuf);
 
 	return error;
 }
-asmlinkage long sys_newfstat(unsigned int fd, struct stat __user * statbuf)
+
+asmlinkage long sys_newfstatat(int dfd, char __user *filename,
+				struct stat __user *statbuf, int flag)
+{
+	struct kstat stat;
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, &stat);
+	else
+		error = vfs_stat_fd(dfd, filename, &stat);
+
+	if (!error)
+		error = cp_new_stat(&stat, statbuf);
+
+out:
+	return error;
+}
+
+asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
 {
 	struct kstat stat;
 	int error = vfs_fstat(fd, &stat);
@@ -260,7 +293,8 @@ asmlinkage long sys_newfstat(unsigned int fd, struct stat __user * statbuf)
 	return error;
 }
 
-asmlinkage long sys_readlink(const char __user * path, char __user * buf, int bufsiz)
+asmlinkage long sys_readlinkat(int dfd, const char __user *path,
+				char __user *buf, int bufsiz)
 {
 	struct nameidata nd;
 	int error;
@@ -268,7 +302,7 @@ asmlinkage long sys_readlink(const char __user * path, char __user * buf, int bu
 	if (bufsiz <= 0)
 		return -EINVAL;
 
-	error = user_path_walk_link(path, &nd);
+	error = __user_walk_fd(dfd, path, 0, &nd);
 	if (!error) {
 		struct inode * inode = nd.dentry->d_inode;
 
@@ -285,6 +319,12 @@ asmlinkage long sys_readlink(const char __user * path, char __user * buf, int bu
 	return error;
 }
 
+asmlinkage long sys_readlink(const char __user *path, char __user *buf,
+				int bufsiz)
+{
+	return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
+}
+
 
 /* ---------- LFS-64 ----------- */
 #ifdef __ARCH_WANT_STAT64
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 8a7c82151de9..c52a63755fdd 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -23,6 +23,13 @@
 #define DN_ATTRIB	0x00000020	/* File changed attibutes */
 #define DN_MULTISHOT	0x80000000	/* Don't remove notifier */
 
+#define AT_FDCWD		-100    /* Special value used to indicate
+                                           openat should use the current
+                                           working directory. */
+#define AT_SYMLINK_NOFOLLOW	0x100   /* Do not follow symbolic links.  */
+#define AT_REMOVEDIR		0x200   /* Remove directory instead of
+                                           unlinking file.  */
+
 #ifdef __KERNEL__
 
 #ifndef force_o_largefile
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b77f2608eef9..84bb449b9b01 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1340,7 +1340,8 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 
 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
 		       struct file *filp);
-extern long do_sys_open(const char __user *filename, int flags, int mode);
+extern long do_sys_open(int fdf, const char __user *filename, int flags,
+			int mode);
 extern struct file *filp_open(const char *, int, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
 extern int filp_close(struct file *, fl_owner_t id);
@@ -1479,7 +1480,7 @@ static inline void allow_write_access(struct file *file)
 }
 extern int do_pipe(int *);
 
-extern int open_namei(const char *, int, int, struct nameidata *);
+extern int open_namei(int dfd, const char *, int, int, struct nameidata *);
 extern int may_open(struct nameidata *, int, int);
 
 extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
@@ -1677,6 +1678,8 @@ extern int vfs_readdir(struct file *, filldir_t, void *);
 
 extern int vfs_stat(char __user *, struct kstat *);
 extern int vfs_lstat(char __user *, struct kstat *);
+extern int vfs_stat_fd(int dfd, char __user *, struct kstat *);
+extern int vfs_lstat_fd(int dfd, char __user *, struct kstat *);
 extern int vfs_fstat(unsigned int, struct kstat *);
 
 extern int vfs_ioctl(struct file *, unsigned int, unsigned int, unsigned long);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index b699e427c00c..e6698013e4d0 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -56,10 +56,11 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_ACCESS		(0x0400)
 
 extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
+extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *));
 #define user_path_walk(name,nd) \
-	__user_walk(name, LOOKUP_FOLLOW, nd)
+	__user_walk_fd(AT_FDCWD, name, LOOKUP_FOLLOW, nd)
 #define user_path_walk_link(name,nd) \
-	__user_walk(name, 0, nd)
+	__user_walk_fd(AT_FDCWD, name, 0, nd)
 extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
 extern int FASTCALL(path_walk(const char *, struct nameidata *));
 extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
@@ -67,7 +68,7 @@ extern void path_release(struct nameidata *);
 extern void path_release_on_umount(struct nameidata *);
 
 extern int __user_path_lookup_open(const char __user *, unsigned lookup_flags, struct nameidata *nd, int open_flags);
-extern int path_lookup_open(const char *, unsigned lookup_flags, struct nameidata *, int open_flags);
+extern int path_lookup_open(int dfd, const char *name, unsigned lookup_flags, struct nameidata *, int open_flags);
 extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
 		int (*open)(struct inode *, struct file *));
 extern struct file *nameidata_to_filp(struct nameidata *nd, int flags);
diff --git a/include/linux/time.h b/include/linux/time.h
index f2aca7ec6325..614dd8465839 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -74,7 +74,7 @@ extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
 #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
-extern long do_utimes(char __user *filename, struct timeval *times);
+extern long do_utimes(int dfd, char __user *filename, struct timeval *times);
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
-- 
cgit v1.2.3-71-gd317


From 150256d8aadb3a337c31efa9e175cbd25bf06b06 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 18 Jan 2006 17:43:57 -0800
Subject: [PATCH] Generic sys_rt_sigsuspend()

The TIF_RESTORE_SIGMASK flag allows us to have a generic implementation of
sys_rt_sigsuspend() instead of duplicating it for each architecture.  This
provides such an implementation and makes arch/powerpc use it.

It also tidies up the ppc32 sys_sigsuspend() to use TIF_RESTORE_SIGMASK.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/kernel/signal_32.c | 56 ++++-------------------------------------
 arch/powerpc/kernel/signal_64.c | 36 --------------------------
 include/asm-powerpc/unistd.h    |  2 ++
 include/linux/sched.h           |  1 +
 kernel/compat.c                 | 28 +++++++++++++++++++++
 kernel/signal.c                 | 26 +++++++++++++++++++
 6 files changed, 62 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 177bba78fb0b..7f0d5ce2567d 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -252,8 +252,7 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs);
 /*
  * Atomically swap in the new signal mask, and wait for a signal.
  */
-long sys_sigsuspend(old_sigset_t mask, int p2, int p3, int p4, int p6, int p7,
-	       struct pt_regs *regs)
+long sys_sigsuspend(old_sigset_t mask)
 {
 	sigset_t saveset;
 
@@ -264,55 +263,10 @@ long sys_sigsuspend(old_sigset_t mask, int p2, int p3, int p4, int p6, int p7,
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	regs->result = -EINTR;
-	regs->gpr[3] = EINTR;
-	regs->ccr |= 0x10000000;
-	while (1) {
-		current->state = TASK_INTERRUPTIBLE;
-		schedule();
-		if (do_signal(&saveset, regs)) {
-			set_thread_flag(TIF_RESTOREALL);
-			return 0;
-		}
-	}
-}
-
-long sys_rt_sigsuspend(
-#ifdef CONFIG_PPC64
-		compat_sigset_t __user *unewset,
-#else
-		sigset_t __user *unewset,
-#endif
-		size_t sigsetsize, int p3, int p4,
-		int p6, int p7, struct pt_regs *regs)
-{
-	sigset_t saveset, newset;
-
-	/* XXX: Don't preclude handling different sized sigset_t's.  */
-	if (sigsetsize != sizeof(sigset_t))
-		return -EINVAL;
-
-	if (get_sigset_t(&newset, unewset))
-		return -EFAULT;
-	sigdelsetmask(&newset, ~_BLOCKABLE);
-
-	spin_lock_irq(&current->sighand->siglock);
-	saveset = current->blocked;
-	current->blocked = newset;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-
-	regs->result = -EINTR;
-	regs->gpr[3] = EINTR;
-	regs->ccr |= 0x10000000;
-	while (1) {
-		current->state = TASK_INTERRUPTIBLE;
-		schedule();
-		if (do_signal(&saveset, regs)) {
-			set_thread_flag(TIF_RESTOREALL);
-			return 0;
-		}
-	}
+ 	current->state = TASK_INTERRUPTIBLE;
+ 	schedule();
+ 	set_thread_flag(TIF_RESTORE_SIGMASK);
+ 	return -ERESTARTNOHAND;
 }
 
 #ifdef CONFIG_PPC32
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 7b9d999e2115..a4a6812e815e 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -67,42 +67,6 @@ struct rt_sigframe {
 	char abigap[288];
 } __attribute__ ((aligned (16)));
 
-
-/*
- * Atomically swap in the new signal mask, and wait for a signal.
- */
-long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, int p3, int p4,
-		       int p6, int p7, struct pt_regs *regs)
-{
-	sigset_t saveset, newset;
-
-	/* XXX: Don't preclude handling different sized sigset_t's.  */
-	if (sigsetsize != sizeof(sigset_t))
-		return -EINVAL;
-
-	if (copy_from_user(&newset, unewset, sizeof(newset)))
-		return -EFAULT;
-	sigdelsetmask(&newset, ~_BLOCKABLE);
-
-	spin_lock_irq(&current->sighand->siglock);
-	saveset = current->blocked;
-	current->blocked = newset;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-
-	regs->result = -EINTR;
-	regs->gpr[3] = EINTR;
-	regs->ccr |= 0x10000000;
-	while (1) {
-		current->state = TASK_INTERRUPTIBLE;
-		schedule();
-		if (do_signal(&saveset, regs)) {
-			set_thread_flag(TIF_RESTOREALL);
-			return 0;
-		}
-	}
-}
-
 long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, unsigned long r5,
 		     unsigned long r6, unsigned long r7, unsigned long r8,
 		     struct pt_regs *regs)
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index 19eaac3fbbf9..76daec496062 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -444,11 +444,13 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_SYS_RT_SIGACTION
+#define __ARCH_WANT_SYS_RT_SIGSUSPEND
 #ifdef CONFIG_PPC32
 #define __ARCH_WANT_OLD_STAT
 #endif
 #ifdef CONFIG_PPC64
 #define __ARCH_WANT_COMPAT_SYS_TIME
+#define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 #endif
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2df1a1a2fee5..0cfcd1c7865e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -809,6 +809,7 @@ struct task_struct {
 	struct sighand_struct *sighand;
 
 	sigset_t blocked, real_blocked;
+	sigset_t saved_sigmask;		/* To be restored with TIF_RESTORE_SIGMASK */
 	struct sigpending pending;
 
 	unsigned long sas_ss_sp;
diff --git a/kernel/compat.c b/kernel/compat.c
index 256e5d9f0647..1867290c37e3 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -871,3 +871,31 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
 }
 
 #endif /* __ARCH_WANT_COMPAT_SYS_TIME */
+
+#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
+asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
+{
+	sigset_t newset;
+	compat_sigset_t newset32;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+		return -EFAULT;
+	sigset_from_compat(&newset, &newset32);
+	sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+	spin_lock_irq(&current->sighand->siglock);
+	current->saved_sigmask = current->blocked;
+	current->blocked = newset;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();
+	set_thread_flag(TIF_RESTORE_SIGMASK);
+	return -ERESTARTNOHAND;
+}
+#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/signal.c b/kernel/signal.c
index 5dafbd36d62e..d3efafd8109a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2721,6 +2721,32 @@ sys_pause(void)
 
 #endif
 
+#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
+asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
+{
+	sigset_t newset;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&newset, unewset, sizeof(newset)))
+		return -EFAULT;
+	sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+	spin_lock_irq(&current->sighand->siglock);
+	current->saved_sigmask = current->blocked;
+	current->blocked = newset;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();
+	set_thread_flag(TIF_RESTORE_SIGMASK);
+	return -ERESTARTNOHAND;
+}
+#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+
 void __init signals_init(void)
 {
 	sigqueue_cachep =
-- 
cgit v1.2.3-71-gd317


From 9f72949f679df06021c9e43886c9191494fdb007 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 18 Jan 2006 17:44:05 -0800
Subject: [PATCH] Add pselect/ppoll system call implementation

The following implementation of ppoll() and pselect() system calls
depends on the architecture providing a TIF_RESTORE_SIGMASK flag in the
thread_info.

These system calls have to change the signal mask during their
operation, and signal handlers must be invoked using the new, temporary
signal mask. The old signal mask must be restored either upon successful
exit from the system call, or upon returning from the invoked signal
handler if the system call is interrupted. We can't simply restore the
original signal mask and return to userspace, since the restored signal
mask may actually block the signal which interrupted the system call.

The TIF_RESTORE_SIGMASK flag deals with this by causing the syscall exit
path to trap into do_signal() just as TIF_SIGPENDING does, and by
causing do_signal() to use the saved signal mask instead of the current
signal mask when setting up the stack frame for the signal handler -- or
by causing do_signal() to simply restore the saved signal mask in the
case where there is no handler to be invoked.

The first patch implements the sys_pselect() and sys_ppoll() system
calls, which are present only if TIF_RESTORE_SIGMASK is defined. That
#ifdef should go away in time when all architectures have implemented
it. The second patch implements TIF_RESTORE_SIGMASK for the PowerPC
kernel (in the -mm tree), and the third patch then removes the
arch-specific implementations of sys_rt_sigsuspend() and replaces them
with generic versions using the same trick.

The fourth and fifth patches, provided by David Howells, implement
TIF_RESTORE_SIGMASK for FR-V and i386 respectively, and the sixth patch
adds the syscalls to the i386 syscall table.

This patch:

Add the pselect() and ppoll() system calls, providing core routines usable by
the original select() and poll() system calls and also the new calls (with
their semantics w.r.t timeouts).

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c          | 260 ++++++++++++++++++++++++++++++++------
 fs/select.c          | 348 ++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/poll.h |   6 +-
 3 files changed, 519 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index c6ba9deabada..18b21b4c9e3a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -53,6 +53,8 @@
 #include <asm/mmu_context.h>
 #include <asm/ioctls.h>
 
+extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -1657,36 +1659,14 @@ static void select_bits_free(void *bits, int size)
 #define MAX_SELECT_SECONDS \
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
-asmlinkage long
-compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp,
-		compat_ulong_t __user *exp, struct compat_timeval __user *tvp)
+int compat_core_sys_select(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout)
 {
 	fd_set_bits fds;
 	char *bits;
-	long timeout;
 	int size, max_fdset, ret = -EINVAL;
 	struct fdtable *fdt;
 
-	timeout = MAX_SCHEDULE_TIMEOUT;
-	if (tvp) {
-		time_t sec, usec;
-
-		if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp))
-		    || __get_user(sec, &tvp->tv_sec)
-		    || __get_user(usec, &tvp->tv_usec)) {
-			ret = -EFAULT;
-			goto out_nofds;
-		}
-
-		if (sec < 0 || usec < 0)
-			goto out_nofds;
-
-		if ((unsigned long) sec < MAX_SELECT_SECONDS) {
-			timeout = ROUND_UP(usec, 1000000/HZ);
-			timeout += sec * (unsigned long) HZ;
-		}
-	}
-
 	if (n < 0)
 		goto out_nofds;
 
@@ -1723,19 +1703,7 @@ compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, &timeout);
-
-	if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
-		time_t sec = 0, usec = 0;
-		if (timeout) {
-			sec = timeout / HZ;
-			usec = timeout % HZ;
-			usec *= (1000000/HZ);
-		}
-		if (put_user(sec, &tvp->tv_sec) ||
-		    put_user(usec, &tvp->tv_usec))
-			ret = -EFAULT;
-	}
+	ret = do_select(n, &fds, timeout);
 
 	if (ret < 0)
 		goto out;
@@ -1756,6 +1724,224 @@ out_nofds:
 	return ret;
 }
 
+asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timeval __user *tvp)
+{
+	s64 timeout = -1;
+	struct compat_timeval tv;
+	int ret;
+
+	if (tvp) {
+		if (copy_from_user(&tv, tvp, sizeof(tv)))
+			return -EFAULT;
+
+		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+			return -EINVAL;
+
+		/* Cast to u64 to make GCC stop complaining */
+		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
+			timeout = -1;	/* infinite */
+		else {
+			timeout = ROUND_UP(tv.tv_sec, 1000000/HZ);
+			timeout += tv.tv_sec * HZ;
+		}
+	}
+
+	ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
+
+	if (tvp) {
+		if (current->personality & STICKY_TIMEOUTS)
+			goto sticky;
+		tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
+		tv.tv_sec = timeout;
+		if (copy_to_user(tvp, &tv, sizeof(tv))) {
+sticky:
+			/*
+			 * If an application puts its timeval in read-only
+			 * memory, we don't want the Linux-specific update to
+			 * the timeval to cause a fault after the select has
+			 * completed successfully. However, because we're not
+			 * updating the timeval, we can't restart the system
+			 * call.
+			 */
+			if (ret == -ERESTARTNOHAND)
+				ret = -EINTR;
+		}
+	}
+
+	return ret;
+}
+
+#ifdef TIF_RESTORE_SIGMASK
+asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
+	compat_size_t sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	long timeout = MAX_SCHEDULE_TIMEOUT;
+	struct compat_timespec ts;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	do {
+		if (tsp) {
+			if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) {
+				timeout = ROUND_UP(ts.tv_nsec, 1000000000/HZ);
+				timeout += ts.tv_sec * (unsigned long)HZ;
+				ts.tv_sec = 0;
+				ts.tv_nsec = 0;
+			} else {
+				ts.tv_sec -= MAX_SELECT_SECONDS;
+				timeout = MAX_SELECT_SECONDS * HZ;
+			}
+		}
+
+		ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
+
+	} while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
+
+	if (tsp && !(current->personality & STICKY_TIMEOUTS)) {
+		ts.tv_sec += timeout / HZ;
+		ts.tv_nsec += (timeout % HZ) * (1000000000/HZ);
+		if (ts.tv_nsec >= 1000000000) {
+			ts.tv_sec++;
+			ts.tv_nsec -= 1000000000;
+		}
+		(void)copy_to_user(tsp, &ts, sizeof(ts));
+	}
+
+	if (ret == -ERESTARTNOHAND) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_thread_flag(TIF_RESTORE_SIGMASK);
+		}
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timespec __user *tsp, void __user *sig)
+{
+	compat_size_t sigsetsize = 0;
+	compat_uptr_t up = 0;
+
+	if (sig) {
+		if (!access_ok(VERIFY_READ, sig,
+				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
+		    	__get_user(up, (compat_uptr_t __user *)sig) ||
+		    	__get_user(sigsetsize,
+				(compat_size_t __user *)(sig+sizeof(up))))
+			return -EFAULT;
+	}
+	return compat_sys_pselect7(n, inp, outp, exp, tsp, compat_ptr(up),
+					sigsetsize);
+}
+
+asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
+	unsigned int nfds, struct compat_timespec __user *tsp,
+	const compat_sigset_t __user *sigmask, compat_size_t sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	struct compat_timespec ts;
+	s64 timeout = -1;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		/* We assume that ts.tv_sec is always lower than
+		   the number of seconds that can be expressed in
+		   an s64. Otherwise the compiler bitches at us */
+		timeout = ROUND_UP(ts.tv_sec, 1000000000/HZ);
+		timeout += ts.tv_sec * HZ;
+	}
+
+	if (sigmask) {
+		if (sigsetsize |= sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_sys_poll(ufds, nfds, &timeout);
+
+	/* We can restart this syscall, usually */
+	if (ret == -EINTR) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+				sizeof(sigsaved));
+			set_thread_flag(TIF_RESTORE_SIGMASK);
+		}
+		ret = -ERESTARTNOHAND;
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	if (tsp && timeout >= 0) {
+		if (current->personality & STICKY_TIMEOUTS)
+			goto sticky;
+		/* Yes, we know it's actually an s64, but it's also positive. */
+		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
+		ts.tv_sec = timeout;
+		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+sticky:
+			/*
+			 * If an application puts its timeval in read-only
+			 * memory, we don't want the Linux-specific update to
+			 * the timeval to cause a fault after the select has
+			 * completed successfully. However, because we're not
+			 * updating the timeval, we can't restart the system
+			 * call.
+			 */
+			if (ret == -ERESTARTNOHAND && timeout >= 0)
+				ret = -EINTR;
+		}
+	}
+
+	return ret;
+}
+#endif /* TIF_RESTORE_SIGMASK */
+
 #if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
 /* Stuff for NFS server syscalls... */
 struct compat_nfsctl_svc {
diff --git a/fs/select.c b/fs/select.c
index f10a10317d54..c0f02d36c60e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -179,12 +179,11 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
 
-int do_select(int n, fd_set_bits *fds, long *timeout)
+int do_select(int n, fd_set_bits *fds, s64 *timeout)
 {
 	struct poll_wqueues table;
 	poll_table *wait;
 	int retval, i;
-	long __timeout = *timeout;
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -196,11 +195,12 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
 
 	poll_initwait(&table);
 	wait = &table.pt;
-	if (!__timeout)
+	if (!*timeout)
 		wait = NULL;
 	retval = 0;
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
+		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -255,22 +255,32 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
 				*rexp = res_ex;
 		}
 		wait = NULL;
-		if (retval || !__timeout || signal_pending(current))
+		if (retval || !*timeout || signal_pending(current))
 			break;
 		if(table.error) {
 			retval = table.error;
 			break;
 		}
+
+		if (*timeout < 0) {
+			/* Wait indefinitely */
+			__timeout = MAX_SCHEDULE_TIMEOUT;
+		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
+			/* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
+			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
+			*timeout -= __timeout;
+		} else {
+			__timeout = *timeout;
+			*timeout = 0;
+		}
 		__timeout = schedule_timeout(__timeout);
+		if (*timeout >= 0)
+			*timeout += __timeout;
 	}
 	__set_current_state(TASK_RUNNING);
 
 	poll_freewait(&table);
 
-	/*
-	 * Up-to-date the caller timeout.
-	 */
-	*timeout = __timeout;
 	return retval;
 }
 
@@ -295,36 +305,14 @@ static void select_bits_free(void *bits, int size)
 #define MAX_SELECT_SECONDS \
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
-asmlinkage long
-sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
+static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
+			   fd_set __user *exp, s64 *timeout)
 {
 	fd_set_bits fds;
 	char *bits;
-	long timeout;
 	int ret, size, max_fdset;
 	struct fdtable *fdt;
 
-	timeout = MAX_SCHEDULE_TIMEOUT;
-	if (tvp) {
-		time_t sec, usec;
-
-		if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp))
-		    || __get_user(sec, &tvp->tv_sec)
-		    || __get_user(usec, &tvp->tv_usec)) {
-			ret = -EFAULT;
-			goto out_nofds;
-		}
-
-		ret = -EINVAL;
-		if (sec < 0 || usec < 0)
-			goto out_nofds;
-
-		if ((unsigned long) sec < MAX_SELECT_SECONDS) {
-			timeout = ROUND_UP(usec, 1000000/HZ);
-			timeout += sec * (unsigned long) HZ;
-		}
-	}
-
 	ret = -EINVAL;
 	if (n < 0)
 		goto out_nofds;
@@ -362,18 +350,7 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, &timeout);
-
-	if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
-		time_t sec = 0, usec = 0;
-		if (timeout) {
-			sec = timeout / HZ;
-			usec = timeout % HZ;
-			usec *= (1000000/HZ);
-		}
-		put_user(sec, &tvp->tv_sec);
-		put_user(usec, &tvp->tv_usec);
-	}
+	ret = do_select(n, &fds, timeout);
 
 	if (ret < 0)
 		goto out;
@@ -395,6 +372,154 @@ out_nofds:
 	return ret;
 }
 
+asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
+			fd_set __user *exp, struct timeval __user *tvp)
+{
+	s64 timeout = -1;
+	struct timeval tv;
+	int ret;
+
+	if (tvp) {
+		if (copy_from_user(&tv, tvp, sizeof(tv)))
+			return -EFAULT;
+
+		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+			return -EINVAL;
+
+		/* Cast to u64 to make GCC stop complaining */
+		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
+			timeout = -1;	/* infinite */
+		else {
+			timeout = ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
+			timeout += tv.tv_sec * HZ;
+		}
+	}
+
+	ret = core_sys_select(n, inp, outp, exp, &timeout);
+
+	if (tvp) {
+		if (current->personality & STICKY_TIMEOUTS)
+			goto sticky;
+		tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
+		tv.tv_sec = timeout;
+		if (copy_to_user(tvp, &tv, sizeof(tv))) {
+sticky:
+			/*
+			 * If an application puts its timeval in read-only
+			 * memory, we don't want the Linux-specific update to
+			 * the timeval to cause a fault after the select has
+			 * completed successfully. However, because we're not
+			 * updating the timeval, we can't restart the system
+			 * call.
+			 */
+			if (ret == -ERESTARTNOHAND)
+				ret = -EINTR;
+		}
+	}
+
+	return ret;
+}
+
+#ifdef TIF_RESTORE_SIGMASK
+asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
+		fd_set __user *exp, struct timespec __user *tsp,
+		const sigset_t __user *sigmask, size_t sigsetsize)
+{
+	s64 timeout = MAX_SCHEDULE_TIMEOUT;
+	sigset_t ksigmask, sigsaved;
+	struct timespec ts;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+			return -EINVAL;
+
+		/* Cast to u64 to make GCC stop complaining */
+		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
+			timeout = -1;	/* infinite */
+		else {
+			timeout = ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
+			timeout += ts.tv_sec * HZ;
+		}
+	}
+
+	if (sigmask) {
+		/* XXX: Don't preclude handling different sized sigset_t's.  */
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = core_sys_select(n, inp, outp, exp, &timeout);
+
+	if (tsp) {
+		if (current->personality & STICKY_TIMEOUTS)
+			goto sticky;
+		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
+		ts.tv_sec = timeout;
+		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+sticky:
+			/*
+			 * If an application puts its timeval in read-only
+			 * memory, we don't want the Linux-specific update to
+			 * the timeval to cause a fault after the select has
+			 * completed successfully. However, because we're not
+			 * updating the timeval, we can't restart the system
+			 * call.
+			 */
+			if (ret == -ERESTARTNOHAND)
+				ret = -EINTR;
+		}
+	}
+
+	if (ret == -ERESTARTNOHAND) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_thread_flag(TIF_RESTORE_SIGMASK);
+		}
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return ret;
+}
+
+/*
+ * Most architectures can't handle 7-argument syscalls. So we provide a
+ * 6-argument version where the sixth argument is a pointer to a structure
+ * which has a pointer to the sigset_t itself followed by a size_t containing
+ * the sigset size.
+ */
+asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
+	fd_set __user *exp, struct timespec __user *tsp, void __user *sig)
+{
+	size_t sigsetsize = 0;
+	sigset_t __user *up = NULL;
+
+	if (sig) {
+		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
+		    || __get_user(up, (sigset_t * __user *)sig)
+		    || __get_user(sigsetsize,
+				(size_t * __user)(sig+sizeof(void *))))
+			return -EFAULT;
+	}
+
+	return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize);
+}
+#endif /* TIF_RESTORE_SIGMASK */
+
 struct poll_list {
 	struct poll_list *next;
 	int len;
@@ -436,16 +561,19 @@ static void do_pollfd(unsigned int num, struct pollfd * fdpage,
 }
 
 static int do_poll(unsigned int nfds,  struct poll_list *list,
-			struct poll_wqueues *wait, long timeout)
+		   struct poll_wqueues *wait, s64 *timeout)
 {
 	int count = 0;
 	poll_table* pt = &wait->pt;
 
-	if (!timeout)
+	/* Optimise the no-wait case */
+	if (!(*timeout))
 		pt = NULL;
  
 	for (;;) {
 		struct poll_list *walk;
+		long __timeout;
+
 		set_current_state(TASK_INTERRUPTIBLE);
 		walk = list;
 		while(walk != NULL) {
@@ -453,18 +581,36 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 			walk = walk->next;
 		}
 		pt = NULL;
-		if (count || !timeout || signal_pending(current))
+		if (count || !*timeout || signal_pending(current))
 			break;
 		count = wait->error;
 		if (count)
 			break;
-		timeout = schedule_timeout(timeout);
+
+		if (*timeout < 0) {
+			/* Wait indefinitely */
+			__timeout = MAX_SCHEDULE_TIMEOUT;
+		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) {
+			/*
+			 * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in
+			 * a loop
+			 */
+			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
+			*timeout -= __timeout;
+		} else {
+			__timeout = *timeout;
+			*timeout = 0;
+		}
+
+		__timeout = schedule_timeout(__timeout);
+		if (*timeout >= 0)
+			*timeout += __timeout;
 	}
 	__set_current_state(TASK_RUNNING);
 	return count;
 }
 
-asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout)
+int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 {
 	struct poll_wqueues table;
  	int fdcount, err;
@@ -482,14 +628,6 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
 	if (nfds > max_fdset && nfds > OPEN_MAX)
 		return -EINVAL;
 
-	if (timeout) {
-		/* Careful about overflow in the intermediate values */
-		if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
-			timeout = (unsigned long)(timeout*HZ+999)/1000+1;
-		else /* Negative or overflow */
-			timeout = MAX_SCHEDULE_TIMEOUT;
-	}
-
 	poll_initwait(&table);
 
 	head = NULL;
@@ -519,6 +657,7 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
 		}
 		i -= pp->len;
 	}
+
 	fdcount = do_poll(nfds, head, &table, timeout);
 
 	/* OK, now copy the revents fields back to user space. */
@@ -547,3 +686,98 @@ out_fds:
 	poll_freewait(&table);
 	return err;
 }
+
+asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+			long timeout_msecs)
+{
+	s64 timeout_jiffies = 0;
+
+	if (timeout_msecs) {
+#if HZ > 1000
+		/* We can only overflow if HZ > 1000 */
+		if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
+			timeout_jiffies = -1;
+		else
+#endif
+			timeout_jiffies = msecs_to_jiffies(timeout_msecs);
+	}
+
+	return do_sys_poll(ufds, nfds, &timeout_jiffies);
+}
+
+#ifdef TIF_RESTORE_SIGMASK
+asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
+	struct timespec __user *tsp, const sigset_t __user *sigmask,
+	size_t sigsetsize)
+{
+	sigset_t ksigmask, sigsaved;
+	struct timespec ts;
+	s64 timeout = -1;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		/* Cast to u64 to make GCC stop complaining */
+		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
+			timeout = -1;	/* infinite */
+		else {
+			timeout = ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
+			timeout += ts.tv_sec * HZ;
+		}
+	}
+
+	if (sigmask) {
+		/* XXX: Don't preclude handling different sized sigset_t's.  */
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_sys_poll(ufds, nfds, &timeout);
+
+	/* We can restart this syscall, usually */
+	if (ret == -EINTR) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_thread_flag(TIF_RESTORE_SIGMASK);
+		}
+		ret = -ERESTARTNOHAND;
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	if (tsp && timeout >= 0) {
+		if (current->personality & STICKY_TIMEOUTS)
+			goto sticky;
+		/* Yes, we know it's actually an s64, but it's also positive. */
+		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
+		ts.tv_sec = timeout;
+		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+		sticky:
+			/*
+			 * If an application puts its timeval in read-only
+			 * memory, we don't want the Linux-specific update to
+			 * the timeval to cause a fault after the select has
+			 * completed successfully. However, because we're not
+			 * updating the timeval, we can't restart the system
+			 * call.
+			 */
+			if (ret == -ERESTARTNOHAND && timeout >= 0)
+				ret = -EINTR;
+		}
+	}
+
+	return ret;
+}
+#endif /* TIF_RESTORE_SIGMASK */
diff --git a/include/linux/poll.h b/include/linux/poll.h
index f6da702088f4..8e8f6098508a 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -92,7 +92,11 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset)
 	memset(fdset, 0, FDS_BYTES(nr));
 }
 
-extern int do_select(int n, fd_set_bits *fds, long *timeout);
+#define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
+
+extern int do_select(int n, fd_set_bits *fds, s64 *timeout);
+extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
+		       s64 *timeout);
 
 #endif /* KERNEL */
 
-- 
cgit v1.2.3-71-gd317