From 58a7ce64426394a46e80cdc9440cc1e7c195e85d Mon Sep 17 00:00:00 2001
From: Krzysztof Halasa <khc@pm.waw.pl>
Date: Thu, 30 Mar 2006 17:01:53 +0200
Subject: [PATCH] Goramo PCI200SYN WAN driver subsystem ID patch

Goramo finally got PCI subsystem ID for their PCI200SYN card. The
attached patch adds support for it - cards with old EEPROM data
will emit a warning with URL for update tool.

Signed-off-by: Krzysztof Halasa <khc@pm.waw.pl>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 870fe38378b1..6a1ef8b635c5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -930,6 +930,7 @@
 #define PCI_DEVICE_ID_PLX_DJINN_ITOO	0x1151
 #define PCI_DEVICE_ID_PLX_R753		0x1152
 #define PCI_DEVICE_ID_PLX_OLITEC	0x1187
+#define PCI_DEVICE_ID_PLX_PCI200SYN	0x3196
 #define PCI_DEVICE_ID_PLX_9050		0x9050
 #define PCI_DEVICE_ID_PLX_9080		0x9080
 #define PCI_DEVICE_ID_PLX_GTEK_SERIAL2	0xa001
-- 
cgit v1.2.3-71-gd317


From 3a720d726a6aa0a7cd9190f694587adf7bafdf4f Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Tue, 23 May 2006 06:10:01 -0400
Subject: [PATCH] Revive pci_find_ext_capability

This patch revives pci_find_ext_capability (has been disabled a couple month
ago since it was not used anywhere. See http://lkml.org/lkml/2006/1/20/247).
It will now be used by the myri10ge driver.

Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: Andrew J. Gallatin <gallatin@myri.com>

 drivers/pci/pci.c   |    3 +--
 include/linux/pci.h |    2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/pci/pci.c   | 3 +--
 include/linux/pci.h | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 2329f941a0dc..8d107c6c2c70 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -164,7 +164,6 @@ int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap)
 	return __pci_bus_find_cap(bus, devfn, hdr_type & 0x7f, cap);
 }
 
-#if 0
 /**
  * pci_find_ext_capability - Find an extended capability
  * @dev: PCI device to query
@@ -212,7 +211,7 @@ int pci_find_ext_capability(struct pci_dev *dev, int cap)
 
 	return 0;
 }
-#endif  /*  0  */
+EXPORT_SYMBOL_GPL(pci_find_ext_capability);
 
 /**
  * pci_find_parent_resource - return resource region of parent bus of given region
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3a6a4e37a482..6fd36cb09160 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -442,6 +442,7 @@ struct pci_dev *pci_find_device_reverse (unsigned int vendor, unsigned int devic
 struct pci_dev *pci_find_slot (unsigned int bus, unsigned int devfn);
 int pci_find_capability (struct pci_dev *dev, int cap);
 int pci_find_next_capability (struct pci_dev *dev, u8 pos, int cap);
+int pci_find_ext_capability (struct pci_dev *dev, int cap);
 struct pci_bus * pci_find_next_bus(const struct pci_bus *from);
 
 struct pci_dev *pci_get_device (unsigned int vendor, unsigned int device, struct pci_dev *from);
@@ -662,6 +663,7 @@ static inline int pci_register_driver(struct pci_driver *drv) { return 0;}
 static inline void pci_unregister_driver(struct pci_driver *drv) { }
 static inline int pci_find_capability (struct pci_dev *dev, int cap) {return 0; }
 static inline int pci_find_next_capability (struct pci_dev *dev, u8 post, int cap) { return 0; }
+static inline int pci_find_ext_capability (struct pci_dev *dev, int cap) {return 0; }
 static inline const struct pci_device_id *pci_match_device(const struct pci_device_id *ids, const struct pci_dev *dev) { return NULL; }
 
 /* Power management related routines */
-- 
cgit v1.2.3-71-gd317


From 0da34b6dfe55810ae60db57e08e2af8a808c0a55 Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Tue, 23 May 2006 06:10:15 -0400
Subject: [PATCH] Add Myri-10G Ethernet driver

Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: Andrew J. Gallatin <gallatin@myri.com>

 drivers/net/Kconfig                            |   17
 drivers/net/Makefile                           |    1
 drivers/net/myri10ge/Makefile                  |    5
 drivers/net/myri10ge/myri10ge.c                | 2851 +++++++++++++++
 drivers/net/myri10ge/myri10ge_mcp.h            |  205 +
 drivers/net/myri10ge/myri10ge_mcp_gen_header.h |   58
 include/linux/pci_ids.h                        |    1
 7 files changed, 3138 insertions(+)
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/net/Kconfig                            |   17 +
 drivers/net/Makefile                           |    1 +
 drivers/net/myri10ge/Makefile                  |    5 +
 drivers/net/myri10ge/myri10ge.c                | 2851 ++++++++++++++++++++++++
 drivers/net/myri10ge/myri10ge_mcp.h            |  205 ++
 drivers/net/myri10ge/myri10ge_mcp_gen_header.h |   58 +
 include/linux/pci_ids.h                        |    1 +
 7 files changed, 3138 insertions(+)
 create mode 100644 drivers/net/myri10ge/Makefile
 create mode 100644 drivers/net/myri10ge/myri10ge.c
 create mode 100644 drivers/net/myri10ge/myri10ge_mcp.h
 create mode 100644 drivers/net/myri10ge/myri10ge_mcp_gen_header.h

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 68bc073b8b31..f499a3bc629f 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2327,6 +2327,23 @@ config S2IO_NAPI
 
 	  If in doubt, say N.
 
+config MYRI10GE
+	tristate "Myricom Myri-10G Ethernet support"
+	depends on PCI
+	select FW_LOADER
+	select CRC32
+	---help---
+	  This driver supports Myricom Myri-10G Dual Protocol interface in
+	  Ethernet mode. If the eeprom on your board is not recent enough,
+	  you will need a newer firmware image.
+	  You may get this image or more information, at:
+
+	  <http://www.myri.com/Myri-10G/>
+
+	  To compile this driver as a module, choose M here and read
+	  <file:Documentation/networking/net-modules.txt>.  The module
+	  will be called myri10ge.
+
 endmenu
 
 source "drivers/net/tokenring/Kconfig"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index b01cc9a3cb10..1eced3287507 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -192,6 +192,7 @@ obj-$(CONFIG_R8169) += r8169.o
 obj-$(CONFIG_AMD8111_ETH) += amd8111e.o
 obj-$(CONFIG_IBMVETH) += ibmveth.o
 obj-$(CONFIG_S2IO) += s2io.o
+obj-$(CONFIG_MYRI10GE) += myri10ge/
 obj-$(CONFIG_SMC91X) += smc91x.o
 obj-$(CONFIG_SMC911X) += smc911x.o
 obj-$(CONFIG_DM9000) += dm9000.o
diff --git a/drivers/net/myri10ge/Makefile b/drivers/net/myri10ge/Makefile
new file mode 100644
index 000000000000..5df891647aee
--- /dev/null
+++ b/drivers/net/myri10ge/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the Myricom Myri-10G ethernet driver
+#
+
+obj-$(CONFIG_MYRI10GE) += myri10ge.o
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
new file mode 100644
index 000000000000..87933cba7e22
--- /dev/null
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -0,0 +1,2851 @@
+/*************************************************************************
+ * myri10ge.c: Myricom Myri-10G Ethernet driver.
+ *
+ * Copyright (C) 2005, 2006 Myricom, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Myricom, Inc. nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *
+ * If the eeprom on your board is not recent enough, you will need to get a
+ * newer firmware image at:
+ *   http://www.myri.com/scs/download-Myri10GE.html
+ *
+ * Contact Information:
+ *   <help@myri.com>
+ *   Myricom, Inc., 325N Santa Anita Avenue, Arcadia, CA 91006
+ *************************************************************************/
+
+#include <linux/tcp.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/ethtool.h>
+#include <linux/firmware.h>
+#include <linux/delay.h>
+#include <linux/version.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+#include <linux/crc32.h>
+#include <linux/moduleparam.h>
+#include <linux/io.h>
+#include <net/checksum.h>
+#include <asm/byteorder.h>
+#include <asm/io.h>
+#include <asm/pci.h>
+#include <asm/processor.h>
+#ifdef CONFIG_MTRR
+#include <asm/mtrr.h>
+#endif
+
+#include "myri10ge_mcp.h"
+#include "myri10ge_mcp_gen_header.h"
+
+#define MYRI10GE_VERSION_STR "0.9.0"
+
+MODULE_DESCRIPTION("Myricom 10G driver (10GbE)");
+MODULE_AUTHOR("Maintainer: help@myri.com");
+MODULE_VERSION(MYRI10GE_VERSION_STR);
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define MYRI10GE_MAX_ETHER_MTU 9014
+
+#define MYRI10GE_ETH_STOPPED 0
+#define MYRI10GE_ETH_STOPPING 1
+#define MYRI10GE_ETH_STARTING 2
+#define MYRI10GE_ETH_RUNNING 3
+#define MYRI10GE_ETH_OPEN_FAILED 4
+
+#define MYRI10GE_EEPROM_STRINGS_SIZE 256
+#define MYRI10GE_MAX_SEND_DESC_TSO ((65536 / 2048) * 2)
+
+#define MYRI10GE_NO_CONFIRM_DATA 0xffffffff
+#define MYRI10GE_NO_RESPONSE_RESULT 0xffffffff
+
+struct myri10ge_rx_buffer_state {
+	struct sk_buff *skb;
+	 DECLARE_PCI_UNMAP_ADDR(bus)
+	 DECLARE_PCI_UNMAP_LEN(len)
+};
+
+struct myri10ge_tx_buffer_state {
+	struct sk_buff *skb;
+	int last;
+	 DECLARE_PCI_UNMAP_ADDR(bus)
+	 DECLARE_PCI_UNMAP_LEN(len)
+};
+
+struct myri10ge_cmd {
+	u32 data0;
+	u32 data1;
+	u32 data2;
+};
+
+struct myri10ge_rx_buf {
+	struct mcp_kreq_ether_recv __iomem *lanai;	/* lanai ptr for recv ring */
+	u8 __iomem *wc_fifo;	/* w/c rx dma addr fifo address */
+	struct mcp_kreq_ether_recv *shadow;	/* host shadow of recv ring */
+	struct myri10ge_rx_buffer_state *info;
+	int cnt;
+	int alloc_fail;
+	int mask;		/* number of rx slots -1 */
+};
+
+struct myri10ge_tx_buf {
+	struct mcp_kreq_ether_send __iomem *lanai;	/* lanai ptr for sendq */
+	u8 __iomem *wc_fifo;	/* w/c send fifo address */
+	struct mcp_kreq_ether_send *req_list;	/* host shadow of sendq */
+	char *req_bytes;
+	struct myri10ge_tx_buffer_state *info;
+	int mask;		/* number of transmit slots -1  */
+	int boundary;		/* boundary transmits cannot cross */
+	int req ____cacheline_aligned;	/* transmit slots submitted     */
+	int pkt_start;		/* packets started */
+	int done ____cacheline_aligned;	/* transmit slots completed     */
+	int pkt_done;		/* packets completed */
+};
+
+struct myri10ge_rx_done {
+	struct mcp_slot *entry;
+	dma_addr_t bus;
+	int cnt;
+	int idx;
+};
+
+struct myri10ge_priv {
+	int running;		/* running?             */
+	int csum_flag;		/* rx_csums?            */
+	struct myri10ge_tx_buf tx;	/* transmit ring        */
+	struct myri10ge_rx_buf rx_small;
+	struct myri10ge_rx_buf rx_big;
+	struct myri10ge_rx_done rx_done;
+	int small_bytes;
+	struct net_device *dev;
+	struct net_device_stats stats;
+	u8 __iomem *sram;
+	int sram_size;
+	unsigned long board_span;
+	unsigned long iomem_base;
+	u32 __iomem *irq_claim;
+	u32 __iomem *irq_deassert;
+	char *mac_addr_string;
+	struct mcp_cmd_response *cmd;
+	dma_addr_t cmd_bus;
+	struct mcp_irq_data *fw_stats;
+	dma_addr_t fw_stats_bus;
+	struct pci_dev *pdev;
+	int msi_enabled;
+	unsigned int link_state;
+	unsigned int rdma_tags_available;
+	int intr_coal_delay;
+	u32 __iomem *intr_coal_delay_ptr;
+	int mtrr;
+	int wake_queue;
+	int stop_queue;
+	int down_cnt;
+	wait_queue_head_t down_wq;
+	struct work_struct watchdog_work;
+	struct timer_list watchdog_timer;
+	int watchdog_tx_done;
+	int watchdog_resets;
+	int tx_linearized;
+	int pause;
+	char *fw_name;
+	char eeprom_strings[MYRI10GE_EEPROM_STRINGS_SIZE];
+	char fw_version[128];
+	u8 mac_addr[6];		/* eeprom mac address */
+	unsigned long serial_number;
+	int vendor_specific_offset;
+	u32 devctl;
+	u16 msi_flags;
+	u32 pm_state[16];
+	u32 read_dma;
+	u32 write_dma;
+	u32 read_write_dma;
+};
+
+static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat";
+static char *myri10ge_fw_aligned = "myri10ge_eth_z8e.dat";
+
+static char *myri10ge_fw_name = NULL;
+module_param(myri10ge_fw_name, charp, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(myri10ge_fw_name, "Firmware image name\n");
+
+static int myri10ge_ecrc_enable = 1;
+module_param(myri10ge_ecrc_enable, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_ecrc_enable, "Enable Extended CRC on PCI-E\n");
+
+static int myri10ge_max_intr_slots = 1024;
+module_param(myri10ge_max_intr_slots, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_max_intr_slots, "Interrupt queue slots\n");
+
+static int myri10ge_small_bytes = -1;	/* -1 == auto */
+module_param(myri10ge_small_bytes, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(myri10ge_small_bytes, "Threshold of small packets\n");
+
+static int myri10ge_msi = 1;	/* enable msi by default */
+module_param(myri10ge_msi, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_msi, "Enable Message Signalled Interrupts\n");
+
+static int myri10ge_intr_coal_delay = 25;
+module_param(myri10ge_intr_coal_delay, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_intr_coal_delay, "Interrupt coalescing delay\n");
+
+static int myri10ge_flow_control = 1;
+module_param(myri10ge_flow_control, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_flow_control, "Pause parameter\n");
+
+static int myri10ge_deassert_wait = 1;
+module_param(myri10ge_deassert_wait, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(myri10ge_deassert_wait,
+		 "Wait when deasserting legacy interrupts\n");
+
+static int myri10ge_force_firmware = 0;
+module_param(myri10ge_force_firmware, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_force_firmware,
+		 "Force firmware to assume aligned completions\n");
+
+static int myri10ge_skb_cross_4k = 0;
+module_param(myri10ge_skb_cross_4k, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(myri10ge_skb_cross_4k,
+		 "Can a small skb cross a 4KB boundary?\n");
+
+static int myri10ge_initial_mtu = MYRI10GE_MAX_ETHER_MTU - ETH_HLEN;
+module_param(myri10ge_initial_mtu, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_initial_mtu, "Initial MTU\n");
+
+static int myri10ge_napi_weight = 64;
+module_param(myri10ge_napi_weight, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_napi_weight, "Set NAPI weight\n");
+
+static int myri10ge_watchdog_timeout = 1;
+module_param(myri10ge_watchdog_timeout, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_watchdog_timeout, "Set watchdog timeout\n");
+
+static int myri10ge_max_irq_loops = 1048576;
+module_param(myri10ge_max_irq_loops, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_max_irq_loops,
+		 "Set stuck legacy IRQ detection threshold\n");
+
+#define MYRI10GE_FW_OFFSET 1024*1024
+#define MYRI10GE_HIGHPART_TO_U32(X) \
+(sizeof (X) == 8) ? ((u32)((u64)(X) >> 32)) : (0)
+#define MYRI10GE_LOWPART_TO_U32(X) ((u32)(X))
+
+#define myri10ge_pio_copy(to,from,size) __iowrite64_copy(to,from,size/8)
+
+static int
+myri10ge_send_cmd(struct myri10ge_priv *mgp, u32 cmd,
+		  struct myri10ge_cmd *data, int atomic)
+{
+	struct mcp_cmd *buf;
+	char buf_bytes[sizeof(*buf) + 8];
+	struct mcp_cmd_response *response = mgp->cmd;
+	char __iomem *cmd_addr = mgp->sram + MXGEFW_CMD_OFFSET;
+	u32 dma_low, dma_high, result, value;
+	int sleep_total = 0;
+
+	/* ensure buf is aligned to 8 bytes */
+	buf = (struct mcp_cmd *)ALIGN((unsigned long)buf_bytes, 8);
+
+	buf->data0 = htonl(data->data0);
+	buf->data1 = htonl(data->data1);
+	buf->data2 = htonl(data->data2);
+	buf->cmd = htonl(cmd);
+	dma_low = MYRI10GE_LOWPART_TO_U32(mgp->cmd_bus);
+	dma_high = MYRI10GE_HIGHPART_TO_U32(mgp->cmd_bus);
+
+	buf->response_addr.low = htonl(dma_low);
+	buf->response_addr.high = htonl(dma_high);
+	response->result = MYRI10GE_NO_RESPONSE_RESULT;
+	mb();
+	myri10ge_pio_copy(cmd_addr, buf, sizeof(*buf));
+
+	/* wait up to 15ms. Longest command is the DMA benchmark,
+	 * which is capped at 5ms, but runs from a timeout handler
+	 * that runs every 7.8ms. So a 15ms timeout leaves us with
+	 * a 2.2ms margin
+	 */
+	if (atomic) {
+		/* if atomic is set, do not sleep,
+		 * and try to get the completion quickly
+		 * (1ms will be enough for those commands) */
+		for (sleep_total = 0;
+		     sleep_total < 1000
+		     && response->result == MYRI10GE_NO_RESPONSE_RESULT;
+		     sleep_total += 10)
+			udelay(10);
+	} else {
+		/* use msleep for most command */
+		for (sleep_total = 0;
+		     sleep_total < 15
+		     && response->result == MYRI10GE_NO_RESPONSE_RESULT;
+		     sleep_total++)
+			msleep(1);
+	}
+
+	result = ntohl(response->result);
+	value = ntohl(response->data);
+	if (result != MYRI10GE_NO_RESPONSE_RESULT) {
+		if (result == 0) {
+			data->data0 = value;
+			return 0;
+		} else {
+			dev_err(&mgp->pdev->dev,
+				"command %d failed, result = %d\n",
+				cmd, result);
+			return -ENXIO;
+		}
+	}
+
+	dev_err(&mgp->pdev->dev, "command %d timed out, result = %d\n",
+		cmd, result);
+	return -EAGAIN;
+}
+
+/*
+ * The eeprom strings on the lanaiX have the format
+ * SN=x\0
+ * MAC=x:x:x:x:x:x\0
+ * PT:ddd mmm xx xx:xx:xx xx\0
+ * PV:ddd mmm xx xx:xx:xx xx\0
+ */
+static int myri10ge_read_mac_addr(struct myri10ge_priv *mgp)
+{
+	char *ptr, *limit;
+	int i;
+
+	ptr = mgp->eeprom_strings;
+	limit = mgp->eeprom_strings + MYRI10GE_EEPROM_STRINGS_SIZE;
+
+	while (*ptr != '\0' && ptr < limit) {
+		if (memcmp(ptr, "MAC=", 4) == 0) {
+			ptr += 4;
+			mgp->mac_addr_string = ptr;
+			for (i = 0; i < 6; i++) {
+				if ((ptr + 2) > limit)
+					goto abort;
+				mgp->mac_addr[i] =
+				    simple_strtoul(ptr, &ptr, 16);
+				ptr += 1;
+			}
+		}
+		if (memcmp((const void *)ptr, "SN=", 3) == 0) {
+			ptr += 3;
+			mgp->serial_number = simple_strtoul(ptr, &ptr, 10);
+		}
+		while (ptr < limit && *ptr++) ;
+	}
+
+	return 0;
+
+abort:
+	dev_err(&mgp->pdev->dev, "failed to parse eeprom_strings\n");
+	return -ENXIO;
+}
+
+/*
+ * Enable or disable periodic RDMAs from the host to make certain
+ * chipsets resend dropped PCIe messages
+ */
+
+static void myri10ge_dummy_rdma(struct myri10ge_priv *mgp, int enable)
+{
+	char __iomem *submit;
+	u32 buf[16];
+	u32 dma_low, dma_high;
+	int i;
+
+	/* clear confirmation addr */
+	mgp->cmd->data = 0;
+	mb();
+
+	/* send a rdma command to the PCIe engine, and wait for the
+	 * response in the confirmation address.  The firmware should
+	 * write a -1 there to indicate it is alive and well
+	 */
+	dma_low = MYRI10GE_LOWPART_TO_U32(mgp->cmd_bus);
+	dma_high = MYRI10GE_HIGHPART_TO_U32(mgp->cmd_bus);
+
+	buf[0] = htonl(dma_high);	/* confirm addr MSW */
+	buf[1] = htonl(dma_low);	/* confirm addr LSW */
+	buf[2] = htonl(MYRI10GE_NO_CONFIRM_DATA);	/* confirm data */
+	buf[3] = htonl(dma_high);	/* dummy addr MSW */
+	buf[4] = htonl(dma_low);	/* dummy addr LSW */
+	buf[5] = htonl(enable);	/* enable? */
+
+	submit = mgp->sram + 0xfc01c0;
+
+	myri10ge_pio_copy(submit, &buf, sizeof(buf));
+	for (i = 0; mgp->cmd->data != MYRI10GE_NO_CONFIRM_DATA && i < 20; i++)
+		msleep(1);
+	if (mgp->cmd->data != MYRI10GE_NO_CONFIRM_DATA)
+		dev_err(&mgp->pdev->dev, "dummy rdma %s failed\n",
+			(enable ? "enable" : "disable"));
+}
+
+static int
+myri10ge_validate_firmware(struct myri10ge_priv *mgp,
+			   struct mcp_gen_header *hdr)
+{
+	struct device *dev = &mgp->pdev->dev;
+	int major, minor;
+
+	/* check firmware type */
+	if (ntohl(hdr->mcp_type) != MCP_TYPE_ETH) {
+		dev_err(dev, "Bad firmware type: 0x%x\n", ntohl(hdr->mcp_type));
+		return -EINVAL;
+	}
+
+	/* save firmware version for ethtool */
+	strncpy(mgp->fw_version, hdr->version, sizeof(mgp->fw_version));
+
+	sscanf(mgp->fw_version, "%d.%d", &major, &minor);
+
+	if (!(major == MXGEFW_VERSION_MAJOR && minor == MXGEFW_VERSION_MINOR)) {
+		dev_err(dev, "Found firmware version %s\n", mgp->fw_version);
+		dev_err(dev, "Driver needs %d.%d\n", MXGEFW_VERSION_MAJOR,
+			MXGEFW_VERSION_MINOR);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int myri10ge_load_hotplug_firmware(struct myri10ge_priv *mgp, u32 * size)
+{
+	unsigned crc, reread_crc;
+	const struct firmware *fw;
+	struct device *dev = &mgp->pdev->dev;
+	struct mcp_gen_header *hdr;
+	size_t hdr_offset;
+	int status;
+
+	if ((status = request_firmware(&fw, mgp->fw_name, dev)) < 0) {
+		dev_err(dev, "Unable to load %s firmware image via hotplug\n",
+			mgp->fw_name);
+		status = -EINVAL;
+		goto abort_with_nothing;
+	}
+
+	/* check size */
+
+	if (fw->size >= mgp->sram_size - MYRI10GE_FW_OFFSET ||
+	    fw->size < MCP_HEADER_PTR_OFFSET + 4) {
+		dev_err(dev, "Firmware size invalid:%d\n", (int)fw->size);
+		status = -EINVAL;
+		goto abort_with_fw;
+	}
+
+	/* check id */
+	hdr_offset = ntohl(*(u32 *) (fw->data + MCP_HEADER_PTR_OFFSET));
+	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->size) {
+		dev_err(dev, "Bad firmware file\n");
+		status = -EINVAL;
+		goto abort_with_fw;
+	}
+	hdr = (void *)(fw->data + hdr_offset);
+
+	status = myri10ge_validate_firmware(mgp, hdr);
+	if (status != 0)
+		goto abort_with_fw;
+
+	crc = crc32(~0, fw->data, fw->size);
+	memcpy_toio(mgp->sram + MYRI10GE_FW_OFFSET, fw->data, fw->size);
+	/* corruption checking is good for parity recovery and buggy chipset */
+	memcpy_fromio(fw->data, mgp->sram + MYRI10GE_FW_OFFSET, fw->size);
+	reread_crc = crc32(~0, fw->data, fw->size);
+	if (crc != reread_crc) {
+		dev_err(dev, "CRC failed(fw-len=%u), got 0x%x (expect 0x%x)\n",
+			(unsigned)fw->size, reread_crc, crc);
+		status = -EIO;
+		goto abort_with_fw;
+	}
+	*size = (u32) fw->size;
+
+abort_with_fw:
+	release_firmware(fw);
+
+abort_with_nothing:
+	return status;
+}
+
+static int myri10ge_adopt_running_firmware(struct myri10ge_priv *mgp)
+{
+	struct mcp_gen_header *hdr;
+	struct device *dev = &mgp->pdev->dev;
+	const size_t bytes = sizeof(struct mcp_gen_header);
+	size_t hdr_offset;
+	int status;
+
+	/* find running firmware header */
+	hdr_offset = ntohl(__raw_readl(mgp->sram + MCP_HEADER_PTR_OFFSET));
+
+	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > mgp->sram_size) {
+		dev_err(dev, "Running firmware has bad header offset (%d)\n",
+			(int)hdr_offset);
+		return -EIO;
+	}
+
+	/* copy header of running firmware from SRAM to host memory to
+	 * validate firmware */
+	hdr = kmalloc(bytes, GFP_KERNEL);
+	if (hdr == NULL) {
+		dev_err(dev, "could not malloc firmware hdr\n");
+		return -ENOMEM;
+	}
+	memcpy_fromio(hdr, mgp->sram + hdr_offset, bytes);
+	status = myri10ge_validate_firmware(mgp, hdr);
+	kfree(hdr);
+	return status;
+}
+
+static int myri10ge_load_firmware(struct myri10ge_priv *mgp)
+{
+	char __iomem *submit;
+	u32 buf[16];
+	u32 dma_low, dma_high, size;
+	int status, i;
+
+	status = myri10ge_load_hotplug_firmware(mgp, &size);
+	if (status) {
+		dev_warn(&mgp->pdev->dev, "hotplug firmware loading failed\n");
+
+		/* Do not attempt to adopt firmware if there
+		 * was a bad crc */
+		if (status == -EIO)
+			return status;
+
+		status = myri10ge_adopt_running_firmware(mgp);
+		if (status != 0) {
+			dev_err(&mgp->pdev->dev,
+				"failed to adopt running firmware\n");
+			return status;
+		}
+		dev_info(&mgp->pdev->dev,
+			 "Successfully adopted running firmware\n");
+		if (mgp->tx.boundary == 4096) {
+			dev_warn(&mgp->pdev->dev,
+				 "Using firmware currently running on NIC"
+				 ".  For optimal\n");
+			dev_warn(&mgp->pdev->dev,
+				 "performance consider loading optimized "
+				 "firmware\n");
+			dev_warn(&mgp->pdev->dev, "via hotplug\n");
+		}
+
+		mgp->fw_name = "adopted";
+		mgp->tx.boundary = 2048;
+		return status;
+	}
+
+	/* clear confirmation addr */
+	mgp->cmd->data = 0;
+	mb();
+
+	/* send a reload command to the bootstrap MCP, and wait for the
+	 *  response in the confirmation address.  The firmware should
+	 * write a -1 there to indicate it is alive and well
+	 */
+	dma_low = MYRI10GE_LOWPART_TO_U32(mgp->cmd_bus);
+	dma_high = MYRI10GE_HIGHPART_TO_U32(mgp->cmd_bus);
+
+	buf[0] = htonl(dma_high);	/* confirm addr MSW */
+	buf[1] = htonl(dma_low);	/* confirm addr LSW */
+	buf[2] = htonl(MYRI10GE_NO_CONFIRM_DATA);	/* confirm data */
+
+	/* FIX: All newest firmware should un-protect the bottom of
+	 * the sram before handoff. However, the very first interfaces
+	 * do not. Therefore the handoff copy must skip the first 8 bytes
+	 */
+	buf[3] = htonl(MYRI10GE_FW_OFFSET + 8);	/* where the code starts */
+	buf[4] = htonl(size - 8);	/* length of code */
+	buf[5] = htonl(8);	/* where to copy to */
+	buf[6] = htonl(0);	/* where to jump to */
+
+	submit = mgp->sram + 0xfc0000;
+
+	myri10ge_pio_copy(submit, &buf, sizeof(buf));
+	mb();
+	msleep(1);
+	mb();
+	i = 0;
+	while (mgp->cmd->data != MYRI10GE_NO_CONFIRM_DATA && i < 20) {
+		msleep(1);
+		i++;
+	}
+	if (mgp->cmd->data != MYRI10GE_NO_CONFIRM_DATA) {
+		dev_err(&mgp->pdev->dev, "handoff failed\n");
+		return -ENXIO;
+	}
+	dev_info(&mgp->pdev->dev, "handoff confirmed\n");
+	myri10ge_dummy_rdma(mgp, mgp->tx.boundary != 4096);
+
+	return 0;
+}
+
+static int myri10ge_update_mac_address(struct myri10ge_priv *mgp, u8 * addr)
+{
+	struct myri10ge_cmd cmd;
+	int status;
+
+	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
+		     | (addr[2] << 8) | addr[3]);
+
+	cmd.data1 = ((addr[4] << 8) | (addr[5]));
+
+	status = myri10ge_send_cmd(mgp, MXGEFW_SET_MAC_ADDRESS, &cmd, 0);
+	return status;
+}
+
+static int myri10ge_change_pause(struct myri10ge_priv *mgp, int pause)
+{
+	struct myri10ge_cmd cmd;
+	int status, ctl;
+
+	ctl = pause ? MXGEFW_ENABLE_FLOW_CONTROL : MXGEFW_DISABLE_FLOW_CONTROL;
+	status = myri10ge_send_cmd(mgp, ctl, &cmd, 0);
+
+	if (status) {
+		printk(KERN_ERR
+		       "myri10ge: %s: Failed to set flow control mode\n",
+		       mgp->dev->name);
+		return status;
+	}
+	mgp->pause = pause;
+	return 0;
+}
+
+static void
+myri10ge_change_promisc(struct myri10ge_priv *mgp, int promisc, int atomic)
+{
+	struct myri10ge_cmd cmd;
+	int status, ctl;
+
+	ctl = promisc ? MXGEFW_ENABLE_PROMISC : MXGEFW_DISABLE_PROMISC;
+	status = myri10ge_send_cmd(mgp, ctl, &cmd, atomic);
+	if (status)
+		printk(KERN_ERR "myri10ge: %s: Failed to set promisc mode\n",
+		       mgp->dev->name);
+}
+
+static int myri10ge_reset(struct myri10ge_priv *mgp)
+{
+	struct myri10ge_cmd cmd;
+	int status;
+	size_t bytes;
+	u32 len;
+
+	/* try to send a reset command to the card to see if it
+	 * is alive */
+	memset(&cmd, 0, sizeof(cmd));
+	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd, 0);
+	if (status != 0) {
+		dev_err(&mgp->pdev->dev, "failed reset\n");
+		return -ENXIO;
+	}
+
+	/* Now exchange information about interrupts  */
+
+	bytes = myri10ge_max_intr_slots * sizeof(*mgp->rx_done.entry);
+	memset(mgp->rx_done.entry, 0, bytes);
+	cmd.data0 = (u32) bytes;
+	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd, 0);
+	cmd.data0 = MYRI10GE_LOWPART_TO_U32(mgp->rx_done.bus);
+	cmd.data1 = MYRI10GE_HIGHPART_TO_U32(mgp->rx_done.bus);
+	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_DMA, &cmd, 0);
+
+	status |=
+	    myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd, 0);
+	mgp->irq_claim = (__iomem u32 *) (mgp->sram + cmd.data0);
+	if (!mgp->msi_enabled) {
+		status |= myri10ge_send_cmd
+		    (mgp, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd, 0);
+		mgp->irq_deassert = (__iomem u32 *) (mgp->sram + cmd.data0);
+
+	}
+	status |= myri10ge_send_cmd
+	    (mgp, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd, 0);
+	mgp->intr_coal_delay_ptr = (__iomem u32 *) (mgp->sram + cmd.data0);
+	if (status != 0) {
+		dev_err(&mgp->pdev->dev, "failed set interrupt parameters\n");
+		return status;
+	}
+	__raw_writel(htonl(mgp->intr_coal_delay), mgp->intr_coal_delay_ptr);
+
+	/* Run a small DMA test.
+	 * The magic multipliers to the length tell the firmware
+	 * to do DMA read, write, or read+write tests.  The
+	 * results are returned in cmd.data0.  The upper 16
+	 * bits or the return is the number of transfers completed.
+	 * The lower 16 bits is the time in 0.5us ticks that the
+	 * transfers took to complete.
+	 */
+
+	len = mgp->tx.boundary;
+
+	cmd.data0 = MYRI10GE_LOWPART_TO_U32(mgp->rx_done.bus);
+	cmd.data1 = MYRI10GE_HIGHPART_TO_U32(mgp->rx_done.bus);
+	cmd.data2 = len * 0x10000;
+	status = myri10ge_send_cmd(mgp, MXGEFW_DMA_TEST, &cmd, 0);
+	if (status == 0)
+		mgp->read_dma = ((cmd.data0 >> 16) * len * 2) /
+		    (cmd.data0 & 0xffff);
+	else
+		dev_warn(&mgp->pdev->dev, "DMA read benchmark failed: %d\n",
+			 status);
+	cmd.data0 = MYRI10GE_LOWPART_TO_U32(mgp->rx_done.bus);
+	cmd.data1 = MYRI10GE_HIGHPART_TO_U32(mgp->rx_done.bus);
+	cmd.data2 = len * 0x1;
+	status = myri10ge_send_cmd(mgp, MXGEFW_DMA_TEST, &cmd, 0);
+	if (status == 0)
+		mgp->write_dma = ((cmd.data0 >> 16) * len * 2) /
+		    (cmd.data0 & 0xffff);
+	else
+		dev_warn(&mgp->pdev->dev, "DMA write benchmark failed: %d\n",
+			 status);
+
+	cmd.data0 = MYRI10GE_LOWPART_TO_U32(mgp->rx_done.bus);
+	cmd.data1 = MYRI10GE_HIGHPART_TO_U32(mgp->rx_done.bus);
+	cmd.data2 = len * 0x10001;
+	status = myri10ge_send_cmd(mgp, MXGEFW_DMA_TEST, &cmd, 0);
+	if (status == 0)
+		mgp->read_write_dma = ((cmd.data0 >> 16) * len * 2 * 2) /
+		    (cmd.data0 & 0xffff);
+	else
+		dev_warn(&mgp->pdev->dev,
+			 "DMA read/write benchmark failed: %d\n", status);
+
+	memset(mgp->rx_done.entry, 0, bytes);
+
+	/* reset mcp/driver shared state back to 0 */
+	mgp->tx.req = 0;
+	mgp->tx.done = 0;
+	mgp->tx.pkt_start = 0;
+	mgp->tx.pkt_done = 0;
+	mgp->rx_big.cnt = 0;
+	mgp->rx_small.cnt = 0;
+	mgp->rx_done.idx = 0;
+	mgp->rx_done.cnt = 0;
+	status = myri10ge_update_mac_address(mgp, mgp->dev->dev_addr);
+	myri10ge_change_promisc(mgp, 0, 0);
+	myri10ge_change_pause(mgp, mgp->pause);
+	return status;
+}
+
+static inline void
+myri10ge_submit_8rx(struct mcp_kreq_ether_recv __iomem * dst,
+		    struct mcp_kreq_ether_recv *src)
+{
+	u32 low;
+
+	low = src->addr_low;
+	src->addr_low = DMA_32BIT_MASK;
+	myri10ge_pio_copy(dst, src, 8 * sizeof(*src));
+	mb();
+	src->addr_low = low;
+	__raw_writel(low, &dst->addr_low);
+	mb();
+}
+
+/*
+ * Set of routunes to get a new receive buffer.  Any buffer which
+ * crosses a 4KB boundary must start on a 4KB boundary due to PCIe
+ * wdma restrictions. We also try to align any smaller allocation to
+ * at least a 16 byte boundary for efficiency.  We assume the linux
+ * memory allocator works by powers of 2, and will not return memory
+ * smaller than 2KB which crosses a 4KB boundary.  If it does, we fall
+ * back to allocating 2x as much space as required.
+ *
+ * We intend to replace large (>4KB) skb allocations by using
+ * pages directly and building a fraglist in the near future.
+ */
+
+static inline struct sk_buff *myri10ge_alloc_big(int bytes)
+{
+	struct sk_buff *skb;
+	unsigned long data, roundup;
+
+	skb = dev_alloc_skb(bytes + 4096 + MXGEFW_PAD);
+	if (skb == NULL)
+		return NULL;
+
+	/* Correct skb->truesize so that socket buffer
+	 * accounting is not confused the rounding we must
+	 * do to satisfy alignment constraints.
+	 */
+	skb->truesize -= 4096;
+
+	data = (unsigned long)(skb->data);
+	roundup = (-data) & (4095);
+	skb_reserve(skb, roundup);
+	return skb;
+}
+
+/* Allocate 2x as much space as required and use whichever portion
+ * does not cross a 4KB boundary */
+static inline struct sk_buff *myri10ge_alloc_small_safe(unsigned int bytes)
+{
+	struct sk_buff *skb;
+	unsigned long data, boundary;
+
+	skb = dev_alloc_skb(2 * (bytes + MXGEFW_PAD) - 1);
+	if (unlikely(skb == NULL))
+		return NULL;
+
+	/* Correct skb->truesize so that socket buffer
+	 * accounting is not confused the rounding we must
+	 * do to satisfy alignment constraints.
+	 */
+	skb->truesize -= bytes + MXGEFW_PAD;
+
+	data = (unsigned long)(skb->data);
+	boundary = (data + 4095UL) & ~4095UL;
+	if ((boundary - data) >= (bytes + MXGEFW_PAD))
+		return skb;
+
+	skb_reserve(skb, boundary - data);
+	return skb;
+}
+
+/* Allocate just enough space, and verify that the allocated
+ * space does not cross a 4KB boundary */
+static inline struct sk_buff *myri10ge_alloc_small(int bytes)
+{
+	struct sk_buff *skb;
+	unsigned long roundup, data, end;
+
+	skb = dev_alloc_skb(bytes + 16 + MXGEFW_PAD);
+	if (unlikely(skb == NULL))
+		return NULL;
+
+	/* Round allocated buffer to 16 byte boundary */
+	data = (unsigned long)(skb->data);
+	roundup = (-data) & 15UL;
+	skb_reserve(skb, roundup);
+	/* Verify that the data buffer does not cross a page boundary */
+	data = (unsigned long)(skb->data);
+	end = data + bytes + MXGEFW_PAD - 1;
+	if (unlikely(((end >> 12) != (data >> 12)) && (data & 4095UL))) {
+		printk(KERN_NOTICE
+		       "myri10ge_alloc_small: small skb crossed 4KB boundary\n");
+		myri10ge_skb_cross_4k = 1;
+		dev_kfree_skb_any(skb);
+		skb = myri10ge_alloc_small_safe(bytes);
+	}
+	return skb;
+}
+
+static inline int
+myri10ge_getbuf(struct myri10ge_rx_buf *rx, struct pci_dev *pdev, int bytes,
+		int idx)
+{
+	struct sk_buff *skb;
+	dma_addr_t bus;
+	int len, retval = 0;
+
+	bytes += VLAN_HLEN;	/* account for 802.1q vlan tag */
+
+	if ((bytes + MXGEFW_PAD) > (4096 - 16) /* linux overhead */ )
+		skb = myri10ge_alloc_big(bytes);
+	else if (myri10ge_skb_cross_4k)
+		skb = myri10ge_alloc_small_safe(bytes);
+	else
+		skb = myri10ge_alloc_small(bytes);
+
+	if (unlikely(skb == NULL)) {
+		rx->alloc_fail++;
+		retval = -ENOBUFS;
+		goto done;
+	}
+
+	/* set len so that it only covers the area we
+	 * need mapped for DMA */
+	len = bytes + MXGEFW_PAD;
+
+	bus = pci_map_single(pdev, skb->data, len, PCI_DMA_FROMDEVICE);
+	rx->info[idx].skb = skb;
+	pci_unmap_addr_set(&rx->info[idx], bus, bus);
+	pci_unmap_len_set(&rx->info[idx], len, len);
+	rx->shadow[idx].addr_low = htonl(MYRI10GE_LOWPART_TO_U32(bus));
+	rx->shadow[idx].addr_high = htonl(MYRI10GE_HIGHPART_TO_U32(bus));
+
+done:
+	/* copy 8 descriptors (64-bytes) to the mcp at a time */
+	if ((idx & 7) == 7) {
+		if (rx->wc_fifo == NULL)
+			myri10ge_submit_8rx(&rx->lanai[idx - 7],
+					    &rx->shadow[idx - 7]);
+		else {
+			mb();
+			myri10ge_pio_copy(rx->wc_fifo,
+					  &rx->shadow[idx - 7], 64);
+		}
+	}
+	return retval;
+}
+
+static inline void myri10ge_vlan_ip_csum(struct sk_buff *skb, u16 hw_csum)
+{
+	struct vlan_hdr *vh = (struct vlan_hdr *)(skb->data);
+
+	if ((skb->protocol == ntohs(ETH_P_8021Q)) &&
+	    (vh->h_vlan_encapsulated_proto == htons(ETH_P_IP) ||
+	     vh->h_vlan_encapsulated_proto == htons(ETH_P_IPV6))) {
+		skb->csum = hw_csum;
+		skb->ip_summed = CHECKSUM_HW;
+	}
+}
+
+static inline unsigned long
+myri10ge_rx_done(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx,
+		 int bytes, int len, int csum)
+{
+	dma_addr_t bus;
+	struct sk_buff *skb;
+	int idx, unmap_len;
+
+	idx = rx->cnt & rx->mask;
+	rx->cnt++;
+
+	/* save a pointer to the received skb */
+	skb = rx->info[idx].skb;
+	bus = pci_unmap_addr(&rx->info[idx], bus);
+	unmap_len = pci_unmap_len(&rx->info[idx], len);
+
+	/* try to replace the received skb */
+	if (myri10ge_getbuf(rx, mgp->pdev, bytes, idx)) {
+		/* drop the frame -- the old skbuf is re-cycled */
+		mgp->stats.rx_dropped += 1;
+		return 0;
+	}
+
+	/* unmap the recvd skb */
+	pci_unmap_single(mgp->pdev, bus, unmap_len, PCI_DMA_FROMDEVICE);
+
+	/* mcp implicitly skips 1st bytes so that packet is properly
+	 * aligned */
+	skb_reserve(skb, MXGEFW_PAD);
+
+	/* set the length of the frame */
+	skb_put(skb, len);
+
+	skb->protocol = eth_type_trans(skb, mgp->dev);
+	skb->dev = mgp->dev;
+	if (mgp->csum_flag) {
+		if ((skb->protocol == ntohs(ETH_P_IP)) ||
+		    (skb->protocol == ntohs(ETH_P_IPV6))) {
+			skb->csum = ntohs((u16) csum);
+			skb->ip_summed = CHECKSUM_HW;
+		} else
+			myri10ge_vlan_ip_csum(skb, ntohs((u16) csum));
+	}
+
+	netif_receive_skb(skb);
+	mgp->dev->last_rx = jiffies;
+	return 1;
+}
+
+static inline void myri10ge_tx_done(struct myri10ge_priv *mgp, int mcp_index)
+{
+	struct pci_dev *pdev = mgp->pdev;
+	struct myri10ge_tx_buf *tx = &mgp->tx;
+	struct sk_buff *skb;
+	int idx, len;
+	int limit = 0;
+
+	while (tx->pkt_done != mcp_index) {
+		idx = tx->done & tx->mask;
+		skb = tx->info[idx].skb;
+
+		/* Mark as free */
+		tx->info[idx].skb = NULL;
+		if (tx->info[idx].last) {
+			tx->pkt_done++;
+			tx->info[idx].last = 0;
+		}
+		tx->done++;
+		len = pci_unmap_len(&tx->info[idx], len);
+		pci_unmap_len_set(&tx->info[idx], len, 0);
+		if (skb) {
+			mgp->stats.tx_bytes += skb->len;
+			mgp->stats.tx_packets++;
+			dev_kfree_skb_irq(skb);
+			if (len)
+				pci_unmap_single(pdev,
+						 pci_unmap_addr(&tx->info[idx],
+								bus), len,
+						 PCI_DMA_TODEVICE);
+		} else {
+			if (len)
+				pci_unmap_page(pdev,
+					       pci_unmap_addr(&tx->info[idx],
+							      bus), len,
+					       PCI_DMA_TODEVICE);
+		}
+
+		/* limit potential for livelock by only handling
+		 * 2 full tx rings per call */
+		if (unlikely(++limit > 2 * tx->mask))
+			break;
+	}
+	/* start the queue if we've stopped it */
+	if (netif_queue_stopped(mgp->dev)
+	    && tx->req - tx->done < (tx->mask >> 1)) {
+		mgp->wake_queue++;
+		netif_wake_queue(mgp->dev);
+	}
+}
+
+static inline void myri10ge_clean_rx_done(struct myri10ge_priv *mgp, int *limit)
+{
+	struct myri10ge_rx_done *rx_done = &mgp->rx_done;
+	unsigned long rx_bytes = 0;
+	unsigned long rx_packets = 0;
+	unsigned long rx_ok;
+
+	int idx = rx_done->idx;
+	int cnt = rx_done->cnt;
+	u16 length;
+	u16 checksum;
+
+	while (rx_done->entry[idx].length != 0 && *limit != 0) {
+		length = ntohs(rx_done->entry[idx].length);
+		rx_done->entry[idx].length = 0;
+		checksum = ntohs(rx_done->entry[idx].checksum);
+		if (length <= mgp->small_bytes)
+			rx_ok = myri10ge_rx_done(mgp, &mgp->rx_small,
+						 mgp->small_bytes,
+						 length, checksum);
+		else
+			rx_ok = myri10ge_rx_done(mgp, &mgp->rx_big,
+						 mgp->dev->mtu + ETH_HLEN,
+						 length, checksum);
+		rx_packets += rx_ok;
+		rx_bytes += rx_ok * (unsigned long)length;
+		cnt++;
+		idx = cnt & (myri10ge_max_intr_slots - 1);
+
+		/* limit potential for livelock by only handling a
+		 * limited number of frames. */
+		(*limit)--;
+	}
+	rx_done->idx = idx;
+	rx_done->cnt = cnt;
+	mgp->stats.rx_packets += rx_packets;
+	mgp->stats.rx_bytes += rx_bytes;
+}
+
+static inline void myri10ge_check_statblock(struct myri10ge_priv *mgp)
+{
+	struct mcp_irq_data *stats = mgp->fw_stats;
+
+	if (unlikely(stats->stats_updated)) {
+		if (mgp->link_state != stats->link_up) {
+			mgp->link_state = stats->link_up;
+			if (mgp->link_state) {
+				printk(KERN_INFO "myri10ge: %s: link up\n",
+				       mgp->dev->name);
+				netif_carrier_on(mgp->dev);
+			} else {
+				printk(KERN_INFO "myri10ge: %s: link down\n",
+				       mgp->dev->name);
+				netif_carrier_off(mgp->dev);
+			}
+		}
+		if (mgp->rdma_tags_available !=
+		    ntohl(mgp->fw_stats->rdma_tags_available)) {
+			mgp->rdma_tags_available =
+			    ntohl(mgp->fw_stats->rdma_tags_available);
+			printk(KERN_WARNING "myri10ge: %s: RDMA timed out! "
+			       "%d tags left\n", mgp->dev->name,
+			       mgp->rdma_tags_available);
+		}
+		mgp->down_cnt += stats->link_down;
+		if (stats->link_down)
+			wake_up(&mgp->down_wq);
+	}
+}
+
+static int myri10ge_poll(struct net_device *netdev, int *budget)
+{
+	struct myri10ge_priv *mgp = netdev_priv(netdev);
+	struct myri10ge_rx_done *rx_done = &mgp->rx_done;
+	int limit, orig_limit, work_done;
+
+	/* process as many rx events as NAPI will allow */
+	limit = min(*budget, netdev->quota);
+	orig_limit = limit;
+	myri10ge_clean_rx_done(mgp, &limit);
+	work_done = orig_limit - limit;
+	*budget -= work_done;
+	netdev->quota -= work_done;
+
+	if (rx_done->entry[rx_done->idx].length == 0 || !netif_running(netdev)) {
+		netif_rx_complete(netdev);
+		__raw_writel(htonl(3), mgp->irq_claim);
+		return 0;
+	}
+	return 1;
+}
+
+static irqreturn_t myri10ge_intr(int irq, void *arg, struct pt_regs *regs)
+{
+	struct myri10ge_priv *mgp = arg;
+	struct mcp_irq_data *stats = mgp->fw_stats;
+	struct myri10ge_tx_buf *tx = &mgp->tx;
+	u32 send_done_count;
+	int i;
+
+	/* make sure it is our IRQ, and that the DMA has finished */
+	if (unlikely(!stats->valid))
+		return (IRQ_NONE);
+
+	/* low bit indicates receives are present, so schedule
+	 * napi poll handler */
+	if (stats->valid & 1)
+		netif_rx_schedule(mgp->dev);
+
+	if (!mgp->msi_enabled) {
+		__raw_writel(0, mgp->irq_deassert);
+		if (!myri10ge_deassert_wait)
+			stats->valid = 0;
+		mb();
+	} else
+		stats->valid = 0;
+
+	/* Wait for IRQ line to go low, if using INTx */
+	i = 0;
+	while (1) {
+		i++;
+		/* check for transmit completes and receives */
+		send_done_count = ntohl(stats->send_done_count);
+		if (send_done_count != tx->pkt_done)
+			myri10ge_tx_done(mgp, (int)send_done_count);
+		if (unlikely(i > myri10ge_max_irq_loops)) {
+			printk(KERN_WARNING "myri10ge: %s: irq stuck?\n",
+			       mgp->dev->name);
+			stats->valid = 0;
+			schedule_work(&mgp->watchdog_work);
+		}
+		if (likely(stats->valid == 0))
+			break;
+		cpu_relax();
+		barrier();
+	}
+
+	myri10ge_check_statblock(mgp);
+
+	__raw_writel(htonl(3), mgp->irq_claim + 1);
+	return (IRQ_HANDLED);
+}
+
+static int
+myri10ge_get_settings(struct net_device *netdev, struct ethtool_cmd *cmd)
+{
+	cmd->autoneg = AUTONEG_DISABLE;
+	cmd->speed = SPEED_10000;
+	cmd->duplex = DUPLEX_FULL;
+	return 0;
+}
+
+static void
+myri10ge_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *info)
+{
+	struct myri10ge_priv *mgp = netdev_priv(netdev);
+
+	strlcpy(info->driver, "myri10ge", sizeof(info->driver));
+	strlcpy(info->version, MYRI10GE_VERSION_STR, sizeof(info->version));
+	strlcpy(info->fw_version, mgp->fw_version, sizeof(info->fw_version));
+	strlcpy(info->bus_info, pci_name(mgp->pdev), sizeof(info->bus_info));
+}
+
+static int
+myri10ge_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *coal)
+{
+	struct myri10ge_priv *mgp = netdev_priv(netdev);
+	coal->rx_coalesce_usecs = mgp->intr_coal_delay;
+	return 0;
+}
+
+static int
+myri10ge_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *coal)
+{
+	struct myri10ge_priv *mgp = netdev_priv(netdev);
+
+	mgp->intr_coal_delay = coal->rx_coalesce_usecs;
+	__raw_writel(htonl(mgp->intr_coal_delay), mgp->intr_coal_delay_ptr);
+	return 0;
+}
+
+static void
+myri10ge_get_pauseparam(struct net_device *netdev,
+			struct ethtool_pauseparam *pause)
+{
+	struct myri10ge_priv *mgp = netdev_priv(netdev);
+
+	pause->autoneg = 0;
+	pause->rx_pause = mgp->pause;
+	pause->tx_pause = mgp->pause;
+}
+
+static int
+myri10ge_set_pauseparam(struct net_device *netdev,
+			struct ethtool_pauseparam *pause)
+{
+	struct myri10ge_priv *mgp = netdev_priv(netdev);
+
+	if (pause->tx_pause != mgp->pause)
+		return myri10ge_change_pause(mgp, pause->tx_pause);
+	if (pause->rx_pause != mgp->pause)
+		return myri10ge_change_pause(mgp, pause->tx_pause);
+	if (pause->autoneg != 0)
+		return -EINVAL;
+	return 0;
+}
+
+static void
+myri10ge_get_ringparam(struct net_device *netdev,
+		       struct ethtool_ringparam *ring)
+{
+	struct myri10ge_priv *mgp = netdev_priv(netdev);
+
+	ring->rx_mini_max_pending = mgp->rx_small.mask + 1;
+	ring->rx_max_pending = mgp->rx_big.mask + 1;
+	ring->rx_jumbo_max_pending = 0;
+	ring->tx_max_pending = mgp->rx_small.mask + 1;
+	ring->rx_mini_pending = ring->rx_mini_max_pending;
+	ring->rx_pending = ring->rx_max_pending;
+	ring->rx_jumbo_pending = ring->rx_jumbo_max_pending;
+	ring->tx_pending = ring->tx_max_pending;
+}
+
+static u32 myri10ge_get_rx_csum(struct net_device *netdev)
+{
+	struct myri10ge_priv *mgp = netdev_priv(netdev);
+	if (mgp->csum_flag)
+		return 1;
+	else
+		return 0;
+}
+
+static int myri10ge_set_rx_csum(struct net_device *netdev, u32 csum_enabled)
+{
+	struct myri10ge_priv *mgp = netdev_priv(netdev);
+	if (csum_enabled)
+		mgp->csum_flag = MXGEFW_FLAGS_CKSUM;
+	else
+		mgp->csum_flag = 0;
+	return 0;
+}
+
+static const char myri10ge_gstrings_stats[][ETH_GSTRING_LEN] = {
+	"rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_errors",
+	"tx_errors", "rx_dropped", "tx_dropped", "multicast", "collisions",
+	"rx_length_errors", "rx_over_errors", "rx_crc_errors",
+	"rx_frame_errors", "rx_fifo_errors", "rx_missed_errors",
+	"tx_aborted_errors", "tx_carrier_errors", "tx_fifo_errors",
+	"tx_heartbeat_errors", "tx_window_errors",
+	/* device-specific stats */
+	"read_dma_bw_MBs", "write_dma_bw_MBs", "read_write_dma_bw_MBs",
+	"serial_number", "tx_pkt_start", "tx_pkt_done",
+	"tx_req", "tx_done", "rx_small_cnt", "rx_big_cnt",
+	"wake_queue", "stop_queue", "watchdog_resets", "tx_linearized",
+	"link_up", "dropped_link_overflow", "dropped_link_error_or_filtered",
+	"dropped_runt", "dropped_overrun", "dropped_no_small_buffer",
+	"dropped_no_big_buffer"
+};
+
+#define MYRI10GE_NET_STATS_LEN      21
+#define MYRI10GE_STATS_LEN  sizeof(myri10ge_gstrings_stats) / ETH_GSTRING_LEN
+
+static void
+myri10ge_get_strings(struct net_device *netdev, u32 stringset, u8 * data)
+{
+	switch (stringset) {
+	case ETH_SS_STATS:
+		memcpy(data, *myri10ge_gstrings_stats,
+		       sizeof(myri10ge_gstrings_stats));
+		break;
+	}
+}
+
+static int myri10ge_get_stats_count(struct net_device *netdev)
+{
+	return MYRI10GE_STATS_LEN;
+}
+
+static void
+myri10ge_get_ethtool_stats(struct net_device *netdev,
+			   struct ethtool_stats *stats, u64 * data)
+{
+	struct myri10ge_priv *mgp = netdev_priv(netdev);
+	int i;
+
+	for (i = 0; i < MYRI10GE_NET_STATS_LEN; i++)
+		data[i] = ((unsigned long *)&mgp->stats)[i];
+
+	data[i++] = (unsigned int)mgp->read_dma;
+	data[i++] = (unsigned int)mgp->write_dma;
+	data[i++] = (unsigned int)mgp->read_write_dma;
+	data[i++] = (unsigned int)mgp->serial_number;
+	data[i++] = (unsigned int)mgp->tx.pkt_start;
+	data[i++] = (unsigned int)mgp->tx.pkt_done;
+	data[i++] = (unsigned int)mgp->tx.req;
+	data[i++] = (unsigned int)mgp->tx.done;
+	data[i++] = (unsigned int)mgp->rx_small.cnt;
+	data[i++] = (unsigned int)mgp->rx_big.cnt;
+	data[i++] = (unsigned int)mgp->wake_queue;
+	data[i++] = (unsigned int)mgp->stop_queue;
+	data[i++] = (unsigned int)mgp->watchdog_resets;
+	data[i++] = (unsigned int)mgp->tx_linearized;
+	data[i++] = (unsigned int)ntohl(mgp->fw_stats->link_up);
+	data[i++] = (unsigned int)ntohl(mgp->fw_stats->dropped_link_overflow);
+	data[i++] =
+	    (unsigned int)ntohl(mgp->fw_stats->dropped_link_error_or_filtered);
+	data[i++] = (unsigned int)ntohl(mgp->fw_stats->dropped_runt);
+	data[i++] = (unsigned int)ntohl(mgp->fw_stats->dropped_overrun);
+	data[i++] = (unsigned int)ntohl(mgp->fw_stats->dropped_no_small_buffer);
+	data[i++] = (unsigned int)ntohl(mgp->fw_stats->dropped_no_big_buffer);
+}
+
+static struct ethtool_ops myri10ge_ethtool_ops = {
+	.get_settings = myri10ge_get_settings,
+	.get_drvinfo = myri10ge_get_drvinfo,
+	.get_coalesce = myri10ge_get_coalesce,
+	.set_coalesce = myri10ge_set_coalesce,
+	.get_pauseparam = myri10ge_get_pauseparam,
+	.set_pauseparam = myri10ge_set_pauseparam,
+	.get_ringparam = myri10ge_get_ringparam,
+	.get_rx_csum = myri10ge_get_rx_csum,
+	.set_rx_csum = myri10ge_set_rx_csum,
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.set_tx_csum = ethtool_op_set_tx_csum,
+	.get_sg = ethtool_op_get_sg,
+	.set_sg = ethtool_op_set_sg,
+#ifdef NETIF_F_TSO
+	.get_tso = ethtool_op_get_tso,
+	.set_tso = ethtool_op_set_tso,
+#endif
+	.get_strings = myri10ge_get_strings,
+	.get_stats_count = myri10ge_get_stats_count,
+	.get_ethtool_stats = myri10ge_get_ethtool_stats
+};
+
+static int myri10ge_allocate_rings(struct net_device *dev)
+{
+	struct myri10ge_priv *mgp;
+	struct myri10ge_cmd cmd;
+	int tx_ring_size, rx_ring_size;
+	int tx_ring_entries, rx_ring_entries;
+	int i, status;
+	size_t bytes;
+
+	mgp = netdev_priv(dev);
+
+	/* get ring sizes */
+
+	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd, 0);
+	tx_ring_size = cmd.data0;
+	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd, 0);
+	rx_ring_size = cmd.data0;
+
+	tx_ring_entries = tx_ring_size / sizeof(struct mcp_kreq_ether_send);
+	rx_ring_entries = rx_ring_size / sizeof(struct mcp_dma_addr);
+	mgp->tx.mask = tx_ring_entries - 1;
+	mgp->rx_small.mask = mgp->rx_big.mask = rx_ring_entries - 1;
+
+	/* allocate the host shadow rings */
+
+	bytes = 8 + (MYRI10GE_MAX_SEND_DESC_TSO + 4)
+	    * sizeof(*mgp->tx.req_list);
+	mgp->tx.req_bytes = kzalloc(bytes, GFP_KERNEL);
+	if (mgp->tx.req_bytes == NULL)
+		goto abort_with_nothing;
+
+	/* ensure req_list entries are aligned to 8 bytes */
+	mgp->tx.req_list = (struct mcp_kreq_ether_send *)
+	    ALIGN((unsigned long)mgp->tx.req_bytes, 8);
+
+	bytes = rx_ring_entries * sizeof(*mgp->rx_small.shadow);
+	mgp->rx_small.shadow = kzalloc(bytes, GFP_KERNEL);
+	if (mgp->rx_small.shadow == NULL)
+		goto abort_with_tx_req_bytes;
+
+	bytes = rx_ring_entries * sizeof(*mgp->rx_big.shadow);
+	mgp->rx_big.shadow = kzalloc(bytes, GFP_KERNEL);
+	if (mgp->rx_big.shadow == NULL)
+		goto abort_with_rx_small_shadow;
+
+	/* allocate the host info rings */
+
+	bytes = tx_ring_entries * sizeof(*mgp->tx.info);
+	mgp->tx.info = kzalloc(bytes, GFP_KERNEL);
+	if (mgp->tx.info == NULL)
+		goto abort_with_rx_big_shadow;
+
+	bytes = rx_ring_entries * sizeof(*mgp->rx_small.info);
+	mgp->rx_small.info = kzalloc(bytes, GFP_KERNEL);
+	if (mgp->rx_small.info == NULL)
+		goto abort_with_tx_info;
+
+	bytes = rx_ring_entries * sizeof(*mgp->rx_big.info);
+	mgp->rx_big.info = kzalloc(bytes, GFP_KERNEL);
+	if (mgp->rx_big.info == NULL)
+		goto abort_with_rx_small_info;
+
+	/* Fill the receive rings */
+
+	for (i = 0; i <= mgp->rx_small.mask; i++) {
+		status = myri10ge_getbuf(&mgp->rx_small, mgp->pdev,
+					 mgp->small_bytes, i);
+		if (status) {
+			printk(KERN_ERR
+			       "myri10ge: %s: alloced only %d small bufs\n",
+			       dev->name, i);
+			goto abort_with_rx_small_ring;
+		}
+	}
+
+	for (i = 0; i <= mgp->rx_big.mask; i++) {
+		status =
+		    myri10ge_getbuf(&mgp->rx_big, mgp->pdev,
+				    dev->mtu + ETH_HLEN, i);
+		if (status) {
+			printk(KERN_ERR
+			       "myri10ge: %s: alloced only %d big bufs\n",
+			       dev->name, i);
+			goto abort_with_rx_big_ring;
+		}
+	}
+
+	return 0;
+
+abort_with_rx_big_ring:
+	for (i = 0; i <= mgp->rx_big.mask; i++) {
+		if (mgp->rx_big.info[i].skb != NULL)
+			dev_kfree_skb_any(mgp->rx_big.info[i].skb);
+		if (pci_unmap_len(&mgp->rx_big.info[i], len))
+			pci_unmap_single(mgp->pdev,
+					 pci_unmap_addr(&mgp->rx_big.info[i],
+							bus),
+					 pci_unmap_len(&mgp->rx_big.info[i],
+						       len),
+					 PCI_DMA_FROMDEVICE);
+	}
+
+abort_with_rx_small_ring:
+	for (i = 0; i <= mgp->rx_small.mask; i++) {
+		if (mgp->rx_small.info[i].skb != NULL)
+			dev_kfree_skb_any(mgp->rx_small.info[i].skb);
+		if (pci_unmap_len(&mgp->rx_small.info[i], len))
+			pci_unmap_single(mgp->pdev,
+					 pci_unmap_addr(&mgp->rx_small.info[i],
+							bus),
+					 pci_unmap_len(&mgp->rx_small.info[i],
+						       len),
+					 PCI_DMA_FROMDEVICE);
+	}
+	kfree(mgp->rx_big.info);
+
+abort_with_rx_small_info:
+	kfree(mgp->rx_small.info);
+
+abort_with_tx_info:
+	kfree(mgp->tx.info);
+
+abort_with_rx_big_shadow:
+	kfree(mgp->rx_big.shadow);
+
+abort_with_rx_small_shadow:
+	kfree(mgp->rx_small.shadow);
+
+abort_with_tx_req_bytes:
+	kfree(mgp->tx.req_bytes);
+	mgp->tx.req_bytes = NULL;
+	mgp->tx.req_list = NULL;
+
+abort_with_nothing:
+	return status;
+}
+
+static void myri10ge_free_rings(struct net_device *dev)
+{
+	struct myri10ge_priv *mgp;
+	struct sk_buff *skb;
+	struct myri10ge_tx_buf *tx;
+	int i, len, idx;
+
+	mgp = netdev_priv(dev);
+
+	for (i = 0; i <= mgp->rx_big.mask; i++) {
+		if (mgp->rx_big.info[i].skb != NULL)
+			dev_kfree_skb_any(mgp->rx_big.info[i].skb);
+		if (pci_unmap_len(&mgp->rx_big.info[i], len))
+			pci_unmap_single(mgp->pdev,
+					 pci_unmap_addr(&mgp->rx_big.info[i],
+							bus),
+					 pci_unmap_len(&mgp->rx_big.info[i],
+						       len),
+					 PCI_DMA_FROMDEVICE);
+	}
+
+	for (i = 0; i <= mgp->rx_small.mask; i++) {
+		if (mgp->rx_small.info[i].skb != NULL)
+			dev_kfree_skb_any(mgp->rx_small.info[i].skb);
+		if (pci_unmap_len(&mgp->rx_small.info[i], len))
+			pci_unmap_single(mgp->pdev,
+					 pci_unmap_addr(&mgp->rx_small.info[i],
+							bus),
+					 pci_unmap_len(&mgp->rx_small.info[i],
+						       len),
+					 PCI_DMA_FROMDEVICE);
+	}
+
+	tx = &mgp->tx;
+	while (tx->done != tx->req) {
+		idx = tx->done & tx->mask;
+		skb = tx->info[idx].skb;
+
+		/* Mark as free */
+		tx->info[idx].skb = NULL;
+		tx->done++;
+		len = pci_unmap_len(&tx->info[idx], len);
+		pci_unmap_len_set(&tx->info[idx], len, 0);
+		if (skb) {
+			mgp->stats.tx_dropped++;
+			dev_kfree_skb_any(skb);
+			if (len)
+				pci_unmap_single(mgp->pdev,
+						 pci_unmap_addr(&tx->info[idx],
+								bus), len,
+						 PCI_DMA_TODEVICE);
+		} else {
+			if (len)
+				pci_unmap_page(mgp->pdev,
+					       pci_unmap_addr(&tx->info[idx],
+							      bus), len,
+					       PCI_DMA_TODEVICE);
+		}
+	}
+	kfree(mgp->rx_big.info);
+
+	kfree(mgp->rx_small.info);
+
+	kfree(mgp->tx.info);
+
+	kfree(mgp->rx_big.shadow);
+
+	kfree(mgp->rx_small.shadow);
+
+	kfree(mgp->tx.req_bytes);
+	mgp->tx.req_bytes = NULL;
+	mgp->tx.req_list = NULL;
+}
+
+static int myri10ge_open(struct net_device *dev)
+{
+	struct myri10ge_priv *mgp;
+	struct myri10ge_cmd cmd;
+	int status, big_pow2;
+
+	mgp = netdev_priv(dev);
+
+	if (mgp->running != MYRI10GE_ETH_STOPPED)
+		return -EBUSY;
+
+	mgp->running = MYRI10GE_ETH_STARTING;
+	status = myri10ge_reset(mgp);
+	if (status != 0) {
+		printk(KERN_ERR "myri10ge: %s: failed reset\n", dev->name);
+		mgp->running = MYRI10GE_ETH_STOPPED;
+		return -ENXIO;
+	}
+
+	/* decide what small buffer size to use.  For good TCP rx
+	 * performance, it is important to not receive 1514 byte
+	 * frames into jumbo buffers, as it confuses the socket buffer
+	 * accounting code, leading to drops and erratic performance.
+	 */
+
+	if (dev->mtu <= ETH_DATA_LEN)
+		mgp->small_bytes = 128;	/* enough for a TCP header */
+	else
+		mgp->small_bytes = ETH_FRAME_LEN;	/* enough for an ETH_DATA_LEN frame */
+
+	/* Override the small buffer size? */
+	if (myri10ge_small_bytes > 0)
+		mgp->small_bytes = myri10ge_small_bytes;
+
+	/* If the user sets an obscenely small MTU, adjust the small
+	 * bytes down to nearly nothing */
+	if (mgp->small_bytes >= (dev->mtu + ETH_HLEN))
+		mgp->small_bytes = 64;
+
+	/* get the lanai pointers to the send and receive rings */
+
+	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_OFFSET, &cmd, 0);
+	mgp->tx.lanai =
+	    (struct mcp_kreq_ether_send __iomem *)(mgp->sram + cmd.data0);
+
+	status |=
+	    myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd, 0);
+	mgp->rx_small.lanai =
+	    (struct mcp_kreq_ether_recv __iomem *)(mgp->sram + cmd.data0);
+
+	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd, 0);
+	mgp->rx_big.lanai =
+	    (struct mcp_kreq_ether_recv __iomem *)(mgp->sram + cmd.data0);
+
+	if (status != 0) {
+		printk(KERN_ERR
+		       "myri10ge: %s: failed to get ring sizes or locations\n",
+		       dev->name);
+		mgp->running = MYRI10GE_ETH_STOPPED;
+		return -ENXIO;
+	}
+
+	if (mgp->mtrr >= 0) {
+		mgp->tx.wc_fifo = (u8 __iomem *) mgp->sram + 0x200000;
+		mgp->rx_small.wc_fifo = (u8 __iomem *) mgp->sram + 0x300000;
+		mgp->rx_big.wc_fifo = (u8 __iomem *) mgp->sram + 0x340000;
+	} else {
+		mgp->tx.wc_fifo = NULL;
+		mgp->rx_small.wc_fifo = NULL;
+		mgp->rx_big.wc_fifo = NULL;
+	}
+
+	status = myri10ge_allocate_rings(dev);
+	if (status != 0)
+		goto abort_with_nothing;
+
+	/* Firmware needs the big buff size as a power of 2.  Lie and
+	 * tell him the buffer is larger, because we only use 1
+	 * buffer/pkt, and the mtu will prevent overruns.
+	 */
+	big_pow2 = dev->mtu + ETH_HLEN + MXGEFW_PAD;
+	while ((big_pow2 & (big_pow2 - 1)) != 0)
+		big_pow2++;
+
+	/* now give firmware buffers sizes, and MTU */
+	cmd.data0 = dev->mtu + ETH_HLEN + VLAN_HLEN;
+	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_MTU, &cmd, 0);
+	cmd.data0 = mgp->small_bytes;
+	status |=
+	    myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd, 0);
+	cmd.data0 = big_pow2;
+	status |=
+	    myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd, 0);
+	if (status) {
+		printk(KERN_ERR "myri10ge: %s: Couldn't set buffer sizes\n",
+		       dev->name);
+		goto abort_with_rings;
+	}
+
+	cmd.data0 = MYRI10GE_LOWPART_TO_U32(mgp->fw_stats_bus);
+	cmd.data1 = MYRI10GE_HIGHPART_TO_U32(mgp->fw_stats_bus);
+	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_STATS_DMA, &cmd, 0);
+	if (status) {
+		printk(KERN_ERR "myri10ge: %s: Couldn't set stats DMA\n",
+		       dev->name);
+		goto abort_with_rings;
+	}
+
+	mgp->link_state = -1;
+	mgp->rdma_tags_available = 15;
+
+	netif_poll_enable(mgp->dev);	/* must happen prior to any irq */
+
+	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_UP, &cmd, 0);
+	if (status) {
+		printk(KERN_ERR "myri10ge: %s: Couldn't bring up link\n",
+		       dev->name);
+		goto abort_with_rings;
+	}
+
+	mgp->wake_queue = 0;
+	mgp->stop_queue = 0;
+	mgp->running = MYRI10GE_ETH_RUNNING;
+	mgp->watchdog_timer.expires = jiffies + myri10ge_watchdog_timeout * HZ;
+	add_timer(&mgp->watchdog_timer);
+	netif_wake_queue(dev);
+	return 0;
+
+abort_with_rings:
+	myri10ge_free_rings(dev);
+
+abort_with_nothing:
+	mgp->running = MYRI10GE_ETH_STOPPED;
+	return -ENOMEM;
+}
+
+static int myri10ge_close(struct net_device *dev)
+{
+	struct myri10ge_priv *mgp;
+	struct myri10ge_cmd cmd;
+	int status, old_down_cnt;
+
+	mgp = netdev_priv(dev);
+
+	if (mgp->running != MYRI10GE_ETH_RUNNING)
+		return 0;
+
+	if (mgp->tx.req_bytes == NULL)
+		return 0;
+
+	del_timer_sync(&mgp->watchdog_timer);
+	mgp->running = MYRI10GE_ETH_STOPPING;
+	netif_poll_disable(mgp->dev);
+	netif_carrier_off(dev);
+	netif_stop_queue(dev);
+	old_down_cnt = mgp->down_cnt;
+	mb();
+	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
+	if (status)
+		printk(KERN_ERR "myri10ge: %s: Couldn't bring down link\n",
+		       dev->name);
+
+	wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, HZ);
+	if (old_down_cnt == mgp->down_cnt)
+		printk(KERN_ERR "myri10ge: %s never got down irq\n", dev->name);
+
+	netif_tx_disable(dev);
+
+	myri10ge_free_rings(dev);
+
+	mgp->running = MYRI10GE_ETH_STOPPED;
+	return 0;
+}
+
+/* copy an array of struct mcp_kreq_ether_send's to the mcp.  Copy
+ * backwards one at a time and handle ring wraps */
+
+static inline void
+myri10ge_submit_req_backwards(struct myri10ge_tx_buf *tx,
+			      struct mcp_kreq_ether_send *src, int cnt)
+{
+	int idx, starting_slot;
+	starting_slot = tx->req;
+	while (cnt > 1) {
+		cnt--;
+		idx = (starting_slot + cnt) & tx->mask;
+		myri10ge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
+		mb();
+	}
+}
+
+/*
+ * copy an array of struct mcp_kreq_ether_send's to the mcp.  Copy
+ * at most 32 bytes at a time, so as to avoid involving the software
+ * pio handler in the nic.   We re-write the first segment's flags
+ * to mark them valid only after writing the entire chain.
+ */
+
+static inline void
+myri10ge_submit_req(struct myri10ge_tx_buf *tx, struct mcp_kreq_ether_send *src,
+		    int cnt)
+{
+	int idx, i;
+	struct mcp_kreq_ether_send __iomem *dstp, *dst;
+	struct mcp_kreq_ether_send *srcp;
+	u8 last_flags;
+
+	idx = tx->req & tx->mask;
+
+	last_flags = src->flags;
+	src->flags = 0;
+	mb();
+	dst = dstp = &tx->lanai[idx];
+	srcp = src;
+
+	if ((idx + cnt) < tx->mask) {
+		for (i = 0; i < (cnt - 1); i += 2) {
+			myri10ge_pio_copy(dstp, srcp, 2 * sizeof(*src));
+			mb();	/* force write every 32 bytes */
+			srcp += 2;
+			dstp += 2;
+		}
+	} else {
+		/* submit all but the first request, and ensure
+		 * that it is submitted below */
+		myri10ge_submit_req_backwards(tx, src, cnt);
+		i = 0;
+	}
+	if (i < cnt) {
+		/* submit the first request */
+		myri10ge_pio_copy(dstp, srcp, sizeof(*src));
+		mb();		/* barrier before setting valid flag */
+	}
+
+	/* re-write the last 32-bits with the valid flags */
+	src->flags = last_flags;
+	__raw_writel(*((u32 *) src + 3), (u32 __iomem *) dst + 3);
+	tx->req += cnt;
+	mb();
+}
+
+static inline void
+myri10ge_submit_req_wc(struct myri10ge_tx_buf *tx,
+		       struct mcp_kreq_ether_send *src, int cnt)
+{
+	tx->req += cnt;
+	mb();
+	while (cnt >= 4) {
+		myri10ge_pio_copy(tx->wc_fifo, src, 64);
+		mb();
+		src += 4;
+		cnt -= 4;
+	}
+	if (cnt > 0) {
+		/* pad it to 64 bytes.  The src is 64 bytes bigger than it
+		 * needs to be so that we don't overrun it */
+		myri10ge_pio_copy(tx->wc_fifo + (cnt << 18), src, 64);
+		mb();
+	}
+}
+
+/*
+ * Transmit a packet.  We need to split the packet so that a single
+ * segment does not cross myri10ge->tx.boundary, so this makes segment
+ * counting tricky.  So rather than try to count segments up front, we
+ * just give up if there are too few segments to hold a reasonably
+ * fragmented packet currently available.  If we run
+ * out of segments while preparing a packet for DMA, we just linearize
+ * it and try again.
+ */
+
+static int myri10ge_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct myri10ge_priv *mgp = netdev_priv(dev);
+	struct mcp_kreq_ether_send *req;
+	struct myri10ge_tx_buf *tx = &mgp->tx;
+	struct skb_frag_struct *frag;
+	dma_addr_t bus;
+	u32 low, high_swapped;
+	unsigned int len;
+	int idx, last_idx, avail, frag_cnt, frag_idx, count, mss, max_segments;
+	u16 pseudo_hdr_offset, cksum_offset;
+	int cum_len, seglen, boundary, rdma_count;
+	u8 flags, odd_flag;
+
+again:
+	req = tx->req_list;
+	avail = tx->mask - 1 - (tx->req - tx->done);
+
+	mss = 0;
+	max_segments = MXGEFW_MAX_SEND_DESC;
+
+#ifdef NETIF_F_TSO
+	if (skb->len > (dev->mtu + ETH_HLEN)) {
+		mss = skb_shinfo(skb)->tso_size;
+		if (mss != 0)
+			max_segments = MYRI10GE_MAX_SEND_DESC_TSO;
+	}
+#endif				/*NETIF_F_TSO */
+
+	if ((unlikely(avail < max_segments))) {
+		/* we are out of transmit resources */
+		mgp->stop_queue++;
+		netif_stop_queue(dev);
+		return 1;
+	}
+
+	/* Setup checksum offloading, if needed */
+	cksum_offset = 0;
+	pseudo_hdr_offset = 0;
+	odd_flag = 0;
+	flags = (MXGEFW_FLAGS_NO_TSO | MXGEFW_FLAGS_FIRST);
+	if (likely(skb->ip_summed == CHECKSUM_HW)) {
+		cksum_offset = (skb->h.raw - skb->data);
+		pseudo_hdr_offset = (skb->h.raw + skb->csum) - skb->data;
+		/* If the headers are excessively large, then we must
+		 * fall back to a software checksum */
+		if (unlikely(cksum_offset > 255 || pseudo_hdr_offset > 127)) {
+			if (skb_checksum_help(skb, 0))
+				goto drop;
+			cksum_offset = 0;
+			pseudo_hdr_offset = 0;
+		} else {
+			pseudo_hdr_offset = htons(pseudo_hdr_offset);
+			odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
+			flags |= MXGEFW_FLAGS_CKSUM;
+		}
+	}
+
+	cum_len = 0;
+
+#ifdef NETIF_F_TSO
+	if (mss) {		/* TSO */
+		/* this removes any CKSUM flag from before */
+		flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
+
+		/* negative cum_len signifies to the
+		 * send loop that we are still in the
+		 * header portion of the TSO packet.
+		 * TSO header must be at most 134 bytes long */
+		cum_len = -((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
+
+		/* for TSO, pseudo_hdr_offset holds mss.
+		 * The firmware figures out where to put
+		 * the checksum by parsing the header. */
+		pseudo_hdr_offset = htons(mss);
+	} else
+#endif				/*NETIF_F_TSO */
+		/* Mark small packets, and pad out tiny packets */
+	if (skb->len <= MXGEFW_SEND_SMALL_SIZE) {
+		flags |= MXGEFW_FLAGS_SMALL;
+
+		/* pad frames to at least ETH_ZLEN bytes */
+		if (unlikely(skb->len < ETH_ZLEN)) {
+			skb = skb_padto(skb, ETH_ZLEN);
+			if (skb == NULL) {
+				/* The packet is gone, so we must
+				 * return 0 */
+				mgp->stats.tx_dropped += 1;
+				return 0;
+			}
+			/* adjust the len to account for the zero pad
+			 * so that the nic can know how long it is */
+			skb->len = ETH_ZLEN;
+		}
+	}
+
+	/* map the skb for DMA */
+	len = skb->len - skb->data_len;
+	idx = tx->req & tx->mask;
+	tx->info[idx].skb = skb;
+	bus = pci_map_single(mgp->pdev, skb->data, len, PCI_DMA_TODEVICE);
+	pci_unmap_addr_set(&tx->info[idx], bus, bus);
+	pci_unmap_len_set(&tx->info[idx], len, len);
+
+	frag_cnt = skb_shinfo(skb)->nr_frags;
+	frag_idx = 0;
+	count = 0;
+	rdma_count = 0;
+
+	/* "rdma_count" is the number of RDMAs belonging to the
+	 * current packet BEFORE the current send request. For
+	 * non-TSO packets, this is equal to "count".
+	 * For TSO packets, rdma_count needs to be reset
+	 * to 0 after a segment cut.
+	 *
+	 * The rdma_count field of the send request is
+	 * the number of RDMAs of the packet starting at
+	 * that request. For TSO send requests with one ore more cuts
+	 * in the middle, this is the number of RDMAs starting
+	 * after the last cut in the request. All previous
+	 * segments before the last cut implicitly have 1 RDMA.
+	 *
+	 * Since the number of RDMAs is not known beforehand,
+	 * it must be filled-in retroactively - after each
+	 * segmentation cut or at the end of the entire packet.
+	 */
+
+	while (1) {
+		/* Break the SKB or Fragment up into pieces which
+		 * do not cross mgp->tx.boundary */
+		low = MYRI10GE_LOWPART_TO_U32(bus);
+		high_swapped = htonl(MYRI10GE_HIGHPART_TO_U32(bus));
+		while (len) {
+			u8 flags_next;
+			int cum_len_next;
+
+			if (unlikely(count == max_segments))
+				goto abort_linearize;
+
+			boundary = (low + tx->boundary) & ~(tx->boundary - 1);
+			seglen = boundary - low;
+			if (seglen > len)
+				seglen = len;
+			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
+			cum_len_next = cum_len + seglen;
+#ifdef NETIF_F_TSO
+			if (mss) {	/* TSO */
+				(req - rdma_count)->rdma_count = rdma_count + 1;
+
+				if (likely(cum_len >= 0)) {	/* payload */
+					int next_is_first, chop;
+
+					chop = (cum_len_next > mss);
+					cum_len_next = cum_len_next % mss;
+					next_is_first = (cum_len_next == 0);
+					flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
+					flags_next |= next_is_first *
+					    MXGEFW_FLAGS_FIRST;
+					rdma_count |= -(chop | next_is_first);
+					rdma_count += chop & !next_is_first;
+				} else if (likely(cum_len_next >= 0)) {	/* header ends */
+					int small;
+
+					rdma_count = -1;
+					cum_len_next = 0;
+					seglen = -cum_len;
+					small = (mss <= MXGEFW_SEND_SMALL_SIZE);
+					flags_next = MXGEFW_FLAGS_TSO_PLD |
+					    MXGEFW_FLAGS_FIRST |
+					    (small * MXGEFW_FLAGS_SMALL);
+				}
+			}
+#endif				/* NETIF_F_TSO */
+			req->addr_high = high_swapped;
+			req->addr_low = htonl(low);
+			req->pseudo_hdr_offset = pseudo_hdr_offset;
+			req->pad = 0;	/* complete solid 16-byte block; does this matter? */
+			req->rdma_count = 1;
+			req->length = htons(seglen);
+			req->cksum_offset = cksum_offset;
+			req->flags = flags | ((cum_len & 1) * odd_flag);
+
+			low += seglen;
+			len -= seglen;
+			cum_len = cum_len_next;
+			flags = flags_next;
+			req++;
+			count++;
+			rdma_count++;
+			if (unlikely(cksum_offset > seglen))
+				cksum_offset -= seglen;
+			else
+				cksum_offset = 0;
+		}
+		if (frag_idx == frag_cnt)
+			break;
+
+		/* map next fragment for DMA */
+		idx = (count + tx->req) & tx->mask;
+		frag = &skb_shinfo(skb)->frags[frag_idx];
+		frag_idx++;
+		len = frag->size;
+		bus = pci_map_page(mgp->pdev, frag->page, frag->page_offset,
+				   len, PCI_DMA_TODEVICE);
+		pci_unmap_addr_set(&tx->info[idx], bus, bus);
+		pci_unmap_len_set(&tx->info[idx], len, len);
+	}
+
+	(req - rdma_count)->rdma_count = rdma_count;
+#ifdef NETIF_F_TSO
+	if (mss)
+		do {
+			req--;
+			req->flags |= MXGEFW_FLAGS_TSO_LAST;
+		} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
+					 MXGEFW_FLAGS_FIRST)));
+#endif
+	idx = ((count - 1) + tx->req) & tx->mask;
+	tx->info[idx].last = 1;
+	if (tx->wc_fifo == NULL)
+		myri10ge_submit_req(tx, tx->req_list, count);
+	else
+		myri10ge_submit_req_wc(tx, tx->req_list, count);
+	tx->pkt_start++;
+	if ((avail - count) < MXGEFW_MAX_SEND_DESC) {
+		mgp->stop_queue++;
+		netif_stop_queue(dev);
+	}
+	dev->trans_start = jiffies;
+	return 0;
+
+abort_linearize:
+	/* Free any DMA resources we've alloced and clear out the skb
+	 * slot so as to not trip up assertions, and to avoid a
+	 * double-free if linearizing fails */
+
+	last_idx = (idx + 1) & tx->mask;
+	idx = tx->req & tx->mask;
+	tx->info[idx].skb = NULL;
+	do {
+		len = pci_unmap_len(&tx->info[idx], len);
+		if (len) {
+			if (tx->info[idx].skb != NULL)
+				pci_unmap_single(mgp->pdev,
+						 pci_unmap_addr(&tx->info[idx],
+								bus), len,
+						 PCI_DMA_TODEVICE);
+			else
+				pci_unmap_page(mgp->pdev,
+					       pci_unmap_addr(&tx->info[idx],
+							      bus), len,
+					       PCI_DMA_TODEVICE);
+			pci_unmap_len_set(&tx->info[idx], len, 0);
+			tx->info[idx].skb = NULL;
+		}
+		idx = (idx + 1) & tx->mask;
+	} while (idx != last_idx);
+	if (skb_shinfo(skb)->tso_size) {
+		printk(KERN_ERR
+		       "myri10ge: %s: TSO but wanted to linearize?!?!?\n",
+		       mgp->dev->name);
+		goto drop;
+	}
+
+	if (skb_linearize(skb, GFP_ATOMIC))
+		goto drop;
+
+	mgp->tx_linearized++;
+	goto again;
+
+drop:
+	dev_kfree_skb_any(skb);
+	mgp->stats.tx_dropped += 1;
+	return 0;
+
+}
+
+static struct net_device_stats *myri10ge_get_stats(struct net_device *dev)
+{
+	struct myri10ge_priv *mgp = netdev_priv(dev);
+	return &mgp->stats;
+}
+
+static void myri10ge_set_multicast_list(struct net_device *dev)
+{
+	/* can be called from atomic contexts,
+	 * pass 1 to force atomicity in myri10ge_send_cmd() */
+	myri10ge_change_promisc(netdev_priv(dev), dev->flags & IFF_PROMISC, 1);
+}
+
+static int myri10ge_set_mac_address(struct net_device *dev, void *addr)
+{
+	struct sockaddr *sa = addr;
+	struct myri10ge_priv *mgp = netdev_priv(dev);
+	int status;
+
+	if (!is_valid_ether_addr(sa->sa_data))
+		return -EADDRNOTAVAIL;
+
+	status = myri10ge_update_mac_address(mgp, sa->sa_data);
+	if (status != 0) {
+		printk(KERN_ERR
+		       "myri10ge: %s: changing mac address failed with %d\n",
+		       dev->name, status);
+		return status;
+	}
+
+	/* change the dev structure */
+	memcpy(dev->dev_addr, sa->sa_data, 6);
+	return 0;
+}
+
+static int myri10ge_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct myri10ge_priv *mgp = netdev_priv(dev);
+	int error = 0;
+
+	if ((new_mtu < 68) || (ETH_HLEN + new_mtu > MYRI10GE_MAX_ETHER_MTU)) {
+		printk(KERN_ERR "myri10ge: %s: new mtu (%d) is not valid\n",
+		       dev->name, new_mtu);
+		return -EINVAL;
+	}
+	printk(KERN_INFO "%s: changing mtu from %d to %d\n",
+	       dev->name, dev->mtu, new_mtu);
+	if (mgp->running) {
+		/* if we change the mtu on an active device, we must
+		 * reset the device so the firmware sees the change */
+		myri10ge_close(dev);
+		dev->mtu = new_mtu;
+		myri10ge_open(dev);
+	} else
+		dev->mtu = new_mtu;
+
+	return error;
+}
+
+/*
+ * Enable ECRC to align PCI-E Completion packets on an 8-byte boundary.
+ * Only do it if the bridge is a root port since we don't want to disturb
+ * any other device, except if forced with myri10ge_ecrc_enable > 1.
+ */
+
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_PCIE	0x005d
+
+static void myri10ge_enable_ecrc(struct myri10ge_priv *mgp)
+{
+	struct pci_dev *bridge = mgp->pdev->bus->self;
+	struct device *dev = &mgp->pdev->dev;
+	unsigned cap;
+	unsigned err_cap;
+	u16 val;
+	u8 ext_type;
+	int ret;
+
+	if (!myri10ge_ecrc_enable || !bridge)
+		return;
+
+	/* check that the bridge is a root port */
+	cap = pci_find_capability(bridge, PCI_CAP_ID_EXP);
+	pci_read_config_word(bridge, cap + PCI_CAP_FLAGS, &val);
+	ext_type = (val & PCI_EXP_FLAGS_TYPE) >> 4;
+	if (ext_type != PCI_EXP_TYPE_ROOT_PORT) {
+		if (myri10ge_ecrc_enable > 1) {
+			struct pci_dev *old_bridge = bridge;
+
+			/* Walk the hierarchy up to the root port
+			 * where ECRC has to be enabled */
+			do {
+				bridge = bridge->bus->self;
+				if (!bridge) {
+					dev_err(dev,
+						"Failed to find root port"
+						" to force ECRC\n");
+					return;
+				}
+				cap =
+				    pci_find_capability(bridge, PCI_CAP_ID_EXP);
+				pci_read_config_word(bridge,
+						     cap + PCI_CAP_FLAGS, &val);
+				ext_type = (val & PCI_EXP_FLAGS_TYPE) >> 4;
+			} while (ext_type != PCI_EXP_TYPE_ROOT_PORT);
+
+			dev_info(dev,
+				 "Forcing ECRC on non-root port %s"
+				 " (enabling on root port %s)\n",
+				 pci_name(old_bridge), pci_name(bridge));
+		} else {
+			dev_err(dev,
+				"Not enabling ECRC on non-root port %s\n",
+				pci_name(bridge));
+			return;
+		}
+	}
+
+	cap = pci_find_ext_capability(bridge, PCI_EXT_CAP_ID_ERR);
+	/* nvidia ext cap is not always linked in ext cap chain */
+	if (!cap
+	    && bridge->vendor == PCI_VENDOR_ID_NVIDIA
+	    && bridge->device == PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_PCIE)
+		cap = 0x160;
+
+	if (!cap)
+		return;
+
+	ret = pci_read_config_dword(bridge, cap + PCI_ERR_CAP, &err_cap);
+	if (ret) {
+		dev_err(dev, "failed reading ext-conf-space of %s\n",
+			pci_name(bridge));
+		dev_err(dev, "\t pci=nommconf in use? "
+			"or buggy/incomplete/absent ACPI MCFG attr?\n");
+		return;
+	}
+	if (!(err_cap & PCI_ERR_CAP_ECRC_GENC))
+		return;
+
+	err_cap |= PCI_ERR_CAP_ECRC_GENE;
+	pci_write_config_dword(bridge, cap + PCI_ERR_CAP, err_cap);
+	dev_info(dev, "Enabled ECRC on upstream bridge %s\n", pci_name(bridge));
+	mgp->tx.boundary = 4096;
+	mgp->fw_name = myri10ge_fw_aligned;
+}
+
+/*
+ * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
+ * when the PCI-E Completion packets are aligned on an 8-byte
+ * boundary.  Some PCI-E chip sets always align Completion packets; on
+ * the ones that do not, the alignment can be enforced by enabling
+ * ECRC generation (if supported).
+ *
+ * When PCI-E Completion packets are not aligned, it is actually more
+ * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
+ *
+ * If the driver can neither enable ECRC nor verify that it has
+ * already been enabled, then it must use a firmware image which works
+ * around unaligned completion packets (myri10ge_ethp_z8e.dat), and it
+ * should also ensure that it never gives the device a Read-DMA which is
+ * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
+ * enabled, then the driver should use the aligned (myri10ge_eth_z8e.dat)
+ * firmware image, and set tx.boundary to 4KB.
+ */
+
+#define PCI_DEVICE_ID_SERVERWORKS_HT2000_PCIE	0x0132
+
+static void myri10ge_select_firmware(struct myri10ge_priv *mgp)
+{
+	struct pci_dev *bridge = mgp->pdev->bus->self;
+
+	mgp->tx.boundary = 2048;
+	mgp->fw_name = myri10ge_fw_unaligned;
+
+	if (myri10ge_force_firmware == 0) {
+		myri10ge_enable_ecrc(mgp);
+
+		/* Check to see if the upstream bridge is known to
+		 * provide aligned completions */
+		if (bridge
+		    /* ServerWorks HT2000/HT1000 */
+		    && bridge->vendor == PCI_VENDOR_ID_SERVERWORKS
+		    && bridge->device ==
+		    PCI_DEVICE_ID_SERVERWORKS_HT2000_PCIE) {
+			dev_info(&mgp->pdev->dev,
+				 "Assuming aligned completions (0x%x:0x%x)\n",
+				 bridge->vendor, bridge->device);
+			mgp->tx.boundary = 4096;
+			mgp->fw_name = myri10ge_fw_aligned;
+		}
+	} else {
+		if (myri10ge_force_firmware == 1) {
+			dev_info(&mgp->pdev->dev,
+				 "Assuming aligned completions (forced)\n");
+			mgp->tx.boundary = 4096;
+			mgp->fw_name = myri10ge_fw_aligned;
+		} else {
+			dev_info(&mgp->pdev->dev,
+				 "Assuming unaligned completions (forced)\n");
+			mgp->tx.boundary = 2048;
+			mgp->fw_name = myri10ge_fw_unaligned;
+		}
+	}
+	if (myri10ge_fw_name != NULL) {
+		dev_info(&mgp->pdev->dev, "overriding firmware to %s\n",
+			 myri10ge_fw_name);
+		mgp->fw_name = myri10ge_fw_name;
+	}
+}
+
+static void myri10ge_save_state(struct myri10ge_priv *mgp)
+{
+	struct pci_dev *pdev = mgp->pdev;
+	int cap;
+
+	pci_save_state(pdev);
+	/* now save PCIe and MSI state that Linux will not
+	 * save for us */
+	cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	pci_read_config_dword(pdev, cap + PCI_EXP_DEVCTL, &mgp->devctl);
+	cap = pci_find_capability(pdev, PCI_CAP_ID_MSI);
+	pci_read_config_word(pdev, cap + PCI_MSI_FLAGS, &mgp->msi_flags);
+}
+
+static void myri10ge_restore_state(struct myri10ge_priv *mgp)
+{
+	struct pci_dev *pdev = mgp->pdev;
+	int cap;
+
+	/* restore PCIe and MSI state that linux will not */
+	cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	pci_write_config_dword(pdev, cap + PCI_CAP_ID_EXP, mgp->devctl);
+	cap = pci_find_capability(pdev, PCI_CAP_ID_MSI);
+	pci_write_config_word(pdev, cap + PCI_MSI_FLAGS, mgp->msi_flags);
+
+	pci_restore_state(pdev);
+}
+
+#ifdef CONFIG_PM
+
+static int myri10ge_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct myri10ge_priv *mgp;
+	struct net_device *netdev;
+
+	mgp = pci_get_drvdata(pdev);
+	if (mgp == NULL)
+		return -EINVAL;
+	netdev = mgp->dev;
+
+	netif_device_detach(netdev);
+	if (netif_running(netdev)) {
+		printk(KERN_INFO "myri10ge: closing %s\n", netdev->name);
+		rtnl_lock();
+		myri10ge_close(netdev);
+		rtnl_unlock();
+	}
+	myri10ge_dummy_rdma(mgp, 0);
+	free_irq(pdev->irq, mgp);
+	myri10ge_save_state(mgp);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+	return 0;
+}
+
+static int myri10ge_resume(struct pci_dev *pdev)
+{
+	struct myri10ge_priv *mgp;
+	struct net_device *netdev;
+	int status;
+	u16 vendor;
+
+	mgp = pci_get_drvdata(pdev);
+	if (mgp == NULL)
+		return -EINVAL;
+	netdev = mgp->dev;
+	pci_set_power_state(pdev, 0);	/* zeros conf space as a side effect */
+	msleep(5);		/* give card time to respond */
+	pci_read_config_word(mgp->pdev, PCI_VENDOR_ID, &vendor);
+	if (vendor == 0xffff) {
+		printk(KERN_ERR "myri10ge: %s: device disappeared!\n",
+		       mgp->dev->name);
+		return -EIO;
+	}
+	myri10ge_restore_state(mgp);
+	pci_enable_device(pdev);
+	pci_set_master(pdev);
+
+	status = request_irq(pdev->irq, myri10ge_intr, SA_SHIRQ,
+			     netdev->name, mgp);
+	if (status != 0) {
+		dev_err(&pdev->dev, "failed to allocate IRQ\n");
+		goto abort_with_msi;
+	}
+
+	myri10ge_reset(mgp);
+	myri10ge_dummy_rdma(mgp, mgp->tx.boundary != 4096);
+
+	/* Save configuration space to be restored if the
+	 * nic resets due to a parity error */
+	myri10ge_save_state(mgp);
+
+	if (netif_running(netdev)) {
+		rtnl_lock();
+		myri10ge_open(netdev);
+		rtnl_unlock();
+	}
+	netif_device_attach(netdev);
+
+	return 0;
+
+abort_with_msi:
+	return -EIO;
+
+}
+
+#endif				/* CONFIG_PM */
+
+static u32 myri10ge_read_reboot(struct myri10ge_priv *mgp)
+{
+	struct pci_dev *pdev = mgp->pdev;
+	int vs = mgp->vendor_specific_offset;
+	u32 reboot;
+
+	/*enter read32 mode */
+	pci_write_config_byte(pdev, vs + 0x10, 0x3);
+
+	/*read REBOOT_STATUS (0xfffffff0) */
+	pci_write_config_dword(pdev, vs + 0x18, 0xfffffff0);
+	pci_read_config_dword(pdev, vs + 0x14, &reboot);
+	return reboot;
+}
+
+/*
+ * This watchdog is used to check whether the board has suffered
+ * from a parity error and needs to be recovered.
+ */
+static void myri10ge_watchdog(void *arg)
+{
+	struct myri10ge_priv *mgp = arg;
+	u32 reboot;
+	int status;
+	u16 cmd, vendor;
+
+	mgp->watchdog_resets++;
+	pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
+	if ((cmd & PCI_COMMAND_MASTER) == 0) {
+		/* Bus master DMA disabled?  Check to see
+		 * if the card rebooted due to a parity error
+		 * For now, just report it */
+		reboot = myri10ge_read_reboot(mgp);
+		printk(KERN_ERR
+		       "myri10ge: %s: NIC rebooted (0x%x), resetting\n",
+		       mgp->dev->name, reboot);
+		/*
+		 * A rebooted nic will come back with config space as
+		 * it was after power was applied to PCIe bus.
+		 * Attempt to restore config space which was saved
+		 * when the driver was loaded, or the last time the
+		 * nic was resumed from power saving mode.
+		 */
+		myri10ge_restore_state(mgp);
+	} else {
+		/* if we get back -1's from our slot, perhaps somebody
+		 * powered off our card.  Don't try to reset it in
+		 * this case */
+		if (cmd == 0xffff) {
+			pci_read_config_word(mgp->pdev, PCI_VENDOR_ID, &vendor);
+			if (vendor == 0xffff) {
+				printk(KERN_ERR
+				       "myri10ge: %s: device disappeared!\n",
+				       mgp->dev->name);
+				return;
+			}
+		}
+		/* Perhaps it is a software error.  Try to reset */
+
+		printk(KERN_ERR "myri10ge: %s: device timeout, resetting\n",
+		       mgp->dev->name);
+		printk(KERN_INFO "myri10ge: %s: %d %d %d %d %d\n",
+		       mgp->dev->name, mgp->tx.req, mgp->tx.done,
+		       mgp->tx.pkt_start, mgp->tx.pkt_done,
+		       (int)ntohl(mgp->fw_stats->send_done_count));
+		msleep(2000);
+		printk(KERN_INFO "myri10ge: %s: %d %d %d %d %d\n",
+		       mgp->dev->name, mgp->tx.req, mgp->tx.done,
+		       mgp->tx.pkt_start, mgp->tx.pkt_done,
+		       (int)ntohl(mgp->fw_stats->send_done_count));
+	}
+	rtnl_lock();
+	myri10ge_close(mgp->dev);
+	status = myri10ge_load_firmware(mgp);
+	if (status != 0)
+		printk(KERN_ERR "myri10ge: %s: failed to load firmware\n",
+		       mgp->dev->name);
+	else
+		myri10ge_open(mgp->dev);
+	rtnl_unlock();
+}
+
+/*
+ * We use our own timer routine rather than relying upon
+ * netdev->tx_timeout because we have a very large hardware transmit
+ * queue.  Due to the large queue, the netdev->tx_timeout function
+ * cannot detect a NIC with a parity error in a timely fashion if the
+ * NIC is lightly loaded.
+ */
+static void myri10ge_watchdog_timer(unsigned long arg)
+{
+	struct myri10ge_priv *mgp;
+
+	mgp = (struct myri10ge_priv *)arg;
+	if (mgp->tx.req != mgp->tx.done &&
+	    mgp->tx.done == mgp->watchdog_tx_done)
+		/* nic seems like it might be stuck.. */
+		schedule_work(&mgp->watchdog_work);
+	else
+		/* rearm timer */
+		mod_timer(&mgp->watchdog_timer,
+			  jiffies + myri10ge_watchdog_timeout * HZ);
+
+	mgp->watchdog_tx_done = mgp->tx.done;
+}
+
+static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct net_device *netdev;
+	struct myri10ge_priv *mgp;
+	struct device *dev = &pdev->dev;
+	size_t bytes;
+	int i;
+	int status = -ENXIO;
+	int cap;
+	int dac_enabled;
+	u16 val;
+
+	netdev = alloc_etherdev(sizeof(*mgp));
+	if (netdev == NULL) {
+		dev_err(dev, "Could not allocate ethernet device\n");
+		return -ENOMEM;
+	}
+
+	mgp = netdev_priv(netdev);
+	memset(mgp, 0, sizeof(*mgp));
+	mgp->dev = netdev;
+	mgp->pdev = pdev;
+	mgp->csum_flag = MXGEFW_FLAGS_CKSUM;
+	mgp->pause = myri10ge_flow_control;
+	mgp->intr_coal_delay = myri10ge_intr_coal_delay;
+	init_waitqueue_head(&mgp->down_wq);
+
+	if (pci_enable_device(pdev)) {
+		dev_err(&pdev->dev, "pci_enable_device call failed\n");
+		status = -ENODEV;
+		goto abort_with_netdev;
+	}
+	myri10ge_select_firmware(mgp);
+
+	/* Find the vendor-specific cap so we can check
+	 * the reboot register later on */
+	mgp->vendor_specific_offset
+	    = pci_find_capability(pdev, PCI_CAP_ID_VNDR);
+
+	/* Set our max read request to 4KB */
+	cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (cap < 64) {
+		dev_err(&pdev->dev, "Bad PCI_CAP_ID_EXP location %d\n", cap);
+		goto abort_with_netdev;
+	}
+	status = pci_read_config_word(pdev, cap + PCI_EXP_DEVCTL, &val);
+	if (status != 0) {
+		dev_err(&pdev->dev, "Error %d reading PCI_EXP_DEVCTL\n",
+			status);
+		goto abort_with_netdev;
+	}
+	val = (val & ~PCI_EXP_DEVCTL_READRQ) | (5 << 12);
+	status = pci_write_config_word(pdev, cap + PCI_EXP_DEVCTL, val);
+	if (status != 0) {
+		dev_err(&pdev->dev, "Error %d writing PCI_EXP_DEVCTL\n",
+			status);
+		goto abort_with_netdev;
+	}
+
+	pci_set_master(pdev);
+	dac_enabled = 1;
+	status = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
+	if (status != 0) {
+		dac_enabled = 0;
+		dev_err(&pdev->dev,
+			"64-bit pci address mask was refused, trying 32-bit");
+		status = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+	}
+	if (status != 0) {
+		dev_err(&pdev->dev, "Error %d setting DMA mask\n", status);
+		goto abort_with_netdev;
+	}
+	mgp->cmd = pci_alloc_consistent(pdev, sizeof(*mgp->cmd), &mgp->cmd_bus);
+	if (mgp->cmd == NULL)
+		goto abort_with_netdev;
+
+	mgp->fw_stats = pci_alloc_consistent(pdev, sizeof(*mgp->fw_stats),
+					     &mgp->fw_stats_bus);
+	if (mgp->fw_stats == NULL)
+		goto abort_with_cmd;
+
+	mgp->board_span = pci_resource_len(pdev, 0);
+	mgp->iomem_base = pci_resource_start(pdev, 0);
+	mgp->mtrr = -1;
+#ifdef CONFIG_MTRR
+	mgp->mtrr = mtrr_add(mgp->iomem_base, mgp->board_span,
+			     MTRR_TYPE_WRCOMB, 1);
+#endif
+	/* Hack.  need to get rid of these magic numbers */
+	mgp->sram_size =
+	    2 * 1024 * 1024 - (2 * (48 * 1024) + (32 * 1024)) - 0x100;
+	if (mgp->sram_size > mgp->board_span) {
+		dev_err(&pdev->dev, "board span %ld bytes too small\n",
+			mgp->board_span);
+		goto abort_with_wc;
+	}
+	mgp->sram = ioremap(mgp->iomem_base, mgp->board_span);
+	if (mgp->sram == NULL) {
+		dev_err(&pdev->dev, "ioremap failed for %ld bytes at 0x%lx\n",
+			mgp->board_span, mgp->iomem_base);
+		status = -ENXIO;
+		goto abort_with_wc;
+	}
+	memcpy_fromio(mgp->eeprom_strings,
+		      mgp->sram + mgp->sram_size - MYRI10GE_EEPROM_STRINGS_SIZE,
+		      MYRI10GE_EEPROM_STRINGS_SIZE);
+	memset(mgp->eeprom_strings + MYRI10GE_EEPROM_STRINGS_SIZE - 2, 0, 2);
+	status = myri10ge_read_mac_addr(mgp);
+	if (status)
+		goto abort_with_ioremap;
+
+	for (i = 0; i < ETH_ALEN; i++)
+		netdev->dev_addr[i] = mgp->mac_addr[i];
+
+	/* allocate rx done ring */
+	bytes = myri10ge_max_intr_slots * sizeof(*mgp->rx_done.entry);
+	mgp->rx_done.entry =
+	    pci_alloc_consistent(pdev, bytes, &mgp->rx_done.bus);
+	if (mgp->rx_done.entry == NULL)
+		goto abort_with_ioremap;
+	memset(mgp->rx_done.entry, 0, bytes);
+
+	status = myri10ge_load_firmware(mgp);
+	if (status != 0) {
+		dev_err(&pdev->dev, "failed to load firmware\n");
+		goto abort_with_rx_done;
+	}
+
+	status = myri10ge_reset(mgp);
+	if (status != 0) {
+		dev_err(&pdev->dev, "failed reset\n");
+		goto abort_with_firmware;
+	}
+
+	if (myri10ge_msi) {
+		status = pci_enable_msi(pdev);
+		if (status != 0)
+			dev_err(&pdev->dev,
+				"Error %d setting up MSI; falling back to xPIC\n",
+				status);
+		else
+			mgp->msi_enabled = 1;
+	}
+
+	status = request_irq(pdev->irq, myri10ge_intr, SA_SHIRQ,
+			     netdev->name, mgp);
+	if (status != 0) {
+		dev_err(&pdev->dev, "failed to allocate IRQ\n");
+		goto abort_with_firmware;
+	}
+
+	pci_set_drvdata(pdev, mgp);
+	if ((myri10ge_initial_mtu + ETH_HLEN) > MYRI10GE_MAX_ETHER_MTU)
+		myri10ge_initial_mtu = MYRI10GE_MAX_ETHER_MTU - ETH_HLEN;
+	if ((myri10ge_initial_mtu + ETH_HLEN) < 68)
+		myri10ge_initial_mtu = 68;
+	netdev->mtu = myri10ge_initial_mtu;
+	netdev->open = myri10ge_open;
+	netdev->stop = myri10ge_close;
+	netdev->hard_start_xmit = myri10ge_xmit;
+	netdev->get_stats = myri10ge_get_stats;
+	netdev->base_addr = mgp->iomem_base;
+	netdev->irq = pdev->irq;
+	netdev->change_mtu = myri10ge_change_mtu;
+	netdev->set_multicast_list = myri10ge_set_multicast_list;
+	netdev->set_mac_address = myri10ge_set_mac_address;
+	netdev->features = NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_TSO;
+	if (dac_enabled)
+		netdev->features |= NETIF_F_HIGHDMA;
+	netdev->poll = myri10ge_poll;
+	netdev->weight = myri10ge_napi_weight;
+
+	/* Save configuration space to be restored if the
+	 * nic resets due to a parity error */
+	myri10ge_save_state(mgp);
+	/* Restore state immediately since pci_save_msi_state disables MSI */
+	myri10ge_restore_state(mgp);
+
+	/* Setup the watchdog timer */
+	setup_timer(&mgp->watchdog_timer, myri10ge_watchdog_timer,
+		    (unsigned long)mgp);
+
+	SET_ETHTOOL_OPS(netdev, &myri10ge_ethtool_ops);
+	INIT_WORK(&mgp->watchdog_work, myri10ge_watchdog, mgp);
+	status = register_netdev(netdev);
+	if (status != 0) {
+		dev_err(&pdev->dev, "register_netdev failed: %d\n", status);
+		goto abort_with_irq;
+	}
+
+	printk(KERN_INFO "myri10ge: %s: %s IRQ %d, tx bndry %d, fw %s, WC %s\n",
+	       netdev->name, (mgp->msi_enabled ? "MSI" : "xPIC"),
+	       pdev->irq, mgp->tx.boundary, mgp->fw_name,
+	       (mgp->mtrr >= 0 ? "Enabled" : "Disabled"));
+
+	return 0;
+
+abort_with_irq:
+	free_irq(pdev->irq, mgp);
+	if (mgp->msi_enabled)
+		pci_disable_msi(pdev);
+
+abort_with_firmware:
+	myri10ge_dummy_rdma(mgp, 0);
+
+abort_with_rx_done:
+	bytes = myri10ge_max_intr_slots * sizeof(*mgp->rx_done.entry);
+	pci_free_consistent(pdev, bytes, mgp->rx_done.entry, mgp->rx_done.bus);
+
+abort_with_ioremap:
+	iounmap(mgp->sram);
+
+abort_with_wc:
+#ifdef CONFIG_MTRR
+	if (mgp->mtrr >= 0)
+		mtrr_del(mgp->mtrr, mgp->iomem_base, mgp->board_span);
+#endif
+	pci_free_consistent(pdev, sizeof(*mgp->fw_stats),
+			    mgp->fw_stats, mgp->fw_stats_bus);
+
+abort_with_cmd:
+	pci_free_consistent(pdev, sizeof(*mgp->cmd), mgp->cmd, mgp->cmd_bus);
+
+abort_with_netdev:
+
+	free_netdev(netdev);
+	return status;
+}
+
+/*
+ * myri10ge_remove
+ *
+ * Does what is necessary to shutdown one Myrinet device. Called
+ *   once for each Myrinet card by the kernel when a module is
+ *   unloaded.
+ */
+static void myri10ge_remove(struct pci_dev *pdev)
+{
+	struct myri10ge_priv *mgp;
+	struct net_device *netdev;
+	size_t bytes;
+
+	mgp = pci_get_drvdata(pdev);
+	if (mgp == NULL)
+		return;
+
+	flush_scheduled_work();
+	netdev = mgp->dev;
+	unregister_netdev(netdev);
+	free_irq(pdev->irq, mgp);
+	if (mgp->msi_enabled)
+		pci_disable_msi(pdev);
+
+	myri10ge_dummy_rdma(mgp, 0);
+
+	bytes = myri10ge_max_intr_slots * sizeof(*mgp->rx_done.entry);
+	pci_free_consistent(pdev, bytes, mgp->rx_done.entry, mgp->rx_done.bus);
+
+	iounmap(mgp->sram);
+
+#ifdef CONFIG_MTRR
+	if (mgp->mtrr >= 0)
+		mtrr_del(mgp->mtrr, mgp->iomem_base, mgp->board_span);
+#endif
+	pci_free_consistent(pdev, sizeof(*mgp->fw_stats),
+			    mgp->fw_stats, mgp->fw_stats_bus);
+
+	pci_free_consistent(pdev, sizeof(*mgp->cmd), mgp->cmd, mgp->cmd_bus);
+
+	free_netdev(netdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+#define PCI_DEVICE_ID_MYIRCOM_MYRI10GE_Z8E 	0x0008
+
+static struct pci_device_id myri10ge_pci_tbl[] = {
+	{PCI_DEVICE(PCI_VENDOR_ID_MYRICOM, PCI_DEVICE_ID_MYIRCOM_MYRI10GE_Z8E)},
+	{0},
+};
+
+static struct pci_driver myri10ge_driver = {
+	.name = "myri10ge",
+	.probe = myri10ge_probe,
+	.remove = myri10ge_remove,
+	.id_table = myri10ge_pci_tbl,
+#ifdef CONFIG_PM
+	.suspend = myri10ge_suspend,
+	.resume = myri10ge_resume,
+#endif
+};
+
+static __init int myri10ge_init_module(void)
+{
+	printk(KERN_INFO "%s: Version %s\n", myri10ge_driver.name,
+	       MYRI10GE_VERSION_STR);
+	return pci_register_driver(&myri10ge_driver);
+}
+
+module_init(myri10ge_init_module);
+
+static __exit void myri10ge_cleanup_module(void)
+{
+	pci_unregister_driver(&myri10ge_driver);
+}
+
+module_exit(myri10ge_cleanup_module);
diff --git a/drivers/net/myri10ge/myri10ge_mcp.h b/drivers/net/myri10ge/myri10ge_mcp.h
new file mode 100644
index 000000000000..ac23e18db63d
--- /dev/null
+++ b/drivers/net/myri10ge/myri10ge_mcp.h
@@ -0,0 +1,205 @@
+#ifndef __MYRI10GE_MCP_H__
+#define __MYRI10GE_MCP_H__
+
+#define MXGEFW_VERSION_MAJOR	1
+#define MXGEFW_VERSION_MINOR	4
+
+/* 8 Bytes */
+struct mcp_dma_addr {
+	u32 high;
+	u32 low;
+};
+
+/* 4 Bytes */
+struct mcp_slot {
+	u16 checksum;
+	u16 length;
+};
+
+/* 64 Bytes */
+struct mcp_cmd {
+	u32 cmd;
+	u32 data0;		/* will be low portion if data > 32 bits */
+	/* 8 */
+	u32 data1;		/* will be high portion if data > 32 bits */
+	u32 data2;		/* currently unused.. */
+	/* 16 */
+	struct mcp_dma_addr response_addr;
+	/* 24 */
+	u8 pad[40];
+};
+
+/* 8 Bytes */
+struct mcp_cmd_response {
+	u32 data;
+	u32 result;
+};
+
+/* 
+ * flags used in mcp_kreq_ether_send_t:
+ * 
+ * The SMALL flag is only needed in the first segment. It is raised
+ * for packets that are total less or equal 512 bytes.
+ * 
+ * The CKSUM flag must be set in all segments.
+ * 
+ * The PADDED flags is set if the packet needs to be padded, and it
+ * must be set for all segments.
+ * 
+ * The  MXGEFW_FLAGS_ALIGN_ODD must be set if the cumulative
+ * length of all previous segments was odd.
+ */
+
+#define MXGEFW_FLAGS_SMALL      0x1
+#define MXGEFW_FLAGS_TSO_HDR    0x1
+#define MXGEFW_FLAGS_FIRST      0x2
+#define MXGEFW_FLAGS_ALIGN_ODD  0x4
+#define MXGEFW_FLAGS_CKSUM      0x8
+#define MXGEFW_FLAGS_TSO_LAST   0x8
+#define MXGEFW_FLAGS_NO_TSO     0x10
+#define MXGEFW_FLAGS_TSO_CHOP   0x10
+#define MXGEFW_FLAGS_TSO_PLD    0x20
+
+#define MXGEFW_SEND_SMALL_SIZE  1520
+#define MXGEFW_MAX_MTU          9400
+
+union mcp_pso_or_cumlen {
+	u16 pseudo_hdr_offset;
+	u16 cum_len;
+};
+
+#define	MXGEFW_MAX_SEND_DESC 12
+#define MXGEFW_PAD	    2
+
+/* 16 Bytes */
+struct mcp_kreq_ether_send {
+	u32 addr_high;
+	u32 addr_low;
+	u16 pseudo_hdr_offset;
+	u16 length;
+	u8 pad;
+	u8 rdma_count;
+	u8 cksum_offset;	/* where to start computing cksum */
+	u8 flags;		/* as defined above */
+};
+
+/* 8 Bytes */
+struct mcp_kreq_ether_recv {
+	u32 addr_high;
+	u32 addr_low;
+};
+
+/* Commands */
+
+#define MXGEFW_CMD_OFFSET 0xf80000
+
+enum myri10ge_mcp_cmd_type {
+	MXGEFW_CMD_NONE = 0,
+	/* Reset the mcp, it is left in a safe state, waiting
+	 * for the driver to set all its parameters */
+	MXGEFW_CMD_RESET,
+
+	/* get the version number of the current firmware..
+	 * (may be available in the eeprom strings..? */
+	MXGEFW_GET_MCP_VERSION,
+
+	/* Parameters which must be set by the driver before it can
+	 * issue MXGEFW_CMD_ETHERNET_UP. They persist until the next
+	 * MXGEFW_CMD_RESET is issued */
+
+	MXGEFW_CMD_SET_INTRQ_DMA,
+	MXGEFW_CMD_SET_BIG_BUFFER_SIZE,	/* in bytes, power of 2 */
+	MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,	/* in bytes */
+
+	/* Parameters which refer to lanai SRAM addresses where the 
+	 * driver must issue PIO writes for various things */
+
+	MXGEFW_CMD_GET_SEND_OFFSET,
+	MXGEFW_CMD_GET_SMALL_RX_OFFSET,
+	MXGEFW_CMD_GET_BIG_RX_OFFSET,
+	MXGEFW_CMD_GET_IRQ_ACK_OFFSET,
+	MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
+
+	/* Parameters which refer to rings stored on the MCP,
+	 * and whose size is controlled by the mcp */
+
+	MXGEFW_CMD_GET_SEND_RING_SIZE,	/* in bytes */
+	MXGEFW_CMD_GET_RX_RING_SIZE,	/* in bytes */
+
+	/* Parameters which refer to rings stored in the host,
+	 * and whose size is controlled by the host.  Note that
+	 * all must be physically contiguous and must contain 
+	 * a power of 2 number of entries.  */
+
+	MXGEFW_CMD_SET_INTRQ_SIZE,	/* in bytes */
+
+	/* command to bring ethernet interface up.  Above parameters
+	 * (plus mtu & mac address) must have been exchanged prior
+	 * to issuing this command  */
+	MXGEFW_CMD_ETHERNET_UP,
+
+	/* command to bring ethernet interface down.  No further sends
+	 * or receives may be processed until an MXGEFW_CMD_ETHERNET_UP
+	 * is issued, and all interrupt queues must be flushed prior
+	 * to ack'ing this command */
+
+	MXGEFW_CMD_ETHERNET_DOWN,
+
+	/* commands the driver may issue live, without resetting
+	 * the nic.  Note that increasing the mtu "live" should
+	 * only be done if the driver has already supplied buffers
+	 * sufficiently large to handle the new mtu.  Decreasing
+	 * the mtu live is safe */
+
+	MXGEFW_CMD_SET_MTU,
+	MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,	/* in microseconds */
+	MXGEFW_CMD_SET_STATS_INTERVAL,	/* in microseconds */
+	MXGEFW_CMD_SET_STATS_DMA,
+
+	MXGEFW_ENABLE_PROMISC,
+	MXGEFW_DISABLE_PROMISC,
+	MXGEFW_SET_MAC_ADDRESS,
+
+	MXGEFW_ENABLE_FLOW_CONTROL,
+	MXGEFW_DISABLE_FLOW_CONTROL,
+
+	/* do a DMA test
+	 * data0,data1 = DMA address
+	 * data2       = RDMA length (MSH), WDMA length (LSH)
+	 * command return data = repetitions (MSH), 0.5-ms ticks (LSH)
+	 */
+	MXGEFW_DMA_TEST
+};
+
+enum myri10ge_mcp_cmd_status {
+	MXGEFW_CMD_OK = 0,
+	MXGEFW_CMD_UNKNOWN,
+	MXGEFW_CMD_ERROR_RANGE,
+	MXGEFW_CMD_ERROR_BUSY,
+	MXGEFW_CMD_ERROR_EMPTY,
+	MXGEFW_CMD_ERROR_CLOSED,
+	MXGEFW_CMD_ERROR_HASH_ERROR,
+	MXGEFW_CMD_ERROR_BAD_PORT,
+	MXGEFW_CMD_ERROR_RESOURCES
+};
+
+/* 40 Bytes */
+struct mcp_irq_data {
+	u32 send_done_count;
+
+	u32 link_up;
+	u32 dropped_link_overflow;
+	u32 dropped_link_error_or_filtered;
+	u32 dropped_runt;
+	u32 dropped_overrun;
+	u32 dropped_no_small_buffer;
+	u32 dropped_no_big_buffer;
+	u32 rdma_tags_available;
+
+	u8 tx_stopped;
+	u8 link_down;
+	u8 stats_updated;
+	u8 valid;
+};
+
+#endif				/* __MYRI10GE_MCP_H__ */
diff --git a/drivers/net/myri10ge/myri10ge_mcp_gen_header.h b/drivers/net/myri10ge/myri10ge_mcp_gen_header.h
new file mode 100644
index 000000000000..1d3a35f10789
--- /dev/null
+++ b/drivers/net/myri10ge/myri10ge_mcp_gen_header.h
@@ -0,0 +1,58 @@
+#ifndef __MYRI10GE_MCP_GEN_HEADER_H__
+#define __MYRI10GE_MCP_GEN_HEADER_H__
+
+/* this file define a standard header used as a first entry point to
+ * exchange information between firmware/driver and driver.  The
+ * header structure can be anywhere in the mcp. It will usually be in
+ * the .data section, because some fields needs to be initialized at
+ * compile time.
+ * The 32bit word at offset MX_HEADER_PTR_OFFSET in the mcp must
+ * contains the location of the header. 
+ * 
+ * Typically a MCP will start with the following:
+ * .text
+ * .space 52    ! to help catch MEMORY_INT errors
+ * bt start     ! jump to real code
+ * nop
+ * .long _gen_mcp_header
+ * 
+ * The source will have a definition like:
+ * 
+ * mcp_gen_header_t gen_mcp_header = {
+ * .header_length = sizeof(mcp_gen_header_t),
+ * .mcp_type = MCP_TYPE_XXX,
+ * .version = "something $Id: mcp_gen_header.h,v 1.2 2006/05/13 10:04:35 bgoglin Exp $",
+ * .mcp_globals = (unsigned)&Globals
+ * };
+ */
+
+#define MCP_HEADER_PTR_OFFSET  0x3c
+
+#define MCP_TYPE_MX 0x4d582020	/* "MX  " */
+#define MCP_TYPE_PCIE 0x70636965	/* "PCIE" pcie-only MCP */
+#define MCP_TYPE_ETH 0x45544820	/* "ETH " */
+#define MCP_TYPE_MCP0 0x4d435030	/* "MCP0" */
+
+struct mcp_gen_header {
+	/* the first 4 fields are filled at compile time */
+	unsigned header_length;
+	unsigned mcp_type;
+	char version[128];
+	unsigned mcp_globals;	/* pointer to mcp-type specific structure */
+
+	/* filled by the MCP at run-time */
+	unsigned sram_size;
+	unsigned string_specs;	/* either the original STRING_SPECS or a superset */
+	unsigned string_specs_len;
+
+	/* Fields above this comment are guaranteed to be present.
+	 * 
+	 * Fields below this comment are extensions added in later versions
+	 * of this struct, drivers should compare the header_length against
+	 * offsetof(field) to check wether a given MCP implements them.
+	 * 
+	 * Never remove any field.  Keep everything naturally align.
+	 */
+};
+
+#endif				/* __MYRI10GE_MCP_GEN_HEADER_H__ */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 563eeadbea46..151549a69ee0 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1827,6 +1827,7 @@
 
 #define PCI_VENDOR_ID_SAMSUNG		0x144d
 
+#define PCI_VENDOR_ID_MYRICOM		0x14c1
 
 #define PCI_VENDOR_ID_TITAN		0x14D2
 #define PCI_DEVICE_ID_TITAN_010L	0x8001
-- 
cgit v1.2.3-71-gd317


From 4c0c2fd486b6598e37c77b5d81a08bc2d948aa7b Mon Sep 17 00:00:00 2001
From: Ayaz Abdulla <aabdulla@nvidia.com>
Date: Thu, 25 May 2006 13:10:38 -0400
Subject: [PATCH] pci_ids: add new device ids

This patch adds new device ids for MCP61 and MCP65 chips.

Signed-Off-By: Ayaz Abdulla <aabdulla@nvidia.com>

Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/pci_ids.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 151549a69ee0..dce2534ab7dc 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1183,6 +1183,14 @@
 #define PCI_DEVICE_ID_NVIDIA_QUADRO_FX_1100         0x034E
 #define PCI_DEVICE_ID_NVIDIA_NVENET_14              0x0372
 #define PCI_DEVICE_ID_NVIDIA_NVENET_15              0x0373
+#define PCI_DEVICE_ID_NVIDIA_NVENET_16              0x03E5
+#define PCI_DEVICE_ID_NVIDIA_NVENET_17              0x03E6
+#define PCI_DEVICE_ID_NVIDIA_NVENET_18              0x03EE
+#define PCI_DEVICE_ID_NVIDIA_NVENET_19              0x03EF
+#define PCI_DEVICE_ID_NVIDIA_NVENET_20              0x0450
+#define PCI_DEVICE_ID_NVIDIA_NVENET_21              0x0451
+#define PCI_DEVICE_ID_NVIDIA_NVENET_22              0x0452
+#define PCI_DEVICE_ID_NVIDIA_NVENET_23              0x0453
 
 #define PCI_VENDOR_ID_IMS		0x10e0
 #define PCI_DEVICE_ID_IMS_TT128		0x9128
-- 
cgit v1.2.3-71-gd317


From 57a62fed871eb2a95f296fe6c5c250ce21b81a79 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Sat, 10 Jun 2006 09:54:14 -0700
Subject: [PATCH] I2O: Bugfixes to get I2O working again

From: Markus Lidel <Markus.Lidel@shadowconnect.com>

- Fixed locking of struct i2o_exec_wait in Executive-OSM

- Removed LCT Notify in i2o_exec_probe() which caused freeing memory and
  accessing freed memory during first enumeration of I2O devices

- Added missing locking in i2o_exec_lct_notify()

- removed put_device() of I2O controller in i2o_iop_remove() which caused
  the controller structure get freed to early

- Fixed size of mempool in i2o_iop_alloc()

- Fixed access to freed memory in i2o_msg_get()

See http://bugzilla.kernel.org/show_bug.cgi?id=6561

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/exec-osm.c | 72 ++++++++++++++++++++++--------------------
 drivers/message/i2o/iop.c      |  4 +--
 include/linux/i2o.h            |  5 ++-
 3 files changed, 42 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 5ea133c59afb..7bd4d85d0b42 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -55,6 +55,7 @@ struct i2o_exec_wait {
 	u32 m;			/* message id */
 	struct i2o_message *msg;	/* pointer to the reply message */
 	struct list_head list;	/* node in global wait list */
+	spinlock_t lock;	/* lock before modifying */
 };
 
 /* Work struct needed to handle LCT NOTIFY replies */
@@ -87,6 +88,7 @@ static struct i2o_exec_wait *i2o_exec_wait_alloc(void)
 		return NULL;
 
 	INIT_LIST_HEAD(&wait->list);
+	spin_lock_init(&wait->lock);
 
 	return wait;
 };
@@ -125,6 +127,7 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg,
 	DECLARE_WAIT_QUEUE_HEAD(wq);
 	struct i2o_exec_wait *wait;
 	static u32 tcntxt = 0x80000000;
+	long flags;
 	int rc = 0;
 
 	wait = i2o_exec_wait_alloc();
@@ -146,33 +149,28 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg,
 	wait->tcntxt = tcntxt++;
 	msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt);
 
+	wait->wq = &wq;
+	/*
+	 * we add elements to the head, because if a entry in the list will
+	 * never be removed, we have to iterate over it every time
+	 */
+	list_add(&wait->list, &i2o_exec_wait_list);
+
 	/*
 	 * Post the message to the controller. At some point later it will
 	 * return. If we time out before it returns then complete will be zero.
 	 */
 	i2o_msg_post(c, msg);
 
-	if (!wait->complete) {
-		wait->wq = &wq;
-		/*
-		 * we add elements add the head, because if a entry in the list
-		 * will never be removed, we have to iterate over it every time
-		 */
-		list_add(&wait->list, &i2o_exec_wait_list);
-
-		wait_event_interruptible_timeout(wq, wait->complete,
-						 timeout * HZ);
+	wait_event_interruptible_timeout(wq, wait->complete, timeout * HZ);
 
-		wait->wq = NULL;
-	}
+	spin_lock_irqsave(&wait->lock, flags);
 
-	barrier();
+	wait->wq = NULL;
 
-	if (wait->complete) {
+	if (wait->complete)
 		rc = le32_to_cpu(wait->msg->body[0]) >> 24;
-		i2o_flush_reply(c, wait->m);
-		i2o_exec_wait_free(wait);
-	} else {
+	else {
 		/*
 		 * We cannot remove it now. This is important. When it does
 		 * terminate (which it must do if the controller has not
@@ -186,6 +184,13 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg,
 		rc = -ETIMEDOUT;
 	}
 
+	spin_unlock_irqrestore(&wait->lock, flags);
+
+	if (rc != -ETIMEDOUT) {
+		i2o_flush_reply(c, wait->m);
+		i2o_exec_wait_free(wait);
+	}
+
 	return rc;
 };
 
@@ -213,7 +218,6 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 {
 	struct i2o_exec_wait *wait, *tmp;
 	unsigned long flags;
-	static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 	int rc = 1;
 
 	/*
@@ -223,23 +227,24 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 	 * already expired. Not much we can do about that except log it for
 	 * debug purposes, increase timeout, and recompile.
 	 */
-	spin_lock_irqsave(&lock, flags);
 	list_for_each_entry_safe(wait, tmp, &i2o_exec_wait_list, list) {
 		if (wait->tcntxt == context) {
-			list_del(&wait->list);
+			spin_lock_irqsave(&wait->lock, flags);
 
-			spin_unlock_irqrestore(&lock, flags);
+			list_del(&wait->list);
 
 			wait->m = m;
 			wait->msg = msg;
 			wait->complete = 1;
 
-			barrier();
-
-			if (wait->wq) {
-				wake_up_interruptible(wait->wq);
+			if (wait->wq)
 				rc = 0;
-			} else {
+			else
+				rc = -1;
+
+			spin_unlock_irqrestore(&wait->lock, flags);
+
+			if (rc) {
 				struct device *dev;
 
 				dev = &c->pdev->dev;
@@ -248,15 +253,13 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 					 c->name);
 				i2o_dma_free(dev, &wait->dma);
 				i2o_exec_wait_free(wait);
-				rc = -1;
-			}
+			} else
+				wake_up_interruptible(wait->wq);
 
 			return rc;
 		}
 	}
 
-	spin_unlock_irqrestore(&lock, flags);
-
 	osm_warn("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name,
 		 context);
 
@@ -322,14 +325,9 @@ static DEVICE_ATTR(product_id, S_IRUGO, i2o_exec_show_product_id, NULL);
 static int i2o_exec_probe(struct device *dev)
 {
 	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_controller *c = i2o_dev->iop;
 
 	i2o_event_register(i2o_dev, &i2o_exec_driver, 0, 0xffffffff);
 
-	c->exec = i2o_dev;
-
-	i2o_exec_lct_notify(c, c->lct->change_ind + 1);
-
 	device_create_file(dev, &dev_attr_vendor_id);
 	device_create_file(dev, &dev_attr_product_id);
 
@@ -523,6 +521,8 @@ static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
 	struct device *dev;
 	struct i2o_message *msg;
 
+	down(&c->lct_lock);
+
 	dev = &c->pdev->dev;
 
 	if (i2o_dma_realloc
@@ -545,6 +545,8 @@ static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
 
 	i2o_msg_post(c, msg);
 
+	up(&c->lct_lock);
+
 	return 0;
 };
 
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 492167446936..febbdd4e0605 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -804,8 +804,6 @@ void i2o_iop_remove(struct i2o_controller *c)
 
 	/* Ask the IOP to switch to RESET state */
 	i2o_iop_reset(c);
-
-	put_device(&c->device);
 }
 
 /**
@@ -1059,7 +1057,7 @@ struct i2o_controller *i2o_iop_alloc(void)
 
 	snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name);
 	if (i2o_pool_alloc
-	    (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4,
+	    (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4 + sizeof(u32),
 	     I2O_MSG_INPOOL_MIN)) {
 		kfree(c);
 		return ERR_PTR(-ENOMEM);
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index dd7d627bf66f..c115e9e840b4 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -1114,8 +1114,11 @@ static inline struct i2o_message *i2o_msg_get(struct i2o_controller *c)
 
 	mmsg->mfa = readl(c->in_port);
 	if (unlikely(mmsg->mfa >= c->in_queue.len)) {
+		u32 mfa = mmsg->mfa;
+
 		mempool_free(mmsg, c->in_msg.mempool);
-		if(mmsg->mfa == I2O_QUEUE_EMPTY)
+
+		if (mfa == I2O_QUEUE_EMPTY)
 			return ERR_PTR(-EBUSY);
 		return ERR_PTR(-EFAULT);
 	}
-- 
cgit v1.2.3-71-gd317


From 0ce030395b92270567423d57d9d432eb77df32f2 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Sat, 13 May 2006 08:30:52 -0700
Subject: [PATCH] PCI: fix pciehp compile issue when CONFIG_ACPI is not enabled

Fix build error when CONFIG_ACPI not defined

Signed-off-by: Kristen Carlson Accardi <kristen.c.accardi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci-acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 4877e35ae202..936ef82ed76a 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -50,7 +50,7 @@
 extern acpi_status pci_osc_control_set(acpi_handle handle, u32 flags);
 extern acpi_status pci_osc_support_set(u32 flags);
 #else
-#if !defined(acpi_status)
+#if !defined(AE_ERROR)
 typedef u32 		acpi_status;
 #define AE_ERROR      	(acpi_status) (0x0001)
 #endif    
-- 
cgit v1.2.3-71-gd317


From c13c8260da3155f2cefb63b0d1b7dcdcb405c644 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 17:18:44 -0700
Subject: [I/OAT]: DMA memcpy subsystem

Provides an API for offloading memory copies to DMA devices

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/Kconfig           |   2 +
 drivers/Makefile          |   1 +
 drivers/dma/Kconfig       |  13 ++
 drivers/dma/Makefile      |   1 +
 drivers/dma/dmaengine.c   | 408 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dmaengine.h | 337 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 762 insertions(+)
 create mode 100644 drivers/dma/Kconfig
 create mode 100644 drivers/dma/Makefile
 create mode 100644 drivers/dma/dmaengine.c
 create mode 100644 include/linux/dmaengine.h

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index aeb5ab2391e4..8b11cebe65df 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -72,4 +72,6 @@ source "drivers/edac/Kconfig"
 
 source "drivers/rtc/Kconfig"
 
+source "drivers/dma/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 447d8e68887a..3c5170310bd0 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN)		+= sn/
 obj-y				+= firmware/
 obj-$(CONFIG_CRYPTO)		+= crypto/
 obj-$(CONFIG_SUPERH)		+= sh/
+obj-$(CONFIG_DMA_ENGINE)	+= dma/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644
index 000000000000..f9ac4bcf8652
--- /dev/null
+++ b/drivers/dma/Kconfig
@@ -0,0 +1,13 @@
+#
+# DMA engine configuration
+#
+
+menu "DMA Engine support"
+
+config DMA_ENGINE
+	bool "Support for DMA engines"
+	---help---
+	  DMA engines offload copy operations from the CPU to dedicated
+	  hardware, allowing the copies to happen asynchronously.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644
index 000000000000..10b739138c93
--- /dev/null
+++ b/drivers/dma/Makefile
@@ -0,0 +1 @@
+obj-y += dmaengine.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644
index 000000000000..473c47b6f094
--- /dev/null
+++ b/drivers/dma/dmaengine.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code implements the DMA subsystem. It provides a HW-neutral interface
+ * for other kernel code to use asynchronous memory copy capabilities,
+ * if present, and allows different HW DMA drivers to register as providing
+ * this capability.
+ *
+ * Due to the fact we are accelerating what is already a relatively fast
+ * operation, the code goes to great lengths to avoid additional overhead,
+ * such as locking.
+ *
+ * LOCKING:
+ *
+ * The subsystem keeps two global lists, dma_device_list and dma_client_list.
+ * Both of these are protected by a mutex, dma_list_mutex.
+ *
+ * Each device has a channels list, which runs unlocked but is never modified
+ * once the device is registered, it's just setup by the driver.
+ *
+ * Each client has a channels list, it's only modified under the client->lock
+ * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ *
+ * Each device has a kref, which is initialized to 1 when the device is
+ * registered. A kref_put is done for each class_device registered.  When the
+ * class_device is released, the coresponding kref_put is done in the release
+ * method. Every time one of the device's channels is allocated to a client,
+ * a kref_get occurs.  When the channel is freed, the coresponding kref_put
+ * happens. The device's release function does a completion, so
+ * unregister_device does a remove event, class_device_unregister, a kref_put
+ * for the first reference, then waits on the completion for all other
+ * references to finish.
+ *
+ * Each channel has an open-coded implementation of Rusty Russell's "bigref,"
+ * with a kref and a per_cpu local_t.  A single reference is set when on an
+ * ADDED event, and removed with a REMOVE event.  Net DMA client takes an
+ * extra reference per outstanding transaction.  The relase function does a
+ * kref_put on the device. -ChrisL
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/mutex.h>
+
+static DEFINE_MUTEX(dma_list_mutex);
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs implementation --- */
+
+static ssize_t show_memcpy_count(struct class_device *cd, char *buf)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	unsigned long count = 0;
+	int i;
+
+	for_each_cpu(i)
+		count += per_cpu_ptr(chan->local, i)->memcpy_count;
+
+	return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	unsigned long count = 0;
+	int i;
+
+	for_each_cpu(i)
+		count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+
+	return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_in_use(struct class_device *cd, char *buf)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+
+	return sprintf(buf, "%d\n", (chan->client ? 1 : 0));
+}
+
+static struct class_device_attribute dma_class_attrs[] = {
+	__ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL),
+	__ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL),
+	__ATTR(in_use, S_IRUGO, show_in_use, NULL),
+	__ATTR_NULL
+};
+
+static void dma_async_device_cleanup(struct kref *kref);
+
+static void dma_class_dev_release(struct class_device *cd)
+{
+	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static struct class dma_devclass = {
+	.name            = "dma",
+	.class_dev_attrs = dma_class_attrs,
+	.release = dma_class_dev_release,
+};
+
+/* --- client and device registration --- */
+
+/**
+ * dma_client_chan_alloc - try to allocate a channel to a client
+ * @client: &dma_client
+ *
+ * Called with dma_list_mutex held.
+ */
+static struct dma_chan *dma_client_chan_alloc(struct dma_client *client)
+{
+	struct dma_device *device;
+	struct dma_chan *chan;
+	unsigned long flags;
+	int desc;	/* allocated descriptor count */
+
+	/* Find a channel, any DMA engine will do */
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		list_for_each_entry(chan, &device->channels, device_node) {
+			if (chan->client)
+				continue;
+
+			desc = chan->device->device_alloc_chan_resources(chan);
+			if (desc >= 0) {
+				kref_get(&device->refcount);
+				kref_init(&chan->refcount);
+				chan->slow_ref = 0;
+				INIT_RCU_HEAD(&chan->rcu);
+				chan->client = client;
+				spin_lock_irqsave(&client->lock, flags);
+				list_add_tail_rcu(&chan->client_node,
+				                  &client->channels);
+				spin_unlock_irqrestore(&client->lock, flags);
+				return chan;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * dma_client_chan_free - release a DMA channel
+ * @chan: &dma_chan
+ */
+void dma_chan_cleanup(struct kref *kref)
+{
+	struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
+	chan->device->device_free_chan_resources(chan);
+	chan->client = NULL;
+	kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static void dma_chan_free_rcu(struct rcu_head *rcu)
+{
+	struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
+	int bias = 0x7FFFFFFF;
+	int i;
+	for_each_cpu(i)
+		bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount);
+	atomic_sub(bias, &chan->refcount.refcount);
+	kref_put(&chan->refcount, dma_chan_cleanup);
+}
+
+static void dma_client_chan_free(struct dma_chan *chan)
+{
+	atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
+	chan->slow_ref = 1;
+	call_rcu(&chan->rcu, dma_chan_free_rcu);
+}
+
+/**
+ * dma_chans_rebalance - reallocate channels to clients
+ *
+ * When the number of DMA channel in the system changes,
+ * channels need to be rebalanced among clients
+ */
+static void dma_chans_rebalance(void)
+{
+	struct dma_client *client;
+	struct dma_chan *chan;
+	unsigned long flags;
+
+	mutex_lock(&dma_list_mutex);
+
+	list_for_each_entry(client, &dma_client_list, global_node) {
+		while (client->chans_desired > client->chan_count) {
+			chan = dma_client_chan_alloc(client);
+			if (!chan)
+				break;
+			client->chan_count++;
+			client->event_callback(client,
+	                                       chan,
+	                                       DMA_RESOURCE_ADDED);
+		}
+		while (client->chans_desired < client->chan_count) {
+			spin_lock_irqsave(&client->lock, flags);
+			chan = list_entry(client->channels.next,
+			                  struct dma_chan,
+			                  client_node);
+			list_del_rcu(&chan->client_node);
+			spin_unlock_irqrestore(&client->lock, flags);
+			client->chan_count--;
+			client->event_callback(client,
+			                       chan,
+			                       DMA_RESOURCE_REMOVED);
+			dma_client_chan_free(chan);
+		}
+	}
+
+	mutex_unlock(&dma_list_mutex);
+}
+
+/**
+ * dma_async_client_register - allocate and register a &dma_client
+ * @event_callback: callback for notification of channel addition/removal
+ */
+struct dma_client *dma_async_client_register(dma_event_callback event_callback)
+{
+	struct dma_client *client;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (!client)
+		return NULL;
+
+	INIT_LIST_HEAD(&client->channels);
+	spin_lock_init(&client->lock);
+	client->chans_desired = 0;
+	client->chan_count = 0;
+	client->event_callback = event_callback;
+
+	mutex_lock(&dma_list_mutex);
+	list_add_tail(&client->global_node, &dma_client_list);
+	mutex_unlock(&dma_list_mutex);
+
+	return client;
+}
+
+/**
+ * dma_async_client_unregister - unregister a client and free the &dma_client
+ * @client:
+ *
+ * Force frees any allocated DMA channels, frees the &dma_client memory
+ */
+void dma_async_client_unregister(struct dma_client *client)
+{
+	struct dma_chan *chan;
+
+	if (!client)
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(chan, &client->channels, client_node)
+		dma_client_chan_free(chan);
+	rcu_read_unlock();
+
+	mutex_lock(&dma_list_mutex);
+	list_del(&client->global_node);
+	mutex_unlock(&dma_list_mutex);
+
+	kfree(client);
+	dma_chans_rebalance();
+}
+
+/**
+ * dma_async_client_chan_request - request DMA channels
+ * @client: &dma_client
+ * @number: count of DMA channels requested
+ *
+ * Clients call dma_async_client_chan_request() to specify how many
+ * DMA channels they need, 0 to free all currently allocated.
+ * The resulting allocations/frees are indicated to the client via the
+ * event callback.
+ */
+void dma_async_client_chan_request(struct dma_client *client,
+			unsigned int number)
+{
+	client->chans_desired = number;
+	dma_chans_rebalance();
+}
+
+/**
+ * dma_async_device_register -
+ * @device: &dma_device
+ */
+int dma_async_device_register(struct dma_device *device)
+{
+	static int id;
+	int chancnt = 0;
+	struct dma_chan* chan;
+
+	if (!device)
+		return -ENODEV;
+
+	init_completion(&device->done);
+	kref_init(&device->refcount);
+	device->dev_id = id++;
+
+	/* represent channels in sysfs. Probably want devs too */
+	list_for_each_entry(chan, &device->channels, device_node) {
+		chan->local = alloc_percpu(typeof(*chan->local));
+		if (chan->local == NULL)
+			continue;
+
+		chan->chan_id = chancnt++;
+		chan->class_dev.class = &dma_devclass;
+		chan->class_dev.dev = NULL;
+		snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d",
+		         device->dev_id, chan->chan_id);
+
+		kref_get(&device->refcount);
+		class_device_register(&chan->class_dev);
+	}
+
+	mutex_lock(&dma_list_mutex);
+	list_add_tail(&device->global_node, &dma_device_list);
+	mutex_unlock(&dma_list_mutex);
+
+	dma_chans_rebalance();
+
+	return 0;
+}
+
+/**
+ * dma_async_device_unregister -
+ * @device: &dma_device
+ */
+static void dma_async_device_cleanup(struct kref *kref)
+{
+	struct dma_device *device;
+
+	device = container_of(kref, struct dma_device, refcount);
+	complete(&device->done);
+}
+
+void dma_async_device_unregister(struct dma_device* device)
+{
+	struct dma_chan *chan;
+	unsigned long flags;
+
+	mutex_lock(&dma_list_mutex);
+	list_del(&device->global_node);
+	mutex_unlock(&dma_list_mutex);
+
+	list_for_each_entry(chan, &device->channels, device_node) {
+		if (chan->client) {
+			spin_lock_irqsave(&chan->client->lock, flags);
+			list_del(&chan->client_node);
+			chan->client->chan_count--;
+			spin_unlock_irqrestore(&chan->client->lock, flags);
+			chan->client->event_callback(chan->client,
+			                             chan,
+			                             DMA_RESOURCE_REMOVED);
+			dma_client_chan_free(chan);
+		}
+		class_device_unregister(&chan->class_dev);
+	}
+	dma_chans_rebalance();
+
+	kref_put(&device->refcount, dma_async_device_cleanup);
+	wait_for_completion(&device->done);
+}
+
+static int __init dma_bus_init(void)
+{
+	mutex_init(&dma_list_mutex);
+	return class_register(&dma_devclass);
+}
+
+subsys_initcall(dma_bus_init);
+
+EXPORT_SYMBOL(dma_async_client_register);
+EXPORT_SYMBOL(dma_async_client_unregister);
+EXPORT_SYMBOL(dma_async_client_chan_request);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_complete);
+EXPORT_SYMBOL(dma_async_memcpy_issue_pending);
+EXPORT_SYMBOL(dma_async_device_register);
+EXPORT_SYMBOL(dma_async_device_unregister);
+EXPORT_SYMBOL(dma_chan_cleanup);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
new file mode 100644
index 000000000000..30781546ac99
--- /dev/null
+++ b/include/linux/dmaengine.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef DMAENGINE_H
+#define DMAENGINE_H
+#include <linux/config.h>
+#ifdef CONFIG_DMA_ENGINE
+
+#include <linux/device.h>
+#include <linux/uio.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+#include <linux/rcupdate.h>
+
+/**
+ * enum dma_event - resource PNP/power managment events
+ * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
+ * @DMA_RESOURCE_RESUME: DMA device returning to full power
+ * @DMA_RESOURCE_ADDED: DMA device added to the system
+ * @DMA_RESOURCE_REMOVED: DMA device removed from the system
+ */
+enum dma_event {
+	DMA_RESOURCE_SUSPEND,
+	DMA_RESOURCE_RESUME,
+	DMA_RESOURCE_ADDED,
+	DMA_RESOURCE_REMOVED,
+};
+
+/**
+ * typedef dma_cookie_t
+ *
+ * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code
+ */
+typedef s32 dma_cookie_t;
+
+#define dma_submit_error(cookie) ((cookie) < 0 ? 1 : 0)
+
+/**
+ * enum dma_status - DMA transaction status
+ * @DMA_SUCCESS: transaction completed successfully
+ * @DMA_IN_PROGRESS: transaction not yet processed
+ * @DMA_ERROR: transaction failed
+ */
+enum dma_status {
+	DMA_SUCCESS,
+	DMA_IN_PROGRESS,
+	DMA_ERROR,
+};
+
+/**
+ * struct dma_chan_percpu - the per-CPU part of struct dma_chan
+ * @refcount: local_t used for open-coded "bigref" counting
+ * @memcpy_count: transaction counter
+ * @bytes_transferred: byte counter
+ */
+
+struct dma_chan_percpu {
+	local_t refcount;
+	/* stats */
+	unsigned long memcpy_count;
+	unsigned long bytes_transferred;
+};
+
+/**
+ * struct dma_chan - devices supply DMA channels, clients use them
+ * @client: ptr to the client user of this chan, will be NULL when unused
+ * @device: ptr to the dma device who supplies this channel, always !NULL
+ * @cookie: last cookie value returned to client
+ * @chan_id:
+ * @class_dev:
+ * @refcount: kref, used in "bigref" slow-mode
+ * @slow_ref:
+ * @rcu:
+ * @client_node: used to add this to the client chan list
+ * @device_node: used to add this to the device chan list
+ * @local: per-cpu pointer to a struct dma_chan_percpu
+ */
+struct dma_chan {
+	struct dma_client *client;
+	struct dma_device *device;
+	dma_cookie_t cookie;
+
+	/* sysfs */
+	int chan_id;
+	struct class_device class_dev;
+
+	struct kref refcount;
+	int slow_ref;
+	struct rcu_head rcu;
+
+	struct list_head client_node;
+	struct list_head device_node;
+	struct dma_chan_percpu *local;
+};
+
+void dma_chan_cleanup(struct kref *kref);
+
+static inline void dma_chan_get(struct dma_chan *chan)
+{
+	if (unlikely(chan->slow_ref))
+		kref_get(&chan->refcount);
+	else {
+		local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+		put_cpu();
+	}
+}
+
+static inline void dma_chan_put(struct dma_chan *chan)
+{
+	if (unlikely(chan->slow_ref))
+		kref_put(&chan->refcount, dma_chan_cleanup);
+	else {
+		local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+		put_cpu();
+	}
+}
+
+/*
+ * typedef dma_event_callback - function pointer to a DMA event callback
+ */
+typedef void (*dma_event_callback) (struct dma_client *client,
+		struct dma_chan *chan, enum dma_event event);
+
+/**
+ * struct dma_client - info on the entity making use of DMA services
+ * @event_callback: func ptr to call when something happens
+ * @chan_count: number of chans allocated
+ * @chans_desired: number of chans requested. Can be +/- chan_count
+ * @lock: protects access to the channels list
+ * @channels: the list of DMA channels allocated
+ * @global_node: list_head for global dma_client_list
+ */
+struct dma_client {
+	dma_event_callback	event_callback;
+	unsigned int		chan_count;
+	unsigned int		chans_desired;
+
+	spinlock_t		lock;
+	struct list_head	channels;
+	struct list_head	global_node;
+};
+
+/**
+ * struct dma_device - info on the entity supplying DMA services
+ * @chancnt: how many DMA channels are supported
+ * @channels: the list of struct dma_chan
+ * @global_node: list_head for global dma_device_list
+ * @refcount:
+ * @done:
+ * @dev_id:
+ * Other func ptrs: used to make use of this device's capabilities
+ */
+struct dma_device {
+
+	unsigned int chancnt;
+	struct list_head channels;
+	struct list_head global_node;
+
+	struct kref refcount;
+	struct completion done;
+
+	int dev_id;
+
+	int (*device_alloc_chan_resources)(struct dma_chan *chan);
+	void (*device_free_chan_resources)(struct dma_chan *chan);
+	dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan,
+			void *dest, void *src, size_t len);
+	dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan,
+			struct page *page, unsigned int offset, void *kdata,
+			size_t len);
+	dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan,
+			struct page *dest_pg, unsigned int dest_off,
+			struct page *src_pg, unsigned int src_off, size_t len);
+	enum dma_status (*device_memcpy_complete)(struct dma_chan *chan,
+			dma_cookie_t cookie, dma_cookie_t *last,
+			dma_cookie_t *used);
+	void (*device_memcpy_issue_pending)(struct dma_chan *chan);
+};
+
+/* --- public DMA engine API --- */
+
+struct dma_client *dma_async_client_register(dma_event_callback event_callback);
+void dma_async_client_unregister(struct dma_client *client);
+void dma_async_client_chan_request(struct dma_client *client,
+		unsigned int number);
+
+/**
+ * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
+ * @chan: DMA channel to offload copy to
+ * @dest: destination address (virtual)
+ * @src: source address (virtual)
+ * @len: length
+ *
+ * Both @dest and @src must be mappable to a bus address according to the
+ * DMA mapping API rules for streaming mappings.
+ * Both @dest and @src must stay memory resident (kernel memory or locked
+ * user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
+	void *dest, void *src, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @page: destination page
+ * @offset: offset in page to copy to
+ * @kdata: source address (virtual)
+ * @len: length
+ *
+ * Both @page/@offset and @kdata must be mappable to a bus address according
+ * to the DMA mapping API rules for streaming mappings.
+ * Both @page/@offset and @kdata must stay memory resident (kernel memory or
+ * locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
+	struct page *page, unsigned int offset, void *kdata, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_buf_to_pg(chan, page, offset,
+	                                             kdata, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @dest_page: destination page
+ * @dest_off: offset in page to copy to
+ * @src_page: source page
+ * @src_off: offset in page to copy from
+ * @len: length
+ *
+ * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
+ * address according to the DMA mapping API rules for streaming mappings.
+ * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
+ * (kernel memory or locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
+	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
+	unsigned int src_off, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off,
+	                                            src_pg, src_off, len);
+}
+
+/**
+ * dma_async_memcpy_issue_pending - flush pending copies to HW
+ * @chan:
+ *
+ * This allows drivers to push copies to HW in batches,
+ * reducing MMIO writes where possible.
+ */
+static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
+{
+	return chan->device->device_memcpy_issue_pending(chan);
+}
+
+/**
+ * dma_async_memcpy_complete - poll for transaction completion
+ * @chan: DMA channel
+ * @cookie: transaction identifier to check status of
+ * @last: returns last completed cookie, can be NULL
+ * @used: returns last issued cookie, can be NULL
+ *
+ * If @last and @used are passed in, upon return they reflect the driver
+ * internal state and can be used with dma_async_is_complete() to check
+ * the status of multiple cookies without re-checking hardware state.
+ */
+static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan,
+	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
+{
+	return chan->device->device_memcpy_complete(chan, cookie, last, used);
+}
+
+/**
+ * dma_async_is_complete - test a cookie against chan state
+ * @cookie: transaction identifier to test status of
+ * @last_complete: last know completed transaction
+ * @last_used: last cookie value handed out
+ *
+ * dma_async_is_complete() is used in dma_async_memcpy_complete()
+ * the test logic is seperated for lightweight testing of multiple cookies
+ */
+static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
+			dma_cookie_t last_complete, dma_cookie_t last_used)
+{
+	if (last_complete <= last_used) {
+		if ((cookie <= last_complete) || (cookie > last_used))
+			return DMA_SUCCESS;
+	} else {
+		if ((cookie <= last_complete) && (cookie > last_used))
+			return DMA_SUCCESS;
+	}
+	return DMA_IN_PROGRESS;
+}
+
+
+/* --- DMA device --- */
+
+int dma_async_device_register(struct dma_device *device);
+void dma_async_device_unregister(struct dma_device *device);
+
+#endif /* CONFIG_DMA_ENGINE */
+#endif /* DMAENGINE_H */
-- 
cgit v1.2.3-71-gd317


From 57c651f74cd8383df10a648e677902849de1bc0b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.sfo1.dsl.speakeasy.net>
Date: Tue, 23 May 2006 17:39:49 -0700
Subject: [I/OAT]: Move PCI_DEVICE_ID_INTEL_IOAT to linux/pci_ids.h

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dma/ioatdma.h   | 3 +--
 include/linux/pci_ids.h | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
index 312353d12af2..a5d3b3644160 100644
--- a/drivers/dma/ioatdma.h
+++ b/drivers/dma/ioatdma.h
@@ -26,8 +26,7 @@
 #include <linux/init.h>
 #include <linux/dmapool.h>
 #include <linux/cache.h>
-
-#define PCI_DEVICE_ID_INTEL_IOAT	0x1a38
+#include <linux/pci_ids.h>
 
 #define IOAT_LOW_COMPLETION_MASK	0xffffffc0
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 590dc6dca315..819c8e1324d1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2043,6 +2043,7 @@
 #define PCI_DEVICE_ID_INTEL_80960_RP	0x1960
 #define PCI_DEVICE_ID_INTEL_82840_HB	0x1a21
 #define PCI_DEVICE_ID_INTEL_82845_HB	0x1a30
+#define PCI_DEVICE_ID_INTEL_IOAT	0x1a38
 #define PCI_DEVICE_ID_INTEL_82801AA_0	0x2410
 #define PCI_DEVICE_ID_INTEL_82801AA_1	0x2411
 #define PCI_DEVICE_ID_INTEL_82801AA_3	0x2413
-- 
cgit v1.2.3-71-gd317


From db21733488f84a596faaad0d05430b3f51804692 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Sat, 17 Jun 2006 21:24:58 -0700
Subject: [I/OAT]: Setup the networking subsystem as a DMA client

Attempts to allocate per-CPU DMA channels

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dma/Kconfig       |  12 ++++++
 include/linux/netdevice.h |   4 ++
 include/net/netdma.h      |  38 +++++++++++++++++
 net/core/dev.c            | 104 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 158 insertions(+)
 create mode 100644 include/net/netdma.h

(limited to 'include/linux')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 0f15e769c6bc..30d021d1a07c 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -10,6 +10,18 @@ config DMA_ENGINE
 	  DMA engines offload copy operations from the CPU to dedicated
 	  hardware, allowing the copies to happen asynchronously.
 
+comment "DMA Clients"
+
+config NET_DMA
+	bool "Network: TCP receive copy offload"
+	depends on DMA_ENGINE && NET
+	default y
+	---help---
+	  This enables the use of DMA engines in the network stack to
+	  offload receive copy-to-user operations, freeing CPU cycles.
+	  Since this is the main user of the DMA engine, it should be enabled;
+	  say Y here.
+
 comment "DMA Devices"
 
 config INTEL_IOATDMA
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f4169bbb60eb..b5760c67af9c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -37,6 +37,7 @@
 #include <linux/config.h>
 #include <linux/device.h>
 #include <linux/percpu.h>
+#include <linux/dmaengine.h>
 
 struct divert_blk;
 struct vlan_group;
@@ -593,6 +594,9 @@ struct softnet_data
 	struct sk_buff		*completion_queue;
 
 	struct net_device	backlog_dev;	/* Sorry. 8) */
+#ifdef CONFIG_NET_DMA
+	struct dma_chan		*net_dma;
+#endif
 };
 
 DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff --git a/include/net/netdma.h b/include/net/netdma.h
new file mode 100644
index 000000000000..cbfe89d7e5d0
--- /dev/null
+++ b/include/net/netdma.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef NETDMA_H
+#define NETDMA_H
+#include <linux/config.h>
+#ifdef CONFIG_NET_DMA
+#include <linux/dmaengine.h>
+
+static inline struct dma_chan *get_softnet_dma(void)
+{
+	struct dma_chan *chan;
+	rcu_read_lock();
+	chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma));
+	if (chan)
+		dma_chan_get(chan);
+	rcu_read_unlock();
+	return chan;
+}
+#endif /* CONFIG_NET_DMA */
+#endif /* NETDMA_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 4fba549caf29..6bfa78c66c25 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,6 +115,7 @@
 #include <net/iw_handler.h>
 #include <asm/current.h>
 #include <linux/audit.h>
+#include <linux/dmaengine.h>
 
 /*
  *	The list of packet types we will receive (as opposed to discard)
@@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16];	/* 16 way hashed list */
 static struct list_head ptype_all;		/* Taps */
 
+#ifdef CONFIG_NET_DMA
+static struct dma_client *net_dma_client;
+static unsigned int net_dma_count;
+static spinlock_t net_dma_event_lock;
+#endif
+
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtnl
  * semaphore.
@@ -1846,6 +1853,19 @@ static void net_rx_action(struct softirq_action *h)
 		}
 	}
 out:
+#ifdef CONFIG_NET_DMA
+	/*
+	 * There may not be any more sk_buffs coming right now, so push
+	 * any pending DMA copies to hardware
+	 */
+	if (net_dma_client) {
+		struct dma_chan *chan;
+		rcu_read_lock();
+		list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
+			dma_async_memcpy_issue_pending(chan);
+		rcu_read_unlock();
+	}
+#endif
 	local_irq_enable();
 	return;
 
@@ -3300,6 +3320,88 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+#ifdef CONFIG_NET_DMA
+/**
+ * net_dma_rebalance -
+ * This is called when the number of channels allocated to the net_dma_client
+ * changes.  The net_dma_client tries to have one DMA channel per CPU.
+ */
+static void net_dma_rebalance(void)
+{
+	unsigned int cpu, i, n;
+	struct dma_chan *chan;
+
+	lock_cpu_hotplug();
+
+	if (net_dma_count == 0) {
+		for_each_online_cpu(cpu)
+			rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL);
+		unlock_cpu_hotplug();
+		return;
+	}
+
+	i = 0;
+	cpu = first_cpu(cpu_online_map);
+
+	rcu_read_lock();
+	list_for_each_entry(chan, &net_dma_client->channels, client_node) {
+		n = ((num_online_cpus() / net_dma_count)
+		   + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
+
+		while(n) {
+			per_cpu(softnet_data.net_dma, cpu) = chan;
+			cpu = next_cpu(cpu, cpu_online_map);
+			n--;
+		}
+		i++;
+	}
+	rcu_read_unlock();
+
+	unlock_cpu_hotplug();
+}
+
+/**
+ * netdev_dma_event - event callback for the net_dma_client
+ * @client: should always be net_dma_client
+ * @chan:
+ * @event:
+ */
+static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+	enum dma_event event)
+{
+	spin_lock(&net_dma_event_lock);
+	switch (event) {
+	case DMA_RESOURCE_ADDED:
+		net_dma_count++;
+		net_dma_rebalance();
+		break;
+	case DMA_RESOURCE_REMOVED:
+		net_dma_count--;
+		net_dma_rebalance();
+		break;
+	default:
+		break;
+	}
+	spin_unlock(&net_dma_event_lock);
+}
+
+/**
+ * netdev_dma_regiser - register the networking subsystem as a DMA client
+ */
+static int __init netdev_dma_register(void)
+{
+	spin_lock_init(&net_dma_event_lock);
+	net_dma_client = dma_async_client_register(netdev_dma_event);
+	if (net_dma_client == NULL)
+		return -ENOMEM;
+
+	dma_async_client_chan_request(net_dma_client, num_online_cpus());
+	return 0;
+}
+
+#else
+static int __init netdev_dma_register(void) { return -ENODEV; }
+#endif /* CONFIG_NET_DMA */
 
 /*
  *	Initialize the DEV module. At boot time this walks the device list and
@@ -3353,6 +3455,8 @@ static int __init net_dev_init(void)
 		atomic_set(&queue->backlog_dev.refcnt, 1);
 	}
 
+	netdev_dma_register();
+
 	dev_boot_phase = 0;
 
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
-- 
cgit v1.2.3-71-gd317


From de5506e155276d385712c2aa1c2d9a27cd4ed947 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 17:50:37 -0700
Subject: [I/OAT]: Utility functions for offloading sk_buff to iovec copies

Provides for pinning user space pages in memory, copying to iovecs,
and copying from sk_buffs including fragmented and chained sk_buffs.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dma/Makefile      |   3 +-
 drivers/dma/iovlock.c     | 301 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dmaengine.h |  22 ++++
 include/net/netdma.h      |   6 +
 net/core/Makefile         |   1 +
 net/core/user_dma.c       | 127 +++++++++++++++++++
 6 files changed, 459 insertions(+), 1 deletion(-)
 create mode 100644 drivers/dma/iovlock.c
 create mode 100644 net/core/user_dma.c

(limited to 'include/linux')

diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index c8a5f5677313..bdcfdbdb1aec 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,2 +1,3 @@
-obj-y += dmaengine.o
+obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
+obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
new file mode 100644
index 000000000000..5ed327e453a2
--- /dev/null
+++ b/drivers/dma/iovlock.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/pagemap.h>
+#include <net/tcp.h> /* for memcpy_toiovec */
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+int num_pages_spanned(struct iovec *iov)
+{
+	return
+	((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
+	((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT);
+}
+
+/*
+ * Pin down all the iovec pages needed for len bytes.
+ * Return a struct dma_pinned_list to keep track of pages pinned down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len)
+{
+	struct dma_pinned_list *local_list;
+	struct page **pages;
+	int i;
+	int ret;
+	int nr_iovecs = 0;
+	int iovec_len_used = 0;
+	int iovec_pages_used = 0;
+	long err;
+
+	/* don't pin down non-user-based iovecs */
+	if (segment_eq(get_fs(), KERNEL_DS))
+		return NULL;
+
+	/* determine how many iovecs/pages there are, up front */
+	do {
+		iovec_len_used += iov[nr_iovecs].iov_len;
+		iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]);
+		nr_iovecs++;
+	} while (iovec_len_used < len);
+
+	/* single kmalloc for pinned list, page_list[], and the page arrays */
+	local_list = kmalloc(sizeof(*local_list)
+		+ (nr_iovecs * sizeof (struct dma_page_list))
+		+ (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
+	if (!local_list) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* list of pages starts right after the page list array */
+	pages = (struct page **) &local_list->page_list[nr_iovecs];
+
+	for (i = 0; i < nr_iovecs; i++) {
+		struct dma_page_list *page_list = &local_list->page_list[i];
+
+		len -= iov[i].iov_len;
+
+		if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) {
+			err = -EFAULT;
+			goto unpin;
+		}
+
+		page_list->nr_pages = num_pages_spanned(&iov[i]);
+		page_list->base_address = iov[i].iov_base;
+
+		page_list->pages = pages;
+		pages += page_list->nr_pages;
+
+		/* pin pages down */
+		down_read(&current->mm->mmap_sem);
+		ret = get_user_pages(
+			current,
+			current->mm,
+			(unsigned long) iov[i].iov_base,
+			page_list->nr_pages,
+			1,	/* write */
+			0,	/* force */
+			page_list->pages,
+			NULL);
+		up_read(&current->mm->mmap_sem);
+
+		if (ret != page_list->nr_pages) {
+			err = -ENOMEM;
+			goto unpin;
+		}
+
+		local_list->nr_iovecs = i + 1;
+	}
+
+	return local_list;
+
+unpin:
+	dma_unpin_iovec_pages(local_list);
+out:
+	return ERR_PTR(err);
+}
+
+void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list)
+{
+	int i, j;
+
+	if (!pinned_list)
+		return;
+
+	for (i = 0; i < pinned_list->nr_iovecs; i++) {
+		struct dma_page_list *page_list = &pinned_list->page_list[i];
+		for (j = 0; j < page_list->nr_pages; j++) {
+			set_page_dirty_lock(page_list->pages[j]);
+			page_cache_release(page_list->pages[j]);
+		}
+	}
+
+	kfree(pinned_list);
+}
+
+static dma_cookie_t dma_memcpy_to_kernel_iovec(struct dma_chan *chan, struct
+	iovec *iov, unsigned char *kdata, size_t len)
+{
+	dma_cookie_t dma_cookie = 0;
+
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, iov->iov_len, len);
+			dma_cookie = dma_async_memcpy_buf_to_buf(
+					chan,
+					iov->iov_base,
+					kdata,
+					copy);
+			kdata += copy;
+			len -= copy;
+			iov->iov_len -= copy;
+			iov->iov_base += copy;
+		}
+		iov++;
+	}
+
+	return dma_cookie;
+}
+
+/*
+ * We have already pinned down the pages we will be using in the iovecs.
+ * Each entry in iov array has corresponding entry in pinned_list->page_list.
+ * Using array indexing to keep iov[] and page_list[] in sync.
+ * Initial elements in iov array's iov->iov_len will be 0 if already copied into
+ *   by another call.
+ * iov array length remaining guaranteed to be bigger than len.
+ */
+dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+
+	if (!chan)
+		return memcpy_toiovec(iov, kdata, len);
+
+	/* -> kernel copies (e.g. smbfs) */
+	if (!pinned_list)
+		return dma_memcpy_to_kernel_iovec(chan, iov, kdata, len);
+
+	iovec_idx = 0;
+	while (iovec_idx < pinned_list->nr_iovecs) {
+		struct dma_page_list *page_list;
+
+		/* skip already used-up iovecs */
+		while (!iov[iovec_idx].iov_len)
+			iovec_idx++;
+
+		page_list = &pinned_list->page_list[iovec_idx];
+
+		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+		/* break up copies to not cross page boundary */
+		while (iov[iovec_idx].iov_len) {
+			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+			copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+			dma_cookie = dma_async_memcpy_buf_to_pg(chan,
+					page_list->pages[page_idx],
+					iov_byte_offset,
+					kdata,
+					copy);
+
+			len -= copy;
+			iov[iovec_idx].iov_len -= copy;
+			iov[iovec_idx].iov_base += copy;
+
+			if (!len)
+				return dma_cookie;
+
+			kdata += copy;
+			iov_byte_offset = 0;
+			page_idx++;
+		}
+		iovec_idx++;
+	}
+
+	/* really bad if we ever run out of iovecs */
+	BUG();
+	return -EFAULT;
+}
+
+dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, struct page *page,
+	unsigned int offset, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+	int err;
+
+	/* this needs as-yet-unimplemented buf-to-buff, so punt. */
+	/* TODO: use dma for this */
+	if (!chan || !pinned_list) {
+		u8 *vaddr = kmap(page);
+		err = memcpy_toiovec(iov, vaddr + offset, len);
+		kunmap(page);
+		return err;
+	}
+
+	iovec_idx = 0;
+	while (iovec_idx < pinned_list->nr_iovecs) {
+		struct dma_page_list *page_list;
+
+		/* skip already used-up iovecs */
+		while (!iov[iovec_idx].iov_len)
+			iovec_idx++;
+
+		page_list = &pinned_list->page_list[iovec_idx];
+
+		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+		/* break up copies to not cross page boundary */
+		while (iov[iovec_idx].iov_len) {
+			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+			copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+			dma_cookie = dma_async_memcpy_pg_to_pg(chan,
+					page_list->pages[page_idx],
+					iov_byte_offset,
+					page,
+					offset,
+					copy);
+
+			len -= copy;
+			iov[iovec_idx].iov_len -= copy;
+			iov[iovec_idx].iov_base += copy;
+
+			if (!len)
+				return dma_cookie;
+
+			offset += copy;
+			iov_byte_offset = 0;
+			page_idx++;
+		}
+		iovec_idx++;
+	}
+
+	/* really bad if we ever run out of iovecs */
+	BUG();
+	return -EFAULT;
+}
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 30781546ac99..78b236ca04f8 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -333,5 +333,27 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
 int dma_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
 
+/* --- Helper iov-locking functions --- */
+
+struct dma_page_list {
+	char *base_address;
+	int nr_pages;
+	struct page **pages;
+};
+
+struct dma_pinned_list {
+	int nr_iovecs;
+	struct dma_page_list page_list[0];
+};
+
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len);
+void dma_unpin_iovec_pages(struct dma_pinned_list* pinned_list);
+
+dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len);
+dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, struct page *page,
+	unsigned int offset, size_t len);
+
 #endif /* CONFIG_DMA_ENGINE */
 #endif /* DMAENGINE_H */
diff --git a/include/net/netdma.h b/include/net/netdma.h
index cbfe89d7e5d0..19760eb131aa 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -23,6 +23,7 @@
 #include <linux/config.h>
 #ifdef CONFIG_NET_DMA
 #include <linux/dmaengine.h>
+#include <linux/skbuff.h>
 
 static inline struct dma_chan *get_softnet_dma(void)
 {
@@ -34,5 +35,10 @@ static inline struct dma_chan *get_softnet_dma(void)
 	rcu_read_unlock();
 	return chan;
 }
+
+int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
+		const struct sk_buff *skb, int offset, struct iovec *to,
+		size_t len, struct dma_pinned_list *pinned_list);
+
 #endif /* CONFIG_NET_DMA */
 #endif /* NETDMA_H */
diff --git a/net/core/Makefile b/net/core/Makefile
index 79fe12cced27..e9bd2467d5a9 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_WIRELESS_EXT) += wireless.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_NET_DMA) += user_dma.o
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
new file mode 100644
index 000000000000..9eee91bcbf3f
--- /dev/null
+++ b/net/core/user_dma.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h> /* for BUG_TRAP */
+#include <net/tcp.h>
+
+/**
+ *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ *	@skb - buffer to copy
+ *	@offset - offset in the buffer to start copying from
+ *	@iovec - io vector to copy to
+ *	@len - amount of data to copy from buffer to iovec
+ *	@pinned_list - locked iovec buffer data
+ *
+ *	Note: the iovec is modified during the copy.
+ */
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+			struct sk_buff *skb, int offset, struct iovec *to,
+			size_t len, struct dma_pinned_list *pinned_list)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	dma_cookie_t cookie = 0;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
+		                            skb->data + offset, copy);
+		if (cookie < 0)
+			goto fault;
+		len -= copy;
+		if (len == 0)
+			goto end;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		BUG_TRAP(start <= offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		copy = end - offset;
+		if ((copy = end - offset) > 0) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+
+			cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
+					frag->page_offset + offset - start, copy);
+			if (cookie < 0)
+				goto fault;
+			len -= copy;
+			if (len == 0)
+				goto end;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+		for (; list; list = list->next) {
+			int end;
+
+			BUG_TRAP(start <= offset + len);
+
+			end = start + list->len;
+			copy = end - offset;
+			if (copy > 0) {
+				if (copy > len)
+					copy = len;
+				cookie = dma_skb_copy_datagram_iovec(chan, list,
+				                offset - start, to, copy,
+				                pinned_list);
+				if (cookie < 0)
+					goto fault;
+				len -= copy;
+				if (len == 0)
+					goto end;
+				offset += copy;
+			}
+			start = end;
+		}
+	}
+
+end:
+	if (!len) {
+		skb->dma_cookie = cookie;
+		return cookie;
+	}
+
+fault:
+ 	return -EFAULT;
+}
-- 
cgit v1.2.3-71-gd317


From 97fc2f0848c928c63c2ae619deee61a0b1107b69 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 17:55:33 -0700
Subject: [I/OAT]: Structure changes for TCP recv offload to I/OAT

Adds an async_wait_queue and some additional fields to tcp_sock, and a
dma_cookie_t to sk_buff.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 ++++
 include/linux/tcp.h    | 8 ++++++++
 include/net/sock.h     | 2 ++
 include/net/tcp.h      | 7 +++++++
 net/core/sock.c        | 6 ++++++
 5 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8f234708b98..23bad3bf3c9d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -29,6 +29,7 @@
 #include <linux/net.h>
 #include <linux/textsearch.h>
 #include <net/checksum.h>
+#include <linux/dmaengine.h>
 
 #define HAVE_ALLOC_SKB		/* For the drivers to know */
 #define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
@@ -285,6 +286,9 @@ struct sk_buff {
 	__u16			tc_verd;	/* traffic control verdict */
 #endif
 #endif
+#ifdef CONFIG_NET_DMA
+	dma_cookie_t		dma_cookie;
+#endif
 
 
 	/* These elements must be at the end, see alloc_skb() for details.  */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 542d39596bd8..c90daa5da6c3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -18,6 +18,7 @@
 #define _LINUX_TCP_H
 
 #include <linux/types.h>
+#include <linux/dmaengine.h>
 #include <asm/byteorder.h>
 
 struct tcphdr {
@@ -233,6 +234,13 @@ struct tcp_sock {
 		struct iovec		*iov;
 		int			memory;
 		int			len;
+#ifdef CONFIG_NET_DMA
+		/* members for async copy */
+		struct dma_chan		*dma_chan;
+		int			wakeup;
+		struct dma_pinned_list	*pinned_list;
+		dma_cookie_t		dma_cookie;
+#endif
 	} ucopy;
 
 	__u32	snd_wl1;	/* Sequence for window update		*/
diff --git a/include/net/sock.h b/include/net/sock.h
index c9fad6fb629b..90c65cb091a8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -132,6 +132,7 @@ struct sock_common {
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
   *	@sk_write_queue: Packet sending queue
+  *	@sk_async_wait_queue: DMA copied packets
   *	@sk_omem_alloc: "o" is "option" or "other"
   *	@sk_wmem_queued: persistent queue size
   *	@sk_forward_alloc: space allocated forward
@@ -205,6 +206,7 @@ struct sock {
 	atomic_t		sk_omem_alloc;
 	struct sk_buff_head	sk_receive_queue;
 	struct sk_buff_head	sk_write_queue;
+	struct sk_buff_head	sk_async_wait_queue;
 	int			sk_wmem_queued;
 	int			sk_forward_alloc;
 	gfp_t			sk_allocation;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c989db8a7aa..d0c2c2fa7587 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -28,6 +28,7 @@
 #include <linux/cache.h>
 #include <linux/percpu.h>
 #include <linux/skbuff.h>
+#include <linux/dmaengine.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -817,6 +818,12 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp)
 	tp->ucopy.len = 0;
 	tp->ucopy.memory = 0;
 	skb_queue_head_init(&tp->ucopy.prequeue);
+#ifdef CONFIG_NET_DMA
+	tp->ucopy.dma_chan = NULL;
+	tp->ucopy.wakeup = 0;
+	tp->ucopy.pinned_list = NULL;
+	tp->ucopy.dma_cookie = 0;
+#endif
 }
 
 /* Packet is added to VJ-style prequeue for processing in process
diff --git a/net/core/sock.c b/net/core/sock.c
index ed2afdb9ea2d..5d820c376653 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		atomic_set(&newsk->sk_omem_alloc, 0);
 		skb_queue_head_init(&newsk->sk_receive_queue);
 		skb_queue_head_init(&newsk->sk_write_queue);
+#ifdef CONFIG_NET_DMA
+		skb_queue_head_init(&newsk->sk_async_wait_queue);
+#endif
 
 		rwlock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
@@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	skb_queue_head_init(&sk->sk_receive_queue);
 	skb_queue_head_init(&sk->sk_write_queue);
 	skb_queue_head_init(&sk->sk_error_queue);
+#ifdef CONFIG_NET_DMA
+	skb_queue_head_init(&sk->sk_async_wait_queue);
+#endif
 
 	sk->sk_send_head	=	NULL;
 
-- 
cgit v1.2.3-71-gd317


From 9593782585e0cf70babe787a8463d492a68b1744 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 18:02:55 -0700
Subject: [I/OAT]: Add a sysctl for tuning the I/OAT offloaded I/O threshold

Any socket recv of less than this ammount will not be offloaded

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h     |  1 +
 include/net/tcp.h          |  1 +
 net/core/user_dma.c        |  4 ++++
 net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++
 4 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 76eaeff76f82..cd9e7c0825ad 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -403,6 +403,7 @@ enum
  	NET_TCP_MTU_PROBING=113,
 	NET_TCP_BASE_MSS=114,
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
+	NET_TCP_DMA_COPYBREAK=116,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 578cccf275d3..f1f472746e6c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -219,6 +219,7 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
+extern int sysctl_tcp_dma_copybreak;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 9eee91bcbf3f..b7c98dbcdb81 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -30,6 +30,10 @@
 #include <linux/rtnetlink.h> /* for BUG_TRAP */
 #include <net/tcp.h>
 
+#define NET_DMA_DEFAULT_COPYBREAK 4096
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+
 /**
  *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb - buffer to copy
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3adfcf00..6a6aa537b7aa 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -688,6 +688,16 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#ifdef CONFIG_NET_DMA
+	{
+		.ctl_name	= NET_TCP_DMA_COPYBREAK,
+		.procname	= "tcp_dma_copybreak",
+		.data		= &sysctl_tcp_dma_copybreak,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3-71-gd317


From 30b6c28d2aca4669f2e609ad5d77ea2a6cf0dd3a Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Fri, 26 May 2006 17:44:45 -0700
Subject: [TG3]: Add 5786 PCI ID

Add PCI ID for BCM5786 which is a variant of 5787.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c       | 2 ++
 include/linux/pci_ids.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 862c226dbbe2..cb0ebf83c843 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -229,6 +229,8 @@ static struct pci_device_id tg3_pci_tbl[] = {
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5755M,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5786,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787M,
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 819c8e1324d1..cf45e8bb69a8 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1887,6 +1887,7 @@
 #define PCI_DEVICE_ID_TIGON3_5751F	0x167e
 #define PCI_DEVICE_ID_TIGON3_5787M	0x1693
 #define PCI_DEVICE_ID_TIGON3_5782	0x1696
+#define PCI_DEVICE_ID_TIGON3_5786	0x169a
 #define PCI_DEVICE_ID_TIGON3_5787	0x169b
 #define PCI_DEVICE_ID_TIGON3_5788	0x169c
 #define PCI_DEVICE_ID_TIGON3_5789	0x169d
-- 
cgit v1.2.3-71-gd317


From b59f45d0b2878ab76f8053b0973654e6621828ee Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 27 May 2006 23:05:54 -0700
Subject: [IPSEC] xfrm: Abstract out encapsulation modes

This patch adds the structure xfrm_mode.  It is meant to represent
the operations carried out by transport/tunnel modes.

By doing this we allow additional encapsulation modes to be added
without clogging up the xfrm_input/xfrm_output paths.

Candidate modes include 4-to-6 tunnel mode, 6-to-4 tunnel mode, and
BEET modes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h            |   4 ++
 include/net/xfrm.h              |  17 ++++++
 net/ipv4/Kconfig                |  18 ++++++
 net/ipv4/Makefile               |   2 +
 net/ipv4/xfrm4_input.c          |  28 +--------
 net/ipv4/xfrm4_mode_transport.c |  69 ++++++++++++++++++++++
 net/ipv4/xfrm4_mode_tunnel.c    | 125 ++++++++++++++++++++++++++++++++++++++++
 net/ipv4/xfrm4_output.c         |  61 +-------------------
 net/ipv6/Kconfig                |  20 +++++++
 net/ipv6/Makefile               |   2 +
 net/ipv6/ip6_output.c           |   2 +
 net/ipv6/xfrm6_input.c          |  29 +---------
 net/ipv6/xfrm6_mode_transport.c |  73 +++++++++++++++++++++++
 net/ipv6/xfrm6_mode_tunnel.c    | 121 ++++++++++++++++++++++++++++++++++++++
 net/ipv6/xfrm6_output.c         |  63 +-------------------
 net/xfrm/xfrm_policy.c          |  83 ++++++++++++++++++++++++++
 net/xfrm/xfrm_state.c           |   6 ++
 17 files changed, 553 insertions(+), 170 deletions(-)
 create mode 100644 net/ipv4/xfrm4_mode_transport.c
 create mode 100644 net/ipv4/xfrm4_mode_tunnel.c
 create mode 100644 net/ipv6/xfrm6_mode_transport.c
 create mode 100644 net/ipv6/xfrm6_mode_tunnel.c

(limited to 'include/linux')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 6b42cc474c01..46a15c7a1a13 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -118,6 +118,10 @@ enum
 	XFRM_SHARE_UNIQUE	/* Use once */
 };
 
+#define XFRM_MODE_TRANSPORT 0
+#define XFRM_MODE_TUNNEL 1
+#define XFRM_MODE_MAX 2
+
 /* Netlink configuration messages.  */
 enum {
 	XFRM_MSG_BASE = 0x10,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ed7c9747059d..ed5bb34f817f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -20,6 +20,8 @@
 #include <net/ip6_fib.h>
 
 #define XFRM_ALIGN8(len)	(((len) + 7) & ~7)
+#define MODULE_ALIAS_XFRM_MODE(family, encap) \
+	MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap))
 
 extern struct sock *xfrm_nl;
 extern u32 sysctl_xfrm_aevent_etime;
@@ -164,6 +166,7 @@ struct xfrm_state
 	/* Reference to data common to all the instances of this
 	 * transformer. */
 	struct xfrm_type	*type;
+	struct xfrm_mode	*mode;
 
 	/* Security context */
 	struct xfrm_sec_ctx	*security;
@@ -205,6 +208,7 @@ struct xfrm_dst;
 struct xfrm_policy_afinfo {
 	unsigned short		family;
 	struct xfrm_type	*type_map[256];
+	struct xfrm_mode	*mode_map[XFRM_MODE_MAX];
 	struct dst_ops		*dst_ops;
 	void			(*garbage_collect)(void);
 	int			(*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl);
@@ -267,6 +271,19 @@ extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family);
 extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family);
 extern void xfrm_put_type(struct xfrm_type *type);
 
+struct xfrm_mode {
+	int (*input)(struct xfrm_state *x, struct sk_buff *skb);
+	int (*output)(struct sk_buff *skb);
+
+	struct module *owner;
+	unsigned int encap;
+};
+
+extern int xfrm_register_mode(struct xfrm_mode *mode, int family);
+extern int xfrm_unregister_mode(struct xfrm_mode *mode, int family);
+extern struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family);
+extern void xfrm_put_mode(struct xfrm_mode *mode);
+
 struct xfrm_tmpl
 {
 /* id in template is interpreted as:
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e40f75322377..35eb70b1448d 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -414,6 +414,24 @@ config INET_TUNNEL
 	tristate
 	default n
 
+config INET_XFRM_MODE_TRANSPORT
+	tristate "IP: IPsec transport mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec transport mode.
+
+	  If unsure, say Y.
+
+config INET_XFRM_MODE_TUNNEL
+	tristate "IP: IPsec tunnel mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec tunnel mode.
+
+	  If unsure, say Y.
+
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9ef50a0b9d2c..4cc94482eacc 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
+obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
+obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
 obj-$(CONFIG_IP_PNP) += ipconfig.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 3e174c83bfe7..817ed84511a6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -13,7 +13,6 @@
 #include <linux/string.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
-#include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
 
@@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb)
 
 EXPORT_SYMBOL(xfrm4_rcv);
 
-static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
-{
-	struct iphdr *outer_iph = skb->nh.iph;
-	struct iphdr *inner_iph = skb->h.ipiph;
-
-	if (INET_ECN_is_ce(outer_iph->tos))
-		IP_ECN_set_ce(inner_iph);
-}
-
 static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
 {
 	switch (nexthdr) {
@@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
 
 		xfrm_vec[xfrm_nr++] = x;
 
-		iph = skb->nh.iph;
+		if (x->mode->input(x, skb))
+			goto drop;
 
 		if (x->props.mode) {
-			if (iph->protocol != IPPROTO_IPIP)
-				goto drop;
-			if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-				goto drop;
-			if (skb_cloned(skb) &&
-			    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-				goto drop;
-			if (x->props.flags & XFRM_STATE_DECAP_DSCP)
-				ipv4_copy_dscp(iph, skb->h.ipiph);
-			if (!(x->props.flags & XFRM_STATE_NOECN))
-				ipip_ecn_decapsulate(skb);
-			skb->mac.raw = memmove(skb->data - skb->mac_len,
-					       skb->mac.raw, skb->mac_len);
-			skb->nh.raw = skb->data;
-			memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
 			decaps = 1;
 			break;
 		}
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
new file mode 100644
index 000000000000..e46d9a4ccc55
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -0,0 +1,69 @@
+/*
+ * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header will be moved forward to make space for the encapsulation
+ * header.
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static int xfrm4_transport_output(struct sk_buff *skb)
+{
+	struct xfrm_state *x;
+	struct iphdr *iph;
+	int ihl;
+
+	iph = skb->nh.iph;
+	skb->h.ipiph = iph;
+
+	ihl = iph->ihl * 4;
+	skb->h.raw += ihl;
+
+	x = skb->dst->xfrm;
+	skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl);
+	return 0;
+}
+
+static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return 0;
+}
+
+static struct xfrm_mode xfrm4_transport_mode = {
+	.input = xfrm4_transport_input,
+	.output = xfrm4_transport_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm4_transport_init(void)
+{
+	return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
+}
+
+static void __exit xfrm4_transport_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_transport_init);
+module_exit(xfrm4_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
new file mode 100644
index 000000000000..f8d880beb12f
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -0,0 +1,125 @@
+/*
+ * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+	struct iphdr *outer_iph = skb->nh.iph;
+	struct iphdr *inner_iph = skb->h.ipiph;
+
+	if (INET_ECN_is_ce(outer_iph->tos))
+		IP_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.  The following fields
+ * in it shall be filled in by x->type->output:
+ *      tot_len
+ *      check
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static int xfrm4_tunnel_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb->dst;
+	struct xfrm_state *x = dst->xfrm;
+	struct iphdr *iph, *top_iph;
+	int flags;
+
+	iph = skb->nh.iph;
+	skb->h.ipiph = iph;
+
+	skb->nh.raw = skb_push(skb, x->props.header_len);
+	top_iph = skb->nh.iph;
+
+	top_iph->ihl = 5;
+	top_iph->version = 4;
+
+	/* DS disclosed */
+	top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
+
+	flags = x->props.flags;
+	if (flags & XFRM_STATE_NOECN)
+		IP_ECN_clear(top_iph);
+
+	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+		0 : (iph->frag_off & htons(IP_DF));
+	if (!top_iph->frag_off)
+		__ip_select_ident(top_iph, dst->child, 0);
+
+	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
+
+	top_iph->saddr = x->props.saddr.a4;
+	top_iph->daddr = x->id.daddr.a4;
+	top_iph->protocol = IPPROTO_IPIP;
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+	return 0;
+}
+
+static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+	int err = -EINVAL;
+
+	if (iph->protocol != IPPROTO_IPIP)
+		goto out;
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out;
+
+	if (skb_cloned(skb) &&
+	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		goto out;
+
+	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+		ipv4_copy_dscp(iph, skb->h.ipiph);
+	if (!(x->props.flags & XFRM_STATE_NOECN))
+		ipip_ecn_decapsulate(skb);
+	skb->mac.raw = memmove(skb->data - skb->mac_len,
+			       skb->mac.raw, skb->mac_len);
+	skb->nh.raw = skb->data;
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+	err = 0;
+
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm4_tunnel_mode = {
+	.input = xfrm4_tunnel_input,
+	.output = xfrm4_tunnel_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TUNNEL,
+};
+
+static int __init xfrm4_tunnel_init(void)
+{
+	return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
+}
+
+static void __exit xfrm4_tunnel_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_tunnel_init);
+module_exit(xfrm4_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 4ef8efaf6a67..ac9d91d4bb05 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -12,67 +12,10 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/netfilter_ipv4.h>
-#include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
 #include <net/icmp.h>
 
-/* Add encapsulation header.
- *
- * In transport mode, the IP header will be moved forward to make space
- * for the encapsulation header.
- *
- * In tunnel mode, the top IP header will be constructed per RFC 2401.
- * The following fields in it shall be filled in by x->type->output:
- *	tot_len
- *	check
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
- */
-static void xfrm4_encap(struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb->dst;
-	struct xfrm_state *x = dst->xfrm;
-	struct iphdr *iph, *top_iph;
-	int flags;
-
-	iph = skb->nh.iph;
-	skb->h.ipiph = iph;
-
-	skb->nh.raw = skb_push(skb, x->props.header_len);
-	top_iph = skb->nh.iph;
-
-	if (!x->props.mode) {
-		skb->h.raw += iph->ihl*4;
-		memmove(top_iph, iph, iph->ihl*4);
-		return;
-	}
-
-	top_iph->ihl = 5;
-	top_iph->version = 4;
-
-	/* DS disclosed */
-	top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
-
-	flags = x->props.flags;
-	if (flags & XFRM_STATE_NOECN)
-		IP_ECN_clear(top_iph);
-
-	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
-		0 : (iph->frag_off & htons(IP_DF));
-	if (!top_iph->frag_off)
-		__ip_select_ident(top_iph, dst->child, 0);
-
-	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
-
-	top_iph->saddr = x->props.saddr.a4;
-	top_iph->daddr = x->id.daddr.a4;
-	top_iph->protocol = IPPROTO_IPIP;
-
-	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
-}
-
 static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 {
 	int mtu, ret = 0;
@@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb)
 		if (err)
 			goto error;
 
-		xfrm4_encap(skb);
+		err = x->mode->output(skb);
+		if (err)
+			goto error;
 
 		err = x->type->output(x, skb);
 		if (err)
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index f8a107ab5592..e923d4dea418 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -106,6 +106,26 @@ config INET6_TUNNEL
 	tristate
 	default n
 
+config INET6_XFRM_MODE_TRANSPORT
+	tristate "IPv6: IPsec transport mode"
+	depends on IPV6
+	default IPV6
+	select XFRM
+	---help---
+	  Support for IPsec transport mode.
+
+	  If unsure, say Y.
+
+config INET6_XFRM_MODE_TUNNEL
+	tristate "IPv6: IPsec tunnel mode"
+	depends on IPV6
+	default IPV6
+	select XFRM
+	---help---
+	  Support for IPsec tunnel mode.
+
+	  If unsure, say Y.
+
 config IPV6_TUNNEL
 	tristate "IPv6: IPv6-in-IPv6 tunnel"
 	select INET6_TUNNEL
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index a760b0988fbb..386e0a626948 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -20,6 +20,8 @@ obj-$(CONFIG_INET6_ESP) += esp6.o
 obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
 obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
 obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
+obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o
+obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e46048974f37..416f6e428a0a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
 #include <linux/in6.h>
 #include <linux/tcp.h>
 #include <linux/route.h>
+#include <linux/module.h>
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
@@ -488,6 +489,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 
 	return offset;
 }
+EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
 
 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 00cfdee18dca..0405d74ff910 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -13,21 +13,9 @@
 #include <linux/string.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/xfrm.h>
 
-static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
-{
-	struct ipv6hdr *outer_iph = skb->nh.ipv6h;
-	struct ipv6hdr *inner_iph = skb->h.ipv6h;
-
-	if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
-		IP6_ECN_set_ce(inner_iph);
-}
-
 int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
 {
 	int err;
@@ -81,21 +69,10 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
 
 		xfrm_vec[xfrm_nr++] = x;
 
+		if (x->mode->input(x, skb))
+			goto drop;
+
 		if (x->props.mode) { /* XXX */
-			if (nexthdr != IPPROTO_IPV6)
-				goto drop;
-			if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-				goto drop;
-			if (skb_cloned(skb) &&
-			    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-				goto drop;
-			if (x->props.flags & XFRM_STATE_DECAP_DSCP)
-				ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h);
-			if (!(x->props.flags & XFRM_STATE_NOECN))
-				ipip6_ecn_decapsulate(skb);
-			skb->mac.raw = memmove(skb->data - skb->mac_len,
-					       skb->mac.raw, skb->mac_len);
-			skb->nh.raw = skb->data;
 			decaps = 1;
 			break;
 		}
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
new file mode 100644
index 000000000000..5efbbae08ef0
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -0,0 +1,73 @@
+/*
+ * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header and mutable extension headers will be moved forward to make
+ * space for the encapsulation header.
+ *
+ * On exit, skb->h will be set to the start of the encapsulation header to be
+ * filled in by x->type->output and skb->nh will be set to the nextheader field
+ * of the extension header directly preceding the encapsulation header, or in
+ * its absence, that of the top IP header.  The value of skb->data will always
+ * point to the top IP header.
+ */
+static int xfrm6_transport_output(struct sk_buff *skb)
+{
+	struct xfrm_state *x = skb->dst->xfrm;
+	struct ipv6hdr *iph;
+	u8 *prevhdr;
+	int hdr_len;
+
+	skb_push(skb, x->props.header_len);
+	iph = skb->nh.ipv6h;
+
+	hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
+	skb->nh.raw = prevhdr - x->props.header_len;
+	skb->h.raw = skb->data + hdr_len;
+	memmove(skb->data, iph, hdr_len);
+	return 0;
+}
+
+static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return 0;
+}
+
+static struct xfrm_mode xfrm6_transport_mode = {
+	.input = xfrm6_transport_input,
+	.output = xfrm6_transport_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm6_transport_init(void)
+{
+	return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6);
+}
+
+static void __exit xfrm6_transport_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6);
+	BUG_ON(err);
+}
+
+module_init(xfrm6_transport_init);
+module_exit(xfrm6_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
new file mode 100644
index 000000000000..8af79be2edca
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -0,0 +1,121 @@
+/*
+ * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dsfield.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
+{
+	struct ipv6hdr *outer_iph = skb->nh.ipv6h;
+	struct ipv6hdr *inner_iph = skb->h.ipv6h;
+
+	if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
+		IP6_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.  The following fields
+ * in it shall be filled in by x->type->output:
+ *	payload_len
+ *
+ * On exit, skb->h will be set to the start of the encapsulation header to be
+ * filled in by x->type->output and skb->nh will be set to the nextheader field
+ * of the extension header directly preceding the encapsulation header, or in
+ * its absence, that of the top IP header.  The value of skb->data will always
+ * point to the top IP header.
+ */
+static int xfrm6_tunnel_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb->dst;
+	struct xfrm_state *x = dst->xfrm;
+	struct ipv6hdr *iph, *top_iph;
+	int dsfield;
+
+	skb_push(skb, x->props.header_len);
+	iph = skb->nh.ipv6h;
+
+	skb->nh.raw = skb->data;
+	top_iph = skb->nh.ipv6h;
+	skb->nh.raw = &top_iph->nexthdr;
+	skb->h.ipv6h = top_iph + 1;
+
+	top_iph->version = 6;
+	top_iph->priority = iph->priority;
+	top_iph->flow_lbl[0] = iph->flow_lbl[0];
+	top_iph->flow_lbl[1] = iph->flow_lbl[1];
+	top_iph->flow_lbl[2] = iph->flow_lbl[2];
+	dsfield = ipv6_get_dsfield(top_iph);
+	dsfield = INET_ECN_encapsulate(dsfield, dsfield);
+	if (x->props.flags & XFRM_STATE_NOECN)
+		dsfield &= ~INET_ECN_MASK;
+	ipv6_change_dsfield(top_iph, 0, dsfield);
+	top_iph->nexthdr = IPPROTO_IPV6; 
+	top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
+	ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
+	ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
+	return 0;
+}
+
+static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err = -EINVAL;
+
+	if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6)
+		goto out;
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		goto out;
+
+	if (skb_cloned(skb) &&
+	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		goto out;
+
+	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+		ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h);
+	if (!(x->props.flags & XFRM_STATE_NOECN))
+		ipip6_ecn_decapsulate(skb);
+	skb->mac.raw = memmove(skb->data - skb->mac_len,
+			       skb->mac.raw, skb->mac_len);
+	skb->nh.raw = skb->data;
+	err = 0;
+
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm6_tunnel_mode = {
+	.input = xfrm6_tunnel_input,
+	.output = xfrm6_tunnel_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TUNNEL,
+};
+
+static int __init xfrm6_tunnel_init(void)
+{
+	return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6);
+}
+
+static void __exit xfrm6_tunnel_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6);
+	BUG_ON(err);
+}
+
+module_init(xfrm6_tunnel_init);
+module_exit(xfrm6_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 80242172a5df..16e84254a252 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -14,68 +14,9 @@
 #include <linux/spinlock.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter_ipv6.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
 #include <net/ipv6.h>
 #include <net/xfrm.h>
 
-/* Add encapsulation header.
- *
- * In transport mode, the IP header and mutable extension headers will be moved
- * forward to make space for the encapsulation header.
- *
- * In tunnel mode, the top IP header will be constructed per RFC 2401.
- * The following fields in it shall be filled in by x->type->output:
- *	payload_len
- *
- * On exit, skb->h will be set to the start of the encapsulation header to be
- * filled in by x->type->output and skb->nh will be set to the nextheader field
- * of the extension header directly preceding the encapsulation header, or in
- * its absence, that of the top IP header.  The value of skb->data will always
- * point to the top IP header.
- */
-static void xfrm6_encap(struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb->dst;
-	struct xfrm_state *x = dst->xfrm;
-	struct ipv6hdr *iph, *top_iph;
-	int dsfield;
-
-	skb_push(skb, x->props.header_len);
-	iph = skb->nh.ipv6h;
-
-	if (!x->props.mode) {
-		u8 *prevhdr;
-		int hdr_len;
-
-		hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
-		skb->nh.raw = prevhdr - x->props.header_len;
-		skb->h.raw = skb->data + hdr_len;
-		memmove(skb->data, iph, hdr_len);
-		return;
-	}
-
-	skb->nh.raw = skb->data;
-	top_iph = skb->nh.ipv6h;
-	skb->nh.raw = &top_iph->nexthdr;
-	skb->h.ipv6h = top_iph + 1;
-
-	top_iph->version = 6;
-	top_iph->priority = iph->priority;
-	top_iph->flow_lbl[0] = iph->flow_lbl[0];
-	top_iph->flow_lbl[1] = iph->flow_lbl[1];
-	top_iph->flow_lbl[2] = iph->flow_lbl[2];
-	dsfield = ipv6_get_dsfield(top_iph);
-	dsfield = INET_ECN_encapsulate(dsfield, dsfield);
-	if (x->props.flags & XFRM_STATE_NOECN)
-		dsfield &= ~INET_ECN_MASK;
-	ipv6_change_dsfield(top_iph, 0, dsfield);
-	top_iph->nexthdr = IPPROTO_IPV6; 
-	top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
-	ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
-	ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
-}
-
 static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 {
 	int mtu, ret = 0;
@@ -118,7 +59,9 @@ static int xfrm6_output_one(struct sk_buff *skb)
 		if (err)
 			goto error;
 
-		xfrm6_encap(skb);
+		err = x->mode->output(skb);
+		if (err)
+			goto error;
 
 		err = x->type->output(x, skb);
 		if (err)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 44b64a593c01..b8936926c24b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -138,6 +138,89 @@ void xfrm_put_type(struct xfrm_type *type)
 	module_put(type->owner);
 }
 
+int xfrm_register_mode(struct xfrm_mode *mode, int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	struct xfrm_mode **modemap;
+	int err;
+
+	if (unlikely(mode->encap >= XFRM_MODE_MAX))
+		return -EINVAL;
+
+	afinfo = xfrm_policy_lock_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+
+	err = -EEXIST;
+	modemap = afinfo->mode_map;
+	if (likely(modemap[mode->encap] == NULL)) {
+		modemap[mode->encap] = mode;
+		err = 0;
+	}
+
+	xfrm_policy_unlock_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_register_mode);
+
+int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	struct xfrm_mode **modemap;
+	int err;
+
+	if (unlikely(mode->encap >= XFRM_MODE_MAX))
+		return -EINVAL;
+
+	afinfo = xfrm_policy_lock_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+
+	err = -ENOENT;
+	modemap = afinfo->mode_map;
+	if (likely(modemap[mode->encap] == mode)) {
+		modemap[mode->encap] = NULL;
+		err = 0;
+	}
+
+	xfrm_policy_unlock_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_unregister_mode);
+
+struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	struct xfrm_mode *mode;
+	int modload_attempted = 0;
+
+	if (unlikely(encap >= XFRM_MODE_MAX))
+		return NULL;
+
+retry:
+	afinfo = xfrm_policy_get_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return NULL;
+
+	mode = afinfo->mode_map[encap];
+	if (unlikely(mode && !try_module_get(mode->owner)))
+		mode = NULL;
+	if (!mode && !modload_attempted) {
+		xfrm_policy_put_afinfo(afinfo);
+		request_module("xfrm-mode-%d-%d", family, encap);
+		modload_attempted = 1;
+		goto retry;
+	}
+
+	xfrm_policy_put_afinfo(afinfo);
+	return mode;
+}
+
+void xfrm_put_mode(struct xfrm_mode *mode)
+{
+	module_put(mode->owner);
+}
+
 static inline unsigned long make_jiffies(long secs)
 {
 	if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index ee62c239a7e3..17b29ec3c417 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -77,6 +77,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 	kfree(x->ealg);
 	kfree(x->calg);
 	kfree(x->encap);
+	if (x->mode)
+		xfrm_put_mode(x->mode);
 	if (x->type) {
 		x->type->destructor(x);
 		xfrm_put_type(x->type);
@@ -1193,6 +1195,10 @@ int xfrm_init_state(struct xfrm_state *x)
 	if (err)
 		goto error;
 
+	x->mode = xfrm_get_mode(x->props.mode, family);
+	if (x->mode == NULL)
+		goto error;
+
 	x->km.state = XFRM_STATE_VALID;
 
 error:
-- 
cgit v1.2.3-71-gd317


From 62b7743483b402f8fb73545d5d487ca714e82766 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:20:32 -0700
Subject: [NETFILTER]: x_tables: add quota match

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_quota.h | 16 +++++++
 net/netfilter/Kconfig              | 10 ++++
 net/netfilter/Makefile             |  1 +
 net/netfilter/xt_quota.c           | 96 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+)
 create mode 100644 include/linux/netfilter/xt_quota.h
 create mode 100644 net/netfilter/xt_quota.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_quota.h b/include/linux/netfilter/xt_quota.h
new file mode 100644
index 000000000000..acd7fd77bbee
--- /dev/null
+++ b/include/linux/netfilter/xt_quota.h
@@ -0,0 +1,16 @@
+#ifndef _XT_QUOTA_H
+#define _XT_QUOTA_H
+
+enum xt_quota_flags {
+	XT_QUOTA_INVERT		= 0x1,
+};
+#define XT_QUOTA_MASK		0x1
+
+struct xt_quota_info {
+	u_int32_t		flags;
+	u_int32_t		pad;
+	aligned_u64		quota;
+	struct xt_quota_info	*master;
+};
+
+#endif /* _XT_QUOTA_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e2893effdfaa..5543c7b74535 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -329,6 +329,16 @@ config NETFILTER_XT_MATCH_PKTTYPE
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_QUOTA
+	tristate '"quota" match support'
+	depends on NETFILTER_XTABLES
+	help
+	  This option adds a `quota' match, which allows to match on a
+	  byte counter.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
 config NETFILTER_XT_MATCH_REALM
 	tristate  '"realm" match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 95b7e416512d..4b6a6ea07373 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
new file mode 100644
index 000000000000..4cdba7469dc4
--- /dev/null
+++ b/net/netfilter/xt_quota.c
@@ -0,0 +1,96 @@
+/*
+ * netfilter module to enforce network quotas
+ *
+ * Sam Johnston <samj@samj.net>
+ */
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+
+static DEFINE_SPINLOCK(quota_lock);
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in, const struct net_device *out,
+      const struct xt_match *match, const void *matchinfo,
+      int offset, unsigned int protoff, int *hotdrop)
+{
+	struct xt_quota_info *q = ((struct xt_quota_info *)matchinfo)->master;
+	int ret = q->flags & XT_QUOTA_INVERT ? 1 : 0;
+
+	spin_lock_bh(&quota_lock);
+	if (q->quota >= skb->len) {
+		q->quota -= skb->len;
+		ret ^= 1;
+	} else {
+	        /* we do not allow even small packets from now on */
+	        q->quota = 0;
+	}
+	spin_unlock_bh(&quota_lock);
+
+	return ret;
+}
+
+static int
+checkentry(const char *tablename, const void *entry,
+	   const struct xt_match *match, void *matchinfo,
+	   unsigned int matchsize, unsigned int hook_mask)
+{
+	struct xt_quota_info *q = (struct xt_quota_info *)matchinfo;
+
+	if (q->flags & ~XT_QUOTA_MASK)
+		return 0;
+	/* For SMP, we only want to use one set of counters. */
+	q->master = q;
+	return 1;
+}
+
+static struct xt_match quota_match = {
+	.name		= "quota",
+	.family		= AF_INET,
+	.match		= match,
+	.matchsize	= sizeof(struct xt_quota_info),
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE
+};
+
+static struct xt_match quota_match6 = {
+	.name		= "quota",
+	.family		= AF_INET6,
+	.match		= match,
+	.matchsize	= sizeof(struct xt_quota_info),
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE
+};
+
+static int __init xt_quota_init(void)
+{
+	int ret;
+
+	ret = xt_register_match(&quota_match);
+	if (ret)
+		goto err1;
+	ret = xt_register_match(&quota_match6);
+	if (ret)
+		goto err2;
+	return ret;
+
+err2:
+	xt_unregister_match(&quota_match);
+err1:
+	return ret;
+}
+
+static void __exit xt_quota_fini(void)
+{
+	xt_unregister_match(&quota_match6);
+	xt_unregister_match(&quota_match);
+}
+
+module_init(xt_quota_init);
+module_exit(xt_quota_fini);
-- 
cgit v1.2.3-71-gd317


From f3389805e53a13bd969ee1c8fc5a4137b7c6c167 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:21:00 -0700
Subject: [NETFILTER]: x_tables: add statistic match

Add statistic match which is a combination of the nth and random matches.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_statistic.h |  32 ++++++++++
 net/netfilter/Kconfig                  |   6 ++
 net/netfilter/Makefile                 |   1 +
 net/netfilter/xt_statistic.c           | 112 +++++++++++++++++++++++++++++++++
 4 files changed, 151 insertions(+)
 create mode 100644 include/linux/netfilter/xt_statistic.h
 create mode 100644 net/netfilter/xt_statistic.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_statistic.h b/include/linux/netfilter/xt_statistic.h
new file mode 100644
index 000000000000..c344e9916e23
--- /dev/null
+++ b/include/linux/netfilter/xt_statistic.h
@@ -0,0 +1,32 @@
+#ifndef _XT_STATISTIC_H
+#define _XT_STATISTIC_H
+
+enum xt_statistic_mode {
+	XT_STATISTIC_MODE_RANDOM,
+	XT_STATISTIC_MODE_NTH,
+	__XT_STATISTIC_MODE_MAX
+};
+#define XT_STATISTIC_MODE_MAX (__XT_STATISTIC_MODE_MAX - 1)
+
+enum xt_statistic_flags {
+	XT_STATISTIC_INVERT		= 0x1,
+};
+#define XT_STATISTIC_MASK		0x1
+
+struct xt_statistic_info {
+	u_int16_t			mode;
+	u_int16_t			flags;
+	union {
+		struct {
+			u_int32_t	probability;
+		} random;
+		struct {
+			u_int32_t	every;
+			u_int32_t	packet;
+			u_int32_t	count;
+		} nth;
+	} u;
+	struct xt_statistic_info	*master __attribute__((aligned(8)));
+};
+
+#endif /* _XT_STATISTIC_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 5543c7b74535..85a7e1770252 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -375,6 +375,12 @@ config NETFILTER_XT_MATCH_STATE
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_STATISTIC
+	tristate '"statistic" match support'
+	depends on NETFILTER_XTABLES
+	help
+	  statistic module
+
 config NETFILTER_XT_MATCH_STRING
 	tristate  '"string" match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 4b6a6ea07373..df1da25f40df 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
new file mode 100644
index 000000000000..de1037f58596
--- /dev/null
+++ b/net/netfilter/xt_statistic.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>.
+ */
+
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+
+#include <linux/netfilter/xt_statistic.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("xtables statistical match module");
+MODULE_ALIAS("ipt_statistic");
+MODULE_ALIAS("ip6t_statistic");
+
+static DEFINE_SPINLOCK(nth_lock);
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in, const struct net_device *out,
+      const struct xt_match *match, const void *matchinfo,
+      int offset, unsigned int protoff, int *hotdrop)
+{
+	struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
+	int ret = info->flags & XT_STATISTIC_INVERT ? 1 : 0;
+
+	switch (info->mode) {
+	case XT_STATISTIC_MODE_RANDOM:
+		if ((net_random() & 0x7FFFFFFF) < info->u.random.probability)
+			ret ^= 1;
+		break;
+	case XT_STATISTIC_MODE_NTH:
+		info = info->master;
+		spin_lock_bh(&nth_lock);
+		if (info->u.nth.count++ == info->u.nth.every) {
+			info->u.nth.count = 0;
+			ret ^= 1;
+		}
+		spin_unlock_bh(&nth_lock);
+		break;
+	}
+
+	return ret;
+}
+
+static int
+checkentry(const char *tablename, const void *entry,
+	   const struct xt_match *match, void *matchinfo,
+	   unsigned int matchsize, unsigned int hook_mask)
+{
+	struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
+
+	if (info->mode > XT_STATISTIC_MODE_MAX ||
+	    info->flags & ~XT_STATISTIC_MASK)
+		return 0;
+	info->master = info;
+	return 1;
+}
+
+static struct xt_match statistic_match = {
+	.name		= "statistic",
+	.match		= match,
+	.matchsize	= sizeof(struct xt_statistic_info),
+	.checkentry	= checkentry,
+	.family		= AF_INET,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_match statistic_match6 = {
+	.name		= "statistic",
+	.match		= match,
+	.matchsize	= sizeof(struct xt_statistic_info),
+	.checkentry	= checkentry,
+	.family		= AF_INET6,
+	.me		= THIS_MODULE,
+};
+
+static int __init xt_statistic_init(void)
+{
+	int ret;
+
+	ret = xt_register_match(&statistic_match);
+	if (ret)
+		goto err1;
+
+	ret = xt_register_match(&statistic_match6);
+	if (ret)
+		goto err2;
+	return ret;
+err2:
+	xt_unregister_match(&statistic_match);
+err1:
+	return ret;
+}
+
+static void __exit xt_statistic_fini(void)
+{
+	xt_unregister_match(&statistic_match6);
+	xt_unregister_match(&statistic_match);
+}
+
+module_init(xt_statistic_init);
+module_exit(xt_statistic_fini);
-- 
cgit v1.2.3-71-gd317


From 39a27a35c5c1b5be499a0576a35c45a011788bf8 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:23:54 -0700
Subject: [NETFILTER]: conntrack: add sysctl to disable checksumming

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h    |  1 +
 include/linux/sysctl.h                         |  2 ++
 include/net/netfilter/nf_conntrack.h           |  1 +
 net/ipv4/netfilter/ip_conntrack_proto_icmp.c   |  2 +-
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c    |  2 +-
 net/ipv4/netfilter/ip_conntrack_proto_udp.c    |  2 +-
 net/ipv4/netfilter/ip_conntrack_standalone.c   | 11 +++++++++++
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  2 +-
 net/netfilter/nf_conntrack_proto_tcp.c         |  5 +++--
 net/netfilter/nf_conntrack_proto_udp.c         |  3 ++-
 net/netfilter/nf_conntrack_standalone.c        | 11 +++++++++++
 12 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index d54d7b278e96..5473c01f69ea 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -293,6 +293,7 @@ static inline int is_dying(struct ip_conntrack *ct)
 }
 
 extern unsigned int ip_conntrack_htable_size;
+extern int ip_conntrack_checksum;
  
 #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index cd9e7c0825ad..98338ed2c0b6 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -313,6 +313,7 @@ enum
 	NET_NF_CONNTRACK_FRAG6_TIMEOUT=29,
 	NET_NF_CONNTRACK_FRAG6_LOW_THRESH=30,
 	NET_NF_CONNTRACK_FRAG6_HIGH_THRESH=31,
+	NET_NF_CONNTRACK_CHECKSUM=32,
 };
 
 /* /proc/sys/net/ipv4 */
@@ -492,6 +493,7 @@ enum
  	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25,
  	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26,
 	NET_IPV4_NF_CONNTRACK_COUNT=27,
+	NET_IPV4_NF_CONNTRACK_CHECKSUM=28,
 };
  
 /* /proc/sys/net/ipv6 */
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 916013ca4a5c..dbe7a114d0c5 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -285,6 +285,7 @@ static inline int nf_ct_is_dying(struct nf_conn *ct)
 }
 
 extern unsigned int nf_conntrack_htable_size;
+extern int nf_conntrack_checksum;
 
 #define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++)
 
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index d8b14a9010a6..23f1c504586d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) {
 		if (LOG_INVALID(IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 062b252b58ad..c5c2ce5cdeb8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb,
 	 * and moreover root might send raw packets.
 	 */
 	/* FIXME: Source route IP option packets --RR */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) {
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 70899868783b..9b2c16b4d2ff 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	 * because the semantic of CHECKSUM_HW is different there 
 	 * and moreover root might send raw packets.
 	 * FIXME: Source route IP option packets --RR */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) {
 		if (LOG_INVALID(IPPROTO_UDP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index f0cc7feb0da3..6cb9b989d14c 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -564,6 +564,8 @@ extern unsigned int ip_ct_generic_timeout;
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
 
+int ip_conntrack_checksum = 1;
+
 static struct ctl_table_header *ip_ct_sysctl_header;
 
 static ctl_table ip_ct_sysctl_table[] = {
@@ -591,6 +593,14 @@ static ctl_table ip_ct_sysctl_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_CHECKSUM,
+		.procname	= "ip_conntrack_checksum",
+		.data		= &ip_conntrack_checksum,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
 		.procname	= "ip_conntrack_tcp_timeout_syn_sent",
@@ -946,6 +956,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
 EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
 EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
 EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+EXPORT_SYMBOL_GPL(ip_conntrack_checksum);
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
 EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 4b0d361cc6e6..663a73ee3f2f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {
 		if (LOG_INVALID(IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 86c6703265d0..ef18a7b7014b 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -233,7 +233,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
 		return -NF_ACCEPT;
 	}
 
-	if (hooknum == NF_IP6_PRE_ROUTING &&
+	if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING &&
 	    nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
 		nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
 			      "nf_ct_icmpv6: ICMPv6 checksum failed\n");
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 69899f27d26a..12fb7c0a1509 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -828,8 +828,9 @@ static int tcp_error(struct sk_buff *skb,
 	 * and moreover root might send raw packets.
 	 */
 	/* FIXME: Source route IP option packets --RR */
-	if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
-	     (pf == PF_INET6 && hooknum  == NF_IP6_PRE_ROUTING)) &&
+	if (nf_conntrack_checksum &&
+	    ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+	     (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index d93edbfde9e3..ae07ebe3ab37 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -134,7 +134,8 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff,
 	 * because the semantic of CHECKSUM_HW is different there
 	 * and moreover root might send raw packets.
 	 * FIXME: Source route IP option packets --RR */
-	if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+	if (nf_conntrack_checksum &&
+	    ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
 	     (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
 		if (LOG_INVALID(IPPROTO_UDP))
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 408960c6a544..e01d20d8e287 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -455,6 +455,8 @@ extern unsigned int nf_ct_generic_timeout;
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
 
+int nf_conntrack_checksum = 1;
+
 static struct ctl_table_header *nf_ct_sysctl_header;
 
 static ctl_table nf_ct_sysctl_table[] = {
@@ -482,6 +484,14 @@ static ctl_table nf_ct_sysctl_table[] = {
 		.mode           = 0444,
 		.proc_handler   = &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_NF_CONNTRACK_CHECKSUM,
+		.procname	= "nf_conntrack_checksum",
+		.data		= &nf_conntrack_checksum,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
 		.procname	= "nf_conntrack_tcp_timeout_syn_sent",
@@ -851,6 +861,7 @@ EXPORT_SYMBOL(nf_ct_proto_put);
 EXPORT_SYMBOL(nf_ct_l3proto_find_get);
 EXPORT_SYMBOL(nf_ct_l3proto_put);
 EXPORT_SYMBOL(nf_ct_l3protos);
+EXPORT_SYMBOL_GPL(nf_conntrack_checksum);
 EXPORT_SYMBOL(nf_conntrack_expect_alloc);
 EXPORT_SYMBOL(nf_conntrack_expect_put);
 EXPORT_SYMBOL(nf_conntrack_expect_related);
-- 
cgit v1.2.3-71-gd317


From 997ae831ade74bdaed4172b1c02060b9efd6e206 Mon Sep 17 00:00:00 2001
From: Eric Leblond <eric@inl.fr>
Date: Mon, 29 May 2006 18:24:20 -0700
Subject: [NETFILTER]: conntrack: add fixed timeout flag in connection tracking

Add a flag in a connection status to have a non updated timeout.
This permits to have connection that automatically die at a given
time.

Signed-off-by: Eric Leblond <eric@inl.fr>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nf_conntrack_common.h | 4 ++++
 net/ipv4/netfilter/ip_conntrack_core.c        | 6 ++++++
 net/netfilter/nf_conntrack_core.c             | 6 ++++++
 3 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 3ff88c878308..d2e4bd7a7a14 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -69,6 +69,10 @@ enum ip_conntrack_status {
 	/* Connection is dying (removed from lists), can not be unset. */
 	IPS_DYING_BIT = 9,
 	IPS_DYING = (1 << IPS_DYING_BIT),
+
+	/* Connection has fixed timeout. */
+	IPS_FIXED_TIMEOUT_BIT = 10,
+	IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT),
 };
 
 /* Connection tracking event bits */
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a297da7bbef5..4fe9e69378df 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -1130,6 +1130,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct,
 
 	write_lock_bh(&ip_conntrack_lock);
 
+	/* Only update if this is not a fixed timeout */
+	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
+		write_unlock_bh(&ip_conntrack_lock);
+		return;
+	}
+
 	/* If not in hash table, timer will not be active yet */
 	if (!is_confirmed(ct)) {
 		ct->timeout.expires = extra_jiffies;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f9b83f91371a..bc2bd4c3859e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1396,6 +1396,12 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
 
 	write_lock_bh(&nf_conntrack_lock);
 
+	/* Only update if this is not a fixed timeout */
+	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
+		write_unlock_bh(&nf_conntrack_lock);
+		return;
+	}
+
 	/* If not in hash table, timer will not be active yet */
 	if (!nf_ct_is_confirmed(ct)) {
 		ct->timeout.expires = extra_jiffies;
-- 
cgit v1.2.3-71-gd317


From 3726add76643c715d437aceda320d319153b6113 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:24:39 -0700
Subject: [NETFILTER]: ctnetlink: fix NAT configuration

The current configuration only allows to configure one manip and overloads
conntrack status flags with netlink semantic.

Signed-off-by: Patrick Mchardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h |  4 +-
 net/ipv4/netfilter/ip_conntrack_netlink.c     | 53 +++++++++++----------------
 net/netfilter/nf_conntrack_netlink.c          | 53 +++++++++++----------------
 3 files changed, 47 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 668ec946c8e2..b5883ccee295 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -27,13 +27,15 @@ enum ctattr_type {
 	CTA_STATUS,
 	CTA_PROTOINFO,
 	CTA_HELP,
-	CTA_NAT,
+	CTA_NAT_SRC,
+#define CTA_NAT	CTA_NAT_SRC	/* backwards compatibility */
 	CTA_TIMEOUT,
 	CTA_MARK,
 	CTA_COUNTERS_ORIG,
 	CTA_COUNTERS_REPLY,
 	CTA_USE,
 	CTA_ID,
+	CTA_NAT_DST,
 	__CTA_MAX
 };
 #define CTA_MAX (__CTA_MAX - 1)
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 01bd7cab9367..af152e3623dc 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -629,7 +629,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = {
 };
 
 static inline int
-ctnetlink_parse_nat(struct nfattr *cda[],
+ctnetlink_parse_nat(struct nfattr *nat,
 		    const struct ip_conntrack *ct, struct ip_nat_range *range)
 {
 	struct nfattr *tb[CTA_NAT_MAX];
@@ -639,7 +639,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
 
 	memset(range, 0, sizeof(*range));
 	
-	nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
+	nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
 
 	if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
 		return -EINVAL;
@@ -854,39 +854,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
 		/* ASSURED bit can only be set */
 		return -EINVAL;
 
-	if (cda[CTA_NAT-1]) {
+	if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 #ifndef CONFIG_IP_NF_NAT_NEEDED
 		return -EINVAL;
 #else
-		unsigned int hooknum;
 		struct ip_nat_range range;
 
-		if (ctnetlink_parse_nat(cda, ct, &range) < 0)
-			return -EINVAL;
-
-		DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
-		       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
-		       htons(range.min.all), htons(range.max.all));
-		
-		/* This is tricky but it works. ip_nat_setup_info needs the
-		 * hook number as parameter, so let's do the correct 
-		 * conversion and run away */
-		if (status & IPS_SRC_NAT_DONE)
-			hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
-		else if (status & IPS_DST_NAT_DONE)
-			hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
-		else 
-			return -EINVAL; /* Missing NAT flags */
-
-		DEBUGP("NAT status: %lu\n", 
-		       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
-		
-		if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
-			return -EEXIST;
-		ip_nat_setup_info(ct, &range, hooknum);
-
-                DEBUGP("NAT status after setup_info: %lu\n",
-                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+		if (cda[CTA_NAT_DST-1]) {
+			if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
+						&range) < 0)
+				return -EINVAL;
+			if (ip_nat_initialized(ct,
+					       HOOK2MANIP(NF_IP_PRE_ROUTING)))
+				return -EEXIST;
+			ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
+		}
+		if (cda[CTA_NAT_SRC-1]) {
+			if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
+						&range) < 0)
+				return -EINVAL;
+			if (ip_nat_initialized(ct,
+					       HOOK2MANIP(NF_IP_POST_ROUTING)))
+				return -EEXIST;
+			ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+		}
 #endif
 	}
 
@@ -1106,7 +1097,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	/* implicit 'else' */
 
 	/* we only allow nat config for new conntracks */
-	if (cda[CTA_NAT-1]) {
+	if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index bd10eb944b65..8f27fe9446f2 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -641,7 +641,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = {
 };
 
 static inline int
-ctnetlink_parse_nat(struct nfattr *cda[],
+ctnetlink_parse_nat(struct nfattr *nat,
 		    const struct nf_conn *ct, struct ip_nat_range *range)
 {
 	struct nfattr *tb[CTA_NAT_MAX];
@@ -651,7 +651,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
 
 	memset(range, 0, sizeof(*range));
 	
-	nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
+	nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
 
 	if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
 		return -EINVAL;
@@ -866,39 +866,30 @@ ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[])
 		/* ASSURED bit can only be set */
 		return -EINVAL;
 
-	if (cda[CTA_NAT-1]) {
+	if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 #ifndef CONFIG_IP_NF_NAT_NEEDED
 		return -EINVAL;
 #else
-		unsigned int hooknum;
 		struct ip_nat_range range;
 
-		if (ctnetlink_parse_nat(cda, ct, &range) < 0)
-			return -EINVAL;
-
-		DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
-		       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
-		       htons(range.min.all), htons(range.max.all));
-		
-		/* This is tricky but it works. ip_nat_setup_info needs the
-		 * hook number as parameter, so let's do the correct 
-		 * conversion and run away */
-		if (status & IPS_SRC_NAT_DONE)
-			hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
-		else if (status & IPS_DST_NAT_DONE)
-			hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
-		else 
-			return -EINVAL; /* Missing NAT flags */
-
-		DEBUGP("NAT status: %lu\n", 
-		       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
-		
-		if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
-			return -EEXIST;
-		ip_nat_setup_info(ct, &range, hooknum);
-
-                DEBUGP("NAT status after setup_info: %lu\n",
-                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+		if (cda[CTA_NAT_DST-1]) {
+			if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
+						&range) < 0)
+				return -EINVAL;
+			if (ip_nat_initialized(ct,
+					       HOOK2MANIP(NF_IP_PRE_ROUTING)))
+				return -EEXIST;
+			ip_nat_setup_info(ct, &range, hooknum);
+		}
+		if (cda[CTA_NAT_SRC-1]) {
+			if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
+						&range) < 0)
+				return -EINVAL;
+			if (ip_nat_initialized(ct,
+					       HOOK2MANIP(NF_IP_POST_ROUTING)))
+				return -EEXIST;
+			ip_nat_setup_info(ct, &range, hooknum);
+		}
 #endif
 	}
 
@@ -1122,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	/* implicit 'else' */
 
 	/* we only allow nat config for new conntracks */
-	if (cda[CTA_NAT-1]) {
+	if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
-- 
cgit v1.2.3-71-gd317


From c0d4cfd96dd0cc0dbf49435898808b5553af4822 Mon Sep 17 00:00:00 2001
From: Jing Min Zhao <zhaojingmin@users.sourceforge.net>
Date: Mon, 29 May 2006 18:26:27 -0700
Subject: [NETFILTER]: H.323 helper: Add support for Call Forwarding

Signed-off-by: Jing Min Zhao <zhaojingmin@users.sourceforge.net>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h        |   1 +
 include/linux/netfilter_ipv4/ip_conntrack_h323.h   |   7 ++
 .../ip_conntrack_helper_h323_types.h               |   3 +-
 net/ipv4/netfilter/Kconfig                         |   8 +-
 net/ipv4/netfilter/ip_conntrack_helper_h323.c      | 112 +++++++++++++++++++++
 .../netfilter/ip_conntrack_helper_h323_types.c     |   6 +-
 net/ipv4/netfilter/ip_nat_helper_h323.c            |  77 ++++++++++++++
 7 files changed, 206 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 5473c01f69ea..17d7ef938a09 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -154,6 +154,7 @@ struct ip_conntrack_expect
 	unsigned int flags;
 
 #ifdef CONFIG_IP_NF_NAT_NEEDED
+	u_int32_t saved_ip;
 	/* This is the original per-proto part, used to map the
 	 * expected connection the way the recipient expects. */
 	union ip_conntrack_manip_proto saved_proto;
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_h323.h b/include/linux/netfilter_ipv4/ip_conntrack_h323.h
index eace86bd2adb..3cbff7379002 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_h323.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_h323.h
@@ -71,6 +71,13 @@ extern int (*nat_h245_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct,
 			     unsigned char **data, int dataoff,
 			     TransportAddress * addr, u_int16_t port,
 			     struct ip_conntrack_expect * exp);
+extern int (*nat_callforwarding_hook) (struct sk_buff ** pskb,
+				       struct ip_conntrack * ct,
+				       enum ip_conntrack_info ctinfo,
+				       unsigned char **data, int dataoff,
+				       TransportAddress * addr,
+				       u_int16_t port,
+				       struct ip_conntrack_expect * exp);
 extern int (*nat_q931_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct,
 			     enum ip_conntrack_info ctinfo,
 			     unsigned char **data, TransportAddress * addr,
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h b/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
index cc98f7aa5abe..3d4a773799fc 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
@@ -1,4 +1,4 @@
-/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006
+/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006
  *
  * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
  *
@@ -412,6 +412,7 @@ typedef struct Facility_UUIE {	/* SEQUENCE */
 		eFacility_UUIE_destinationInfo = (1 << 14),
 		eFacility_UUIE_h245SecurityMode = (1 << 13),
 	} options;
+	TransportAddress alternativeAddress;
 	FacilityReason reason;
 	TransportAddress h245Address;
 	Facility_UUIE_fastStart fastStart;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1540e227890f..f86aeda3a5a0 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -183,10 +183,10 @@ config IP_NF_H323
 	  With this module you can support H.323 on a connection tracking/NAT
 	  firewall.
 
-	  This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP
-	  and T.120 based data and applications including audio, video, FAX,
-	  chat, whiteboard, file transfer, etc. For more information, please
-	  see http://nath323.sourceforge.net/.
+	  This module supports RAS, Fast Start, H.245 Tunnelling, Call
+	  Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
+	  whiteboard, file transfer, etc. For more information, please
+	  visit http://nath323.sourceforge.net/.
 
 	  If you want to compile it as a module, say 'M' here and read
 	  Documentation/modules.txt.  If unsure, say 'N'.
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
index 518f581d39ec..3052468a6ab1 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
@@ -22,6 +22,8 @@
 #include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
 #include <linux/netfilter_ipv4/ip_conntrack_h323.h>
 #include <linux/moduleparam.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
 
 #if 0
 #define DEBUGP printk
@@ -38,6 +40,13 @@ static int gkrouted_only = 1;
 module_param(gkrouted_only, int, 0600);
 MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
 
+static char *internal_net = NULL;
+static u_int32_t internal_net_addr = 0;
+static u_int32_t internal_net_mask = 0;
+module_param(internal_net, charp, 0600);
+MODULE_PARM_DESC(internal_net, "specify your internal network using format "
+		 "address/mask. this is used by call forwarding support");
+
 /* Hooks for NAT */
 int (*set_h245_addr_hook) (struct sk_buff ** pskb,
 			   unsigned char **data, int dataoff,
@@ -77,6 +86,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb,
 		      unsigned char **data, int dataoff,
 		      TransportAddress * addr, u_int16_t port,
 		      struct ip_conntrack_expect * exp);
+int (*nat_callforwarding_hook) (struct sk_buff ** pskb,
+				struct ip_conntrack * ct,
+				enum ip_conntrack_info ctinfo,
+				unsigned char **data, int dataoff,
+				TransportAddress * addr, u_int16_t port,
+				struct ip_conntrack_expect * exp);
 int (*nat_q931_hook) (struct sk_buff ** pskb,
 		      struct ip_conntrack * ct,
 		      enum ip_conntrack_info ctinfo,
@@ -683,6 +698,76 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
 	return ret;
 }
 
+/* Forwarding declaration */
+void ip_conntrack_q931_expect(struct ip_conntrack *new,
+			      struct ip_conntrack_expect *this);
+
+/****************************************************************************/
+static int expect_callforwarding(struct sk_buff **pskb,
+				 struct ip_conntrack *ct,
+				 enum ip_conntrack_info ctinfo,
+				 unsigned char **data, int dataoff,
+				 TransportAddress * addr)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	u_int32_t ip;
+	u_int16_t port;
+	struct ip_conntrack_expect *exp = NULL;
+
+	/* Read alternativeAddress */
+	if (!get_h225_addr(*data, addr, &ip, &port) || port == 0)
+		return 0;
+
+	/* If the calling party is on the same side of the forward-to party,
+	 * we don't need to track the second call */
+	if (internal_net &&
+	    ((ip & internal_net_mask) == internal_net_addr) ==
+	    ((ct->tuplehash[!dir].tuple.src.ip & internal_net_mask) ==
+	     internal_net_addr)) {
+		DEBUGP("ip_ct_q931: Call Forwarding not tracked\n");
+		return 0;
+	}
+
+	/* Create expect for the second call leg */
+	if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
+		return -1;
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.tcp.port = 0;
+	exp->tuple.dst.ip = ip;
+	exp->tuple.dst.u.tcp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_TCP;
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.tcp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.tcp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+	exp->flags = 0;
+
+	if (ct->tuplehash[dir].tuple.src.ip !=
+	    ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) {
+		/* Need NAT */
+		ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff,
+					      addr, port, exp);
+	} else {		/* Conntrack only */
+		exp->expectfn = ip_conntrack_q931_expect;
+
+		if (ip_conntrack_expect_related(exp) == 0) {
+			DEBUGP("ip_ct_q931: expect Call Forwarding "
+			       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+			       NIPQUAD(exp->tuple.src.ip),
+			       ntohs(exp->tuple.src.u.tcp.port),
+			       NIPQUAD(exp->tuple.dst.ip),
+			       ntohs(exp->tuple.dst.u.tcp.port));
+		} else
+			ret = -1;
+	}
+
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
 /****************************************************************************/
 static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct,
 			 enum ip_conntrack_info ctinfo,
@@ -878,6 +963,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct,
 
 	DEBUGP("ip_ct_q931: Facility\n");
 
+	if (facility->reason.choice == eFacilityReason_callForwarded) {
+		if (facility->options & eFacility_UUIE_alternativeAddress)
+			return expect_callforwarding(pskb, ct, ctinfo, data,
+						     dataoff,
+						     &facility->
+						     alternativeAddress);
+		return 0;
+	}
+
 	if (facility->options & eFacility_UUIE_h245Address) {
 		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
 				  &facility->h245Address);
@@ -1668,6 +1762,7 @@ static void fini(void)
 static int __init init(void)
 {
 	int ret;
+	char *p;
 
 	h323_buffer = kmalloc(65536, GFP_KERNEL);
 	if (!h323_buffer)
@@ -1678,6 +1773,22 @@ static int __init init(void)
 		return ret;
 	}
 
+	if (internal_net) {
+		if ((p = strchr(internal_net, '/')))
+			*p++ = 0;
+		if (isdigit(internal_net[0])) {
+			internal_net_addr = in_aton(internal_net);
+			if (p && isdigit(p[0]))
+				internal_net_mask = in_aton(p);
+			else
+				internal_net_mask = 0xffffffff;
+			internal_net_addr &= internal_net_mask;
+		}
+		DEBUGP("ip_ct_h323: internal_net = %u.%u.%u.%u/%u.%u.%u.%u\n",
+		       NIPQUAD(internal_net_addr),
+		       NIPQUAD(internal_net_mask));
+	}
+
 	DEBUGP("ip_ct_h323: init success\n");
 	return 0;
 }
@@ -1696,6 +1807,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook);
 EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
 EXPORT_SYMBOL_GPL(nat_t120_hook);
 EXPORT_SYMBOL_GPL(nat_h245_hook);
+EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
 EXPORT_SYMBOL_GPL(nat_q931_hook);
 
 MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
index 022c47b9f6c9..4b359618bedd 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
@@ -1,4 +1,4 @@
-/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006
+/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006
  *
  * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
  *
@@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = {	/* SEQUENCE OF */
 
 static field_t _Facility_UUIE[] = {	/* SEQUENCE */
 	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
-	{FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0,
-	 _TransportAddress},
+	{FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Facility_UUIE, alternativeAddress), _TransportAddress},
 	{FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
 	 _Facility_UUIE_alternativeAliasAddress},
 	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c
index d45663d137a7..419b878fb467 100644
--- a/net/ipv4/netfilter/ip_nat_helper_h323.c
+++ b/net/ipv4/netfilter/ip_nat_helper_h323.c
@@ -486,6 +486,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
 	return 0;
 }
 
+/****************************************************************************/
+static void ip_nat_callforwarding_expect(struct ip_conntrack *new,
+					 struct ip_conntrack_expect *this)
+{
+	struct ip_nat_range range;
+
+	/* This must be a fresh one. */
+	BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = IP_NAT_RANGE_MAP_IPS;
+	range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
+
+	/* hook doesn't matter, but it has to do source manip */
+	ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+	range.min = range.max = this->saved_proto;
+	range.min_ip = range.max_ip = this->saved_ip;
+
+	/* hook doesn't matter, but it has to do destination manip */
+	ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
+
+	ip_conntrack_q931_expect(new, this);
+}
+
+/****************************************************************************/
+static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct,
+			      enum ip_conntrack_info ctinfo,
+			      unsigned char **data, int dataoff,
+			      TransportAddress * addr, u_int16_t port,
+			      struct ip_conntrack_expect *exp)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port;
+
+	/* Set expectations for NAT */
+	exp->saved_ip = exp->tuple.dst.ip;
+	exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = ip_nat_callforwarding_expect;
+	exp->dir = !dir;
+
+	/* Try to get same port: if not, try to change it. */
+	for (nated_port = port; nated_port != 0; nated_port++) {
+		exp->tuple.dst.u.tcp.port = htons(nated_port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			printk("ip_nat_q931: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (!set_h225_addr(pskb, data, dataoff, addr,
+			   ct->tuplehash[!dir].tuple.dst.ip,
+			   nated_port) == 0) {
+		ip_conntrack_unexpect_related(exp);
+		return -1;
+	}
+
+	/* Success */
+	DEBUGP("ip_nat_q931: expect Call Forwarding "
+	       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+	       NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
+	       NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
 /****************************************************************************/
 static int __init init(void)
 {
@@ -496,6 +570,7 @@ static int __init init(void)
 	BUG_ON(nat_rtp_rtcp_hook != NULL);
 	BUG_ON(nat_t120_hook != NULL);
 	BUG_ON(nat_h245_hook != NULL);
+	BUG_ON(nat_callforwarding_hook != NULL);
 	BUG_ON(nat_q931_hook != NULL);
 
 	set_h245_addr_hook = set_h245_addr;
@@ -505,6 +580,7 @@ static int __init init(void)
 	nat_rtp_rtcp_hook = nat_rtp_rtcp;
 	nat_t120_hook = nat_t120;
 	nat_h245_hook = nat_h245;
+	nat_callforwarding_hook = nat_callforwarding;
 	nat_q931_hook = nat_q931;
 
 	DEBUGP("ip_nat_h323: init success\n");
@@ -521,6 +597,7 @@ static void __exit fini(void)
 	nat_rtp_rtcp_hook = NULL;
 	nat_t120_hook = NULL;
 	nat_h245_hook = NULL;
+	nat_callforwarding_hook = NULL;
 	nat_q931_hook = NULL;
 	synchronize_net();
 }
-- 
cgit v1.2.3-71-gd317


From ae5b7d8ba2c28d7d9835856fe0ca5f6ec95ea768 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:27:09 -0700
Subject: [NETFILTER]: Add SIP connection tracking helper

Add SIP connection tracking helper. Originally written by
Christian Hentschel <chentschel@arnet.com.ar>, some cleanup, minor
fixes and bidirectional SIP support added by myself.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack_sip.h |  44 +++
 net/ipv4/netfilter/Kconfig                      |  18 +
 net/ipv4/netfilter/Makefile                     |   2 +
 net/ipv4/netfilter/ip_conntrack_sip.c           | 471 ++++++++++++++++++++++++
 net/ipv4/netfilter/ip_nat_sip.c                 | 249 +++++++++++++
 5 files changed, 784 insertions(+)
 create mode 100644 include/linux/netfilter_ipv4/ip_conntrack_sip.h
 create mode 100644 net/ipv4/netfilter/ip_conntrack_sip.c
 create mode 100644 net/ipv4/netfilter/ip_nat_sip.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack_sip.h b/include/linux/netfilter_ipv4/ip_conntrack_sip.h
new file mode 100644
index 000000000000..913dad66c0fb
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ip_conntrack_sip.h
@@ -0,0 +1,44 @@
+#ifndef __IP_CONNTRACK_SIP_H__
+#define __IP_CONNTRACK_SIP_H__
+#ifdef __KERNEL__
+
+#define SIP_PORT	5060
+#define SIP_TIMEOUT	3600
+
+#define POS_VIA		0
+#define POS_CONTACT	1
+#define POS_CONTENT	2
+#define POS_MEDIA	3
+#define POS_OWNER	4
+#define POS_CONNECTION	5
+#define POS_REQ_HEADER	6
+#define POS_SDP_HEADER	7
+
+struct sip_header_nfo {
+	const char	*lname;
+	const char	*sname;
+	const char	*ln_str;
+	size_t		lnlen;
+	size_t		snlen;
+	size_t		ln_strlen;
+	int		(*match_len)(const char *, const char *, int *);
+};
+
+extern unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb,
+				       enum ip_conntrack_info ctinfo,
+				       struct ip_conntrack *ct,
+				       const char **dptr);
+extern unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb,
+				       enum ip_conntrack_info ctinfo,
+				       struct ip_conntrack_expect *exp,
+				       const char *dptr);
+
+extern int ct_sip_get_info(const char *dptr, size_t dlen,
+			   unsigned int *matchoff,
+			   unsigned int *matchlen,
+			   struct sip_header_nfo *hnfo);
+extern int ct_sip_lnlen(const char *line, const char *limit);
+extern const char *ct_sip_search(const char *needle, const char *haystack,
+                                 size_t needle_len, size_t haystack_len);
+#endif /* __KERNEL__ */
+#endif /* __IP_CONNTRACK_SIP_H__ */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index f86aeda3a5a0..ff4b118f14a9 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -191,6 +191,18 @@ config IP_NF_H323
 	  If you want to compile it as a module, say 'M' here and read
 	  Documentation/modules.txt.  If unsure, say 'N'.
 
+config IP_NF_SIP
+	tristate "SIP protocol support (EXPERIMENTAL)"
+	depends on IP_NF_CONNTRACK && EXPERIMENTAL
+	help
+	  SIP is an application-layer control protocol that can establish,
+	  modify, and terminate multimedia sessions (conferences) such as
+	  Internet telephony calls. With the ip_conntrack_sip and
+	  the ip_nat_sip modules you can support the protocol on a connection
+	  tracking/NATing firewall.
+
+	  To compile it as a module, choose M here.  If unsure, say Y.
+
 config IP_NF_QUEUE
 	tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
 	help
@@ -503,6 +515,12 @@ config IP_NF_NAT_H323
 	default IP_NF_NAT if IP_NF_H323=y
 	default m if IP_NF_H323=m
 
+config IP_NF_NAT_SIP
+	tristate
+	depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+	default IP_NF_NAT if IP_NF_SIP=y
+	default m if IP_NF_SIP=m
+
 # mangle + specific targets
 config IP_NF_MANGLE
 	tristate "Packet mangling"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 461cb1eb5de7..3ded4a3af59c 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
 obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
 obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
 obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o
 obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
 
 # NAT helpers 
@@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
 obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
 obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
 obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
+obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c
new file mode 100644
index 000000000000..fc87ce0da40d
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_sip.c
@@ -0,0 +1,471 @@
+/* SIP extension for IP connection tracking.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_conntrack_ftp.c and other modules.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP connection tracking helper");
+
+#define MAX_PORTS	8
+static unsigned short ports[MAX_PORTS];
+static int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of sip servers");
+
+static unsigned int sip_timeout = SIP_TIMEOUT;
+module_param(sip_timeout, uint, 0600);
+MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
+
+unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb,
+				enum ip_conntrack_info ctinfo,
+				struct ip_conntrack *ct,
+				const char **dptr);
+EXPORT_SYMBOL_GPL(ip_nat_sip_hook);
+
+unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb,
+				enum ip_conntrack_info ctinfo,
+				struct ip_conntrack_expect *exp,
+				const char *dptr);
+EXPORT_SYMBOL_GPL(ip_nat_sdp_hook);
+
+int ct_sip_get_info(const char *dptr, size_t dlen,
+				unsigned int *matchoff,
+				unsigned int *matchlen,
+				struct sip_header_nfo *hnfo);
+EXPORT_SYMBOL_GPL(ct_sip_get_info);
+
+
+static int digits_len(const char *dptr, const char *limit, int *shift);
+static int epaddr_len(const char *dptr, const char *limit, int *shift);
+static int skp_digits_len(const char *dptr, const char *limit, int *shift);
+static int skp_epaddr_len(const char *dptr, const char *limit, int *shift);
+
+struct sip_header_nfo ct_sip_hdrs[] = {
+	{ 	/* Via header */
+		.lname		= "Via:",
+		.lnlen		= sizeof("Via:") - 1,
+		.sname		= "\r\nv:",
+		.snlen		= sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */
+		.ln_str		= "UDP ",
+		.ln_strlen	= sizeof("UDP ") - 1,
+		.match_len	= epaddr_len,
+	},
+	{ 	/* Contact header */
+		.lname		= "Contact:",
+		.lnlen		= sizeof("Contact:") - 1,
+		.sname		= "\r\nm:",
+		.snlen		= sizeof("\r\nm:") - 1,
+		.ln_str		= "sip:",
+		.ln_strlen	= sizeof("sip:") - 1,
+		.match_len	= skp_epaddr_len
+	},
+	{ 	/* Content length header */
+		.lname		= "Content-Length:",
+		.lnlen		= sizeof("Content-Length:") - 1,
+		.sname		= "\r\nl:",
+		.snlen		= sizeof("\r\nl:") - 1,
+		.ln_str		= ":",
+		.ln_strlen	= sizeof(":") - 1,
+		.match_len	= skp_digits_len
+	},
+	{	/* SDP media info */
+		.lname		= "\nm=",
+		.lnlen		= sizeof("\nm=") - 1,
+		.sname		= "\rm=",
+		.snlen		= sizeof("\rm=") - 1,
+		.ln_str		= "audio ",
+		.ln_strlen	= sizeof("audio ") - 1,
+		.match_len	= digits_len
+	},
+	{ 	/* SDP owner address*/
+		.lname		= "\no=",
+		.lnlen		= sizeof("\no=") - 1,
+		.sname		= "\ro=",
+		.snlen		= sizeof("\ro=") - 1,
+		.ln_str		= "IN IP4 ",
+		.ln_strlen	= sizeof("IN IP4 ") - 1,
+		.match_len	= epaddr_len
+	},
+	{ 	/* SDP connection info */
+		.lname		= "\nc=",
+		.lnlen		= sizeof("\nc=") - 1,
+		.sname		= "\rc=",
+		.snlen		= sizeof("\rc=") - 1,
+		.ln_str		= "IN IP4 ",
+		.ln_strlen	= sizeof("IN IP4 ") - 1,
+		.match_len	= epaddr_len
+	},
+	{ 	/* Requests headers */
+		.lname		= "sip:",
+		.lnlen		= sizeof("sip:") - 1,
+		.sname		= "sip:",
+		.snlen		= sizeof("sip:") - 1, /* yes, i know.. ;) */
+		.ln_str		= "@",
+		.ln_strlen	= sizeof("@") - 1,
+		.match_len	= epaddr_len
+	},
+	{ 	/* SDP version header */
+		.lname		= "\nv=",
+		.lnlen		= sizeof("\nv=") - 1,
+		.sname		= "\rv=",
+		.snlen		= sizeof("\rv=") - 1,
+		.ln_str		= "=",
+		.ln_strlen	= sizeof("=") - 1,
+		.match_len	= digits_len
+	}
+};
+EXPORT_SYMBOL_GPL(ct_sip_hdrs);
+
+/* get line lenght until first CR or LF seen. */
+int ct_sip_lnlen(const char *line, const char *limit)
+{
+	const char *k = line;
+
+	while ((line <= limit) && (*line == '\r' || *line == '\n'))
+		line++;
+
+	while (line <= limit) {
+		if (*line == '\r' || *line == '\n')
+			break;
+		line++;
+	}
+	return line - k;
+}
+EXPORT_SYMBOL_GPL(ct_sip_lnlen);
+
+/* Linear string search, case sensitive. */
+const char *ct_sip_search(const char *needle, const char *haystack,
+                          size_t needle_len, size_t haystack_len)
+{
+	const char *limit = haystack + (haystack_len - needle_len);
+
+	while (haystack <= limit) {
+		if (memcmp(haystack, needle, needle_len) == 0)
+			return haystack;
+		haystack++;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ct_sip_search);
+
+static int digits_len(const char *dptr, const char *limit, int *shift)
+{
+	int len = 0;
+	while (dptr <= limit && isdigit(*dptr)) {
+		dptr++;
+		len++;
+	}
+	return len;
+}
+
+/* get digits lenght, skiping blank spaces. */
+static int skp_digits_len(const char *dptr, const char *limit, int *shift)
+{
+	for (; dptr <= limit && *dptr == ' '; dptr++)
+		(*shift)++;
+
+	return digits_len(dptr, limit, shift);
+}
+
+/* Simple ipaddr parser.. */
+static int parse_ipaddr(const char *cp,	const char **endp,
+			u_int32_t *ipaddr, const char *limit)
+{
+	unsigned long int val;
+	int i, digit = 0;
+
+	for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) {
+		digit = 0;
+		if (!isdigit(*cp))
+			break;
+
+		val = simple_strtoul(cp, (char **)&cp, 10);
+		if (val > 0xFF)
+			return -1;
+
+		((u_int8_t *)ipaddr)[i] = val;
+		digit = 1;
+
+		if (*cp != '.')
+			break;
+		cp++;
+	}
+	if (!digit)
+		return -1;
+
+	if (endp)
+		*endp = cp;
+
+	return 0;
+}
+
+/* skip ip address. returns it lenght. */
+static int epaddr_len(const char *dptr, const char *limit, int *shift)
+{
+	const char *aux = dptr;
+	u_int32_t ip;
+
+	if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) {
+		DEBUGP("ip: %s parse failed.!\n", dptr);
+		return 0;
+	}
+
+	/* Port number */
+	if (*dptr == ':') {
+		dptr++;
+		dptr += digits_len(dptr, limit, shift);
+	}
+	return dptr - aux;
+}
+
+/* get address length, skiping user info. */
+static int skp_epaddr_len(const char *dptr, const char *limit, int *shift)
+{
+	int s = *shift;
+
+	for (; dptr <= limit && *dptr != '@'; dptr++)
+		(*shift)++;
+
+	if (*dptr == '@') {
+		dptr++;
+		(*shift)++;
+	} else
+		*shift = s;
+
+	return epaddr_len(dptr, limit, shift);
+}
+
+/* Returns 0 if not found, -1 error parsing. */
+int ct_sip_get_info(const char *dptr, size_t dlen,
+		    unsigned int *matchoff,
+		    unsigned int *matchlen,
+		    struct sip_header_nfo *hnfo)
+{
+	const char *limit, *aux, *k = dptr;
+	int shift = 0;
+
+	limit = dptr + (dlen - hnfo->lnlen);
+
+	while (dptr <= limit) {
+		if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) &&
+		    (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) {
+			dptr++;
+			continue;
+		}
+		aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen,
+		                    ct_sip_lnlen(dptr, limit));
+		if (!aux) {
+			DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str,
+			       hnfo->lname);
+			return -1;
+		}
+		aux += hnfo->ln_strlen;
+
+		*matchlen = hnfo->match_len(aux, limit, &shift);
+		if (!*matchlen)
+			return -1;
+
+		*matchoff = (aux - k) + shift;
+
+		DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname,
+		       *matchlen);
+		return 1;
+	}
+	DEBUGP("%s header not found.\n", hnfo->lname);
+	return 0;
+}
+
+static int set_expected_rtp(struct sk_buff **pskb,
+			    struct ip_conntrack *ct,
+			    enum ip_conntrack_info ctinfo,
+			    u_int32_t ipaddr, u_int16_t port,
+			    const char *dptr)
+{
+	struct ip_conntrack_expect *exp;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	int ret;
+
+	exp = ip_conntrack_expect_alloc(ct);
+	if (exp == NULL)
+		return NF_DROP;
+
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.udp.port = 0;
+	exp->tuple.dst.ip = ipaddr;
+	exp->tuple.dst.u.udp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_UDP;
+
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.udp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.udp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+
+	exp->expectfn = NULL;
+	exp->flags = 0;
+
+	if (ip_nat_sdp_hook)
+		ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr);
+	else {
+		if (ip_conntrack_expect_related(exp) != 0)
+			ret = NF_DROP;
+		else
+			ret = NF_ACCEPT;
+	}
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
+static int sip_help(struct sk_buff **pskb,
+		    struct ip_conntrack *ct,
+		    enum ip_conntrack_info ctinfo)
+{
+	unsigned int dataoff, datalen;
+	const char *dptr;
+	int ret = NF_ACCEPT;
+	int matchoff, matchlen;
+	u_int32_t ipaddr;
+	u_int16_t port;
+
+	/* No Data ? */
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+	if (dataoff >= (*pskb)->len) {
+		DEBUGP("skb->len = %u\n", (*pskb)->len);
+		return NF_ACCEPT;
+        }
+
+	ip_ct_refresh(ct, *pskb, sip_timeout * HZ);
+
+	if (!skb_is_nonlinear(*pskb))
+		dptr = (*pskb)->data + dataoff;
+	else {
+		DEBUGP("Copy of skbuff not supported yet.\n");
+		goto out;
+	}
+
+	if (ip_nat_sip_hook) {
+		if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) {
+			ret = NF_DROP;
+			goto out;
+		}
+	}
+
+	/* After this point NAT, could have mangled skb, so
+	   we need to recalculate payload lenght. */
+	datalen = (*pskb)->len - dataoff;
+
+	if (datalen < (sizeof("SIP/2.0 200") - 1))
+		goto out;
+
+	/* RTP info only in some SDP pkts */
+	if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 &&
+	    memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) {
+		goto out;
+	}
+	/* Get ip and port address from SDP packet. */
+	if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
+	                    &ct_sip_hdrs[POS_CONNECTION]) > 0) {
+
+		/* We'll drop only if there are parse problems. */
+		if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr,
+		                 dptr + datalen) < 0) {
+			ret = NF_DROP;
+			goto out;
+		}
+		if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
+		                    &ct_sip_hdrs[POS_MEDIA]) > 0) {
+
+			port = simple_strtoul(dptr + matchoff, NULL, 10);
+			if (port < 1024) {
+				ret = NF_DROP;
+				goto out;
+			}
+			ret = set_expected_rtp(pskb, ct, ctinfo,
+					       ipaddr, port, dptr);
+		}
+	}
+out:
+	return ret;
+}
+
+static struct ip_conntrack_helper sip[MAX_PORTS];
+static char sip_names[MAX_PORTS][10];
+
+static void fini(void)
+{
+	int i;
+	for (i = 0; i < ports_c; i++) {
+		DEBUGP("unregistering helper for port %d\n", ports[i]);
+		ip_conntrack_helper_unregister(&sip[i]);
+	}
+}
+
+static int __init init(void)
+{
+	int i, ret;
+	char *tmpname;
+
+	if (ports_c == 0)
+		ports[ports_c++] = SIP_PORT;
+
+	for (i = 0; i < ports_c; i++) {
+		/* Create helper structure */
+		memset(&sip[i], 0, sizeof(struct ip_conntrack_helper));
+
+		sip[i].tuple.dst.protonum = IPPROTO_UDP;
+		sip[i].tuple.src.u.udp.port = htons(ports[i]);
+		sip[i].mask.src.u.udp.port = 0xFFFF;
+		sip[i].mask.dst.protonum = 0xFF;
+		sip[i].max_expected = 1;
+		sip[i].timeout = 3 * 60; /* 3 minutes */
+		sip[i].me = THIS_MODULE;
+		sip[i].help = sip_help;
+
+		tmpname = &sip_names[i][0];
+		if (ports[i] == SIP_PORT)
+			sprintf(tmpname, "sip");
+		else
+			sprintf(tmpname, "sip-%d", i);
+		sip[i].name = tmpname;
+
+		DEBUGP("port #%d: %d\n", i, ports[i]);
+
+		ret = ip_conntrack_helper_register(&sip[i]);
+		if (ret) {
+			printk("ERROR registering helper for port %d\n",
+				ports[i]);
+			fini();
+			return ret;
+		}
+	}
+	return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c
new file mode 100644
index 000000000000..6ffba63adca2
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_sip.c
@@ -0,0 +1,249 @@
+/* SIP extension for UDP NAT alteration.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_nat_ftp.c and other modules.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP NAT helper");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+extern struct sip_header_nfo ct_sip_hdrs[];
+
+static unsigned int mangle_sip_packet(struct sk_buff **pskb,
+				      enum ip_conntrack_info ctinfo,
+				      struct ip_conntrack *ct,
+				      const char **dptr, size_t dlen,
+				      char *buffer, int bufflen,
+				      struct sip_header_nfo *hnfo)
+{
+	unsigned int matchlen, matchoff;
+
+	if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0)
+		return 0;
+
+	if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
+	                              matchoff, matchlen, buffer, bufflen))
+		return 0;
+
+	/* We need to reload this. Thanks Patrick. */
+	*dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+	return 1;
+}
+
+static unsigned int ip_nat_sip(struct sk_buff **pskb,
+			       enum ip_conntrack_info ctinfo,
+			       struct ip_conntrack *ct,
+			       const char **dptr)
+{
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
+	unsigned int bufflen, dataoff;
+	u_int32_t ip;
+	u_int16_t port;
+
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+
+	ip   = ct->tuplehash[!dir].tuple.dst.ip;
+	port = ct->tuplehash[!dir].tuple.dst.u.udp.port;
+	bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port));
+
+	/* short packet ? */
+	if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1))
+		return 0;
+
+	/* Basic rules: requests and responses. */
+	if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) {
+		const char *aux;
+
+		if ((ctinfo) < IP_CT_IS_REPLY) {
+			mangle_sip_packet(pskb, ctinfo, ct, dptr,
+			                  (*pskb)->len - dataoff,
+			                  buffer, bufflen,
+			                  &ct_sip_hdrs[POS_CONTACT]);
+			return 1;
+		}
+
+		if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
+				       (*pskb)->len - dataoff,
+		                       buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
+			return 0;
+
+		/* This search should ignore case, but later.. */
+		aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1,
+		                    (*pskb)->len - dataoff);
+		if (!aux)
+			return 0;
+
+		if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"),
+		    ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff)))
+			return 1;
+
+		return mangle_sip_packet(pskb, ctinfo, ct, dptr,
+					 (*pskb)->len - dataoff,
+		                         buffer, bufflen,
+					 &ct_sip_hdrs[POS_CONTACT]);
+	}
+	if ((ctinfo) < IP_CT_IS_REPLY) {
+		if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
+				       (*pskb)->len - dataoff,
+		                       buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
+			return 0;
+
+		/* Mangle Contact if exists only. - watch udp_nat_mangle()! */
+		mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff,
+		                  buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]);
+		return 1;
+	}
+	/* This mangle requests headers. */
+	return mangle_sip_packet(pskb, ctinfo, ct, dptr,
+	                         ct_sip_lnlen(*dptr,
+				              *dptr + (*pskb)->len - dataoff),
+	                         buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]);
+}
+
+static int mangle_content_len(struct sk_buff **pskb,
+			      enum ip_conntrack_info ctinfo,
+			      struct ip_conntrack *ct,
+			      const char *dptr)
+{
+	unsigned int dataoff, matchoff, matchlen;
+	char buffer[sizeof("65536")];
+	int bufflen;
+
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+
+	/* Get actual SDP lenght */
+	if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
+	                    &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) {
+
+		/* since ct_sip_get_info() give us a pointer passing 'v='
+		   we need to add 2 bytes in this count. */
+		int c_len = (*pskb)->len - dataoff - matchoff + 2;
+
+		/* Now, update SDP lenght */
+		if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
+		                    &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) {
+
+			bufflen = sprintf(buffer, "%u", c_len);
+
+			return ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
+							matchoff, matchlen,
+							buffer, bufflen);
+		}
+	}
+	return 0;
+}
+
+static unsigned int mangle_sdp(struct sk_buff **pskb,
+			       enum ip_conntrack_info ctinfo,
+			       struct ip_conntrack *ct,
+			       u_int32_t newip, u_int16_t port,
+			       const char *dptr)
+{
+	char buffer[sizeof("nnn.nnn.nnn.nnn")];
+	unsigned int dataoff, bufflen;
+
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+
+	/* Mangle owner and contact info. */
+	bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip));
+	if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+	                       buffer, bufflen, &ct_sip_hdrs[POS_OWNER]))
+		return 0;
+
+	if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+	                       buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION]))
+		return 0;
+
+	/* Mangle media port. */
+	bufflen = sprintf(buffer, "%u", port);
+	if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+	                       buffer, bufflen, &ct_sip_hdrs[POS_MEDIA]))
+		return 0;
+
+	return mangle_content_len(pskb, ctinfo, ct, dptr);
+}
+
+/* So, this packet has hit the connection tracking matching code.
+   Mangle it, and change the expectation to match the new version. */
+static unsigned int ip_nat_sdp(struct sk_buff **pskb,
+			       enum ip_conntrack_info ctinfo,
+			       struct ip_conntrack_expect *exp,
+			       const char *dptr)
+{
+	struct ip_conntrack *ct = exp->master;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	u_int32_t newip;
+	u_int16_t port;
+
+	DEBUGP("ip_nat_sdp():\n");
+
+	/* Connection will come from reply */
+	newip = ct->tuplehash[!dir].tuple.dst.ip;
+
+	exp->tuple.dst.ip = newip;
+	exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
+	exp->dir = !dir;
+
+	/* When you see the packet, we need to NAT it the same as the
+	   this one. */
+	exp->expectfn = ip_nat_follow_master;
+
+	/* Try to get same port: if not, try to change it. */
+	for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) {
+		exp->tuple.dst.u.udp.port = htons(port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (port == 0)
+		return NF_DROP;
+
+	if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) {
+		ip_conntrack_unexpect_related(exp);
+		return NF_DROP;
+	}
+	return NF_ACCEPT;
+}
+
+static void __exit fini(void)
+{
+	ip_nat_sip_hook = NULL;
+	ip_nat_sdp_hook = NULL;
+	/* Make sure noone calls it, meanwhile. */
+	synchronize_net();
+}
+
+static int __init init(void)
+{
+	BUG_ON(ip_nat_sip_hook);
+	BUG_ON(ip_nat_sdp_hook);
+	ip_nat_sip_hook = ip_nat_sip;
+	ip_nat_sdp_hook = ip_nat_sdp;
+	return 0;
+}
+
+module_init(init);
+module_exit(fini);
-- 
cgit v1.2.3-71-gd317


From 338fcf9886df9ad2873772197a73a57818973316 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 5 Jun 2006 21:04:39 -0700
Subject: [IPV4] igmp: Fixup struct ip_mc_list::multiaddr type

All users except two expect 32-bit big-endian value. One is of

	->multiaddr = ->multiaddr

variety. And last one is "%08lX".

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h | 2 +-
 net/ipv4/igmp.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 28f4f3b36950..899c3d4776f3 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -169,7 +169,7 @@ struct ip_sf_list
 struct ip_mc_list
 {
 	struct in_device	*interface;
-	unsigned long		multiaddr;
+	__be32			multiaddr;
 	struct ip_sf_list	*sources;
 	struct ip_sf_list	*tomb;
 	unsigned int		sfmode;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d512239a1473..ab680c851aa2 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
 		}
 
 		seq_printf(seq,
-			   "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
+			   "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
 			   im->multiaddr, im->users,
 			   im->tm_running, im->tm_running ?
 			   jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
-- 
cgit v1.2.3-71-gd317


From c8c05a8eec6f1258f6d5cb71a44ee5dc1e989b63 Mon Sep 17 00:00:00 2001
From: Catherine Zhang <cxzhang@watson.ibm.com>
Date: Thu, 8 Jun 2006 23:39:49 -0700
Subject: [LSM-IPsec]: SELinux Authorize

This patch contains a fix for the previous patch that adds security
contexts to IPsec policies and security associations.  In the previous
patch, no authorization (besides the check for write permissions to
SAD and SPD) is required to delete IPsec policies and security
assocations with security contexts.  Thus a user authorized to change
SAD and SPD can bypass the IPsec policy authorization by simply
deleteing policies with security contexts.  To fix this security hole,
an additional authorization check is added for removing security
policies and security associations with security contexts.

Note that if no security context is supplied on add or present on
policy to be deleted, the SELinux module allows the change
unconditionally.  The hook is called on deletion when no context is
present, which we may want to change.  At present, I left it up to the
module.

LSM changes:

The patch adds two new LSM hooks: xfrm_policy_delete and
xfrm_state_delete.  The new hooks are necessary to authorize deletion
of IPsec policies that have security contexts.  The existing hooks
xfrm_policy_free and xfrm_state_free lack the context to do the
authorization, so I decided to split authorization of deletion and
memory management of security data, as is typical in the LSM
interface.

Use:

The new delete hooks are checked when xfrm_policy or xfrm_state are
deleted by either the xfrm_user interface (xfrm_get_policy,
xfrm_del_sa) or the pfkey interface (pfkey_spddelete, pfkey_delete).

SELinux changes:

The new policy_delete and state_delete functions are added.

Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com>
Signed-off-by: Trent Jaeger <tjaeger@cse.psu.edu>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h        | 40 ++++++++++++++++++++++++++++++++++------
 net/key/af_key.c                | 17 +++++++++++------
 net/xfrm/xfrm_user.c            | 19 ++++++++++++-------
 security/dummy.c                | 12 ++++++++++++
 security/selinux/hooks.c        |  2 ++
 security/selinux/include/xfrm.h |  2 ++
 security/selinux/xfrm.c         | 39 +++++++++++++++++++++++++++++++++++----
 7 files changed, 108 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 1bab48f6aeac..14c9bd050607 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -805,31 +805,37 @@ struct swap_info_struct;
  *	used by the XFRM system.
  *	@sec_ctx contains the security context information being provided by
  *	the user-level policy update program (e.g., setkey).
- *	Allocate a security structure to the xp->selector.security field.
+ *	Allocate a security structure to the xp->security field.
  *	The security field is initialized to NULL when the xfrm_policy is
  *	allocated.
  *	Return 0 if operation was successful (memory to allocate, legal context)
  * @xfrm_policy_clone_security:
  *	@old contains an existing xfrm_policy in the SPD.
  *	@new contains a new xfrm_policy being cloned from old.
- *	Allocate a security structure to the new->selector.security field
- *	that contains the information from the old->selector.security field.
+ *	Allocate a security structure to the new->security field
+ *	that contains the information from the old->security field.
  *	Return 0 if operation was successful (memory to allocate).
  * @xfrm_policy_free_security:
  *	@xp contains the xfrm_policy
- *	Deallocate xp->selector.security.
+ *	Deallocate xp->security.
+ * @xfrm_policy_delete_security:
+ *	@xp contains the xfrm_policy.
+ *	Authorize deletion of xp->security.
  * @xfrm_state_alloc_security:
  *	@x contains the xfrm_state being added to the Security Association
  *	Database by the XFRM system.
  *	@sec_ctx contains the security context information being provided by
  *	the user-level SA generation program (e.g., setkey or racoon).
- *	Allocate a security structure to the x->sel.security field.  The
+ *	Allocate a security structure to the x->security field.  The
  *	security field is initialized to NULL when the xfrm_state is
  *	allocated.
  *	Return 0 if operation was successful (memory to allocate, legal context).
  * @xfrm_state_free_security:
  *	@x contains the xfrm_state.
- *	Deallocate x>sel.security.
+ *	Deallocate x->security.
+ * @xfrm_state_delete_security:
+ *	@x contains the xfrm_state.
+ *	Authorize deletion of x->security.
  * @xfrm_policy_lookup:
  *	@xp contains the xfrm_policy for which the access control is being
  *	checked.
@@ -1298,8 +1304,10 @@ struct security_operations {
 	int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
 	int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new);
 	void (*xfrm_policy_free_security) (struct xfrm_policy *xp);
+	int (*xfrm_policy_delete_security) (struct xfrm_policy *xp);
 	int (*xfrm_state_alloc_security) (struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
 	void (*xfrm_state_free_security) (struct xfrm_state *x);
+	int (*xfrm_state_delete_security) (struct xfrm_state *x);
 	int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 
@@ -2934,11 +2942,21 @@ static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
 	security_ops->xfrm_policy_free_security(xp);
 }
 
+static inline int security_xfrm_policy_delete(struct xfrm_policy *xp)
+{
+	return security_ops->xfrm_policy_delete_security(xp);
+}
+
 static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
 {
 	return security_ops->xfrm_state_alloc_security(x, sec_ctx);
 }
 
+static inline int security_xfrm_state_delete(struct xfrm_state *x)
+{
+	return security_ops->xfrm_state_delete_security(x);
+}
+
 static inline void security_xfrm_state_free(struct xfrm_state *x)
 {
 	security_ops->xfrm_state_free_security(x);
@@ -2963,6 +2981,11 @@ static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
 {
 }
 
+static inline int security_xfrm_policy_delete(struct xfrm_policy *xp)
+{
+	return 0;
+}
+
 static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
 {
 	return 0;
@@ -2972,6 +2995,11 @@ static inline void security_xfrm_state_free(struct xfrm_state *x)
 {
 }
 
+static inline int security_xfrm_state_delete(struct xfrm_policy *xp)
+{
+	return 0;
+}
+
 static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
 {
 	return 0;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 859582275cab..d5e2121ea207 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1454,21 +1454,23 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	if (x == NULL)
 		return -ESRCH;
 
+	if ((err = security_xfrm_state_delete(x)))
+		goto out;
+
 	if (xfrm_state_kern(x)) {
-		xfrm_state_put(x);
-		return -EPERM;
+		err = -EPERM;
+		goto out;
 	}
 	
 	err = xfrm_state_delete(x);
-	if (err < 0) {
-		xfrm_state_put(x);
-		return err;
-	}
+	if (err < 0)
+		goto out;
 
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
 	c.event = XFRM_MSG_DELSA;
 	km_state_notify(x, &c);
+out:
 	xfrm_state_put(x);
 
 	return err;
@@ -2274,11 +2276,14 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 
 	err = 0;
 
+	if ((err = security_xfrm_policy_delete(xp)))
+		goto out;
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
 	c.event = XFRM_MSG_DELPOLICY;
 	km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
 
+out:
 	xfrm_pol_put(xp);
 	return err;
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 81d1005830f4..a3733d2db3ba 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -427,23 +427,25 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	if (x == NULL)
 		return -ESRCH;
 
+	if (err = security_xfrm_state_delete(x))
+		goto out;
+
 	if (xfrm_state_kern(x)) {
-		xfrm_state_put(x);
-		return -EPERM;
+		err = -EPERM;
+		goto out;
 	}
 
 	err = xfrm_state_delete(x);
-	if (err < 0) {
-		xfrm_state_put(x);
-		return err;
-	}
+	if (err < 0)
+		goto out;
 
 	c.seq = nlh->nlmsg_seq;
 	c.pid = nlh->nlmsg_pid;
 	c.event = nlh->nlmsg_type;
 	km_state_notify(x, &c);
-	xfrm_state_put(x);
 
+out:
+	xfrm_state_put(x);
 	return err;
 }
 
@@ -1055,6 +1057,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 					      MSG_DONTWAIT);
 		}
 	} else {
+		if (err = security_xfrm_policy_delete(xp))
+			goto out;
 		c.data.byid = p->index;
 		c.event = nlh->nlmsg_type;
 		c.seq = nlh->nlmsg_seq;
@@ -1064,6 +1068,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 
 	xfrm_pol_put(xp);
 
+out:
 	return err;
 }
 
diff --git a/security/dummy.c b/security/dummy.c
index 8ccccccc12ac..64f6da0f422e 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -810,6 +810,11 @@ static void dummy_xfrm_policy_free_security(struct xfrm_policy *xp)
 {
 }
 
+static int dummy_xfrm_policy_delete_security(struct xfrm_policy *xp)
+{
+	return 0;
+}
+
 static int dummy_xfrm_state_alloc_security(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
 {
 	return 0;
@@ -819,6 +824,11 @@ static void dummy_xfrm_state_free_security(struct xfrm_state *x)
 {
 }
 
+static int dummy_xfrm_state_delete_security(struct xfrm_state *x)
+{
+	return 0;
+}
+
 static int dummy_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
 {
 	return 0;
@@ -1024,8 +1034,10 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, xfrm_policy_alloc_security);
 	set_to_dummy_if_null(ops, xfrm_policy_clone_security);
 	set_to_dummy_if_null(ops, xfrm_policy_free_security);
+	set_to_dummy_if_null(ops, xfrm_policy_delete_security);
 	set_to_dummy_if_null(ops, xfrm_state_alloc_security);
 	set_to_dummy_if_null(ops, xfrm_state_free_security);
+	set_to_dummy_if_null(ops, xfrm_state_delete_security);
 	set_to_dummy_if_null(ops, xfrm_policy_lookup);
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 #ifdef CONFIG_KEYS
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 90b4cdc0c948..cf7b62ca886a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4374,8 +4374,10 @@ static struct security_operations selinux_ops = {
 	.xfrm_policy_alloc_security =	selinux_xfrm_policy_alloc,
 	.xfrm_policy_clone_security =	selinux_xfrm_policy_clone,
 	.xfrm_policy_free_security =	selinux_xfrm_policy_free,
+	.xfrm_policy_delete_security =	selinux_xfrm_policy_delete,
 	.xfrm_state_alloc_security =	selinux_xfrm_state_alloc,
 	.xfrm_state_free_security =	selinux_xfrm_state_free,
+	.xfrm_state_delete_security =	selinux_xfrm_state_delete,
 	.xfrm_policy_lookup = 		selinux_xfrm_policy_lookup,
 #endif
 };
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index c10f1fc41502..f0f4e480ff99 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -9,8 +9,10 @@
 int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
 int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new);
 void selinux_xfrm_policy_free(struct xfrm_policy *xp);
+int selinux_xfrm_policy_delete(struct xfrm_policy *xp);
 int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
 void selinux_xfrm_state_free(struct xfrm_state *x);
+int selinux_xfrm_state_delete(struct xfrm_state *x);
 int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
 
 /*
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index abe99d881376..0e24df41099f 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -132,10 +132,7 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_us
 		goto out;
 
 	/*
-	 * Does the subject have permission to set security or permission to
-	 * do the relabel?
-	 * Must be permitted to relabel from default socket type (process type)
-	 * to specified context
+	 * Does the subject have permission to set security context?
 	 */
 	rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
 			  SECCLASS_ASSOCIATION,
@@ -200,6 +197,23 @@ void selinux_xfrm_policy_free(struct xfrm_policy *xp)
 		kfree(ctx);
 }
 
+/*
+ * LSM hook implementation that authorizes deletion of labeled policies.
+ */
+int selinux_xfrm_policy_delete(struct xfrm_policy *xp)
+{
+	struct task_security_struct *tsec = current->security;
+	struct xfrm_sec_ctx *ctx = xp->security;
+	int rc = 0;
+
+	if (ctx)
+		rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
+				  SECCLASS_ASSOCIATION,
+				  ASSOCIATION__SETCONTEXT, NULL);
+
+	return rc;
+}
+
 /*
  * LSM hook implementation that allocs and transfers sec_ctx spec to
  * xfrm_state.
@@ -292,6 +306,23 @@ u32 selinux_socket_getpeer_dgram(struct sk_buff *skb)
 	return SECSID_NULL;
 }
 
+ /*
+  * LSM hook implementation that authorizes deletion of labeled SAs.
+  */
+int selinux_xfrm_state_delete(struct xfrm_state *x)
+{
+	struct task_security_struct *tsec = current->security;
+	struct xfrm_sec_ctx *ctx = x->security;
+	int rc = 0;
+
+	if (ctx)
+		rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
+				  SECCLASS_ASSOCIATION,
+				  ASSOCIATION__SETCONTEXT, NULL);
+
+	return rc;
+}
+
 /*
  * LSM hook that controls access to unlabelled packets.  If
  * a xfrm_state is authorizable (defined by macro) then it was
-- 
cgit v1.2.3-71-gd317


From 6f68dc37759b1d6ff3b4d4a9d097605a09f8f043 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 8 Jun 2006 23:58:52 -0700
Subject: [NET]: Fix warnings after LSM-IPSEC changes.

Assignment used as truth value in xfrm_del_sa()
and xfrm_get_policy().

Wrong argument type declared for security_xfrm_state_delete()
when SELINUX is disabled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h | 2 +-
 net/xfrm/xfrm_user.c     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 14c9bd050607..4dfb1b84a9b3 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2995,7 +2995,7 @@ static inline void security_xfrm_state_free(struct xfrm_state *x)
 {
 }
 
-static inline int security_xfrm_state_delete(struct xfrm_policy *xp)
+static inline int security_xfrm_state_delete(struct xfrm_state *x)
 {
 	return 0;
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index a3733d2db3ba..c21dc26141ea 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -427,7 +427,7 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	if (x == NULL)
 		return -ESRCH;
 
-	if (err = security_xfrm_state_delete(x))
+	if ((err = security_xfrm_state_delete(x)) != 0)
 		goto out;
 
 	if (xfrm_state_kern(x)) {
@@ -1057,7 +1057,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 					      MSG_DONTWAIT);
 		}
 	} else {
-		if (err = security_xfrm_policy_delete(xp))
+		if ((err = security_xfrm_policy_delete(xp)) != 0)
 			goto out;
 		c.data.byid = p->index;
 		c.event = nlh->nlmsg_type;
-- 
cgit v1.2.3-71-gd317


From c749b29fae74ed59c507d84025b3298202b42609 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:28:25 -0700
Subject: [SECMARK]: Add SELinux exports

Add and export new functions to the in-kernel SELinux API in support of the
new secmark-based packet controls.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/selinux.h    | 32 ++++++++++++++++++++++++++++++++
 security/selinux/exports.c | 22 ++++++++++++++++++++++
 2 files changed, 54 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 4047bcde4484..aad4e390d6a5 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -118,6 +118,27 @@ void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid);
  */
 void selinux_get_task_sid(struct task_struct *tsk, u32 *sid);
 
+/**
+ *     selinux_string_to_sid - map a security context string to a security ID
+ *     @str: the security context string to be mapped
+ *     @sid: ID value returned via this.
+ *
+ *     Returns 0 if successful, with the SID stored in sid.  A value
+ *     of zero for sid indicates no SID could be determined (but no error
+ *     occurred).
+ */
+int selinux_string_to_sid(char *str, u32 *sid);
+
+/**
+ *     selinux_relabel_packet_permission - check permission to relabel a packet
+ *     @sid: ID value to be applied to network packet (via SECMARK, most likely)
+ *
+ *     Returns 0 if the current task is allowed to label packets with the
+ *     supplied security ID.  Note that it is implicit that the packet is always
+ *     being relabeled from the default unlabled value, and that the access
+ *     control decision is made in the AVC.
+ */
+int selinux_relabel_packet_permission(u32 sid);
 
 #else
 
@@ -172,6 +193,17 @@ static inline void selinux_get_task_sid(struct task_struct *tsk, u32 *sid)
 	*sid = 0;
 }
 
+static inline int selinux_string_to_sid(const char *str, u32 *sid)
+{
+       *sid = 0;
+       return 0;
+}
+
+static inline int selinux_relabel_packet_permission(u32 sid)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_SECURITY_SELINUX */
 
 #endif /* _LINUX_SELINUX_H */
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index ae4c73eb3085..9d7737db5e51 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -72,3 +72,25 @@ void selinux_get_task_sid(struct task_struct *tsk, u32 *sid)
 	*sid = 0;
 }
 
+int selinux_string_to_sid(char *str, u32 *sid)
+{
+	if (selinux_enabled)
+		return security_context_to_sid(str, strlen(str), sid);
+	else {
+		*sid = 0;
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(selinux_string_to_sid);
+
+int selinux_relabel_packet_permission(u32 sid)
+{
+	if (selinux_enabled) {
+		struct task_security_struct *tsec = current->security;
+
+		return avc_has_perm(tsec->sid, sid, SECCLASS_PACKET,
+				    PACKET__RELABELTO, NULL);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(selinux_relabel_packet_permission);
-- 
cgit v1.2.3-71-gd317


From 984bc16cc92ea3c247bf34ad667cfb95331b9d3c Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:29:17 -0700
Subject: [SECMARK]: Add secmark support to core networking.

Add a secmark field to the skbuff structure, to allow security subsystems to
place security markings on network packets.  This is similar to the nfmark
field, except is intended for implementing security policy, rather than than
networking policy.

This patch was already acked in principle by Dave Miller.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h          | 22 ++++++++++++++++++++++
 net/Kconfig                     |  7 +++++++
 net/core/skbuff.c               |  3 ++-
 net/ipv4/ip_output.c            |  1 +
 net/ipv4/netfilter/ipt_REJECT.c |  1 +
 net/ipv6/ip6_output.c           |  1 +
 6 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 23bad3bf3c9d..fe2c58e5306f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -210,6 +210,7 @@ enum {
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@tc_index: Traffic control index
  *	@tc_verd: traffic control verdict
+ *	@secmark: security marking
  */
 
 struct sk_buff {
@@ -289,6 +290,9 @@ struct sk_buff {
 #ifdef CONFIG_NET_DMA
 	dma_cookie_t		dma_cookie;
 #endif
+#ifdef CONFIG_NETWORK_SECMARK
+	__u32			secmark;
+#endif
 
 
 	/* These elements must be at the end, see alloc_skb() for details.  */
@@ -1400,5 +1404,23 @@ static inline void nf_reset(struct sk_buff *skb)
 static inline void nf_reset(struct sk_buff *skb) {}
 #endif /* CONFIG_NETFILTER */
 
+#ifdef CONFIG_NETWORK_SECMARK
+static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
+{
+	to->secmark = from->secmark;
+}
+
+static inline void skb_init_secmark(struct sk_buff *skb)
+{
+	skb->secmark = 0;
+}
+#else
+static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
+{ }
+
+static inline void skb_init_secmark(struct sk_buff *skb)
+{ }
+#endif
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/Kconfig b/net/Kconfig
index ccadc8e48152..c6cec5aa5486 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -66,6 +66,13 @@ source "net/ipv6/Kconfig"
 
 endif # if INET
 
+config NETWORK_SECMARK
+	bool "Security Marking"
+	help
+	  This enables security marking of network packets, similar
+	  to nfmark, but designated for security purposes.
+	  If you are unsure how to answer this question, answer N.
+
 menuconfig NETFILTER
 	bool "Network packet filtering (replaces ipchains)"
 	---help---
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fb3770f9c094..96cdcbe24ba2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -464,7 +464,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
 	C(input_dev);
 #endif
-
+	skb_copy_secmark(n, skb);
 #endif
 	C(truesize);
 	atomic_set(&n->users, 1);
@@ -526,6 +526,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 	new->tc_index	= old->tc_index;
 #endif
+	skb_copy_secmark(new, old);
 	atomic_set(&new->users, 1);
 	skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
 	skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cff9c3a72daf..d4bb3fae4e49 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	nf_bridge_get(to->nf_bridge);
 #endif
 #endif
+	skb_copy_secmark(to, from);
 }
 
 /*
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 0bba3c2bb786..431a3ce6f7b7 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	/* This packet will not be the same as the other: clear nf fields */
 	nf_reset(nskb);
 	nskb->nfmark = 0;
+	skb_init_secmark(nskb);
 
 	tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 416f6e428a0a..d29620f4910e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -459,6 +459,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	nf_bridge_get(to->nf_bridge);
 #endif
 #endif
+	skb_copy_secmark(to, from);
 }
 
 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
-- 
cgit v1.2.3-71-gd317


From 5e6874cdb8de94cd3c15d853a8ef9c6f4c305055 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:30:57 -0700
Subject: [SECMARK]: Add xtables SECMARK target

Add a SECMARK target to xtables, allowing the admin to apply security
marks to packets via both iptables and ip6tables.

The target currently handles SELinux security marking, but can be
extended for other purposes as needed.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_SECMARK.h |  26 ++++++
 net/netfilter/Kconfig                |   9 ++
 net/netfilter/Makefile               |   1 +
 net/netfilter/xt_SECMARK.c           | 156 +++++++++++++++++++++++++++++++++++
 4 files changed, 192 insertions(+)
 create mode 100644 include/linux/netfilter/xt_SECMARK.h
 create mode 100644 net/netfilter/xt_SECMARK.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h
new file mode 100644
index 000000000000..c53fbffa997d
--- /dev/null
+++ b/include/linux/netfilter/xt_SECMARK.h
@@ -0,0 +1,26 @@
+#ifndef _XT_SECMARK_H_target
+#define _XT_SECMARK_H_target
+
+/*
+ * This is intended for use by various security subsystems (but not
+ * at the same time).
+ *
+ * 'mode' refers to the specific security subsystem which the
+ * packets are being marked for.
+ */
+#define SECMARK_MODE_SEL	0x01		/* SELinux */
+#define SECMARK_SELCTX_MAX	256
+
+struct xt_secmark_target_selinux_info {
+	u_int32_t selsid;
+	char selctx[SECMARK_SELCTX_MAX];
+};
+
+struct xt_secmark_target_info {
+	u_int8_t mode;
+	union {
+		struct xt_secmark_target_selinux_info sel;
+	} u;
+};
+
+#endif /*_XT_SECMARK_H_target */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 85a7e1770252..10eccdd4d6ea 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -174,6 +174,15 @@ config NETFILTER_XT_TARGET_NOTRACK
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/modules.txt>.  If unsure, say `N'.
 
+config NETFILTER_XT_TARGET_SECMARK
+	tristate '"SECMARK" target support'
+	depends on NETFILTER_XTABLES && NETWORK_SECMARK
+	help
+	  The SECMARK target allows security marking of network
+	  packets, for use with security subsystems.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_MATCH_COMMENT
 	tristate  '"comment" match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index df1da25f40df..3645de046890 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
 
 # matches
 obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
new file mode 100644
index 000000000000..c2ce9c4011cc
--- /dev/null
+++ b/net/netfilter/xt_SECMARK.c
@@ -0,0 +1,156 @@
+/*
+ * Module for modifying the secmark field of the skb, for use by
+ * security subsystems.
+ *
+ * Based on the nfmark match by:
+ * (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/selinux.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SECMARK.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
+MODULE_DESCRIPTION("ip[6]tables SECMARK modification module");
+MODULE_ALIAS("ipt_SECMARK");
+MODULE_ALIAS("ip6t_SECMARK");
+
+#define PFX "SECMARK: "
+
+static u8 mode;
+
+static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
+			   const struct net_device *out, unsigned int hooknum,
+			   const struct xt_target *target,
+			   const void *targinfo, void *userinfo)
+{
+	u32 secmark = 0;
+	const struct xt_secmark_target_info *info = targinfo;
+
+	BUG_ON(info->mode != mode);
+
+	switch (mode) {
+	case SECMARK_MODE_SEL:
+		secmark = info->u.sel.selsid;
+		break;
+
+	default:
+		BUG();
+	}
+
+	if ((*pskb)->secmark != secmark)
+		(*pskb)->secmark = secmark;
+
+	return XT_CONTINUE;
+}
+
+static int checkentry_selinux(struct xt_secmark_target_info *info)
+{
+	int err;
+	struct xt_secmark_target_selinux_info *sel = &info->u.sel;
+
+	err = selinux_string_to_sid(sel->selctx, &sel->selsid);
+	if (err) {
+		if (err == -EINVAL)
+			printk(KERN_INFO PFX "invalid SELinux context \'%s\'\n",
+			       sel->selctx);
+		return 0;
+	}
+
+	if (!sel->selsid) {
+		printk(KERN_INFO PFX "unable to map SELinux context \'%s\'\n",
+		       sel->selctx);
+		return 0;
+	}
+
+	err = selinux_relabel_packet_permission(sel->selsid);
+	if (err) {
+		printk(KERN_INFO PFX "unable to obtain relabeling permission\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static int checkentry(const char *tablename, const void *entry,
+		      const struct xt_target *target, void *targinfo,
+		      unsigned int targinfosize, unsigned int hook_mask)
+{
+	struct xt_secmark_target_info *info = targinfo;
+
+	if (mode && mode != info->mode) {
+		printk(KERN_INFO PFX "mode already set to %hu cannot mix with "
+		       "rules for mode %hu\n", mode, info->mode);
+		return 0;
+	}
+
+	switch (info->mode) {
+	case SECMARK_MODE_SEL:
+		if (!checkentry_selinux(info))
+			return 0;
+		break;
+
+	default:
+		printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode);
+		return 0;
+	}
+
+	if (!mode)
+		mode = info->mode;
+	return 1;
+}
+
+static struct xt_target ipt_secmark_reg = {
+	.name		= "SECMARK",
+	.target		= target,
+	.targetsize	= sizeof(struct xt_secmark_target_info),
+	.table		= "mangle",
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+	.family		= AF_INET,
+	.revision	= 0,
+};
+
+static struct xt_target ip6t_secmark_reg = {
+	.name		= "SECMARK",
+	.target		= target,
+	.targetsize	= sizeof(struct xt_secmark_target_info),
+	.table		= "mangle",
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+	.family		= AF_INET6,
+	.revision	= 0,
+};
+
+static int __init xt_secmark_init(void)
+{
+	int err;
+
+	err = xt_register_target(&ipt_secmark_reg);
+	if (err)
+		return err;
+
+	err = xt_register_target(&ip6t_secmark_reg);
+	if (err)
+		xt_unregister_target(&ipt_secmark_reg);
+
+	return err;
+}
+
+static void __exit xt_secmark_fini(void)
+{
+	xt_unregister_target(&ip6t_secmark_reg);
+	xt_unregister_target(&ipt_secmark_reg);
+}
+
+module_init(xt_secmark_init);
+module_exit(xt_secmark_fini);
-- 
cgit v1.2.3-71-gd317


From 7c9728c393dceb724d66d696cfabce82151a78e5 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:31:46 -0700
Subject: [SECMARK]: Add secmark support to conntrack

Add a secmark field to IP and NF conntracks, so that security markings
on packets can be copied to their associated connections, and also
copied back to packets as required.  This is similar to the network
mark field currently used with conntrack, although it is intended for
enforcement of security policy rather than network policy.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h  |  4 ++++
 include/net/netfilter/nf_conntrack.h         |  4 ++++
 include/net/netfilter/nf_conntrack_compat.h  | 26 ++++++++++++++++++++++++++
 net/ipv4/netfilter/Kconfig                   | 12 ++++++++++++
 net/ipv4/netfilter/ip_conntrack_core.c       |  3 +++
 net/ipv4/netfilter/ip_conntrack_standalone.c |  5 +++++
 net/netfilter/Kconfig                        | 12 ++++++++++++
 net/netfilter/nf_conntrack_core.c            |  3 +++
 net/netfilter/nf_conntrack_standalone.c      |  5 +++++
 9 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 17d7ef938a09..e0e9951eb8c3 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -121,6 +121,10 @@ struct ip_conntrack
 	u_int32_t mark;
 #endif
 
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+	u_int32_t secmark;
+#endif
+
 	/* Traversed often, so hopefully in different cacheline to top */
 	/* These are my tuples; original and reply */
 	struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index dbe7a114d0c5..411117815807 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -114,6 +114,10 @@ struct nf_conn
 	u_int32_t mark;
 #endif
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	u_int32_t secmark;
+#endif
+
 	/* Storage reserved for other modules: */
 	union nf_conntrack_proto proto;
 
diff --git a/include/net/netfilter/nf_conntrack_compat.h b/include/net/netfilter/nf_conntrack_compat.h
index 3cac19fb3648..f1b1482d7200 100644
--- a/include/net/netfilter/nf_conntrack_compat.h
+++ b/include/net/netfilter/nf_conntrack_compat.h
@@ -20,6 +20,19 @@ static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb,
 }
 #endif /* CONFIG_IP_NF_CONNTRACK_MARK */
 
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+static inline u_int32_t *nf_ct_get_secmark(const struct sk_buff *skb,
+					   u_int32_t *ctinfo)
+{
+	struct ip_conntrack *ct = ip_conntrack_get(skb, ctinfo);
+
+	if (ct)
+		return &ct->secmark;
+	else
+		return NULL;
+}
+#endif /* CONFIG_IP_NF_CONNTRACK_SECMARK */
+
 #ifdef CONFIG_IP_NF_CT_ACCT
 static inline struct ip_conntrack_counter *
 nf_ct_get_counters(const struct sk_buff *skb)
@@ -70,6 +83,19 @@ static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb,
 }
 #endif /* CONFIG_NF_CONNTRACK_MARK */
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static inline u_int32_t *nf_ct_get_secmark(const struct sk_buff *skb,
+					   u_int32_t *ctinfo)
+{
+	struct nf_conn *ct = nf_ct_get(skb, ctinfo);
+
+	if (ct)
+		return &ct->secmark;
+	else
+		return NULL;
+}
+#endif /* CONFIG_NF_CONNTRACK_MARK */
+
 #ifdef CONFIG_NF_CT_ACCT
 static inline struct ip_conntrack_counter *
 nf_ct_get_counters(const struct sk_buff *skb)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ff4b118f14a9..e1d7f5fbc526 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK
 	  of packets, but this mark value is kept in the conntrack session
 	  instead of the individual packets.
 	
+config IP_NF_CONNTRACK_SECMARK
+	bool  'Connection tracking security mark support'
+	depends on IP_NF_CONNTRACK && NETWORK_SECMARK
+	help
+	  This option enables security markings to be applied to
+	  connections.  Typically they are copied to connections from
+	  packets using the CONNSECMARK target and copied back from
+	  connections to packets with the same target, with the packets
+	  being originally labeled via SECMARK.
+
+	  If unsure, say 'N'.
+
 config IP_NF_CONNTRACK_EVENTS
 	bool "Connection tracking events (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && IP_NF_CONNTRACK
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 4fe9e69378df..7e4cf9a4d15f 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -723,6 +723,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple,
     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
 		/* this is ugly, but there is no other place where to put it */
 		conntrack->nat.masq_index = exp->master->nat.masq_index;
+#endif
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+		conntrack->secmark = exp->master->secmark;
 #endif
 		nf_conntrack_get(&conntrack->master->ct_general);
 		CONNTRACK_STAT_INC(expect_new);
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 6cb9b989d14c..88445aac3f28 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		return -ENOSPC;
 #endif
 
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+	if (seq_printf(s, "secmark=%u ", conntrack->secmark))
+		return -ENOSPC;
+#endif
+
 	if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
 		return -ENOSPC;
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 10eccdd4d6ea..023f81e5f96b 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -60,6 +60,18 @@ config NF_CONNTRACK_MARK
 	  of packets, but this mark value is kept in the conntrack session
 	  instead of the individual packets.
 
+config NF_CONNTRACK_SECMARK
+	bool  'Connection tracking security mark support'
+	depends on NF_CONNTRACK && NETWORK_SECMARK
+	help
+	  This option enables security markings to be applied to
+	  connections.  Typically they are copied to connections from
+	  packets using the CONNSECMARK target and copied back from
+	  connections to packets with the same target, with the packets
+	  being originally labeled via SECMARK.
+
+	  If unsure, say 'N'.
+
 config NF_CONNTRACK_EVENTS
 	bool "Connection tracking events (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && NF_CONNTRACK
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index bc2bd4c3859e..cd299f4b7db1 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -989,6 +989,9 @@ init_conntrack(const struct nf_conntrack_tuple *tuple,
 		conntrack->master = exp->master;
 #ifdef CONFIG_NF_CONNTRACK_MARK
 		conntrack->mark = exp->master->mark;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+		conntrack->secmark = exp->master->secmark;
 #endif
 		nf_conntrack_get(&conntrack->master->ct_general);
 		NF_CT_STAT_INC(expect_new);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index e01d20d8e287..e34c574f0351 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -213,6 +213,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		return -ENOSPC;
 #endif
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	if (seq_printf(s, "secmark=%u ", conntrack->secmark))
+		return -ENOSPC;
+#endif
+
 	if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
 		return -ENOSPC;
 	
-- 
cgit v1.2.3-71-gd317


From 100468e9c05c10fb6872751c1af523b996d6afa9 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:32:39 -0700
Subject: [SECMARK]: Add CONNSECMARK xtables target

Add a new xtables target, CONNSECMARK, which is used to specify rules
for copying security marks from packets to connections, and for
copyying security marks back from connections to packets.  This is
similar to the CONNMARK target, but is more limited in scope in that
it only allows copying of security marks to and from packets, as this
is all it needs to do.

A typical scenario would be to apply a security mark to a 'new' packet
with SECMARK, then copy that to its conntrack via CONNMARK, and then
restore the security mark from the connection to established and
related packets on that connection.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_CONNSECMARK.h |  13 +++
 net/netfilter/Kconfig                    |  11 +++
 net/netfilter/Makefile                   |   1 +
 net/netfilter/xt_CONNSECMARK.c           | 155 +++++++++++++++++++++++++++++++
 4 files changed, 180 insertions(+)
 create mode 100644 include/linux/netfilter/xt_CONNSECMARK.h
 create mode 100644 net/netfilter/xt_CONNSECMARK.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_CONNSECMARK.h b/include/linux/netfilter/xt_CONNSECMARK.h
new file mode 100644
index 000000000000..c6bd75469ba2
--- /dev/null
+++ b/include/linux/netfilter/xt_CONNSECMARK.h
@@ -0,0 +1,13 @@
+#ifndef _XT_CONNSECMARK_H_target
+#define _XT_CONNSECMARK_H_target
+
+enum {
+	CONNSECMARK_SAVE = 1,
+	CONNSECMARK_RESTORE,
+};
+
+struct xt_connsecmark_target_info {
+	u_int8_t mode;
+};
+
+#endif /*_XT_CONNSECMARK_H_target */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 023f81e5f96b..b1622b7de1cf 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -195,6 +195,17 @@ config NETFILTER_XT_TARGET_SECMARK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_TARGET_CONNSECMARK
+	tristate '"CONNSECMARK" target support'
+	depends on NETFILTER_XTABLES && (NF_CONNTRACK_SECMARK || IP_NF_CONNTRACK_SECMARK)
+	help
+	  The CONNSECMARK target copies security markings from packets
+	  to connections, and restores security markings from connections
+	  to packets (if the packets are not already marked).  This would
+	  normally be used in conjunction with the SECMARK target.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_MATCH_COMMENT
 	tristate  '"comment" match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3645de046890..6fa4b7580458 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 
 # matches
 obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
new file mode 100644
index 000000000000..8c011e020769
--- /dev/null
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -0,0 +1,155 @@
+/*
+ * This module is used to copy security markings from packets
+ * to connections, and restore security markings from connections
+ * back to packets.  This would normally be performed in conjunction
+ * with the SECMARK target and state match.
+ *
+ * Based somewhat on CONNMARK:
+ *   Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ *    by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CONNSECMARK.h>
+#include <net/netfilter/nf_conntrack_compat.h>
+
+#define PFX "CONNSECMARK: "
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
+MODULE_DESCRIPTION("ip[6]tables CONNSECMARK module");
+MODULE_ALIAS("ipt_CONNSECMARK");
+MODULE_ALIAS("ip6t_CONNSECMARK");
+
+/*
+ * If the packet has a security mark and the connection does not, copy
+ * the security mark from the packet to the connection.
+ */
+static void secmark_save(struct sk_buff *skb)
+{
+	if (skb->secmark) {
+		u32 *connsecmark;
+		enum ip_conntrack_info ctinfo;
+
+		connsecmark = nf_ct_get_secmark(skb, &ctinfo);
+		if (connsecmark && !*connsecmark)
+			if (*connsecmark != skb->secmark)
+				*connsecmark = skb->secmark;
+	}
+}
+
+/*
+ * If packet has no security mark, and the connection does, restore the
+ * security mark from the connection to the packet.
+ */
+static void secmark_restore(struct sk_buff *skb)
+{
+	if (!skb->secmark) {
+		u32 *connsecmark;
+		enum ip_conntrack_info ctinfo;
+
+		connsecmark = nf_ct_get_secmark(skb, &ctinfo);
+		if (connsecmark && *connsecmark)
+			if (skb->secmark != *connsecmark)
+				skb->secmark = *connsecmark;
+	}
+}
+
+static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
+			   const struct net_device *out, unsigned int hooknum,
+			   const struct xt_target *target,
+			   const void *targinfo, void *userinfo)
+{
+	struct sk_buff *skb = *pskb;
+	const struct xt_connsecmark_target_info *info = targinfo;
+
+	switch (info->mode) {
+	case CONNSECMARK_SAVE:
+		secmark_save(skb);
+		break;
+
+	case CONNSECMARK_RESTORE:
+		secmark_restore(skb);
+		break;
+
+	default:
+		BUG();
+	}
+
+	return XT_CONTINUE;
+}
+
+static int checkentry(const char *tablename, const void *entry,
+		      const struct xt_target *target, void *targinfo,
+		      unsigned int targinfosize, unsigned int hook_mask)
+{
+	struct xt_connsecmark_target_info *info = targinfo;
+
+	switch (info->mode) {
+	case CONNSECMARK_SAVE:
+	case CONNSECMARK_RESTORE:
+		break;
+
+	default:
+		printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_target ipt_connsecmark_reg = {
+	.name		= "CONNSECMARK",
+	.target		= target,
+	.targetsize	= sizeof(struct xt_connsecmark_target_info),
+	.table		= "mangle",
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+	.family		= AF_INET,
+	.revision	= 0,
+};
+
+static struct xt_target ip6t_connsecmark_reg = {
+	.name		= "CONNSECMARK",
+	.target		= target,
+	.targetsize	= sizeof(struct xt_connsecmark_target_info),
+	.table		= "mangle",
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+	.family		= AF_INET6,
+	.revision	= 0,
+};
+
+static int __init xt_connsecmark_init(void)
+{
+	int err;
+
+	need_conntrack();
+
+	err = xt_register_target(&ipt_connsecmark_reg);
+	if (err)
+		return err;
+
+	err = xt_register_target(&ip6t_connsecmark_reg);
+	if (err)
+		xt_unregister_target(&ipt_connsecmark_reg);
+
+	return err;
+}
+
+static void __exit xt_connsecmark_fini(void)
+{
+	xt_unregister_target(&ip6t_connsecmark_reg);
+	xt_unregister_target(&ipt_connsecmark_reg);
+}
+
+module_init(xt_connsecmark_init);
+module_exit(xt_connsecmark_fini);
-- 
cgit v1.2.3-71-gd317


From 932ff279a43ab7257942cddff2595acd541cc49b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 9 Jun 2006 12:20:56 -0700
Subject: [NET]: Add netif_tx_lock

Various drivers use xmit_lock internally to synchronise with their
transmission routines.  They do so without setting xmit_lock_owner.
This is fine as long as netpoll is not in use.

With netpoll it is possible for deadlocks to occur if xmit_lock_owner
isn't set.  This is because if a printk occurs while xmit_lock is held
and xmit_lock_owner is not set can cause netpoll to attempt to take
xmit_lock recursively.

While it is possible to resolve this by getting netpoll to use
trylock, it is suboptimal because netpoll's sole objective is to
maximise the chance of getting the printk out on the wire.  So
delaying or dropping the message is to be avoided as much as possible.

So the only alternative is to always set xmit_lock_owner.  The
following patch does this by introducing the netif_tx_lock family of
functions that take care of setting/unsetting xmit_lock_owner.

I renamed xmit_lock to _xmit_lock to indicate that it should not be
used directly.  I didn't provide irq versions of the netif_tx_lock
functions since xmit_lock is meant to be a BH-disabling lock.

This is pretty much a straight text substitution except for a small
bug fix in winbond.  It currently uses
netif_stop_queue/spin_unlock_wait to stop transmission.  This is
unsafe as an IRQ can potentially wake up the queue.  So it is safer to
use netif_tx_disable.

The hamradio bits used spin_lock_irq but it is unnecessary as
xmit_lock must never be taken in an IRQ handler.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdevices.txt        |  8 +++---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |  6 ++--
 drivers/media/dvb/dvb-core/dvb_net.c           |  4 +--
 drivers/net/bnx2.c                             |  4 +--
 drivers/net/bonding/bond_main.c                |  2 +-
 drivers/net/forcedeth.c                        | 18 ++++++------
 drivers/net/hamradio/6pack.c                   |  8 +++---
 drivers/net/hamradio/mkiss.c                   |  8 +++---
 drivers/net/ifb.c                              | 10 +++----
 drivers/net/irda/vlsi_ir.c                     |  2 +-
 drivers/net/natsemi.c                          |  4 +--
 drivers/net/tulip/winbond-840.c                |  9 +++---
 drivers/net/wireless/orinoco.c                 |  4 ++-
 include/linux/netdevice.h                      | 38 ++++++++++++++++++++++++--
 net/atm/clip.c                                 |  4 +--
 net/core/dev.c                                 | 12 ++++----
 net/core/dev_mcast.c                           | 28 +++++++++----------
 net/core/netpoll.c                             |  9 ++----
 net/core/pktgen.c                              |  4 +--
 net/sched/sch_generic.c                        | 28 ++++++++-----------
 net/sched/sch_teql.c                           |  9 ++----
 21 files changed, 121 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt
index 3c0a5ba614d7..847cedb238f6 100644
--- a/Documentation/networking/netdevices.txt
+++ b/Documentation/networking/netdevices.txt
@@ -42,9 +42,9 @@ dev->get_stats:
 	Context: nominally process, but don't sleep inside an rwlock
 
 dev->hard_start_xmit:
-	Synchronization: dev->xmit_lock spinlock.
+	Synchronization: netif_tx_lock spinlock.
 	When the driver sets NETIF_F_LLTX in dev->features this will be
-	called without holding xmit_lock. In this case the driver 
+	called without holding netif_tx_lock. In this case the driver
 	has to lock by itself when needed. It is recommended to use a try lock
 	for this and return -1 when the spin lock fails. 
 	The locking there should also properly protect against 
@@ -62,12 +62,12 @@ dev->hard_start_xmit:
 	  Only valid when NETIF_F_LLTX is set.
 
 dev->tx_timeout:
-	Synchronization: dev->xmit_lock spinlock.
+	Synchronization: netif_tx_lock spinlock.
 	Context: BHs disabled
 	Notes: netif_queue_stopped() is guaranteed true
 
 dev->set_multicast_list:
-	Synchronization: dev->xmit_lock spinlock.
+	Synchronization: netif_tx_lock spinlock.
 	Context: BHs disabled
 
 dev->poll:
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 1dae4b238252..1d917edcf9ba 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -821,7 +821,8 @@ void ipoib_mcast_restart_task(void *dev_ptr)
 
 	ipoib_mcast_stop_thread(dev, 0);
 
-	spin_lock_irqsave(&dev->xmit_lock, flags);
+	local_irq_save(flags);
+	netif_tx_lock(dev);
 	spin_lock(&priv->lock);
 
 	/*
@@ -896,7 +897,8 @@ void ipoib_mcast_restart_task(void *dev_ptr)
 	}
 
 	spin_unlock(&priv->lock);
-	spin_unlock_irqrestore(&dev->xmit_lock, flags);
+	netif_tx_unlock(dev);
+	local_irq_restore(flags);
 
 	/* We have to cancel outside of the spinlock */
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
diff --git a/drivers/media/dvb/dvb-core/dvb_net.c b/drivers/media/dvb/dvb-core/dvb_net.c
index 2f0f35811bf7..9fd87521a163 100644
--- a/drivers/media/dvb/dvb-core/dvb_net.c
+++ b/drivers/media/dvb/dvb-core/dvb_net.c
@@ -1052,7 +1052,7 @@ static void wq_set_multicast_list (void *data)
 
 	dvb_net_feed_stop(dev);
 	priv->rx_mode = RX_MODE_UNI;
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 
 	if (dev->flags & IFF_PROMISC) {
 		dprintk("%s: promiscuous mode\n", dev->name);
@@ -1077,7 +1077,7 @@ static void wq_set_multicast_list (void *data)
 		}
 	}
 
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 	dvb_net_feed_start(dev);
 }
 
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 54161aef3cac..9c5a8842ed0f 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -2009,7 +2009,7 @@ bnx2_poll(struct net_device *dev, int *budget)
 	return 1;
 }
 
-/* Called with rtnl_lock from vlan functions and also dev->xmit_lock
+/* Called with rtnl_lock from vlan functions and also netif_tx_lock
  * from set_multicast.
  */
 static void
@@ -4252,7 +4252,7 @@ bnx2_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid)
 }
 #endif
 
-/* Called with dev->xmit_lock.
+/* Called with netif_tx_lock.
  * hard_start_xmit is pseudo-lockless - a lock is only required when
  * the tx queue is full. This way, we get the benefit of lockless
  * operations most of the time without the complexities to handle
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 55d236726d11..46326cdfb277 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4191,7 +4191,7 @@ static int bond_init(struct net_device *bond_dev, struct bond_params *params)
 	 */
 	bond_dev->features |= NETIF_F_VLAN_CHALLENGED;
 
-	/* don't acquire bond device's xmit_lock when 
+	/* don't acquire bond device's netif_tx_lock when
 	 * transmitting */
 	bond_dev->features |= NETIF_F_LLTX;
 
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index feb5b223cd60..5a8651b4b01d 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -533,9 +533,9 @@ typedef union _ring_type {
  * critical parts:
  * - rx is (pseudo-) lockless: it relies on the single-threading provided
  *	by the arch code for interrupts.
- * - tx setup is lockless: it relies on dev->xmit_lock. Actual submission
+ * - tx setup is lockless: it relies on netif_tx_lock. Actual submission
  *	needs dev->priv->lock :-(
- * - set_multicast_list: preparation lockless, relies on dev->xmit_lock.
+ * - set_multicast_list: preparation lockless, relies on netif_tx_lock.
  */
 
 /* in dev: base, irq */
@@ -1213,7 +1213,7 @@ static void drain_ring(struct net_device *dev)
 
 /*
  * nv_start_xmit: dev->hard_start_xmit function
- * Called with dev->xmit_lock held.
+ * Called with netif_tx_lock held.
  */
 static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
@@ -1407,7 +1407,7 @@ static void nv_tx_done(struct net_device *dev)
 
 /*
  * nv_tx_timeout: dev->tx_timeout function
- * Called with dev->xmit_lock held.
+ * Called with netif_tx_lock held.
  */
 static void nv_tx_timeout(struct net_device *dev)
 {
@@ -1737,7 +1737,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu)
 		 * Changing the MTU is a rare event, it shouldn't matter.
 		 */
 		nv_disable_irq(dev);
-		spin_lock_bh(&dev->xmit_lock);
+		netif_tx_lock_bh(dev);
 		spin_lock(&np->lock);
 		/* stop engines */
 		nv_stop_rx(dev);
@@ -1768,7 +1768,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu)
 		nv_start_rx(dev);
 		nv_start_tx(dev);
 		spin_unlock(&np->lock);
-		spin_unlock_bh(&dev->xmit_lock);
+		netif_tx_unlock_bh(dev);
 		nv_enable_irq(dev);
 	}
 	return 0;
@@ -1803,7 +1803,7 @@ static int nv_set_mac_address(struct net_device *dev, void *addr)
 	memcpy(dev->dev_addr, macaddr->sa_data, ETH_ALEN);
 
 	if (netif_running(dev)) {
-		spin_lock_bh(&dev->xmit_lock);
+		netif_tx_lock_bh(dev);
 		spin_lock_irq(&np->lock);
 
 		/* stop rx engine */
@@ -1815,7 +1815,7 @@ static int nv_set_mac_address(struct net_device *dev, void *addr)
 		/* restart rx engine */
 		nv_start_rx(dev);
 		spin_unlock_irq(&np->lock);
-		spin_unlock_bh(&dev->xmit_lock);
+		netif_tx_unlock_bh(dev);
 	} else {
 		nv_copy_mac_to_hw(dev);
 	}
@@ -1824,7 +1824,7 @@ static int nv_set_mac_address(struct net_device *dev, void *addr)
 
 /*
  * nv_set_multicast: dev->set_multicast function
- * Called with dev->xmit_lock held.
+ * Called with netif_tx_lock held.
  */
 static void nv_set_multicast(struct net_device *dev)
 {
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 102c1f0b90da..d12605f0ac7c 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -308,9 +308,9 @@ static int sp_set_mac_address(struct net_device *dev, void *addr)
 {
 	struct sockaddr_ax25 *sa = addr;
 
-	spin_lock_irq(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN);
-	spin_unlock_irq(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 
 	return 0;
 }
@@ -767,9 +767,9 @@ static int sixpack_ioctl(struct tty_struct *tty, struct file *file,
 			break;
 		}
 
-		spin_lock_irq(&dev->xmit_lock);
+		netif_tx_lock_bh(dev);
 		memcpy(dev->dev_addr, &addr, AX25_ADDR_LEN);
-		spin_unlock_irq(&dev->xmit_lock);
+		netif_tx_unlock_bh(dev);
 
 		err = 0;
 		break;
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index d81a8e1eeb8d..3ebbbe56b6e9 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -357,9 +357,9 @@ static int ax_set_mac_address(struct net_device *dev, void *addr)
 {
 	struct sockaddr_ax25 *sa = addr;
 
-	spin_lock_irq(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN);
-	spin_unlock_irq(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 
 	return 0;
 }
@@ -886,9 +886,9 @@ static int mkiss_ioctl(struct tty_struct *tty, struct file *file,
 			break;
 		}
 
-		spin_lock_irq(&dev->xmit_lock);
+		netif_tx_lock_bh(dev);
 		memcpy(dev->dev_addr, addr, AX25_ADDR_LEN);
-		spin_unlock_irq(&dev->xmit_lock);
+		netif_tx_unlock_bh(dev);
 
 		err = 0;
 		break;
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 31fb2d75dc44..2e222ef91e22 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -76,13 +76,13 @@ static void ri_tasklet(unsigned long dev)
 	dp->st_task_enter++;
 	if ((skb = skb_peek(&dp->tq)) == NULL) {
 		dp->st_txq_refl_try++;
-		if (spin_trylock(&_dev->xmit_lock)) {
+		if (netif_tx_trylock(_dev)) {
 			dp->st_rxq_enter++;
 			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
 				skb_queue_tail(&dp->tq, skb);
 				dp->st_rx2tx_tran++;
 			}
-			spin_unlock(&_dev->xmit_lock);
+			netif_tx_unlock(_dev);
 		} else {
 			/* reschedule */
 			dp->st_rxq_notenter++;
@@ -110,7 +110,7 @@ static void ri_tasklet(unsigned long dev)
 		}
 	}
 
-	if (spin_trylock(&_dev->xmit_lock)) {
+	if (netif_tx_trylock(_dev)) {
 		dp->st_rxq_check++;
 		if ((skb = skb_peek(&dp->rq)) == NULL) {
 			dp->tasklet_pending = 0;
@@ -118,10 +118,10 @@ static void ri_tasklet(unsigned long dev)
 				netif_wake_queue(_dev);
 		} else {
 			dp->st_rxq_rsch++;
-			spin_unlock(&_dev->xmit_lock);
+			netif_tx_unlock(_dev);
 			goto resched;
 		}
-		spin_unlock(&_dev->xmit_lock);
+		netif_tx_unlock(_dev);
 	} else {
 resched:
 		dp->tasklet_pending = 1;
diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c
index 97a49e0be76b..d70b9e8d6e60 100644
--- a/drivers/net/irda/vlsi_ir.c
+++ b/drivers/net/irda/vlsi_ir.c
@@ -959,7 +959,7 @@ static int vlsi_hard_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 			    ||  (now.tv_sec==ready.tv_sec && now.tv_usec>=ready.tv_usec))
 			    	break;
 			udelay(100);
-			/* must not sleep here - we are called under xmit_lock! */
+			/* must not sleep here - called under netif_tx_lock! */
 		}
 	}
 
diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c
index 90627756d6fa..2e4ecedba057 100644
--- a/drivers/net/natsemi.c
+++ b/drivers/net/natsemi.c
@@ -318,12 +318,12 @@ performance critical codepaths:
 The rx process only runs in the interrupt handler. Access from outside
 the interrupt handler is only permitted after disable_irq().
 
-The rx process usually runs under the dev->xmit_lock. If np->intr_tx_reap
+The rx process usually runs under the netif_tx_lock. If np->intr_tx_reap
 is set, then access is permitted under spin_lock_irq(&np->lock).
 
 Thus configuration functions that want to access everything must call
 	disable_irq(dev->irq);
-	spin_lock_bh(dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	spin_lock_irq(&np->lock);
 
 IV. Notes
diff --git a/drivers/net/tulip/winbond-840.c b/drivers/net/tulip/winbond-840.c
index 136a70c4d5e4..56d86c7c0386 100644
--- a/drivers/net/tulip/winbond-840.c
+++ b/drivers/net/tulip/winbond-840.c
@@ -1605,11 +1605,11 @@ static void __devexit w840_remove1 (struct pci_dev *pdev)
  * - get_stats:
  * 	spin_lock_irq(np->lock), doesn't touch hw if not present
  * - hard_start_xmit:
- * 	netif_stop_queue + spin_unlock_wait(&dev->xmit_lock);
+ * 	synchronize_irq + netif_tx_disable;
  * - tx_timeout:
- * 	netif_device_detach + spin_unlock_wait(&dev->xmit_lock);
+ * 	netif_device_detach + netif_tx_disable;
  * - set_multicast_list
- * 	netif_device_detach + spin_unlock_wait(&dev->xmit_lock);
+ * 	netif_device_detach + netif_tx_disable;
  * - interrupt handler
  * 	doesn't touch hw if not present, synchronize_irq waits for
  * 	running instances of the interrupt handler.
@@ -1635,11 +1635,10 @@ static int w840_suspend (struct pci_dev *pdev, pm_message_t state)
 		netif_device_detach(dev);
 		update_csr6(dev, 0);
 		iowrite32(0, ioaddr + IntrEnable);
-		netif_stop_queue(dev);
 		spin_unlock_irq(&np->lock);
 
-		spin_unlock_wait(&dev->xmit_lock);
 		synchronize_irq(dev->irq);
+		netif_tx_disable(dev);
 	
 		np->stats.rx_missed_errors += ioread32(ioaddr + RxMissed) & 0xffff;
 
diff --git a/drivers/net/wireless/orinoco.c b/drivers/net/wireless/orinoco.c
index c2d0b09e0418..a5fcfcde63d1 100644
--- a/drivers/net/wireless/orinoco.c
+++ b/drivers/net/wireless/orinoco.c
@@ -1833,7 +1833,9 @@ static int __orinoco_program_rids(struct net_device *dev)
 	/* Set promiscuity / multicast*/
 	priv->promiscuous = 0;
 	priv->mc_count = 0;
-	__orinoco_set_multicast_list(dev); /* FIXME: what about the xmit_lock */
+
+	/* FIXME: what about netif_tx_lock */
+	__orinoco_set_multicast_list(dev);
 
 	return 0;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b5760c67af9c..067b9ccafd87 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -407,7 +407,7 @@ struct net_device
  * One part is mostly used on xmit path (device)
  */
 	/* hard_start_xmit synchronizer */
-	spinlock_t		xmit_lock ____cacheline_aligned_in_smp;
+	spinlock_t		_xmit_lock ____cacheline_aligned_in_smp;
 	/* cpu id of processor entered to hard_start_xmit or -1,
 	   if nobody entered there.
 	 */
@@ -893,11 +893,43 @@ static inline void __netif_rx_complete(struct net_device *dev)
 	clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
 }
 
+static inline void netif_tx_lock(struct net_device *dev)
+{
+	spin_lock(&dev->_xmit_lock);
+	dev->xmit_lock_owner = smp_processor_id();
+}
+
+static inline void netif_tx_lock_bh(struct net_device *dev)
+{
+	spin_lock_bh(&dev->_xmit_lock);
+	dev->xmit_lock_owner = smp_processor_id();
+}
+
+static inline int netif_tx_trylock(struct net_device *dev)
+{
+	int err = spin_trylock(&dev->_xmit_lock);
+	if (!err)
+		dev->xmit_lock_owner = smp_processor_id();
+	return err;
+}
+
+static inline void netif_tx_unlock(struct net_device *dev)
+{
+	dev->xmit_lock_owner = -1;
+	spin_unlock(&dev->_xmit_lock);
+}
+
+static inline void netif_tx_unlock_bh(struct net_device *dev)
+{
+	dev->xmit_lock_owner = -1;
+	spin_unlock_bh(&dev->_xmit_lock);
+}
+
 static inline void netif_tx_disable(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	netif_stop_queue(dev);
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 }
 
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 72d852982664..f92f9c94d2c7 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -98,7 +98,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
 		printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
 		return;
 	}
-	spin_lock_bh(&entry->neigh->dev->xmit_lock);	/* block clip_start_xmit() */
+	netif_tx_lock_bh(entry->neigh->dev);	/* block clip_start_xmit() */
 	entry->neigh->used = jiffies;
 	for (walk = &entry->vccs; *walk; walk = &(*walk)->next)
 		if (*walk == clip_vcc) {
@@ -122,7 +122,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
 	printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc "
 	       "0x%p)\n", entry, clip_vcc);
       out:
-	spin_unlock_bh(&entry->neigh->dev->xmit_lock);
+	netif_tx_unlock_bh(entry->neigh->dev);
 }
 
 /* The neighbour entry n->lock is held. */
diff --git a/net/core/dev.c b/net/core/dev.c
index 6bfa78c66c25..1b09f1cae46e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1282,15 +1282,13 @@ int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
 
 #define HARD_TX_LOCK(dev, cpu) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
-		spin_lock(&dev->xmit_lock);		\
-		dev->xmit_lock_owner = cpu;		\
+		netif_tx_lock(dev);			\
 	}						\
 }
 
 #define HARD_TX_UNLOCK(dev) {				\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
-		dev->xmit_lock_owner = -1;		\
-		spin_unlock(&dev->xmit_lock);		\
+		netif_tx_unlock(dev);			\
 	}						\
 }
 
@@ -1389,8 +1387,8 @@ int dev_queue_xmit(struct sk_buff *skb)
 	/* The device has no queue. Common case for software devices:
 	   loopback, all the sorts of tunnels...
 
-	   Really, it is unlikely that xmit_lock protection is necessary here.
-	   (f.e. loopback and IP tunnels are clean ignoring statistics
+	   Really, it is unlikely that netif_tx_lock protection is necessary
+	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 	   counters.)
 	   However, it is possible, that they rely on protection
 	   made by us here.
@@ -2805,7 +2803,7 @@ int register_netdevice(struct net_device *dev)
 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 
 	spin_lock_init(&dev->queue_lock);
-	spin_lock_init(&dev->xmit_lock);
+	spin_lock_init(&dev->_xmit_lock);
 	dev->xmit_lock_owner = -1;
 #ifdef CONFIG_NET_CLS_ACT
 	spin_lock_init(&dev->ingress_lock);
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index 05d60850840e..c57d887da2ef 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -62,7 +62,7 @@
  *	Device mc lists are changed by bh at least if IPv6 is enabled,
  *	so that it must be bh protected.
  *
- *	We block accesses to device mc filters with dev->xmit_lock.
+ *	We block accesses to device mc filters with netif_tx_lock.
  */
 
 /*
@@ -93,9 +93,9 @@ static void __dev_mc_upload(struct net_device *dev)
 
 void dev_mc_upload(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	__dev_mc_upload(dev);
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 }
 
 /*
@@ -107,7 +107,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
 	int err = 0;
 	struct dev_mc_list *dmi, **dmip;
 
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 
 	for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
 		/*
@@ -139,13 +139,13 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
 			 */
 			__dev_mc_upload(dev);
 			
-			spin_unlock_bh(&dev->xmit_lock);
+			netif_tx_unlock_bh(dev);
 			return 0;
 		}
 	}
 	err = -ENOENT;
 done:
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 	return err;
 }
 
@@ -160,7 +160,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 
 	dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC);
 
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
 		if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
 		    dmi->dmi_addrlen == alen) {
@@ -176,7 +176,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 	}
 
 	if ((dmi = dmi1) == NULL) {
-		spin_unlock_bh(&dev->xmit_lock);
+		netif_tx_unlock_bh(dev);
 		return -ENOMEM;
 	}
 	memcpy(dmi->dmi_addr, addr, alen);
@@ -189,11 +189,11 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 
 	__dev_mc_upload(dev);
 	
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 	return 0;
 
 done:
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 	kfree(dmi1);
 	return err;
 }
@@ -204,7 +204,7 @@ done:
 
 void dev_mc_discard(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	
 	while (dev->mc_list != NULL) {
 		struct dev_mc_list *tmp = dev->mc_list;
@@ -215,7 +215,7 @@ void dev_mc_discard(struct net_device *dev)
 	}
 	dev->mc_count = 0;
 
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -250,7 +250,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)
 	struct dev_mc_list *m;
 	struct net_device *dev = v;
 
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	for (m = dev->mc_list; m; m = m->next) {
 		int i;
 
@@ -262,7 +262,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)
 
 		seq_putc(seq, '\n');
 	}
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 	return 0;
 }
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e8e05cebd95a..9cb781830380 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -273,24 +273,21 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 
 	do {
 		npinfo->tries--;
-		spin_lock(&np->dev->xmit_lock);
-		np->dev->xmit_lock_owner = smp_processor_id();
+		netif_tx_lock(np->dev);
 
 		/*
 		 * network drivers do not expect to be called if the queue is
 		 * stopped.
 		 */
 		if (netif_queue_stopped(np->dev)) {
-			np->dev->xmit_lock_owner = -1;
-			spin_unlock(&np->dev->xmit_lock);
+			netif_tx_unlock(np->dev);
 			netpoll_poll(np);
 			udelay(50);
 			continue;
 		}
 
 		status = np->dev->hard_start_xmit(skb, np->dev);
-		np->dev->xmit_lock_owner = -1;
-		spin_unlock(&np->dev->xmit_lock);
+		netif_tx_unlock(np->dev);
 
 		/* success */
 		if(!status) {
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c23e9c06ee23..67ed14ddabd2 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2897,7 +2897,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		}
 	}
 
-	spin_lock_bh(&odev->xmit_lock);
+	netif_tx_lock_bh(odev);
 	if (!netif_queue_stopped(odev)) {
 
 		atomic_inc(&(pkt_dev->skb->users));
@@ -2942,7 +2942,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		pkt_dev->next_tx_ns = 0;
 	}
 
-	spin_unlock_bh(&odev->xmit_lock);
+	netif_tx_unlock_bh(odev);
 
 	/* If pkt_dev->count is zero, then run forever */
 	if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 138ea92ed268..b1e4c5e20ac7 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -72,9 +72,9 @@ void qdisc_unlock_tree(struct net_device *dev)
    dev->queue_lock serializes queue accesses for this device
    AND dev->qdisc pointer itself.
 
-   dev->xmit_lock serializes accesses to device driver.
+   netif_tx_lock serializes accesses to device driver.
 
-   dev->queue_lock and dev->xmit_lock are mutually exclusive,
+   dev->queue_lock and netif_tx_lock are mutually exclusive,
    if one is grabbed, another must be free.
  */
 
@@ -108,7 +108,7 @@ int qdisc_restart(struct net_device *dev)
 		 * will be requeued.
 		 */
 		if (!nolock) {
-			if (!spin_trylock(&dev->xmit_lock)) {
+			if (!netif_tx_trylock(dev)) {
 			collision:
 				/* So, someone grabbed the driver. */
 				
@@ -126,8 +126,6 @@ int qdisc_restart(struct net_device *dev)
 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
 				goto requeue;
 			}
-			/* Remember that the driver is grabbed by us. */
-			dev->xmit_lock_owner = smp_processor_id();
 		}
 		
 		{
@@ -142,8 +140,7 @@ int qdisc_restart(struct net_device *dev)
 				ret = dev->hard_start_xmit(skb, dev);
 				if (ret == NETDEV_TX_OK) { 
 					if (!nolock) {
-						dev->xmit_lock_owner = -1;
-						spin_unlock(&dev->xmit_lock);
+						netif_tx_unlock(dev);
 					}
 					spin_lock(&dev->queue_lock);
 					return -1;
@@ -157,8 +154,7 @@ int qdisc_restart(struct net_device *dev)
 			/* NETDEV_TX_BUSY - we need to requeue */
 			/* Release the driver */
 			if (!nolock) { 
-				dev->xmit_lock_owner = -1;
-				spin_unlock(&dev->xmit_lock);
+				netif_tx_unlock(dev);
 			} 
 			spin_lock(&dev->queue_lock);
 			q = dev->qdisc;
@@ -187,7 +183,7 @@ static void dev_watchdog(unsigned long arg)
 {
 	struct net_device *dev = (struct net_device *)arg;
 
-	spin_lock(&dev->xmit_lock);
+	netif_tx_lock(dev);
 	if (dev->qdisc != &noop_qdisc) {
 		if (netif_device_present(dev) &&
 		    netif_running(dev) &&
@@ -203,7 +199,7 @@ static void dev_watchdog(unsigned long arg)
 				dev_hold(dev);
 		}
 	}
-	spin_unlock(&dev->xmit_lock);
+	netif_tx_unlock(dev);
 
 	dev_put(dev);
 }
@@ -227,17 +223,17 @@ void __netdev_watchdog_up(struct net_device *dev)
 
 static void dev_watchdog_up(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	__netdev_watchdog_up(dev);
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 }
 
 static void dev_watchdog_down(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	if (del_timer(&dev->watchdog_timer))
 		dev_put(dev);
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 }
 
 void netif_carrier_on(struct net_device *dev)
@@ -582,7 +578,7 @@ void dev_deactivate(struct net_device *dev)
 	while (test_bit(__LINK_STATE_SCHED, &dev->state))
 		yield();
 
-	spin_unlock_wait(&dev->xmit_lock);
+	spin_unlock_wait(&dev->_xmit_lock);
 }
 
 void dev_init_scheduler(struct net_device *dev)
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 79b8ef34c6e4..4c16ad57a3e4 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -302,20 +302,17 @@ restart:
 
 		switch (teql_resolve(skb, skb_res, slave)) {
 		case 0:
-			if (spin_trylock(&slave->xmit_lock)) {
-				slave->xmit_lock_owner = smp_processor_id();
+			if (netif_tx_trylock(slave)) {
 				if (!netif_queue_stopped(slave) &&
 				    slave->hard_start_xmit(skb, slave) == 0) {
-					slave->xmit_lock_owner = -1;
-					spin_unlock(&slave->xmit_lock);
+					netif_tx_unlock(slave);
 					master->slaves = NEXT_SLAVE(q);
 					netif_wake_queue(dev);
 					master->stats.tx_packets++;
 					master->stats.tx_bytes += len;
 					return 0;
 				}
-				slave->xmit_lock_owner = -1;
-				spin_unlock(&slave->xmit_lock);
+				netif_tx_unlock(slave);
 			}
 			if (netif_queue_stopped(dev))
 				busy = 1;
-- 
cgit v1.2.3-71-gd317


From 364c6badde0dd62a0a38e5ed67f85d87d6665780 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 9 Jun 2006 16:10:40 -0700
Subject: [NET]: Clean up skb_linearize

The linearisation operation doesn't need to be super-optimised.  So we can
replace __skb_linearize with __pskb_pull_tail which does the same thing but
is more general.

Also, most users of skb_linearize end up testing whether the skb is linear
or not so it helps to make skb_linearize do just that.

Some callers of skb_linearize also use it to copy cloned data, so it's
useful to have a new function skb_linearize_cow to copy the data if it's
either non-linear or cloned.

Last but not least, I've removed the gfp argument since nobody uses it
anymore.  If it's ever needed we can easily add it back.

Misc bugs fixed by this patch:

* via-velocity error handling (also, no SG => no frags)

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/block/aoe/aoenet.c |  3 +--
 drivers/net/mv643xx_eth.c  |  2 +-
 drivers/net/via-velocity.c | 10 +++++---
 include/linux/skbuff.h     | 24 +++++++++++++++---
 net/core/dev.c             | 63 ++--------------------------------------------
 net/decnet/dn_nsp_in.c     |  3 +--
 net/decnet/dn_route.c      |  3 +--
 net/ipv4/ipcomp.c          | 11 +++-----
 net/ipv6/ipcomp6.c         | 11 +++-----
 9 files changed, 39 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index fdff774b8ab9..c1434ed11880 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -116,8 +116,7 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
 	skb = skb_share_check(skb, GFP_ATOMIC);
 	if (skb == NULL)
 		return 0;
-	if (skb_is_nonlinear(skb))
-	if (skb_linearize(skb, GFP_ATOMIC) < 0)
+	if (skb_linearize(skb))
 		goto exit;
 	if (!is_aoe_netif(ifp))
 		goto exit;
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 411f4d809c47..625ff61c9988 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -1200,7 +1200,7 @@ static int mv643xx_eth_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (has_tiny_unaligned_frags(skb)) {
-		if ((skb_linearize(skb, GFP_ATOMIC) != 0)) {
+		if (__skb_linearize(skb)) {
 			stats->tx_dropped++;
 			printk(KERN_DEBUG "%s: failed to linearize tiny "
 					"unaligned fragment\n", dev->name);
diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c
index ed1f837c8fda..2eb6b5f9ba0d 100644
--- a/drivers/net/via-velocity.c
+++ b/drivers/net/via-velocity.c
@@ -1899,6 +1899,13 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	int pktlen = skb->len;
 
+#ifdef VELOCITY_ZERO_COPY_SUPPORT
+	if (skb_shinfo(skb)->nr_frags > 6 && __skb_linearize(skb)) {
+		kfree_skb(skb);
+		return 0;
+	}
+#endif
+
 	spin_lock_irqsave(&vptr->lock, flags);
 
 	index = vptr->td_curr[qnum];
@@ -1914,8 +1921,6 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	if (pktlen < ETH_ZLEN) {
 		/* Cannot occur until ZC support */
-		if(skb_linearize(skb, GFP_ATOMIC))
-			return 0; 
 		pktlen = ETH_ZLEN;
 		memcpy(tdinfo->buf, skb->data, skb->len);
 		memset(tdinfo->buf + skb->len, 0, ETH_ZLEN - skb->len);
@@ -1933,7 +1938,6 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev)
 		int nfrags = skb_shinfo(skb)->nr_frags;
 		tdinfo->skb = skb;
 		if (nfrags > 6) {
-			skb_linearize(skb, GFP_ATOMIC);
 			memcpy(tdinfo->buf, skb->data, skb->len);
 			tdinfo->skb_dma[0] = tdinfo->buf_dma;
 			td_ptr->tdesc0.pktsize = 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe2c58e5306f..830f58fa03a2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1169,18 +1169,34 @@ static inline int skb_can_coalesce(struct sk_buff *skb, int i,
 	return 0;
 }
 
+static inline int __skb_linearize(struct sk_buff *skb)
+{
+	return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
+}
+
 /**
  *	skb_linearize - convert paged skb to linear one
  *	@skb: buffer to linarize
- *	@gfp: allocation mode
  *
  *	If there is no free memory -ENOMEM is returned, otherwise zero
  *	is returned and the old skb data released.
  */
-extern int __skb_linearize(struct sk_buff *skb, gfp_t gfp);
-static inline int skb_linearize(struct sk_buff *skb, gfp_t gfp)
+static inline int skb_linearize(struct sk_buff *skb)
+{
+	return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
+}
+
+/**
+ *	skb_linearize_cow - make sure skb is linear and writable
+ *	@skb: buffer to process
+ *
+ *	If there is no free memory -ENOMEM is returned, otherwise zero
+ *	is returned and the old skb data released.
+ */
+static inline int skb_linearize_cow(struct sk_buff *skb)
 {
-	return __skb_linearize(skb, gfp);
+	return skb_is_nonlinear(skb) || skb_cloned(skb) ?
+	       __skb_linearize(skb) : 0;
 }
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index 1b09f1cae46e..91361bc2b682 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1222,64 +1222,6 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #define illegal_highdma(dev, skb)	(0)
 #endif
 
-/* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
-{
-	unsigned int size;
-	u8 *data;
-	long offset;
-	struct skb_shared_info *ninfo;
-	int headerlen = skb->data - skb->head;
-	int expand = (skb->tail + skb->data_len) - skb->end;
-
-	if (skb_shared(skb))
-		BUG();
-
-	if (expand <= 0)
-		expand = 0;
-
-	size = skb->end - skb->head + expand;
-	size = SKB_DATA_ALIGN(size);
-	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
-	if (!data)
-		return -ENOMEM;
-
-	/* Copy entire thing */
-	if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
-		BUG();
-
-	/* Set up shinfo */
-	ninfo = (struct skb_shared_info*)(data + size);
-	atomic_set(&ninfo->dataref, 1);
-	ninfo->tso_size = skb_shinfo(skb)->tso_size;
-	ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
-	ninfo->nr_frags = 0;
-	ninfo->frag_list = NULL;
-
-	/* Offset between the two in bytes */
-	offset = data - skb->head;
-
-	/* Free old data. */
-	skb_release_data(skb);
-
-	skb->head = data;
-	skb->end  = data + size;
-
-	/* Set up new pointers */
-	skb->h.raw   += offset;
-	skb->nh.raw  += offset;
-	skb->mac.raw += offset;
-	skb->tail    += offset;
-	skb->data    += offset;
-
-	/* We are no longer a clone, even if we were. */
-	skb->cloned    = 0;
-
-	skb->tail     += skb->data_len;
-	skb->data_len  = 0;
-	return 0;
-}
-
 #define HARD_TX_LOCK(dev, cpu) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
 		netif_tx_lock(dev);			\
@@ -1326,7 +1268,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 
 	if (skb_shinfo(skb)->frag_list &&
 	    !(dev->features & NETIF_F_FRAGLIST) &&
-	    __skb_linearize(skb, GFP_ATOMIC))
+	    __skb_linearize(skb))
 		goto out_kfree_skb;
 
 	/* Fragmented skb is linearized if device does not support SG,
@@ -1335,7 +1277,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	 */
 	if (skb_shinfo(skb)->nr_frags &&
 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
-	    __skb_linearize(skb, GFP_ATOMIC))
+	    __skb_linearize(skb))
 		goto out_kfree_skb;
 
 	/* If packet is not checksummed and device does not support
@@ -3473,7 +3415,6 @@ subsys_initcall(net_dev_init);
 EXPORT_SYMBOL(__dev_get_by_index);
 EXPORT_SYMBOL(__dev_get_by_name);
 EXPORT_SYMBOL(__dev_remove_pack);
-EXPORT_SYMBOL(__skb_linearize);
 EXPORT_SYMBOL(dev_valid_name);
 EXPORT_SYMBOL(dev_add_pack);
 EXPORT_SYMBOL(dev_alloc_name);
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 547523b41c81..a2ba9db1c376 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -801,8 +801,7 @@ got_it:
 		 * We linearize everything except data segments here.
 		 */
 		if (cb->nsp_flags & ~0x60) {
-			if (unlikely(skb_is_nonlinear(skb)) &&
-			    skb_linearize(skb, GFP_ATOMIC) != 0)
+			if (unlikely(skb_linearize(skb)))
 				goto free_out;
 		}
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index e172cf98d7fc..5abf7057af00 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -629,8 +629,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type
 			padlen);
 
         if (flags & DN_RT_PKT_CNTL) {
-		if (unlikely(skb_is_nonlinear(skb)) &&
-		    skb_linearize(skb, GFP_ATOMIC) != 0)
+		if (unlikely(skb_linearize(skb)))
 			goto dump_it;
 
                 switch(flags & DN_RT_CNTL_MSK) {
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 8e243589045f..3ed8b57a1002 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -80,15 +80,12 @@ out:
 
 static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-	int err = 0;
+	int err = -ENOMEM;
 	struct iphdr *iph;
 	struct ip_comp_hdr *ipch;
 
-	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
-	    skb_linearize(skb, GFP_ATOMIC) != 0) {
-	    	err = -ENOMEM;
+	if (skb_linearize_cow(skb))
 	    	goto out;
-	}
 
 	skb->ip_summed = CHECKSUM_NONE;
 
@@ -158,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
 		goto out_ok;
 	}
 
-	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
-	    skb_linearize(skb, GFP_ATOMIC) != 0) {
+	if (skb_linearize_cow(skb))
 		goto out_ok;
-	}
 	
 	err = ipcomp_compress(x, skb);
 	iph = skb->nh.iph;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index cec3be544b69..f28cd37feed3 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -65,7 +65,7 @@ static LIST_HEAD(ipcomp6_tfms_list);
 
 static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-	int err = 0;
+	int err = -ENOMEM;
 	struct ipv6hdr *iph;
 	struct ipv6_comp_hdr *ipch;
 	int plen, dlen;
@@ -74,11 +74,8 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb)
 	struct crypto_tfm *tfm;
 	int cpu;
 
-	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
-		skb_linearize(skb, GFP_ATOMIC) != 0) {
-		err = -ENOMEM;
+	if (skb_linearize_cow(skb))
 		goto out;
-	}
 
 	skb->ip_summed = CHECKSUM_NONE;
 
@@ -142,10 +139,8 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb)
 		goto out_ok;
 	}
 
-	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
-		skb_linearize(skb, GFP_ATOMIC) != 0) {
+	if (skb_linearize_cow(skb))
 		goto out_ok;
-	}
 
 	/* compression */
 	plen = skb->len - hdr_len;
-- 
cgit v1.2.3-71-gd317


From 3cc0e873986fe594d0e96d07259b11f755325cb2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 9 Jun 2006 16:13:38 -0700
Subject: [NET]: Warn in __skb_trim if skb is paged

It's better to warn and fail rather than rarely triggering BUG on paths
that incorrectly call skb_trim/__skb_trim on a non-linear skb.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 24 ++++++++++++------------
 net/core/skbuff.c      |  7 ++-----
 2 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 830f58fa03a2..93e4db221585 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -975,15 +975,16 @@ static inline void skb_reserve(struct sk_buff *skb, int len)
 #define NET_SKB_PAD	16
 #endif
 
-extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc);
+extern int ___pskb_trim(struct sk_buff *skb, unsigned int len);
 
 static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
 {
-	if (!skb->data_len) {
-		skb->len  = len;
-		skb->tail = skb->data + len;
-	} else
-		___pskb_trim(skb, len, 0);
+	if (unlikely(skb->data_len)) {
+		WARN_ON(1);
+		return;
+	}
+	skb->len  = len;
+	skb->tail = skb->data + len;
 }
 
 /**
@@ -993,6 +994,7 @@ static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
  *
  *	Cut the length of a buffer down by removing data from the tail. If
  *	the buffer is already under the length specified it is not modified.
+ *	The skb must be linear.
  */
 static inline void skb_trim(struct sk_buff *skb, unsigned int len)
 {
@@ -1003,12 +1005,10 @@ static inline void skb_trim(struct sk_buff *skb, unsigned int len)
 
 static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
 {
-	if (!skb->data_len) {
-		skb->len  = len;
-		skb->tail = skb->data+len;
-		return 0;
-	}
-	return ___pskb_trim(skb, len, 1);
+	if (skb->data_len)
+		return ___pskb_trim(skb, len);
+	__skb_trim(skb, len);
+	return 0;
 }
 
 static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 96cdcbe24ba2..bb7210f4005e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -801,12 +801,10 @@ struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
 	return nskb;
 }	
  
-/* Trims skb to length len. It can change skb pointers, if "realloc" is 1.
- * If realloc==0 and trimming is impossible without change of data,
- * it is BUG().
+/* Trims skb to length len. It can change skb pointers.
  */
 
-int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
+int ___pskb_trim(struct sk_buff *skb, unsigned int len)
 {
 	int offset = skb_headlen(skb);
 	int nfrags = skb_shinfo(skb)->nr_frags;
@@ -816,7 +814,6 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
 		int end = offset + skb_shinfo(skb)->frags[i].size;
 		if (end > len) {
 			if (skb_cloned(skb)) {
-				BUG_ON(!realloc);
 				if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
 					return -ENOMEM;
 			}
-- 
cgit v1.2.3-71-gd317


From 35089bb203f44e33b6bbb6c4de0b0708f9a48921 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 13 Jun 2006 22:33:04 -0700
Subject: [TCP]: Add tcp_slow_start_after_idle sysctl.

A lot of people have asked for a way to disable tcp_cwnd_restart(),
and it seems reasonable to add a sysctl to do that.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 7 +++++++
 include/linux/sysctl.h                 | 1 +
 include/net/tcp.h                      | 1 +
 net/ipv4/sysctl_net_ipv4.c             | 8 ++++++++
 net/ipv4/tcp_output.c                  | 6 +++++-
 5 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f12007b80a46..d46338af6002 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -362,6 +362,13 @@ tcp_workaround_signed_windows - BOOLEAN
 	not receive a window scaling option from them.
 	Default: 0
 
+tcp_slow_start_after_idle - BOOLEAN
+	If set, provide RFC2861 behavior and time out the congestion
+	window after an idle period.  An idle period is defined at
+	the current RTO.  If unset, the congestion window will not
+	be timed out after an idle period.
+	Default: 1
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 98338ed2c0b6..cee944dbdcd4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -405,6 +405,7 @@ enum
 	NET_TCP_BASE_MSS=114,
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
 	NET_TCP_DMA_COPYBREAK=116,
+	NET_TCP_SLOW_START_AFTER_IDLE=117,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index de88c5472bfc..bfc71f954bbe 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -227,6 +227,7 @@ extern int sysctl_tcp_abc;
 extern int sysctl_tcp_mtu_probing;
 extern int sysctl_tcp_base_mss;
 extern int sysctl_tcp_workaround_signed_windows;
+extern int sysctl_tcp_slow_start_after_idle;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e7fb665873c6..ce4cd5f35511 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -690,6 +690,14 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec
 	},
 #endif
+	{
+		.ctl_name	= NET_TCP_SLOW_START_AFTER_IDLE,
+		.procname	= "tcp_slow_start_after_idle",
+		.data		= &sysctl_tcp_slow_start_after_idle,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f33c9dddaa12..07bb5a2b375e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3;
 int sysctl_tcp_mtu_probing = 0;
 int sysctl_tcp_base_mss = 512;
 
+/* By default, RFC2861 behavior.  */
+int sysctl_tcp_slow_start_after_idle = 1;
+
 static void update_send_head(struct sock *sk, struct tcp_sock *tp,
 			     struct sk_buff *skb)
 {
@@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const u32 now = tcp_time_stamp;
 
-	if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+	if (sysctl_tcp_slow_start_after_idle &&
+	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
 		tcp_cwnd_restart(sk, __sk_dst_get(sk));
 
 	tp->lsndtime = now;
-- 
cgit v1.2.3-71-gd317


From 8648b3053bff39a7ee4c711d74268079c928a657 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 17 Jun 2006 22:06:05 -0700
Subject: [NET]: Add NETIF_F_GEN_CSUM and NETIF_F_ALL_CSUM

The current stack treats NETIF_F_HW_CSUM and NETIF_F_NO_CSUM
identically so we test for them in quite a few places.  For the sake
of brevity, I'm adding the macro NETIF_F_GEN_CSUM for these two.  We
also test the disjunct of NETIF_F_IP_CSUM and the other two in various
places, for that purpose I've added NETIF_F_ALL_CSUM.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  7 ++-----
 include/linux/netdevice.h       |  3 +++
 net/bridge/br_if.c              |  3 +--
 net/core/dev.c                  |  6 ++----
 net/core/ethtool.c              |  6 ++----
 net/ipv4/ip_output.c            |  2 +-
 net/ipv4/tcp.c                  | 10 ++--------
 7 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 46326cdfb277..8171cae06688 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1199,8 +1199,7 @@ int bond_sethwaddr(struct net_device *bond_dev, struct net_device *slave_dev)
 }
 
 #define BOND_INTERSECT_FEATURES \
-	(NETIF_F_SG|NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM|\
-	NETIF_F_TSO|NETIF_F_UFO)
+	(NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_TSO | NETIF_F_UFO)
 
 /* 
  * Compute the common dev->feature set available to all slaves.  Some
@@ -1218,9 +1217,7 @@ static int bond_compute_features(struct bonding *bond)
 		features &= (slave->dev->features & BOND_INTERSECT_FEATURES);
 
 	if ((features & NETIF_F_SG) && 
-	    !(features & (NETIF_F_IP_CSUM |
-			  NETIF_F_NO_CSUM |
-			  NETIF_F_HW_CSUM)))
+	    !(features & NETIF_F_ALL_CSUM))
 		features &= ~NETIF_F_SG;
 
 	/* 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 067b9ccafd87..e432b743dda2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -312,6 +312,9 @@ struct net_device
 #define NETIF_F_LLTX		4096	/* LockLess TX */
 #define NETIF_F_UFO             8192    /* Can offload UDP Large Send*/
 
+#define NETIF_F_GEN_CSUM	(NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
+#define NETIF_F_ALL_CSUM	(NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
+
 	struct net_device	*next_sched;
 
 	/* Interface index. Unique device identifier	*/
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f5d47bf4f967..90c95f59dc82 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -376,8 +376,7 @@ void br_features_recompute(struct net_bridge *br)
 	checksum = br->feature_mask & NETIF_F_IP_CSUM;
 
 	list_for_each_entry(p, &br->port_list, list) {
-		if (!(p->dev->features 
-		      & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)))
+		if (!(p->dev->features & NETIF_F_ALL_CSUM))
 			checksum = 0;
 		features &= p->dev->features;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index 91361bc2b682..ab39fe17cb58 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1284,7 +1284,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	 * checksumming for this protocol, complete checksumming here.
 	 */
 	if (skb->ip_summed == CHECKSUM_HW &&
-	    (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
+	    (!(dev->features & NETIF_F_GEN_CSUM) &&
 	     (!(dev->features & NETIF_F_IP_CSUM) ||
 	      skb->protocol != htons(ETH_P_IP))))
 	      	if (skb_checksum_help(skb, 0))
@@ -2789,9 +2789,7 @@ int register_netdevice(struct net_device *dev)
 
 	/* Fix illegal SG+CSUM combinations. */
 	if ((dev->features & NETIF_F_SG) &&
-	    !(dev->features & (NETIF_F_IP_CSUM |
-			       NETIF_F_NO_CSUM |
-			       NETIF_F_HW_CSUM))) {
+	    !(dev->features & NETIF_F_ALL_CSUM)) {
 		printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
 		       dev->name);
 		dev->features &= ~NETIF_F_SG;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e6f76106a99b..3f269e4e9438 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -30,7 +30,7 @@ u32 ethtool_op_get_link(struct net_device *dev)
 
 u32 ethtool_op_get_tx_csum(struct net_device *dev)
 {
-	return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0;
+	return (dev->features & NETIF_F_ALL_CSUM) != 0;
 }
 
 int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
@@ -551,9 +551,7 @@ static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
 		return -EFAULT;
 
 	if (edata.data && 
-	    !(dev->features & (NETIF_F_IP_CSUM |
-			       NETIF_F_NO_CSUM |
-			       NETIF_F_HW_CSUM)))
+	    !(dev->features & NETIF_F_ALL_CSUM))
 		return -EINVAL;
 
 	return __ethtool_set_sg(dev, edata.data);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d4bb3fae4e49..8538aac3d148 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -840,7 +840,7 @@ int ip_append_data(struct sock *sk,
 	 */
 	if (transhdrlen &&
 	    length + fragheaderlen <= mtu &&
-	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
+	    rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
 	    !exthdrlen)
 		csummode = CHECKSUM_HW;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ff6ccda9ff46..74998f250071 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 	ssize_t res;
 	struct sock *sk = sock->sk;
 
-#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
-
 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
-	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
+	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
 		return sock_no_sendpage(sock, page, offset, size, flags);
 
-#undef TCP_ZC_CSUM_FLAGS
-
 	lock_sock(sk);
 	TCP_CHECK_TIMER(sk);
 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
@@ -726,9 +722,7 @@ new_segment:
 				/*
 				 * Check whether we can use HW checksum.
 				 */
-				if (sk->sk_route_caps &
-				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
-				     NETIF_F_HW_CSUM))
+				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
 					skb->ip_summed = CHECKSUM_HW;
 
 				skb_entail(sk, tp, skb);
-- 
cgit v1.2.3-71-gd317


From c7ce1ae21223fe1f905feba272bc14b87994a57d Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tgohad@mvista.com>
Date: Sat, 17 Jun 2006 22:54:03 -0700
Subject: [PFKEYV2]: Fix inconsistent typing in struct sadb_x_kmprivate.

Signed-off-by: Tushar Gohad <tgohad@mvista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pfkeyv2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
index bac0fb389cf1..d5dd471da225 100644
--- a/include/linux/pfkeyv2.h
+++ b/include/linux/pfkeyv2.h
@@ -159,7 +159,7 @@ struct sadb_spirange {
 struct sadb_x_kmprivate {
 	uint16_t	sadb_x_kmprivate_len;
 	uint16_t	sadb_x_kmprivate_exttype;
-	u_int32_t	sadb_x_kmprivate_reserved;
+	uint32_t	sadb_x_kmprivate_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_x_kmprivate) == 8 */
 
-- 
cgit v1.2.3-71-gd317


From 557240b48e2dc4f6fa878afc3fc767ad745ca7ed Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 19 Jun 2006 18:16:01 -0700
Subject: Add support for suspending and resuming the whole console subsystem

Trying to suspend/resume with console messages flying all around is
doomed to failure, when the devices that the messages are trying to
go to are being shut down.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/console.h |  4 ++++
 kernel/power/main.c     |  2 ++
 kernel/printk.c         | 28 +++++++++++++++++++++++++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 721371382ae5..08734e660d41 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -117,6 +117,10 @@ extern void console_stop(struct console *);
 extern void console_start(struct console *);
 extern int is_console_locked(void);
 
+/* Suspend and resume console messages over PM events */
+extern void suspend_console(void);
+extern void resume_console(void);
+
 /* Some debug stub to catch some of the obvious races in the VT code */
 #if 1
 #define WARN_CONSOLE_UNLOCKED()	WARN_ON(!is_console_locked() && !oops_in_progress)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index a6d9ef46009e..0a907f0dc56b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -86,6 +86,7 @@ static int suspend_prepare(suspend_state_t state)
 			goto Thaw;
 	}
 
+	suspend_console();
 	if ((error = device_suspend(PMSG_SUSPEND))) {
 		printk(KERN_ERR "Some devices failed to suspend\n");
 		goto Finish;
@@ -133,6 +134,7 @@ int suspend_enter(suspend_state_t state)
 static void suspend_finish(suspend_state_t state)
 {
 	device_resume();
+	resume_console();
 	thaw_processes();
 	enable_nonboot_cpus();
 	if (pm_ops && pm_ops->finish)
diff --git a/kernel/printk.c b/kernel/printk.c
index c056f3324432..19a955619294 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(oops_in_progress);
  * driver system.
  */
 static DECLARE_MUTEX(console_sem);
+static DECLARE_MUTEX(secondary_console_sem);
 struct console *console_drivers;
 /*
  * This is used for debugging the mess that is the VT code by
@@ -76,7 +77,7 @@ struct console *console_drivers;
  * path in the console code where we end up in places I want
  * locked without the console sempahore held
  */
-static int console_locked;
+static int console_locked, console_suspended;
 
 /*
  * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
@@ -697,6 +698,23 @@ int __init add_preferred_console(char *name, int idx, char *options)
 	return 0;
 }
 
+/**
+ * suspend_console - suspend the console subsystem
+ *
+ * This disables printk() while we go into suspend states
+ */
+void suspend_console(void)
+{
+	acquire_console_sem();
+	console_suspended = 1;
+}
+
+void resume_console(void)
+{
+	console_suspended = 0;
+	release_console_sem();
+}
+
 /**
  * acquire_console_sem - lock the console system for exclusive use.
  *
@@ -708,6 +726,10 @@ int __init add_preferred_console(char *name, int idx, char *options)
 void acquire_console_sem(void)
 {
 	BUG_ON(in_interrupt());
+	if (console_suspended) {
+		down(&secondary_console_sem);
+		return;
+	}
 	down(&console_sem);
 	console_locked = 1;
 	console_may_schedule = 1;
@@ -750,6 +772,10 @@ void release_console_sem(void)
 	unsigned long _con_start, _log_end;
 	unsigned long wake_klogd = 0;
 
+	if (console_suspended) {
+		up(&secondary_console_sem);
+		return;
+	}
 	for ( ; ; ) {
 		spin_lock_irqsave(&logbuf_lock, flags);
 		wake_klogd |= log_start - log_end;
-- 
cgit v1.2.3-71-gd317